#include "TIPsDatabase.h" Database *TIPsDatabase::m_db = 0; const char *TIPsDatabase::m_feederid = 0; //public TIPsDatabase::TIPsDatabase(Database *_db) { m_db = _db; registerFeeder(); } TIPsDatabase::~TIPsDatabase() { if (m_feederid) {free((void*)m_feederid); m_feederid = 0;} } int TIPsDatabase::save(Summary *object, char **sError) { const char *argv[] = {m_feederid, object->m_title, object->m_body}; const bool argt[] = {false, true, true}; int result; int ret = m_db->execute("padd_summary", 3, argv, argt, &result, sError); //single numeric result procedure return result; } int TIPsDatabase::save(Article *object, char **sError) { const int& externalsourceid = object->m_ir->internetURIRequest()->domain()->domainID(); //assumption->assumption->assumption :) char sexternalsourceid[32]; _SNPRINTF(sexternalsourceid, 32, "%i", externalsourceid); const char *tags = vectorToMultiValue(&object->m_tags); const char *entities = vectorToMultiValue(&object->m_entities); const char *url = object->m_ir->internetURIRequest()->relativeHREF(); const char *argv[] = {m_feederid, sexternalsourceid, url, object->m_title, object->m_body, object->m_origonal, entities, object->m_description, tags}; const bool argt[] = {false, false, true, true, true, true, true, true, true}; int ret, result; try { ret = m_db->execute("padd_article", 9, argv, argt, &result, sError); //single numeric result procedure } catch (...) { //free resources and then bubble up Exception //vectorToMultiValue(...) cannot return 0 but just in case if (entities) free((void*)entities); #ifdef _LINUX //seems to cause a crash in Windows, not in Linux (not sure why!) //memory leak in Windows! if (tags) free((void*)tags); #endif throw; //will copy constructor the exception again and pass it up } //success! free resources if (entities) free((void*)entities); #ifdef _LINUX //seems to cause a crash in Windows, not in Linux (not sure why!) //memory leak in Windows! if (tags) free((void*)tags); #endif return result; } int TIPsDatabase::save(Company *object, char **sError) { const char *argv[] = {m_feederid, object->m_name}; const bool argt[] = {false, true}; int ret, result; try { ret = m_db->execute("padd_company", 2, argv, argt, &result, sError); //single numeric result procedure object->id(result); //setting the id will set the DBObject to inDB = true } catch (...) { //Database::QueryFailed //Database::RequestFailed } return result; } int TIPsDatabase::save(Product *object, char **sError) { const char *argv[] = {m_feederid, object->m_name, object->m_company}; const bool argt[] = {false, true, true}; int ret, result; try { ret = m_db->execute("padd_product", 3, argv, argt, &result, sError); //single numeric result procedure object->id(result); //setting the id will set the DBObject to inDB = true } catch (...) { //Database::QueryFailed //Database::RequestFailed } return result; } int TIPsDatabase::save(Country *object, char **sError) { const char *argv[] = {m_feederid, object->m_name}; const bool argt[] = {false, true}; int ret, result; try { ret = m_db->execute("padd_country", 2, argv, argt, &result, sError); //single numeric result procedure object->id(result); //setting the id will set the DBObject to inDB = true } catch (...) { //Database::QueryFailed //Database::RequestFailed } return result; } int TIPsDatabase::save(Person *object, char **sError) { const char *argv[] = {m_feederid, object->m_name, (object->m_fullname ? object->m_fullname : "")}; const bool argt[] = {false, true, true}; int ret, result; try { ret = m_db->execute("padd_person", 3, argv, argt, &result, sError); //single numeric result procedure object->id(result); //setting the id will set the DBObject to inDB = true } catch (...) { //Database::QueryFailed //Database::RequestFailed } return result; } int TIPsDatabase::save(ProductType *object, char **sError) {return 0;} int TIPsDatabase::loadHTMLObjectParserSettings(const int pagegroupid, FilterGroup *settings, char **sError) const { RecordSet *recordset = 0; const char *regex; unsigned int propertyid, type; bool required; char spagegroupid[32]; _SNPRINTF(spagegroupid, 32, "%i", pagegroupid); const char *argv[] = {spagegroupid}; const bool argt[] = {false}; //order by first name asc and then the number of words desc (so that longer names are checked first) int ret = m_db->execute("pxobjectparsersettings", 1, argv, argt, &recordset, sError, true); //managed recordSet (frees value results itself) if (!ret) { for (RecordSet::iterator i = recordset->begin(); i != recordset->end(); i++) { regex = i(DBF_PARSERSETTING_REGEX); propertyid = i(DBF_PARSERSETTING_PROPERTYID); type = i(DBF_PARSERSETTING_TYPE); required = i(DBF_PARSERSETTING_REQUIRED); //new filter holds the settings: the FilterGroup will clear them up settings->insert(pair (propertyid, new Filter( propertyid, //id type, //(runorder) regex, "\\1", //replacement: is always the first capture group 0, //exceptions (required ? "" : (char*)0), //description (REGEX_FLAGS)0, //flags false //multipass ))); } if (recordset) {delete recordset; recordset = 0;} } return ret; } int TIPsDatabase::loadTagFilters(FilterGroup *fg) const { //note that the tag is in the description field of the filter return loadFilters(fg, "ptagfilters"); } int TIPsDatabase::loadEntities(StringMultiMapCI *entities, char **sError) { RecordSet *recordset = 0; DBEntity *entity; const char *name; int id, type; bool commonword; //order by first name asc and then the number of words desc (so that longer names are checked first) int ret = m_db->execute("pqualifierset_entities", 0, 0, 0, &recordset, sError, true); //managed recordSet (frees value results itself) if (!ret) { for (RecordSet::iterator i = recordset->begin(); i != recordset->end(); i++) { id = i(DBF_QUALIFIERSET_ENTITYID); type = i(DBF_QUALIFIERSET_TYPE); name = i(DBF_QUALIFIERSET_NAME); commonword = i(DBF_QUALIFIERSET_COMMONWORD); if (name && *name) { //name is not used, its parts are strdup'd switch (type) { case DBF_OBJECT_COMPANY: {entity = new Company( this, name, commonword, id); break;} case DBF_OBJECT_COUNTRY: {entity = new Country( this, name, commonword, id); break;} case DBF_OBJECT_PERSON: {entity = new Person( this, name, commonword, id); break;} case DBF_OBJECT_PRODUCT: {entity = new Product( this, name, commonword, id); break;} case DBF_OBJECT_PRODUCTTYPE: {entity = new ProductType(this, name, commonword, id); break;} case DBF_OBJECT_NOT: { //0 or null DEBUGERROR("[TIPsDatabase]: NoObjectType() Entity has no type during instanciation [%i,%s]", type, id, name); throw NoObjectType(); } default: { DEBUGERROR("[TIPsDatabase]: UnknownObjectType(%i) Entity during instanciation [%i,%s]", type, id, name); throw UnknownObjectType(type); } } entities->insert(make_pair(entity->firstname(), entity)); } } if (recordset) {delete recordset;recordset = 0;} } return ret; } int TIPsDatabase::loadMD5s(StringMap *md5s, char **sError) { //get all the MD5s of all the current documents in the system //this allows us to check immediately if the document is already in //the database or not //MD5 = 32 didgit hexedecimal //100,000 documents = 3,200,000 bytes of MD5 //roughly 4MB per 100k documents checked (with char* pointer and map) RecordSet *recordset = 0; int ret = m_db->execute("pmd5s", 0, 0, 0, &recordset, sError, false); //unmanaged recordSet if (!ret) { for (RecordSet::iterator i = recordset->begin(); i != recordset->end(); i++) md5s->insert(make_pair(i(DBF_MD5), 1)); if (recordset) {delete recordset;recordset = 0;} } return ret; } int TIPsDatabase::loadExternalSources(const char *sql, vector *domains, char **sError) { int externalsourceid; Domain *domain; char *sqlFull = (char*) mallocCheck(256+strlen(sql)); //if sql=0 or *sql=0 this will still work Parser *parser; FilterGroup *fg; RecordSet *externalsources = 0, *pagegroups = 0; int retEX, retPG; const char *startpage; //PageGroup fields char sexternalsourceid[32]; int pagegroupid, type, parsertype; const char *pagegroupfilter, *pagegroupexcept, *description; //select external sources sprintf(sqlFull, "select * from pexternalsources() %s", sql); retEX = m_db->execute((const char*) sqlFull, &externalsources, sError, true); //managed recordSet (frees value results itself) if (!retEX) { for (RecordSet::iterator iES = externalsources->begin(); iES != externalsources->end(); iES++) { //create external sources (persistent) startpage = iES(DBF_DOMAINS_STARTPAGE); domain = new Domain( iES(DBF_DOMAINS_DOMAINID), iES(DBF_DOMAINS_DOMAIN), iES(DBF_DOMAINS_ROOTPAGETITLE), (TIPsDatabase*) this, 0, (*startpage ? startpage : "/"), 0, true ); domains->push_back(domain); } if (externalsources) {delete externalsources; externalsources = 0;} } //load the page groups for the domain for (vector::const_iterator iD = domains->begin(); iD != domains->end(); iD++) { domain = *iD; externalsourceid = domain->domainID(); _SNPRINTF(sexternalsourceid, 32, "%i", externalsourceid); const char *argv[] = {sexternalsourceid}; const bool argt[] = {false}; retPG = m_db->execute("pxpagegroup", 1, argv, argt, &pagegroups, sError, true); //managed recordSet (frees value results itself) if (!retPG) { if (pagegroups->size()) { for (RecordSet::iterator iPG = pagegroups->begin(); iPG != pagegroups->end(); iPG++) { //fields pagegroupid = iPG(DBF_DOMAINS_PAGEGROUPID); type = iPG(DBF_DOMAINS_TYPEID); parsertype = iPG(DBF_DOMAINS_PARSERTYPE); pagegroupfilter = iPG(DBF_DOMAINS_PAGEGROUPFILTER); pagegroupexcept = iPG(DBF_DOMAINS_EXCEPTIONSREGXX); description = iPG(DBF_DOMAINS_DESCRIPTION); //Filter area (if any): Domain clears these up if (pagegroupfilter && *pagegroupfilter && strcmp(pagegroupfilter, ".*")) fg = new FilterGroup(new Filter(pagegroupid, type, pagegroupfilter, 0, pagegroupexcept, description, regex::NOCASE)); else fg = 0; //general filter, applies to whole site //parser per domain create: Domain clears them up //note that these Parsers may additionally query the database for extra per domain settings switch (parsertype) { case DBF_PARSERTYPE_HTMLGENERALPARSER: {parser = new HTMLGeneralParser(pagegroupid, this, domain, type); break;} case DBF_PARSERTYPE_HTMLARTICLEPARSER: {parser = new HTMLArticleParser(pagegroupid, this, domain, type); break;} case DBF_PARSERTYPE_PDFARTICLEPARSER: {parser = new PDFParser( pagegroupid, this, domain, type); break;} case DBF_PARSERTYPE_HTMLOBJECTPARSER: {parser = new HTMLObjectParser( pagegroupid, this, domain, type); break;} default: { DEBUGERROR("[TIPsDatabase]: pagegroupid:%i (%s) UnknownParserType(%i)", pagegroupid, description, parsertype); DEBUGERROR0(iPG.raw()); throw UnknownParserType(parsertype); } } //add domain->addParser(parser, fg); } //remaining areas of the site are left un-parsed } else { //no directives so add a general parser for the whole site (checks for Articles and Summaries) //domain->addParser(new HTMLGeneralParser(0, this, domain)); } if (pagegroups) {delete pagegroups; pagegroups = 0;} } } //init parsers: here to avoid concurrent XML HTTP DB requests //entities, tagfilters (categories) etc. here to avoid concurrent XML HTTP DB requests for (vector::const_iterator iD = domains->begin(); iD != domains->end(); iD++) { domain = *iD; domain->initParsers(); } return retEX; } const char *TIPsDatabase::vectorToMultiValues(const vector *v, const char delimiter, const bool appendArraySizer) { //caller frees result //input vector is left untouched //output is a 2 dimensional multi-value {{...},{...},{...}[,{...}]} const char *multi2value, *pos; char *newpos, c; size_t len = 2; //{...} size_t maxelements = 0; //used to pad the last value to make this a valid square 2 dimensional array size_t elementcount = 0; //number of elements in this value if (!v->size()) multi2value = strdupCheck("{}"); else { //calc length (escaping, {} and comma) for (vector::const_iterator i = v->begin(); i != v->end(); i++) len += strlen(*i)*2 + 3; //create and fill multi2value = (const char*)mallocCheck(len + 1); //zero terminator newpos = (char*)multi2value; *newpos++ = '{'; for (vector::const_iterator i = v->begin(); i != v->end(); i++) { *newpos++ = '{'; elementcount = 1; for (pos = *i; c = *pos; pos++) { //need to process special characters } for example //the single quotes will be escaped by the parameter system if (is(c, "}{")) *newpos++ = '\\'; if (c == delimiter) { *newpos++ = ','; elementcount++; } else *newpos++ = c; } if (elementcount > maxelements) maxelements = elementcount; *newpos++ = '}'; *newpos++ = ','; } if (appendArraySizer) { *newpos++ = '{'; for (size_t i = 0; i < maxelements; i++) *newpos++ = ','; *newpos++ = '}'; } else newpos--; //remove trailing comma *newpos++ = '}'; //1st dimension bracket close *newpos = 0; //terminate } return multi2value; } const char *TIPsDatabase::vectorToMultiValue(const vector *v) { //caller frees result //maximum char length of an integer is 2^32 digits = 4,294,967,296 = 10 digits //therefore the maximum size of this string is 10 + 1 (comma) * size() - 1 (trailing comma) + 2 (brackets) + 1 (zero terminator) //e.g. {4294967296,4294967296,4294967296} = 35 bytes const size_t len = v->size(); char *string = (char*) mallocCheck(11 * len + 2); char *pos = string; *pos++ = '{'; if (!len) pos++; //to return {} for (vector::const_iterator i = v->begin(); i != v->end(); i++) { sprintf(pos, "%u,", *i); while (*pos) pos++; } *--pos = '}'; //overwrite trailing comma *++pos = 0; //zero terminate return string; } size_t TIPsDatabase::multiValueToVector(const char *multivalue, vector *v) { //caller manages the multivalue and the by-reference vector //this is a vector of integers e.g. "{1,-2,33,4}" also "{}" is valid although not returned by PostGRES char c; const char *first = 0, *i; for (i = multivalue; c = *i; i++) { if (isNumeric(c) || c == '-') { //number or minus sign if (!first) first = i; //start of a number } else { //end of a number if (first) { //we have a number to process v->push_back(atoi(first)); //also processes the minus signs first = 0; } } } return v->size(); } size_t TIPsDatabase::multiValueToVector(const char *multivalue, vector *v) { //caller manages the multivalue and the by-reference vector //this is a vector of unsigned integers e.g. "{1,2,33,4}" also "{}" is valid although not returned by PostGRES char c; const char *first = 0, *i; for (i = multivalue; c = *i; i++) { if (isNumeric(c)) { //number (ignores minus signs) if (!first) first = i; //start of a number } else { //end of a number if (first) { //we have a number to process v->push_back(atoi(first)); first = 0; } } } return v->size(); } size_t TIPsDatabase::multiValueToVector(const char *multivalue, vector *v) { //caller manages the multivalue and the by-reference vector //this is a vector of strings e.g. "{test,o'reilly}" also "{}" is valid although not returned by PostGRES char c; const char *start = 0, *i; char *newvalue; for (i = multivalue; c = *i; i++) { if (c == '\\') i++; //jump escaped chars \{, \, \' etc. else { if (start && is(c, ",}")) { //value end characters //need to process escaped characters (\' for example) newvalue = (char*) mallocCheck(i-start); v->push_back(newvalue); while (++start < i) if (*start != '\\') *newvalue++ = *start; *newvalue = 0; //zero terminate start = 0; //restart looking for new value } if (!start && is(c, ",{")) start = i; //value start characters } } return v->size(); } const char *TIPsDatabase::commaspaceDelimitedToMultiValue(const char *value) { //caller frees result //null in = null out //input: a string e.g. "some books, publisher o'reilly" //output: a string e.g. "{{some,books},{publisher,o'reilly}}" also "{}" is valid although not returned by PostGRES char c, last = 0; const char *pos; char *multivalue = 0, *newpos; if (value) { multivalue = (char*)mallocCheck(strlen(value)*3 + 3); //{} and zero terminator, {} and escape chars newpos = multivalue; *newpos++ = '{'; *newpos++ = '{'; for (pos = value; c = *pos; pos++) { //need to process special characters } for example //the single quotes will be escaped by the parameter system //normalise whitespace if (is(c, "}{")) *newpos++ = '\\'; if (c == ' ') {if (last && !is(last, " ,")) *newpos++ = ',';} else if (c == ',') { strcpy(newpos, "},{"); newpos += 3;} else *newpos++ = c; last = c; } *newpos++ = '}'; *newpos++ = '}'; *newpos = 0; } return multivalue; } const char *TIPsDatabase::delimitedToMultiValue(const char *value, const char delimiter) { //caller frees the result if not 0 //default delimiter = "," //null in = null out //input: a string e.g. "some books publisher o'reilly", ' ' (delimiter = space) //output: a string e.g. "{some,books,publisher,o'reilly}" also "{}" is valid although not returned by PostGRES char c; const char *pos; char *multivalue = 0, *newpos; if (value) { multivalue = (char*)mallocCheck(strlen(value)*2 + 3); //{} and zero terminator and escape chars newpos = multivalue; *newpos++ = '{'; for (pos = value; c = *pos; pos++) { //need to process special characters } for example //the single quotes will be escaped by the parameter system if (is(c, "}{")) *newpos++ = '\\'; if (c == delimiter) *newpos++ = ','; else *newpos++ = c; } *newpos++ = '}'; *newpos = 0; } return multivalue; } const char *TIPsDatabase::vectorToMultiValue(const vector *v) { //caller frees result //maximum char length of an integer is 2^32 digits = 4,294,967,296 = 10 digits (possibly with a minus sign) //therefore the maximum size of this string is 10 + 1 (comma) * size() - 1 (trailing comma) + 2 (brackets) + 1 (zero terminator) //e.g. [4294967296,4294967296,4294967296] = 35 bytes const size_t len = v->size(); char *string = (char*) mallocCheck(11 * len + 2); char *pos = string; *pos++ = '{'; if (!len) pos++; //to return {} for (vector::const_iterator i = v->begin(); i != v->end(); i++) { sprintf(pos, "%i,", *i); while (*pos) pos++; } *--pos = '}'; //overwrite trailing comma *++pos = 0; //zero terminate return string; } const char *TIPsDatabase::vectorToMultiValue(const vector *v) { //caller frees result //string size will be a comma + 2 quotes for each item, [] and the string lengths + zero terminator - trailing comma const size_t len = v->size(); size_t chars = 0; char *string, *pos, c; const char *value; for (vector::const_iterator i = v->begin(); i != v->end(); i++) chars += strlen(*i)*2; //*2 for escaping pos = string = (char*) mallocCheck(chars + len + 2); *pos++ = '{'; if (!len) pos++; //to return {} for (vector::const_iterator i = v->begin(); i != v->end(); i++) { value = *i; while (c = *value++) { if (is(c, ",{}'")) *pos++ = '\\'; *pos++ = c; } *pos++ = ','; } *--pos = '}'; //overwrite trailing comma *++pos = 0; //zero terminate return string; } //utilities //private funcs const char *TIPsDatabase::registerFeeder(char **sError) const { const char *argv[] = {FEEDER_NAME, FEEDER_MAJORVERSION, FEEDER_MINORVERSION, FEEDER_LIVEBUILD, FEEDER_VERSIONNAME}; const bool argt[] = {true, false, false, false, true}; DEBUGPRINT("registering feeder program: %s v%s.%s %s build %s", DEBUG_CHECK, FEEDER_NAME, FEEDER_MAJORVERSION, FEEDER_MINORVERSION, FEEDER_VERSIONNAME, FEEDER_LIVEBUILD); int ret = m_db->execute("pregisterfeeder", 5, argv, argt, &m_feederid, sError); //single char* result procedure DEBUGPRINT("feederid:%s", DEBUG_LINE, m_feederid); DEBUG_RESULT_OK; return m_feederid; } int TIPsDatabase::loadFilters(FilterGroup *fg, RecordSet *recordset, const FilterGroup::enmConflictMode iConflictMode) const { /* return from the SQL statement needs to be: iFilterId: id for the filter (can be non-unique as is only stored) runOrder: process() uses this to order the work. matches() does not use this, may be NULL or 0 regEx regExExceptions - see Filter class (negative post-match filter) replacement: matches() does not use this description flags: 1=Pattern::CASE_INSENSITIVE multipass: runs the replacement until the pattern is not found: maximum tries: _MULTIPASSMAX [iFilterGroup]: this is an optional field to force loading only part of the PQresult based on the final field must be included if a iOnlyFromFilterGroup!=-1 is sent through */ FilterGroup *fgNewGroup=new FilterGroup(); int iRunOrder; //this is a managed recordSet (private function) and all the value results will be freed in the destructor of the RecordSet for (RecordSet::iterator i=recordset->begin();i!=recordset->end();i++) { iRunOrder = i(DBF_FILTERS_RUNORDER); fgNewGroup->insert(pair (iRunOrder, new Filter( i(DBF_FILTERS_FILTERID), iRunOrder, i(DBF_FILTERS_REGEX), i(DBF_FILTERS_REPLACEMENT), i(DBF_FILTERS_REGEXCEPT), i(DBF_FILTERS_DESC), (REGEX_FLAGS)i(DBF_FILTERS_FLAGS), i(DBF_FILTERS_MULTIPASS) ))); } fg->merge(fgNewGroup, iConflictMode); return 0; } int TIPsDatabase::loadFilters(FilterGroup *fg, const char *procedure, const FilterGroup::enmConflictMode iConflictMode) const { RecordSet *recordset = 0; int ret = m_db->execute(procedure, 0, 0, 0, &recordset, 0, true); //managed recordSet (frees value results itself) if (!ret) { ret = loadFilters(fg, recordset, iConflictMode); if (recordset) {delete recordset;recordset = 0;} } return ret; } int TIPsDatabase::loadFilter(Filter **f, const char *procedure) const { RecordSet *recordset = 0; int ret = m_db->execute(procedure, 0, 0, 0, &recordset, 0, true); //managed recordSet (frees value results itself) if (!ret) { ret = loadFilter(f, recordset); if (recordset) {delete recordset;recordset = 0;} } return ret; } int TIPsDatabase::loadFilter(Filter **f, RecordSet *recordset) const { /* return from the SQL statement needs to be: iFilterId: id for the filter (can be non-unique as is only stored) runOrder: process() uses this to order the work. matches() does not use this, may be NULL or 0 regEx regExExceptions - see Filter class (negative post-match filter) replacement: matches() does not use this description flags: 1=Pattern::CASE_INSENSITIVE multipass: runs the replacement until the pattern is not found: maximum tries: _MULTIPASSMAX [iFilterGroup]: this is an optional field to force loading only part of the PQresult based on the final field must be included if a iOnlyFromFilterGroup!=-1 is sent through */ //this is a managed recordSet (private function) and all the value results will be freed in the destructor of the RecordSet RecordSet::iterator i = recordset->begin(); if (!recordset->size()) { DEBUGPRINT0("[TIPSDATABASE]: FilterNotFound()", DEBUG_LINE); throw FilterNotFound(); } *f = new Filter( i(DBF_FILTERS_FILTERID), i(DBF_FILTERS_RUNORDER), i(DBF_FILTERS_REGEX), i(DBF_FILTERS_REPLACEMENT), i(DBF_FILTERS_REGEXCEPT), i(DBF_FILTERS_DESC), (REGEX_FLAGS)i(DBF_FILTERS_FLAGS), i(DBF_FILTERS_MULTIPASS) ); return 0; }