#include "HTMLObjectParser.h" #include "InternetResource.h" #include "TIPsDatabase.h" HTMLObjectParser::HTMLObjectParser(const int _pagegroupid, TIPsDatabase *_db, const Domain *_domain, const int _type): HTMLParser(_pagegroupid, _db, _domain, _type) {} HTMLObjectParser::~HTMLObjectParser() {} const int HTMLObjectParser::parsertype() const {return DBF_PARSERTYPE_HTMLOBJECTPARSER;} size_t HTMLObjectParser::init() { //load domain - pagegroup specific settings //we need to free the contents of m_settings m_db->loadHTMLObjectParserSettings(m_pagegroupid, &m_settings); return 0; } const size_t HTMLObjectParser::parse(const InternetResource *ir, vector *objects) const { const char *body = ir->body(); size_t bodysize = ir->size(); if (!bodysize) bodysize = strlen(body); vector properties; vector matches; unsigned int iMatchCount; Filter *f; DBEntity *newentity = 0, *entity; const char *name, //property 1 always *firstname; StringMultiMapCI::const_iterator icoCurrent, icoEnd = m_entities.end(); #ifdef _DEBUG if (m_domain != ir->internetURIRequest()->domain()) { DEBUGERROR0("[HTMLArticleParser]: DomainMismatch()"); throw DomainMismatch(); } #endif try { //gather all char values into a vector for (FilterGroup::const_iterator iF = m_settings.begin(); iF != m_settings.end(); iF++) { f = iF->second; //one and only one match allowed (replacement is done automatically) //zero length string = invalid iMatchCount = f->findAll(body, 0, &matches); if (iMatchCount == 1 && *matches[0]) { properties.push_back(matches[0]); } else { for (unsigned int iP = 0; iP < iMatchCount; iP++) free((void*)matches[iP]); if (f->description()) { //the required field is stored in the filter description as 0 or 1 //indicates required field DEBUGERROR("[%s]: cannot find property [%i] in [%s]", m_domain->m_domain, f->filterId(), ir->internetURIRequest()->absoluteURL()); DEBUGERROR("[%s]: MissingProperty()", m_domain->m_domain); for (vector::iterator iP = properties.begin(); iP != properties.end(); iP++) free((void*)*iP); throw MissingProperty(); } else { //indicates not required field properties.push_back(strdupCheck("")); } } matches.clear(); } if (!properties.size()) { DEBUGERROR("[%s]: NoProperties! MissingProperty()", m_domain->m_domain); throw MissingProperty(); } name = properties[0]; //must always be the first property if (!name || !*name) { //must always be a valid string DEBUGERROR("[%s]: MissingProperty()", m_domain->m_domain); for (vector::iterator iP = properties.begin(); iP != properties.end(); iP++) free((void*)*iP); throw MissingProperty(); } //create the object switch (m_type) { case DBF_OBJECT_COMPANY: { if (properties.size() != 1) { DEBUGERROR("[%s]: NotEnoughProperties() for type [%i]", m_domain->m_domain, m_type); for (vector::iterator iP = properties.begin(); iP != properties.end(); iP++) free((void*)*iP); throw NotEnoughProperties(); } newentity = new Company(m_db, ir, name); break; } case DBF_OBJECT_COUNTRY: { break; } case DBF_OBJECT_PERSON: { if (properties.size() != 2) { DEBUGERROR("[%s]: NotEnoughProperties() for type [%i]", m_domain->m_domain, m_type); for (vector::iterator iP = properties.begin(); iP != properties.end(); iP++) free((void*)*iP); throw NotEnoughProperties(); } //properties[0] = commonalias (Idi Amin) //properties[1] = full name (Idi Amin Dada Oumee) newentity = new Person(m_db, ir, name, properties[1]); break; } case DBF_OBJECT_PRODUCT: { switch (properties.size()) { //case 1: { //properties[0] = company name with product name //newentity = new Product(m_db, ir, name); //break; //} case 2: { //properties[0] = product name //properties[1] = company name newentity = new Product(m_db, ir, name, properties[1]); break; } default: { DEBUGERROR("[%s]: NotEnoughProperties() for type [%i]", m_domain->m_domain, m_type); for (vector::iterator iP = properties.begin(); iP != properties.end(); iP++) free((void*)*iP); throw NotEnoughProperties(); } } break; } case DBF_OBJECT_PRODUCTTYPE: { break; } default: { DEBUGERROR("[%s]: UnknownObjectType(%i) during construction", m_domain->m_domain, m_type); for (vector::iterator iP = properties.begin(); iP != properties.end(); iP++) free((void*)*iP); throw UnknownObjectType(); break; } } if (newentity) { //search for the object in the cached in-memory map //Database conducts a more thourough search //this is just to try and weed out obvious matches firstname = newentity->firstname(); icoCurrent = m_entities.find(firstname); //search for the company first name if (icoCurrent != icoEnd) { //cycle through the entities with this first word in length desc order for (; icoCurrent != icoEnd && !_STRCASECMP(icoCurrent->first, firstname); icoCurrent++) { entity = icoCurrent->second; if (strlicmp(firstname, entity->name())) { //compare current entity //found: delete and ignore delete newentity; newentity = 0; break; } } } //add this to the list of objects to save to the DB if (newentity) objects->push_back(newentity); } for (vector::iterator iP = properties.begin(); iP != properties.end(); iP++) free((void*)*iP); } catch (NotEnoughProperties) {} catch (MissingProperty) {} return objects->size(); }