#include "InternetResource.h" using namespace std; using namespace regex; pthread_mutex_t InternetResource::m_hFirst_mutex = PTHREAD_MUTEX_INITIALIZER; unsigned int InternetResource::m_count = 0; TIPsDatabase *InternetResource::m_db = 0; FilterGroup InternetResource::m_fgConc; InternetResource::InternetResource(TIPsDatabase *_db, InternetURIRequest *_ir, const char *_body, const int _responseCode, const char *_ifModifiedSince, unsigned int _checkDigit, const bool _manageBuffer ): ResourceMonitor(_ir->domain()), m_id(m_count++), m_ir(_ir), m_ifModifiedSince(strdupCheck(_ifModifiedSince)), m_body(_body), //strdupCheck(_body)), //we don't need to copy the body because it is transient m_responseCode(_responseCode), m_checkDigit(_checkDigit), m_size(_body ? strlen(_body) : 0), m_manageBuffer(_manageBuffer) { pthread_mutex_lock(&m_hFirst_mutex); if (!m_db && _db) m_db = _db; if (!m_fgConc.size()) { //iFilterId, iRunOrder, sFilterRegEx, sReplacement=0, sFilterRegExExceptions=0, sDescription=0, iFlags, bMultipass, parameters=0 // NOFLAGS = 0x0000, // NOCASE = 0x0001, // ignore case // GLOBAL = 0x0002, // match everywhere in the string // MULTILINE = 0x0004, // ^ and $ can match internal line breaks // SINGLELINE = 0x0008, // . can match newline character // RIGHTMOST = 0x0010, // start matching at the right of the string // NOBACKREFS = 0x0020, // only meaningful when used with GLOBAL and substitute // FIRSTBACKREFS = 0x0040, // only meaningful when used with GLOBAL // ALLBACKREFS = 0x0080, // only meaningful when used with GLOBAL // NORMALIZE = 0x0100, // Preprocess patterns: "\\n" => "\n", etc. // EXTENDED = 0x0200, // ignore whitespace in pattern DEBUGPRINT("[%s]: InternetResource static load of [fltsConicalisation]", DEBUG_LINE, m_ir->domain()->m_domain); //note that $PARAMS in the main regex are compiled in at the start and //static for all uses of the Filter. //$PARAMS in the replacement are dynamicly calculated on every use of the Filter //Thus $RELATIVEHREF will be replaced each time //In order to be re-entrant the $PARAMS are passed into the call and held on the stack during matching m_fgConc.addFilter(new Filter(98, 1, "[\\n\\r\\t]", "", 0, "remove new lines", NOFLAGS, false)); m_fgConc.addFilter(new Filter(89, 3, "^(file|javascript|mailto|gopher):.*","",0,"ignore unsupported protocols links",NOCASE,false)); //m_fgConc.addFilter(new Filter(82,4," ","%20",0,"spaces",NOCASE,false)); m_fgConc.addFilter(new Filter(79, 5, "^\\.$","/$RELATIVEHREF",0,"simple self reference",NOCASE,false)); m_fgConc.addFilter(new Filter(80, 6, "^\\?(.*)","/$FOLDER$FILE?\\1",0,"query string only",NOCASE,false)); m_fgConc.addFilter(new Filter(81, 7, "#(.*)","",0,"remove all anchors",NOCASE,false)); m_fgConc.addFilter(new Filter(71, 10, "(.+)\\.+$","\\1",".*\\?.*","remove trailing dots (when no query string)",NOCASE,false)); m_fgConc.addFilter(new Filter(72, 20, "^www\\.","http://www.",0,"missing protocol on web address",NOCASE,"",false)); m_fgConc.addFilter(new Filter(78, 50, "^([^/].*)","$PROTOCOL://$DOMAIN/$FOLDER\\1",".*://.*","relative reference (see exception)",NOCASE,false)); m_fgConc.addFilter(new Filter(76, 55, "^/+(.*)","$PROTOCOL://$DOMAIN/\\1",0,"relative root references",NOCASE,false)); m_fgConc.addFilter(new Filter(73, 70, "(/+\\.)+/+","/",0,"remove this directory double",NOCASE,false)); m_fgConc.addFilter(new Filter(75, 80, "/+[^./]+/+\\.\\./+","/",0,"remove parent folders /..",NOCASE,true)); m_fgConc.addFilter(new Filter(83, 90, "(/\\.\\.)+/","/",0,"remove bad parent folders /../..",NOCASE,false)); m_fgConc.addFilter(new Filter(74, 100, "([^:])/{2,}","\\1/",0,"remove double directorys",NOCASE,false)); m_fgConc.addFilter(new Filter(95, 105, "(amp%3b){2,}","&",0,"encoding mistakes (see greenpeace.org)",NOCASE,false)); m_fgConc.addFilter(new Filter(97, 107, "([?&])CF(TOKEN|ID)=[0-9]+([&?]|$)","\\1",0,"coldfusion session querystring",NOFLAGS,true)); m_fgConc.addFilter(new Filter(99, 108, "PHPSESSID=[a-fA-F0-9]+","",0,"php session id",NOFLAGS,false)); m_fgConc.addFilter(new Filter(92, 110, "([&?])([^&?]+)[&?]\\2([&?]|$)","\\1\\2\\3",0,"duplicate query string variables",NOFLAGS,true)); m_fgConc.addFilter(new Filter(100, 115, "([&?])([^&?]+)([&?].*)[&?]\\2([&?]|$)","\\1\\2\\3\\4",0,"duplicate query string variables (far apart)",NOFLAGS,true)); m_fgConc.addFilter(new Filter(94, 130, "((?:/[^/]+){2,})\\1\\1","\\1",0,"directory command repeating",NOCASE,true)); m_fgConc.addFilter(new Filter(96, 140, "(/[^/]+)\\1\\1\\1\\1\\1","\\1",0,"directory command repeating",NOCASE,true)); } pthread_mutex_unlock(&m_hFirst_mutex); m_responseCode = _responseCode; m_ir->domain()->addHTTPResponseCode(m_responseCode); m_ir->setInternetResource(this); //if the derived class has not indicated the checkdigit but has passed through the body then calculate it based on that if (!m_checkDigit && m_body) { //calculate checkDigit of content //ignore numbers so that inifinite numeric recursion can be spotted ?pageid=4535345 //ignore query string parts of links to avoid textual recursion ?fontsize=large&fontsize=large&fontsize=large&... const char *bodypos = m_body-1; char digit; bool inQuery = false; bool inTag = false; bool inAttr = false; m_checkDigit = 0; while (digit = *++bodypos) { switch (digit) { case '<': {inTag = true; break;} case '>': {if (!inAttr) inTag = inAttr = inQuery = false; break;} case '\'': case '"': {if (inTag) inAttr = !inAttr; break;} case '?': {if (inTag) inQuery = true; break;} } if (!inQuery && (digit < '0' || digit > '9')) m_checkDigit += digit; } } memoryDelta((int)(sizeof(InternetResource) + m_size), this); //inform the domain which will, in turn, inform the Spider } InternetResource::~InternetResource() { if (m_ifModifiedSince) {free((void*)m_ifModifiedSince); m_ifModifiedSince = 0;} if (m_manageBuffer) freeBody(); //The creator handles the buffer spaces for IRs normally memoryDelta(-(int)sizeof(InternetResource), this); //inform the domain which will, in turn, inform the Spider } const char *InternetResource::domain() const {return m_ir->domain()->m_domain;} const char *InternetResource::absoluteURL() const {return m_ir->absoluteURL();} void InternetResource::freeBody() { //bodies are managed by the creator normally //Calling this function will release the body that was used by this IR if (m_body) { free((void*)m_body); m_body = 0; memoryDelta(-(int)m_size, this); } } vector *InternetResource::links(FilterGroup *fg) { //caller deletes the vector but must also free its contents first vector rawLinks; rawLinks.reserve(HTMLPAGE_NEWLINKS); //string results from the filters vector *links = new vector; links->reserve(HTMLPAGE_NEWLINKS); //parsed valid URLs vector::iterator i; const char *initiallink = 0, *finallink = 0, *domainStart = 0; bool absoluteLink; //none of these are malloc'd or strdup. They are used temporarily from m_ir, owned by this object m_parameters.clear(); m_parameters.insert(make_pair("DOMAIN", m_ir->domain()->m_domain)); m_parameters.insert(make_pair("PROTOCOL", m_ir->protocolText())); m_parameters.insert(make_pair("FOLDER", m_ir->folder())); m_parameters.insert(make_pair("FILE", m_ir->file())); m_parameters.insert(make_pair("RELATIVEHREF", m_ir->relativeHREF())); DEBUGPRINT("[%s]: parameters for relative URL processing:\n\ domain: [%s]\n\ protocol: [%s]\n\ folder: [%s]\n\ file: [%s]\n\ relativehref:[%s]\n\ \n", DEBUG_LINE, m_ir->domain()->m_domain, m_parameters["DOMAIN"], m_parameters["PROTOCOL"], m_parameters["FOLDER"], m_parameters["FILE"], m_parameters["RELATIVEHREF"]); //bare text links fg->findAll(m_body, &m_parameters, &rawLinks, true); //unique case sensitive findAll(N) //bare text links -> links (no InternetURIRequest for them yet, only strings) for (i = rawLinks.begin(); i != rawLinks.end(); i++) { initiallink = *i; domainStart = 0; if (!_STRNCASECMP(initiallink, "http://", 7)) domainStart = initiallink+7; else if (!_STRNCASECMP(initiallink, "https://", 8)) domainStart = initiallink+8; else if (!_STRNCASECMP(initiallink, "www.", 4)) domainStart = initiallink; absoluteLink = (domainStart != 0); if (absoluteLink && _STRNCASECMP(domainStart, m_ir->domain()->m_domain, strlen(m_ir->domain()->m_domain))) { //it is an absolute link for another domain, ignore it DEBUGPRINT("[%s]: %s -> %s", DEBUG_LINE, m_ir->domain()->m_domain, initiallink, "(ignored: other domain)"); } else { //need to pass absolute URLs through the filters as well for all sorts of weirdness //if (absoluteLink && domainStart != initiallink) finallink = initiallink; else m_fgConc.replace(initiallink, &m_parameters, &finallink); //finallink will either point to initiallink or be a new malloc DEBUGPRINT("[%s]: %s -> %s", DEBUG_LINE, m_ir->domain()->m_domain, initiallink, (finallink && *finallink ? finallink : "(ignored)")); if (finallink && *finallink) { if (!_STRCMP(finallink, "http")) { DEBUGERROR("[%s]: strange http url in [%s]", m_ir->domain()->m_domain, m_ir->absoluteURL()); } else links->push_back(finallink); } if (finallink != initiallink) free((void*)initiallink); //if finallink is a new malloc then free initiallink } } return links; } const size_t InternetResource::parse(vector *objects) { size_t numobjects = 0; Parser *parser; //Parsers are held on the domain for each area of the website if (m_ir && m_ir->domain()) { parser = m_ir->domain()->parserFor(this); if (parser) { DEBUGPRINT("[%s]: Parser found", DEBUG_LINE, m_ir->domain()->m_domain); numobjects = parser->parse(this, objects); } } else DEBUGERROR("[%s]: can't reach domain", m_ir->domain()->m_domain); return numobjects; } const int InternetResource::writeToFile() const { char path[1024], basetag[1024]; //filename char *name_esc, *name = _STRDUP(m_ir->absoluteURL()); name_esc = name; if (!_STRNCMP(name_esc, "http://", 7)) {name_esc += 7; while (*name_esc && *name_esc++ != '/') 000;} //skip leading protocol and domain name if there for (char *i = name_esc; *i; i++) if (!((*i >= 'a' && *i <= 'z')||(*i >= 'A' && *i <= 'Z'))) *i = ' '; //illegal chars _SNPRINTF(path, 1023, "pages%s[%s]-[%s].html", DIRSPLITTER, m_ir->domain()->m_domain, name_esc); free(name); _SNPRINTF(basetag, 1023, "", m_ir->domain()->m_domain); //point all pictures and CSS to the original //file write FILE *fPage = fopen(path, "w" ); if (fPage) { fputs(basetag, fPage); fputs(m_body, fPage); fflush(fPage); fclose(fPage); DEBUGPRINT("[%s]: wrote file [%s]", DEBUG_LINE, m_ir->domain()->m_domain, path); } else DEBUGERROR("[%s]: can't open file [%s]", m_ir->domain()->m_domain, path); return 0; }