#include "HTMLParser.h"
#include "HTMLPage.h"
HTMLParser::HTMLParser(const int _pagegroupid, TIPsDatabase *_db, const Domain *_domain, const int _type, short _minimumByteDensity):
Parser(_pagegroupid, _db, _domain, _type), m_minimumByteDensity(_minimumByteDensity) {}
bool HTMLParser::accepts(const InternetResource *ir) const {
//all derived classes accept HTMLPage
return dynamic_cast
" ); while (*pos) pos++; para = true; } } //headings if (tc.m_headingTag) { sprintf(pos, "<%s>", tc.m_headingTag->text()); while (*pos) pos++; } //copy textcontent if (tc.m_type == chunk_text) { //vague limit check (only if size sent through) if (size && pos - newbody + tc.textLength() + 256 > size) { DEBUGERROR0("[Parser]: BodyOverflow()"); throw BodyOverflow(); } _STRNCPY(pos, tc.m_start, tc.textLength()); pos += tc.textLength(); //not zero terminated } //headings if (tc.m_headingTag) { sprintf(pos, "%s>", tc.m_headingTag->text()); while (*pos) pos++; } //separate text with whitespace *pos = ' '; pos++; } //remember previous chunk lastMarkupAreaID = tc.m_markupAreaID; } //finish trailing paragraph and zero terminate if (para) { strcpy(pos, "
"); while (*pos) pos++; } } *pos = 0; //just in case there are no selectedZones! return newbody; } const size_t HTMLParser::generateCodeString(const vector| code | \chunk | \css path | \
MUA | \depth | \text len | \
a1 | \gr | \wrd cnt | \
sent | \byte dens | \
a1 dens | \
word dens | \
inc? | \graph | \|
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| "); while (*newpos) newpos++; newpos = chunkcode(tc, newpos); strcpy(newpos, " | "); while (*newpos) newpos++; //checkbox, text and href strcpy(newpos, " "); while (*newpos) newpos++;
sprintf(newpos, "[%u]", i - iChunksBegin); while (*newpos) newpos++;
if (tc.m_headingTag) {sprintf(newpos, "<%s style=display:inline>", tc.m_headingTag->text()); while (*newpos) newpos++;}
if (tc.m_hrefStart) {
strcpy(newpos, ""); while (*newpos) newpos++;
}
switch (tc.m_type) {
case chunk_image: {
strcpy( newpos, " | "); while (*newpos) newpos++;
//csspath
strcpy(newpos, ""); while (*newpos) newpos++; //markup area strcpy(newpos, " | "); while (*newpos) newpos++; if (!lastMarkupAreaID || lastMarkupAreaID != tc.m_markupAreaID) {sprintf(newpos, "[%u] ", tc.m_markupAreaID); while (*newpos) newpos++;} strcpy(newpos, " | "); while (*newpos) newpos++; //depth strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.m_depth); while (*newpos) newpos++; strcpy(newpos, " | "); while (*newpos) newpos++; //textLength strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.textLength()); while (*newpos) newpos++; strcpy(newpos, " | "); while (*newpos) newpos++; //alphanumerics strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.m_alphanumerics); while (*newpos) newpos++; strcpy(newpos, " | "); while (*newpos) newpos++; //grammars strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.m_grammars); while (*newpos) newpos++; strcpy(newpos, " | "); while (*newpos) newpos++; //wordCount strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.m_wordCount); while (*newpos) newpos++; strcpy(newpos, " | "); while (*newpos) newpos++; //sentences strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.m_sentences); while (*newpos) newpos++; strcpy(newpos, " | "); while (*newpos) newpos++; //bytes //strcpy(newpos, ""); while (*newpos) newpos++; //sprintf(newpos, "%u", bytesLength); while (*newpos) newpos++; //strcpy(newpos, " | "); while (*newpos) newpos++; //text/byte density strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%hi%%", tc.textDensity()); while (*newpos) newpos++; strcpy(newpos, " | "); while (*newpos) newpos++; //a1/byte density strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%hi%%", tc.a1Density()); while (*newpos) newpos++; strcpy(newpos, " | "); while (*newpos) newpos++; //word/byte density strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.wordDensity()); while (*newpos) newpos++; strcpy(newpos, " | "); while (*newpos) newpos++; //include? strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "", (tc.a1Density() >= 40 ? "checked" : "")); while (*newpos) newpos++; sprintf(newpos, "", (tc.m_wordCount >= 4 ? "checked" : "")); while (*newpos) newpos++; sprintf(newpos, "", (tc.m_wordCount >= 20 ? "checked" : "")); while (*newpos) newpos++; sprintf(newpos, "", (tc.m_sentences >= 2 ? "checked" : "")); while (*newpos) newpos++; strcpy(newpos, " | "); while (*newpos) newpos++; //graph - plot all the values! strcpy(newpos, ""); while (*newpos) newpos++;
sprintf(newpos, " |