#include "HTMLParser.h" #include "HTMLPage.h" HTMLParser::HTMLParser(const int _pagegroupid, TIPsDatabase *_db, const Domain *_domain, const int _type, short _minimumByteDensity): Parser(_pagegroupid, _db, _domain, _type), m_minimumByteDensity(_minimumByteDensity) {} bool HTMLParser::accepts(const InternetResource *ir) const { //all derived classes accept HTMLPage return dynamic_cast(ir) != 0; } const char *HTMLParser::selectedzoneText(const vector *chunks, const vector *selectedzones) const { //caller frees result if not 0 //gets *first* chunk of *first* selected zone text const char *textcopy = 0; if (selectedzones->size()) { const selectedzone& sz = selectedzones->at(0); const TextChunk& tc = chunks->at(sz.firstChunk); textcopy = strndupCheck(tc.m_start, tc.textLength()); } return textcopy; } const size_t HTMLParser::calcAggregates(vector *chunks, unsigned int &totalWordCount, unsigned int &totalChunksSize, unsigned int &totalAlphaNumerics, unsigned int &totalBytesSize, unsigned int &maxDepth, const HTMLTag **maxHeadingTag, unsigned int &numHeadings, PERCENTAGE &avgTextHTMLDensity, PERCENTAGE &avgAlphaNumeric, unsigned int &avgWordDensity ) const { //------------------------------------------------------------- 2nd pass: calc averages and totals vector::const_iterator i; //general cunks iterator totalWordCount = 0; totalChunksSize = 0; totalBytesSize = 0; totalAlphaNumerics = 0; maxDepth = 0; *maxHeadingTag = &HTMLTag::tag_h6; //higher value than tag_h1 numHeadings = 0; for (i = chunks->begin(); i != chunks->end(); i++) { const TextChunk& tc = *i; //to avoid copying the csspath again totalWordCount += tc.m_wordCount; totalChunksSize += tc.textLength(); totalBytesSize += tc.m_bytes; totalAlphaNumerics += tc.m_alphanumerics; if (tc.m_headingTag != 0) numHeadings++; if (HTMLTag::tag_h1 <= tc.m_headingTag && HTMLTag::tag_h6 >= tc.m_headingTag && tc.m_headingTag->operator>(*maxHeadingTag)) *maxHeadingTag = tc.m_headingTag; if (tc.m_depth > maxDepth) maxDepth = tc.m_depth; } //avoid division by 0 if (totalBytesSize) { avgTextHTMLDensity = totalChunksSize * 100 / totalBytesSize; avgWordDensity = totalWordCount * 1000 / totalBytesSize; } if (totalChunksSize) avgAlphaNumeric = totalAlphaNumerics * 100 / totalChunksSize; return chunks->size(); } const char *HTMLParser::selectedzonesToHTML(const vector *chunks, const vector *selectedzones, char *newbody, const size_t size) const { //if newbody is supplied then it is used for the result and returned, the // function assumes that the caller has allocated necessary space for the result // however, size can be sent through to limit the result size //otherwise a new area is malloc'd //caller frees result if newbody = 0 //reconstructs HTML from the selected textchunks back into a valid document fragment //TagStateStore tags; vector::const_iterator i; size_t lastMarkupAreaID = 0; bool para = false; size_t maxSize = 0; char *pos; if (!newbody) { //loop through the consecutive selected zones //calculate *max* size of output //HTML entities will reduce in size //only chunk_text processed at the moment //each chunk_text may have

or

placed around it //zero terminator //sz.firstChunk to sz.lastChunk *inclusive* for (i = selectedzones->begin(); i != selectedzones->end(); i++) { const selectedzone& sz = *i; if (sz.lastChunk >= chunks->size()) break; //opps for (size_t iChunk = sz.firstChunk; iChunk <= sz.lastChunk; iChunk++) { const TextChunk& tc = chunks->at(iChunk); maxSize += tc.textLength() + 10; } maxSize += 10; } newbody = (char*)mallocCheck(maxSize); } //now loop through concatenating all the chunks //sz.firstChunk to sz.lastChunk *inclusive* pos = newbody; for (i = selectedzones->begin(); i != selectedzones->end(); i++) { const selectedzone& sz = *i; for (size_t iChunk = sz.firstChunk; iChunk <= sz.lastChunk; iChunk++) { const TextChunk& tc = chunks->at(iChunk); if (tc.m_type == chunk_text) { //paragraph block control if (lastMarkupAreaID != tc.m_markupAreaID) { //block change if (para) { //paragraph is open so close it strcpy(pos, "

"); while (*pos) pos++; para = false; } if (!para && !tc.m_headingTag) { //no paragraph as yet, but have a normal text entry strcpy(pos, "

" ); while (*pos) pos++; para = true; } } //headings if (tc.m_headingTag) { sprintf(pos, "<%s>", tc.m_headingTag->text()); while (*pos) pos++; } //copy textcontent if (tc.m_type == chunk_text) { //vague limit check (only if size sent through) if (size && pos - newbody + tc.textLength() + 256 > size) { DEBUGERROR0("[Parser]: BodyOverflow()"); throw BodyOverflow(); } _STRNCPY(pos, tc.m_start, tc.textLength()); pos += tc.textLength(); //not zero terminated } //headings if (tc.m_headingTag) { sprintf(pos, "", tc.m_headingTag->text()); while (*pos) pos++; } //separate text with whitespace *pos = ' '; pos++; } //remember previous chunk lastMarkupAreaID = tc.m_markupAreaID; } //finish trailing paragraph and zero terminate if (para) { strcpy(pos, "

"); while (*pos) pos++; } } *pos = 0; //just in case there are no selectedZones! return newbody; } const size_t HTMLParser::generateCodeString(const vector *chunks, char **rcodestring) const { //caller frees result char *codestring = (char*) mallocCheck(chunks->size() * 4 + 1); *codestring = 0; //immediate zero terminate char *pos = codestring; //pos for progressive writing *rcodestring = codestring; //return pointer vector::const_iterator i; size_t lastMarkupAreaID = 0; for (i = chunks->begin(); i != chunks->end(); i++) { const TextChunk& tc = *i; //markuparea change if (lastMarkupAreaID != tc.m_markupAreaID) {*pos++ = ' '; *pos = 0;} lastMarkupAreaID = tc.m_markupAreaID; //write chunkcode into the stream pos = chunkcode(tc, pos); } return pos-codestring; } char *HTMLParser::chunkcode(const TextChunk &tc, char *pos) const { //create a encoded string version of the types of TextChunks //this is to allow us to select zones of TextChunks using regex //the selectChunks(...) function will do the selection from a regex size_t sm; char code = '?'; bool hasHeading = (tc.m_headingTag != 0); //0, tag_hx, tag_b, tag_font etc. bool hasLink = (tc.m_hrefStart != 0); bool bigText = (tc.a1Density() > 40 && tc.m_wordCount >= 8); bool grammarBunch = (tc.grammarDensity() > 80 && tc.m_wordCount >= 8); //base type (capital letter indicating the beginning of the chunk) switch (tc.m_type) { case chunk_text: { if (hasHeading) { code = 'H'; } else { //if (grammarBunch) // code = 'G'; if (bigText) code = 'V'; else code = 'T'; } break; } case chunk_image: {code = 'I'; break;} case chunk_alt: {code = 'A'; break;} case chunk_description: {code = 'D'; break;} case chunk_keywords: {code = 'K'; break;} case chunk_title: {code = 'Z'; break;} } *pos = code; pos++; //size modifier (always at least one word) if (tc.m_wordCount == 0 ) sm = 0; //should always be one word for normal texts else if (tc.m_wordCount == 1 ) sm = 2; else if (tc.m_wordCount == 2 ) sm = 2; else if (tc.m_wordCount == 3 ) sm = 3; else if (tc.m_wordCount == 4 ) sm = 4; else if (tc.m_wordCount == 5 ) sm = 5; else if (tc.m_wordCount <= 8 ) sm = 6; else if (tc.m_wordCount <= 14) sm = 7; else if (tc.m_wordCount <= 30) sm = 8; else sm = 9; // > 30 words sprintf(pos, "%u", sm); while (*pos) pos++; //modifiers: links, listitem, etc. sprintf(pos, "%s", (hasLink ? "a" : "") ); while (*pos) pos++; return pos; } const size_t HTMLParser::selectChunks(const char *codestring, Filter *f, vector *v) const { //allows selection of parts of an encodedstring using a regex //a vector of selected zones of that string is returned //a new chunk in the string is indicated by a capital letter //for example: // A1aL2H1aH2T1T2V8 (1st chunk = A1a, 2nd = L2, 3rd = H1a, etc.) // selecting: /H[1-9]+T[1-4]+/g (always replace with [\0]) // creates A1aL2H1a[H2T1T2]V8 intermediate string // and returns 1 selected zone of chunks 4-6 because there are 3 capital letters before the start of the selected zone ...[...] //these encoded strings are created by generateCodeString(...) const char *out, *pos; unsigned int iZoneStart, iZoneFinish, iChunk = 0; char c; int el; f->replace(codestring, 0, &out); //run the requested replacement regex if (out && codestring != out) { //some replacements have been made pos = out; //start traversal of the string while (c = *pos++) { if (isUpper(c)) iChunk++; //upper case indicates the start of a new chunk else { switch (c) { case ' ': {break;} //MUA break case '[': { //start selected zone iZoneStart = iChunk; break; } case ']': { //end selected zone (create) //last one is just before the ] iZoneFinish = iChunk - 1; if (iZoneFinish >= iZoneStart) { //ensure at least one chunk in the zone selectedzone sz = {iZoneStart, iZoneFinish}; v->push_back(sz); } break; } case '{': { //multi-elements el = atoi(pos); if (el > 1) iChunk += el-1; break; } } } } free((void*)out); } return v->size(); } const size_t HTMLParser::getTextChunks(const char *body, size_t bodysize, vector *chunks) const { //using text - tag density to spot relevant content //http://ai-depot.com/articles/the-easy-way-to-extract-useful-text-from-arbitrary-html/#more-90 //although we want to calculate the dense areas of text, //we also want to include lots of formatting so that the doucment can be re-produced from these selections //this function outputs chunks of HTML //thus: h1-6, font+/-, bold, italic, in-text images, etc. need to be included in the output from this function //declare local to keep re-entrant const char *bodyend = body + bodysize; vector::const_iterator i; //general cunks iterator vector csspath; csspath.reserve(200); //linking in the CSS hierarchy for calculating block level elements //------------------------------------------------------------- 1st pass: //compile text chunks into an array of positions and properties (no stats or selection yet) //anything the requires traversing and parsing (the rest is set calcs) char c, c1, was = 0; //current character const char *pos = body, //place in body parse *chunkStart = 0; //chunk of text (0 = no current chunk) unsigned int wordCount = 1, //counts intial word boundaries so start at 1 alphanumerics = 0, //num of alphanumeric chars grammars = 0, //num of grammatical chars (used for spotting lists like in footers) sentences = 1; //sentence count (dot space combos + 1) size_t markupareaID = 0; const char *lastFinish = body; //to record the number of HTML chars required to create the text chunk HTMLTag *currenttag, *droptag, //css tag just dropped *adjacentTagBefore, //only valid if there was a tag immediately before this one e.g.
*lasttag, //the last tag e.g. some text
*headingTag = 0; //if the current text has a heading level bool starttag, //, not selfenclosedtag, //
jumpTag, // processingInstruction; // HTMLAttribute *currentattribute; //tag attribute recognition tagmodifier tm; //for combinations of attributes chunktype ct; //chunktype to create (if any) from attribute const char *attributeValueStart; bool quoted; //quoted attributes allow spaces //start finish pairs pointing into the main document (not zero terminated) const char *hrefStart = 0, //if the current text is in a link *hrefFinish = 0, //end of link *cssidStart = 0, *cssidFinish = 0, *classnameStart = 0, *classnameFinish = 0, *styleStart = 0, *styleFinish = 0; while (pos < bodyend && (c = *pos)) { //EOF check, pos valid check //each section deals with multi-increments of pos accordingly //zero terminator also deals with stristr(...) requests //pos is set to bodyend if it is accidentally set to 0 currenttag = 0; if (c == '<' && (c1 = pos[1]) && !isWhiteSpace(c1) ) { //"< " is not a tag, it's a less than //we have a tag: can occur multiple times before a text break currenttag = HTMLTag::gettag(pos); starttag = (pos[1] != '/'); //will be true for comment starts ( if (!pos) pos = bodyend; jumpTag = true; } //) look for the end of the end tag (>) or EOF: the characters in the tag are not counted in chunks //and process attributes cssidStart = 0; cssidFinish = 0; classnameStart = 0; classnameFinish = 0; styleStart = 0; styleFinish = 0; tm = noTagModifer; while ((c = *pos) && c != '>') { //skip to next tag-end or EOF processing attributes pos++; if (c <= ' ' && (c = *pos) && isAlpha(c)) { //attribute start " checked ", " id=...", !" = " is not (=) //or erroneous value start: but we will simply get an unknownAttribute in this case currentattribute = HTMLAttribute::getattribute(pos); } else if (c == '=') { //we have a value for an registered, identifieable attribute //if we have a currentattribute then it is a registered one and we are thus interested in the value //move to standard place at beginning of name after '"=\s to EOF //to find the beginning of the attribute value //quotes are important becuse they decide what chars are allowed in the value quoted = false; while (is((c = *pos), "=\"'") || isWhiteSpace(c)) { if (c == '"' || c == '\'') quoted = true; pos++; } attributeValueStart = pos; //nearest space or > or / (
) or EOF. Allow spaces if there were quotes at the beginning //to find the end of the attribute value while ((c = *pos) && !( //criteria for the end of the attribute: ( quoted && (c == '"' || c == '\'')) //if quoted then a " ends the attribute || (!quoted && (c == '>' || c <= ' ')) //if !quoted then white space or > ends the attribute ) ) pos++; //check for id, class and alt attributes ct = chunk_none; //causes a new TextChunk to be created below if set if (HTMLAttribute::attribute_src == currentattribute) {if (HTMLTag::tag_img == currenttag) ct = chunk_image;} else if (HTMLAttribute::attribute_alt == currentattribute) { ct = chunk_alt;} else if (HTMLAttribute::attribute_id == currentattribute) {cssidStart = attributeValueStart; cssidFinish = pos;} else if (HTMLAttribute::attribute_class == currentattribute) {classnameStart = attributeValueStart; classnameFinish = pos;} else if (HTMLAttribute::attribute_style == currentattribute) {styleStart = attributeValueStart; styleFinish = pos;} else if (HTMLAttribute::attribute_href == currentattribute) {hrefStart = attributeValueStart; hrefFinish = pos;} else if (HTMLAttribute::attribute_onclick == currentattribute) {} //META Data control else if (HTMLTag::tag_meta == currenttag) { if (HTMLAttribute::attribute_name == currentattribute) { if (strlicmp("description", attributeValueStart)) tm = metaDescription; else if (strlicmp("keywords", attributeValueStart)) tm = metaKeywords; } else if (HTMLAttribute::attribute_content == currentattribute) { switch (tm) { case metaDescription: {ct = chunk_description; break;} case metaKeywords: {ct = chunk_keywords; break;} } } } //instruction to create a TextChunk out of this attribute if (ct) { const TextChunk tc = {ct, attributeValueStart, pos, pos - attributeValueStart, 0, pos - attributeValueStart, 0, 0, headingTag, hrefStart, hrefFinish, markupareaID, csspath.size(), csspath }; chunks->push_back(tc); //(runs mem copy constructor for line) } currentattribute = 0; //processed } } selfenclosedtag = (*(pos-1) == '/') || processingInstruction; //tags that are self-enclosed
pos++; //go one past tag-end (if not EOF) //) linking in the CSS hierarchy for calculating block level elements //need to link in the id and class attributes if (!jumpTag && !selfenclosedtag && currenttag && !processingInstruction) { if (starttag) { //going deeper cssinstance cssi = {currenttag, cssidStart, cssidFinish, classnameStart, classnameFinish, styleStart, styleFinish}; csspath.push_back(cssi); } else { //returning up a level (not necessarily valid XHTML img, br etc. so check levels) //

: need to clear back down to the div level, ignoring the (not) img close //
: this will fail, falling back to the last span if (csspath.size()) { //... > div > img > br do { droptag = csspath.back().t; //br (copy) csspath.pop_back(); //... > div > img } while (csspath.size() && droptag != currenttag); //br != div } } lasttag = currenttag; adjacentTagBefore = currenttag; } was = 0; } else if (c == '&') { //ignore entities (confuses alphanumerics count on small text chunks) //entities are translated later on when creating the DBObjects(s) while ((c = *++pos) && (isAlphaNumeric(c) || is(c, "#;"))) 000; //skip to entity-end or EOF } else { //in a normal text area (outside of a tag) if (isWordStart(c, was) && chunkStart) wordCount++; //word counting (ignore initial white-space, illegal chars EOF check done already in while condition) if (isSentenceEnd(was, c) && chunkStart) sentences++; //sentence counting (ignore initial dot space combos) if (isAlphaNumeric(c)) alphanumerics++; if (isGrammar(c)) grammars++; if (!chunkStart) chunkStart = pos; //start new sentence if not one in progress pos++; } //do not skip to the next < tag start here because we need to count words and alphanumeric if (!currenttag) adjacentTagBefore = 0; if (!pos) pos = bodyend; was = c; } return chunks->size(); } const char *HTMLParser::debugbody(vector *chunks, vector *selectedzones, const char *body, const size_t bodysize) const { //caller frees result if not 0 vector::const_iterator i; //general cunks iterator vector::const_iterator iChunksBegin = chunks->begin(); vector::const_iterator iChunksEnd = chunks->end(); vector::const_iterator iCSS; //output body to replace the current body (with markup) const char *newbody = (const char*)mallocCheck(5 * 1024 * 1024); char *newpos = (char*) newbody; //also output the graphs at this stage (expandeable) size_t lastMarkupAreaID = 0; //so we know when it changes, to change colour bool bColor = false; //DHTML container strcpy(newpos, "

text chunks

"); while (*newpos) newpos++; //TABLE strcpy(newpos, "\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ "); while (*newpos) newpos++; for (i = iChunksBegin; i != iChunksEnd; i++) { //situation of chunk in relation to previous, next and area const TextChunk& tc = *i; //TR and area change if (lastMarkupAreaID != tc.m_markupAreaID) bColor = !bColor; sprintf(newpos, "", (bColor ? "white" : "#d0d0d0")); while (*newpos) newpos++; //codestring strcpy(newpos, ""); while (*newpos) newpos++; //checkbox, text and href strcpy(newpos, ""); while (*newpos) newpos++; //csspath strcpy(newpos, ""); while (*newpos) newpos++; //markup area strcpy(newpos, ""); while (*newpos) newpos++; //depth strcpy(newpos, ""); while (*newpos) newpos++; //textLength strcpy(newpos, ""); while (*newpos) newpos++; //alphanumerics strcpy(newpos, ""); while (*newpos) newpos++; //grammars strcpy(newpos, ""); while (*newpos) newpos++; //wordCount strcpy(newpos, ""); while (*newpos) newpos++; //sentences strcpy(newpos, ""); while (*newpos) newpos++; //bytes //strcpy(newpos, ""); while (*newpos) newpos++; //text/byte density strcpy(newpos, ""); while (*newpos) newpos++; //a1/byte density strcpy(newpos, ""); while (*newpos) newpos++; //word/byte density strcpy(newpos, ""); while (*newpos) newpos++; //include? strcpy(newpos, ""); while (*newpos) newpos++; //graph - plot all the values! strcpy(newpos, ""); while (*newpos) newpos++; lastMarkupAreaID = tc.m_markupAreaID; } strcpy(newpos, "
codechunkcss
path
MUAdepthtext
len
a1grwrd
cnt
sentbyte
dens
a1
dens
word
dens
inc?graph
"); while (*newpos) newpos++; newpos = chunkcode(tc, newpos); strcpy(newpos, " "); while (*newpos) newpos++; sprintf(newpos, "[%u]", i - iChunksBegin); while (*newpos) newpos++; if (tc.m_headingTag) {sprintf(newpos, "<%s style=display:inline>", tc.m_headingTag->text()); while (*newpos) newpos++;} if (tc.m_hrefStart) { strcpy(newpos, ""); while (*newpos) newpos++; } switch (tc.m_type) { case chunk_image: { strcpy( newpos, ""); while (*newpos) newpos++; break; } case chunk_alt: { strcpy( newpos, "(alt):"); while (*newpos) newpos++; strncpy(newpos, tc.m_start, tc.textLength()); newpos += tc.textLength(); //does not include zero terminator break; } default: { strncpy(newpos, tc.m_start, tc.textLength()); newpos += tc.textLength(); //does not include zero terminator break; } } if (tc.m_hrefStart) {sprintf(newpos, ""); while (*newpos) newpos++;} if (tc.m_headingTag) {sprintf(newpos, " (%s)", tc.m_headingTag->text(), tc.m_headingTag->text()); while (*newpos) newpos++;} strcpy(newpos, ""); while (*newpos) newpos++; if (!lastMarkupAreaID || lastMarkupAreaID != tc.m_markupAreaID) {sprintf(newpos, "[%u] ", tc.m_markupAreaID); while (*newpos) newpos++;} strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.m_depth); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.textLength()); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.m_alphanumerics); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.m_grammars); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.m_wordCount); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.m_sentences); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; //sprintf(newpos, "%u", bytesLength); while (*newpos) newpos++; //strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%hi%%", tc.textDensity()); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%hi%%", tc.a1Density()); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "%u", tc.wordDensity()); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "", (tc.a1Density() >= 40 ? "checked" : "")); while (*newpos) newpos++; sprintf(newpos, "", (tc.m_wordCount >= 4 ? "checked" : "")); while (*newpos) newpos++; sprintf(newpos, "", (tc.m_wordCount >= 20 ? "checked" : "")); while (*newpos) newpos++; sprintf(newpos, "", (tc.m_sentences >= 2 ? "checked" : "")); while (*newpos) newpos++; strcpy(newpos, ""); while (*newpos) newpos++; sprintf(newpos, "
 
", tc.a1Density()); while (*newpos) newpos++; sprintf(newpos, "
 
", 50); while (*newpos) newpos++; strcpy(newpos, "
"); while (*newpos) newpos++; //------------------------------------------------------------- 5th pass: output the relevant text(s) strcpy(newpos, "

selectedzones

"); while (*newpos) newpos++; vector::const_iterator iZone; lastMarkupAreaID = 0; for (iZone = selectedzones->begin(); iZone != selectedzones->end(); iZone++) { const selectedzone& sz = *iZone; //output chunks (need to output the last chunk too) i = iChunksBegin + (sz.firstChunk - 1); do { const TextChunk& tc = *i++; if (tc.m_headingTag) {sprintf(newpos, "<%s>", tc.m_headingTag->text()); while (*newpos) newpos++;} if (tc.m_hrefStart) { strcpy(newpos, ""); while (*newpos) newpos++; } switch (tc.m_type) { case chunk_image: { strcpy( newpos, ""); while (*newpos) newpos++; break; } default: { strncpy(newpos, tc.m_start, tc.textLength()); newpos += tc.textLength(); //does not include zero terminator break; } } if (tc.m_hrefStart) {sprintf(newpos, ""); while (*newpos) newpos++;} if (tc.m_headingTag) {sprintf(newpos, " (%s)", tc.m_headingTag->text(), tc.m_headingTag->text()); while (*newpos) newpos++;} lastMarkupAreaID = tc.m_markupAreaID; } while (i <= iChunksBegin + sz.lastChunk); strcpy(newpos, "
"); while (*newpos) newpos++; //zone delimiter } //------------------------------------------------------------- output origonal body //strcpy(newpos, body); //includes zero terminator //newpos += bodysize; *newpos = 0; return newbody; }