#include "HTTP.h" HTTP::HTTP(TIPsDatabase *_db, DomainConnection *_dc, InternetURIRequest *_ir, InternetResource **_resource, char *_buffer, const size_t _buffersize, PersistentData *_permData, const bool _manageBuffer): Protocol(_db, _dc, _ir, _resource, _buffer, _buffersize, _permData, _manageBuffer), m_headerEnd(0), //indicates also that the header has not jet been retrieved m_bodyStart(0), //specific headers (or not) m_httpResponseCode(0), m_contentLength(0), m_lastModifiedStr(0), m_location(0), m_connectionClose(false), //text -> enum fields m_charSetText(0), m_contentTypeText(0), m_contentLanguageText(0), m_responseProtocol(unknownResponseProtocol), //persistent data (will be overwritten by last protocol if sent) m_PHPsessionID(0), m_ASPsessionID(0) {} void HTTP::connectionClosedOnRead(const size_t bufspace) { #ifdef _DEBUG //if it was not suppossed to close the connection or no reply had been recieved: if (!m_connectionClose || !m_totalBytesTransfered) { const char *conversationEndScheme[] = {"unknown", "notStated", "chunkedEncoding", "closeConnection", "contentLength"}; const char *protocol[] = {"unknown", "HTTP 1.0", "HTTP 1.1"}; DEBUGERROR("[%s]: connection type (for bad drop) [%s,%s,%s,%s,%s,%s]", m_domain, protocol[m_responseProtocol], (m_contentEncoding == chunked ? "chunked" : "-"), (m_connectionClose ? "close" : "-"), (m_connectionKeepAlive ? "keep-alive" : "-"), (m_contentLength ? "length" : "-"), conversationEndScheme[m_conversationEndScheme] ); } #endif if (m_connectionClose) { //expected connection close if (m_totalBytesTransfered) { //with data: ok DEBUGPRINT("[%s]: [%s]: dropped connection whilst trying read, (expected with %u bytes, %u buffer space left)", DEBUG_LINE, m_domain, m_ir->absoluteURL(), m_totalBytesTransfered, bufspace); m_status = reconnecting; //now we set to reconnecting, even if already connected to start listening for write events m_dc->connect(); //start reconnection immediately while current page is being analysed m_dc->waitForNothing(); //set DC mode to nothing so that no more events come through finishConversation(finalise()); //release the getResource() call which will delete this Protocol } else { //without data: re-try write DEBUGERROR("[%s]: [%s]: dropped connection whilst trying read, (expected, but 0 bytes recieved, %u buffer space left)", m_domain, m_ir->absoluteURL(), bufspace); m_status = reconnecting; //now we set to reconnecting, even if already connected to start listening for write events m_dc->connect(); //start reconnection immediately while current page is being analysed m_dc->waitForWrite(generateRequest()); //generates in the owners buffer so no free } } else { //unexpected connection close if (m_totalBytesTransfered) { //with data: continue #ifndef PROTOCOL_HIDELOSTCONNS DEBUGERROR("[%s]: [%s]: dropped connection whilst trying read, (unexpected with %u bytes, %u buffer space left)", m_domain, m_ir->absoluteURL(), m_totalBytesTransfered, bufspace); #endif m_status = reconnecting; //now we set to reconnecting, even if already connected to start listening for write events m_dc->connect(); //start reconnection immediately while current page is being analysed m_dc->waitForNothing(); //set DC mode to nothing so that no more events come through finalise(); finishConversation(some); //release the getResource() call which will delete this Protocol } else { //without data: re-try write #ifndef PROTOCOL_HIDELOSTCONNS DEBUGERROR("[%s]: [%s]: dropped connection whilst trying read, (unexpected with 0 bytes recieved, %u buffer space left)", m_domain, m_ir->absoluteURL(), bufspace); #endif m_status = reconnecting; //now we set to reconnecting, even if already connected to start listening for write events m_dc->connect(); //start reconnection immediately while current page is being analysed m_dc->waitForWrite(generateRequest()); //the DC will free the request after it has been sent } } } //---------- finalise parts Protocol::conversationResult HTTP::finalise() { DEBUGPRINT("[%s]: finalising the IR", DEBUG_LINE, m_domain); //tell the Domain what type of HTTP conversations are being had if (m_contentEncoding == chunked) m_ir->domain()->addConversationEndScheme(chunkedEncoding); if (m_connectionClose) m_ir->domain()->addConversationEndScheme(closeConnection); if (m_contentLength) m_ir->domain()->addConversationEndScheme(contentLength); m_ir->domain()->addResponseProtocol(m_responseProtocol); m_ir->domain()->addKeepAlive(); //302 Location directs (the Spider will process these): if (m_location) { //parameter parts for incomplete urls (protocol:\1 domain:\2 port:\3 href:\4 folder:\5 file:\6 query:\7) StringMap m_parameters; const char *finallink = 0, *domainStart = 0; bool absoluteLink; //none of these parameters are malloc'd and we own the m_ir and domain m_parameters.clear(); m_parameters.insert(make_pair("DOMAIN", m_ir->domain()->m_domain)); m_parameters.insert(make_pair("PROTOCOL", m_ir->protocolText())); m_parameters.insert(make_pair("FOLDER", m_ir->folder())); m_parameters.insert(make_pair("FILE", m_ir->file())); m_parameters.insert(make_pair("RELATIVEHREF", m_ir->relativeHREF())); InternetResource::m_fgConc.replace(m_location, &m_parameters, &finallink); //ignore absolute links to other domains if (!_STRNCASECMP(finallink, "http://", 7)) domainStart = finallink+7; else if (!_STRNCASECMP(finallink, "https://", 8)) domainStart = finallink+8; absoluteLink = (domainStart != 0); if (absoluteLink && _STRNCASECMP(domainStart, m_domain, strlen(m_domain))) { //it is an absolute link for another domain, ignore it DEBUGPRINT("[%s]: %s -> %s", DEBUG_LINE, m_domain, finallink, "(ignored: other domain)"); } else { //link for this domain: add it DEBUGPRINT("[%s]: protocol links += [%s]", DEBUG_LINE, m_domain, finallink); m_links.push_back(_STRDUP(finallink)); } if (m_location != finallink) {free((void*)finallink);finallink = 0;} } //now run the standard content decode, charset decode, etc return Protocol::finalise(); } Protocol::conversationResult HTTP::updatePersistentData() { if (m_permData) { DEBUGPRINT("[%s]: Updating PermData", DEBUG_BLOCKSTART, m_domain); if (m_ASPsessionID) { //server has replied with an ASP session ID: store it in the callers struct DEBUGPRINT("[%s]: Updating ASP Session ID to [%s]", DEBUG_LINE, m_domain, m_ASPsessionID); if (m_permData->m_ASPsessionID) free(m_permData->m_ASPsessionID); m_permData->m_ASPsessionID = _STRDUP(m_ASPsessionID); } if (m_PHPsessionID) { //server has replied with a PHP session ID: store it in the callers struct DEBUGPRINT("[%s]: Updating PHP Session ID to [%s]", DEBUG_LINE, m_domain, m_PHPsessionID); if (m_permData->m_PHPsessionID) free(m_permData->m_PHPsessionID); m_permData->m_PHPsessionID = _STRDUP(m_PHPsessionID); } //there is always a valid m_ir->absoluteURL() DEBUGPRINT("[%s]: Updating referrer to [%s]", DEBUG_LINE, m_domain, m_ir->absoluteURL()); if (m_permData->m_referrer) free(m_permData->m_referrer); m_permData->m_referrer = _STRDUP(m_ir->absoluteURL()); DEBUGPRINT0("", DEBUG_BLOCKEND); } return ok; } Protocol::conversationResult HTTP::charsetDecode() { //always decodes the set to UTF-8 Protocol::conversationResult ret = Protocol::charsetDecode(); if (ret == ok) { ret = unknownCharset; if (!m_bodyStart) { DEBUGERROR("[%s]: charsetDecode: [%s] no body start/header", m_domain, m_ir->absoluteURL()); return noBody; } if (m_charSet == unknownCharSet || m_charSet == notStatedCharSet) { //guess charset from content //default to ascii if there are no bytes > 127 //however if there are any isolated bytes > 127 then assume single byte ISO-8859 //if all bytes > 127 are part of a multi-byte sequence then assume UTF-8 m_charSet = us_ascii; for (char *pos = m_bodyStart; pos < m_buffer + m_totalBytesTransfered; pos++ ) { if (*pos & 128) { //check the most significant bit: the indication of non-ascii if (*++pos & 128) { //check next byte m_charSet = utf_8; //multi byte non-ascii sequence while (*++pos & 128) 000; //skip remaining multi-byte sequence (+next ascii byte from end loop pos++) //continue looking because may find a single byte > 127 somewhere } else { m_charSet = iso_8859_1; //found a single-byte sequence break; //stop looking because cannot be utf-8 } } } } switch (m_charSet) { case iso_8859_15: case iso_8859_1: { //assume iso_8859_1 which is the default for web servers that do not state the charset //in that: web servers that give out utf-8 should be savy enough to state it as well //the iso_8859_1 -> utf-8 translation may make the string bigger (maximum twice the size in european languages) //however we have a big reserved buffer space that should be able to handle the extension DEBUGPRINT("[%s]: Decode iso_8859_1 -> utf-8 (temp -> ?)", DEBUG_LINE, m_domain); char *utf8char; for (char *pos = m_bodyStart; pos < m_buffer + m_totalBytesTransfered; pos++ ) { if (*pos & 128) { //check the most significant bit //we have a non-us_ascii character. An iso_8859_1 specific character. Needs to be encoded //calculate new character: utf8char=""; //chunk memory forward (...!) *pos = '?'; //temp solution to invalid UTF-8 sequence } } ret = ok; break; } case windows_1252: case us_ascii: //ascii is valid utf-8 already case utf_8: default: { //input already in utf-8 or valid equal, no conversion required ret = ok; break; } } } return ret; } Protocol::conversationResult HTTP::contentDecode() { Protocol::conversationResult ret = Protocol::contentDecode(); if (ret == ok) { switch (m_contentEncoding) { case chunked: { //this encoding reduces the size of the body //thus no extra space is malloc'd, the same m_body is used ret = contentDecodeFailed; if (!m_bodyStart) { DEBUGERROR("[%s]: contentDecode: no body start/header", m_domain);return noBody; } char *currentChunkStart = m_bodyStart, *currentChunkSizeFieldEnd = 0, *currentBodyPiece = currentChunkStart, *endOfBuffer = m_buffer + m_totalBytesTransfered; size_t currentChunkSizeField = 0, currentChunkSizeFieldLength = 0; bool moreChunks; DEBUGPRINT("[%s]: analysis of chunked transfer encoding [%u]", DEBUG_BLOCKSTART, m_ir->domain()->m_domain, m_totalBytesTransfered); do { moreChunks = false; //4-digit hex indicating chunk size if (!chunkSize(currentChunkStart, ¤tChunkSizeField, ¤tChunkSizeFieldLength)) { DEBUGPRINT("[%s]: Chunk Size [%u]", DEBUG_LINE, m_ir->domain()->m_domain, currentChunkSizeField); if (currentChunkSizeField) { if (currentChunkStart + currentChunkSizeField + currentChunkSizeFieldLength + 2 < endOfBuffer) { memcpy(currentBodyPiece, currentChunkStart + currentChunkSizeFieldLength + 2, currentChunkSizeField); currentBodyPiece += currentChunkSizeField; currentChunkStart += currentChunkSizeField + currentChunkSizeFieldLength + 4; moreChunks = true; } else DEBUGPRINT("[%s]: buffer overrun!", DEBUG_LINE, m_ir->domain()->m_domain); } else { currentBodyPiece[currentChunkSizeField] = 0; //0 terminate the body ret = ok; } } else { DEBUGPRINT("[%s]: can't get end of chunk size field\n[%s]", DEBUG_LINE, m_ir->domain()->m_domain, currentChunkStart - 30); } } while (moreChunks); DEBUGPRINT0("", DEBUG_BLOCKEND); } case notStatedContentEncoding: case noContentEncoding: { ret = ok; break; } case unknownContentEncoding: default: { ret = unknownContentencoding; } } } return ret; } Protocol::conversationResult HTTP::createInternetResource() { //finishedRead(), checkForEOS() and then finalise() have been called. //Update the InternetResource that we were waiting for //returns 0 if all ok //InternetResource does not copy the body //XML type is required for the XMLHTTPDatabase InternetResource *resource = 0; if (ContentType::application_xml == m_contentType) resource = new XMLPage( m_db, m_ir, m_bodyStart, m_httpResponseCode, m_lastModifiedStr, m_manageBuffer); else if (ContentType::text_html == m_contentType) resource = new HTMLPage( m_db, m_ir, m_bodyStart, m_httpResponseCode, m_lastModifiedStr, m_manageBuffer); else if (ContentType::application_x_javascript == m_contentType) resource = new JavaScript(m_db, m_ir, m_bodyStart, m_httpResponseCode, m_lastModifiedStr, m_manageBuffer); else DEBUGPRINT("[%s]: unhandled type [%s]", DEBUG_LINE, m_domain, m_contentTypeText); *m_resource = resource; return (resource ? ok : unknownContenttype); } //---------- text -> enum HTTP::compression HTTP::textCompressionToEnum(const char *compression) const { if (!compression) return notStatedCompression; if (!_STRCASECMP(compression, "deflate")) return deflate; if (!_STRCASECMP(compression, "gzip")) return gzip; m_ir->domain()->addUnknownCompression(compression); return unknownCompression; } HTTP::contentEncoding HTTP::textContentEncodingToEnum(const char *contentEncoding) const { if (!contentEncoding) return notStatedContentEncoding; if (!_STRCASECMP(contentEncoding, "chunked")) return chunked; m_ir->domain()->addUnknownContentEncoding(contentEncoding); return unknownContentEncoding; } contentLanguage HTTP::textContentLanguageToEnum(const char *language) const { if (!language) return notStatedLanguage; if (!_STRCASECMP(language, "en-GB")) return en_GB; m_ir->domain()->addUnknownLanguage(language); return unknownContentLanguage; } HTTP::responseProtocol HTTP::textResponseProtocolToEnum(const char *protocolText) const { if (!protocolText) return unknownResponseProtocol; if (!_STRCASECMP(protocolText, "HTTP/1.0")) return http1_0; if (!_STRCASECMP(protocolText, "HTTP/1.1")) return http1_1; m_ir->domain()->addUnknownResponseProtocol(protocolText); return unknownResponseProtocol; } ContentType *HTTP::textContentTypeToEnum(const char *contentTypeText) const { if (!contentTypeText) return &ContentType::notStatedContentType; ContentType *thisContentType = ContentType::contentType(contentTypeText); if (!thisContentType) { thisContentType = &ContentType::unknownContentType; m_ir->domain()->addUnknownContentType(contentTypeText); } return thisContentType; } HTTP::charSet HTTP::textCharsetToEnum(const char *charSet) const { //W3C org character sets: http://www.iana.org/assignments/character-sets if (!charSet) return notStatedCharSet; //utf-8 if (!_STRCASECMP(charSet, "UTF8")) return utf_8; if (!_STRCASECMP(charSet, "UTF-8")) return utf_8; if (!_STRCASECMP(charSet, "UTF_8")) return utf_8; //iso-8859-1 if (!_STRCASECMP(charSet, "ISO_8859-1:1987")) return iso_8859_1; if (!_STRCASECMP(charSet, "iso-ir-100")) return iso_8859_1; if (!_STRCASECMP(charSet, "ISO_8859-1")) return iso_8859_1; if (!_STRCASECMP(charSet, "iso-8859-1")) return iso_8859_1; if (!_STRCASECMP(charSet, "iso8859-1")) return iso_8859_1; if (!_STRCASECMP(charSet, "latin1")) return iso_8859_1; if (!_STRCASECMP(charSet, "l1")) return iso_8859_1; if (!_STRCASECMP(charSet, "IBM819")) return iso_8859_1; if (!_STRCASECMP(charSet, "CP819")) return iso_8859_1; if (!_STRCASECMP(charSet, "csISOLatin1")) return iso_8859_1; //iso_8859_15 if (!_STRCASECMP(charSet, "iso-8859-15")) return iso_8859_15; //windows_1252 if (!_STRCASECMP(charSet, "windows-1252")) return windows_1252; //ascii if (!_STRCASECMP(charSet, "ANSI_X3.4-1968")) return us_ascii; if (!_STRCASECMP(charSet, "iso-ir-6")) return us_ascii; if (!_STRCASECMP(charSet, "ANSI_X3.4-1986")) return us_ascii; if (!_STRCASECMP(charSet, "ISO_646.irv:1991")) return us_ascii; if (!_STRCASECMP(charSet, "ASCII")) return us_ascii; if (!_STRCASECMP(charSet, "ISO646-US")) return us_ascii; if (!_STRCASECMP(charSet, "US-ASCII")) return us_ascii; if (!_STRCASECMP(charSet, "us")) return us_ascii; if (!_STRCASECMP(charSet, "IBM367")) return us_ascii; if (!_STRCASECMP(charSet, "cp367")) return us_ascii; if (!_STRCASECMP(charSet, "csASCII")) return us_ascii; m_ir->domain()->addUnknownCharSet(charSet); return unknownCharSet; } //---------- stream events void HTTP::finishedRead(const int bytes) { //the read has finished: by default assume a single request -> response paradigm and finialise the IR on EOS switch (m_status) { case written: case reading: { m_totalBytesTransfered += bytes; #ifdef _DEBUG //show the last 15 chars in the buffer but remove spurious chars (videos etc.) char *debug = _STRDUP(m_buffer + m_totalBytesTransfered - 15), *pos = debug; char c = 0; while (c == *pos) {if (c < 32 && c != 13 && c != 10) *pos = 32; pos++;} //DEBUGPRINT("[%s]: HTTP %i/%u bytes recieved [%s]", DEBUG_LINE, m_domain, bytes, m_totalBytesTransfered, debug); free(debug); #endif if (!m_headerEnd && m_totalBytesTransfered > 12) { m_headerEnd = (char*)strstr((const char*)m_buffer, "\r\n\r\n"); //returns NULL (0) if not found if (m_headerEnd) { m_headerEnd[2] = 0; //null terminate header (after the last custom header because we want to have a full header with CRLF at the end) m_bodyStart = m_headerEnd + strlen("\r\n\r\n"); //already null terminated //header line positions, all zeroed const char *headerLineStart = 0, *headerLineValue = 0, *responseCode = 0; char *space = 0, *headerLineFinish = 0; size_t valueLength = 0; DEBUGPRINT("[%s]: header:[\n%s]", DEBUG_LINE, m_domain, m_buffer); //first header line: protocol responseCode text e.g. HTTP/1.1 200 OK headerLineFinish = strstr(m_buffer, "\r\n"); if (headerLineFinish) { *headerLineFinish = 0; space = strchr(m_buffer, ' '); if (space) { *space = 0; //end of protocol responseCode = ++space; space = strchr(space, ' '); if (space) { *space = 0; //end of response code, start of text reponse m_httpResponseCode = atoi(responseCode); //response code } m_responseProtocol = textResponseProtocolToEnum(m_buffer); } } //other header information m_contentEncoding = notStatedContentEncoding; m_compression = notStatedCompression; while (headerLineFinish) { headerLineStart=headerLineFinish + 1; while (*++headerLineStart == ' ') 000; //skip leading spaces headerLineFinish=(char*)strstr(headerLineStart, "\r\n"); if (headerLineFinish) { *headerLineFinish = 0; headerLineValue = strchr(headerLineStart, ':'); if (headerLineValue) { while (*++headerLineValue == ' ') 000; //skip leading spaces if (headerLineValue < headerLineFinish && headerLineValue > headerLineStart) { valueLength = headerLineFinish - headerLineValue; //without trailing 0 terminator //the following values are calculated primitive (int/bool) if (!_STRNCASECMP(headerLineStart, "Connection", 10)) { m_connectionClose = (!_STRNCASECMP(headerLineValue, "close", 5)); m_connectionKeepAlive = (!_STRNCASECMP(headerLineValue, "Keep-alive", 10)); } else if (!_STRNCASECMP(headerLineStart, "Transfer-Encoding", 17)) m_contentEncoding = textContentEncodingToEnum(headerLineValue); else if (!_STRNCASECMP(headerLineStart, "Content-Length", 14)) m_contentLength = atoin(headerLineValue, valueLength); else if (!_STRNCASECMP(headerLineStart, "Keep-alive", 10)) { /* m_keepalive = atoin(headerLineValue, valueLength); m_contentTypeText = headerLineValue; char *semicolon = (char*) strchr(headerLineValue, ','); //check for 2 parameters if (semicolon) { //Content-Type maybe has two elements (MIMEType and charset separated by ;) m_charSetText = semicolon; while (*++m_charSetText == ' ') 000; //skip leading spaces after ; if (!_STRNCASECMP(m_charSetText, "charset", 7)) { m_charSetText=strchr(m_charSetText, '='); if (m_charSetText) { while (*++m_charSetText == ' ') 000; //skip leading spaces after = m_charSet = textCharsetToEnum(m_charSetText); } } do *semicolon-- = 0; while (*semicolon == ' '); //finish the MIMType (removing whitespace) } //content type enum m_contentType = textContentTypeToEnum(m_contentTypeText); */ } //the following are pointers into the buffer area and will be invalid for the next page (reset) else if (!_STRNCASECMP(headerLineStart, "Content-Language", 16)) { m_contentLanguageText = headerLineValue; m_contentLanguage = textContentLanguageToEnum(m_contentLanguageText); } else if (!_STRNCASECMP(headerLineStart, "Last-Modified", 13)) m_lastModifiedStr = headerLineValue; else if (!_STRNCASECMP(headerLineStart, "Location", 8)) m_location = headerLineValue; else if (!_STRNCASECMP(headerLineStart, "Content-Type", 12)) { m_contentTypeText = headerLineValue; char *semicolon = (char*) strchr(headerLineValue, ';'); //check for 2 parameters if (semicolon) { //Content-Type maybe has two elements (MIMEType and charset separated by ;) m_charSetText = semicolon; while (*++m_charSetText == ' ') 000; //skip leading spaces after ; if (!_STRNCASECMP(m_charSetText, "charset", 7)) { m_charSetText=strchr(m_charSetText, '='); if (m_charSetText) { while (*++m_charSetText == ' ') 000; //skip leading spaces after = m_charSet = textCharsetToEnum(m_charSetText); } } do *semicolon-- = 0; while (*semicolon == ' '); //finish the MIMType (removing whitespace) } //content type enum m_contentType = textContentTypeToEnum(m_contentTypeText); } //the following values are persistent between pages and need to be copied/freed etc. else if (!_STRNCASECMP(headerLineStart, "Set-Cookie", 10)) { if (!_STRNCASECMP(headerLineValue, "PHPSESSID=", 10)) m_PHPsessionID = headerLineValue + 10; //Set-Cookie: ASPSESSIONIDCQSQBDBQ=LPBNONIBAGOLLBDHNFPIEADC; path=/ if (!_STRNCASECMP(headerLineValue, "ASPSESSIONID", 12)) m_ASPsessionID = headerLineValue + 12; } } } } } //HTTP conversation end scheme //connection close has highest priority because we MUST get the server to active close from its end first //to avoid closing at this end first and going into a TIMED_WAIT situation with stuff in the write buffer //see DomainConnection.h commenting on TCP basics if (m_responseProtocol == http1_0 && !m_connectionKeepAlive) m_connectionClose = true; if (m_connectionClose) m_conversationEndScheme = closeConnection; else if (m_contentEncoding == chunked) m_conversationEndScheme = chunkedEncoding; else if (m_contentLength) m_conversationEndScheme = contentLength; else { m_conversationEndScheme = unknownConversationEndScheme; m_ir->domain()->addUnknownConversationEndScheme(m_ir->absoluteURL()); } #ifdef _DEBUG //header settings verification const char *charset[] = {"unknown", "notStated", "us_ascii", "iso_8859_1", "iso_8859_15", "windows_1252", "utf_8"}; const char *language[] = {"unknown", "notStated", "en-GB"}; const char *compression[] = {"unknown", "notStated", "none", "defalte", "gzip"}; const char *contentEncoding[] = {"unknown", "notStated", "none", "chunked"}; const char *protocol[] = {"unknown", "HTTP 1.0", "HTTP 1.1"}; const char *conversationEndScheme[] = {"unknown", "notStated", "chunkedEncoding", "closeConnection", "contentLength"}; DEBUGPRINT("[%s]: Header:(URL:%s)\n Protocol:%s\n ReturnCode:%i\n Content-Language:%s (%s)\n Keep-alive:%u\n ConnectionKeepAlive:%i\n CloseConnection:%i\n Content-Encoding:%s\n Compression:%s\n Length:%i\n Content-Type:%s (%s)\n LastMod:%s\n Location:%s\n PHPSess:%s\n ASPSess:%s\n CharSet:%s (%s)\n ConversationEndScheme:%s\n", DEBUG_LINE, m_domain, m_ir->absoluteURL(), protocol[m_responseProtocol], m_httpResponseCode, language[m_contentLanguage], m_contentLanguageText, m_keepalive, m_connectionKeepAlive, m_connectionClose, contentEncoding[m_contentEncoding], compression[m_compression], m_contentLength, (m_contentType ? m_contentType->MIMETypeText() : 0), m_contentTypeText, m_lastModifiedStr, m_location, m_PHPsessionID, m_ASPsessionID, charset[m_charSet], m_charSetText, conversationEndScheme[m_conversationEndScheme] ); #endif } } if (checkForEOS()) finishConversation(finalise()); break; } } } const bool HTTP::checkForEOS() { //see if the end of the HTTP headers has been recieved //note that finished read and the header translation is called before this bool EOS = false; switch (m_conversationEndScheme) { case chunkedEncoding: { //need to keep track of the current label and size //look for the latest, if there is a new one //4-digit hex indicating chunk size char *bufferEnd = m_buffer + m_totalBytesTransfered; size_t currentChunkSizeFieldLength = 0; DEBUGPRINT("[%s]: chunked transfer encoding EOS examine", DEBUG_LINE, m_ir->domain()->m_domain); if (!m_chunkedCurrentLabelPos) { m_chunkedCurrentLabelPos = m_bodyStart; m_chunkedLastSize = 1; //fake just to get the loop started } //look for latest chunk size label (if there is a new one) while (m_chunkedLastSize && m_chunkedCurrentLabelPos < bufferEnd && strstr(m_chunkedCurrentLabelPos, "\r\n")) { if (!chunkSize(m_chunkedCurrentLabelPos, &m_chunkedLastSize, ¤tChunkSizeFieldLength)) { if (m_chunkedLastSize) { DEBUGPRINT("[%s]: Chunk Size [%u]", DEBUG_LINE, m_ir->domain()->m_domain, m_chunkedLastSize); m_chunkedCurrentLabelPos += m_chunkedLastSize + currentChunkSizeFieldLength + 4; } else { DEBUGPRINT("[%s]: Chunk End [0]", DEBUG_LINE, m_ir->domain()->m_domain); } } else { //error reading chunk size: the read probably has not completed the chunk size field DEBUGERROR("[%s]: Failed to get chunk size", m_ir->domain()->m_domain); break; } } EOS = (m_chunkedCurrentLabelPos && m_chunkedLastSize == 0); //have we found the 0 chunk size at the end? break; } case contentLength: { //check to see if the content-length has been equalled / exceeded unsigned int m_currentContentLength = (unsigned int)m_totalBytesTransfered - (m_bodyStart - m_buffer); if (m_currentContentLength == m_contentLength) { DEBUGPRINT("[%s]: Content-length finished", DEBUG_LINE, m_ir->domain()->m_domain); EOS = true; } else if (m_currentContentLength > m_contentLength) { *(m_bodyStart + m_contentLength) = 0; DEBUGERROR("[%s]: Content-length exceeded by [%i], zero terminate and continue", m_ir->domain()->m_domain, m_currentContentLength - m_contentLength); EOS = true; } break; } //case closeConnection: {break;} //the close connection will be spotted and expected, no action necessary here //case notStatedConversationEndScheme: default: {break;} //headers not recieved yet } if (EOS) DEBUGPRINT("[%s]: EOS totalBytesTransfered [%u]", DEBUG_LINE, m_ir->domain()->m_domain, m_totalBytesTransfered); return EOS; } const char *HTTP::generateRequest() const { //no free. Generated in the read buffer sent by the owner of Protocol //calculate the body size (if any) from the parameters vector >& parameters = m_ir->m_parameters; vector >::const_iterator iParam; const bool postrequest = (parameters.size() != 0); size_t bodysize = 0; const char *name, *value; char c; char *pos = m_buffer, *bodystart; //need the body size for the header if (postrequest) { for (iParam = parameters.begin(); iParam != parameters.end(); iParam++) { //format name=value&name=value&... //value may require URL escaping to a larger amount //the body can potentially be MBs of length name = iParam->first; value = iParam->second; bodysize += strlen(name) + 2; //= and & bodysize += Stream::URLEncodedLength(value); } bodysize--; //the last & is not needed (Content-Length is very important here) } //now assemble the request //----------- request type (depends on the body) if (postrequest) strcpy(pos, "POST"); else strcpy(pos, "GET"); while (*pos) pos++; *pos++ = ' '; //----------- URL (with spaces replaced) sprintf(pos, "/%s", m_ir->relativeHREF()); while (c = *pos) { if (c == ' ') *pos = '+'; pos++; } //----------- basic request headers sprintf(pos, " HTTP/1.1\r\n\ Host: %s\r\n\ User-Agent: %s\r\n\ Accept: %s\r\n\ Accept-Language: %s\r\n\ Accept-Encoding: %s\r\n\ Accept-Charset: %s\r\n\ Keep-Alive: %s\r\n\ Connection: %s\r\n\ ", m_domain, USERAGENT, ACCEPT, ACCEPTLANGUAGE, ACCEPTENCODING, ACCEPTCHARSET, KEEPALIVE, CONNECTION ); while (*pos) pos++; //extra POST headers if (postrequest) { sprintf(pos, "Content-Type: %s\r\n", FORMCONTENTTYPE); while (*pos) pos++; sprintf(pos, "Content-Length: %u\r\n", bodysize); while (*pos) pos++; } //----------- optional parts if (m_permData) { //referrer if (m_permData->m_referrer) { sprintf(pos, "Referer: %s\r\n", m_permData->m_referrer); while (*pos) pos++; } //sessions if (m_permData->m_ASPsessionID) { //Request: Set-Cookie: ASPSESSIONIDCQSQBDBQ=LPBNONIBAGOLLBDHNFPIEADC; path=/ //Response: Cookie: ASPSESSIONIDCQSQBDBQ=LPBNONIBAGOLLBDHNFPIEADC; path=/ sprintf(pos, "Cookie: ASPSESSIONID%s\r\n", m_permData->m_ASPsessionID); while (*pos) pos++; } if (m_permData->m_PHPsessionID) { //Request: Set-Cookie: PHPSESSID=LPBNONIBAGOLLBDHNFPIEADC; path=/ //Response: Cookie: PHPSESSID=LPBNONIBAGOLLBDHNFPIEADC; path=/ sprintf(pos, "Cookie: PHPSESSID=%s\r\n", m_permData->m_PHPsessionID); while (*pos) pos++; } } //----------- end of header strcpy(pos, "\r\n"); while (*pos) pos++; //----------- body of POST parameters if (postrequest) { bodystart = pos; for (iParam = parameters.begin(); iParam != parameters.end(); iParam++) { //format name=value&name=value&... //value may require URL escaping to a larger amount //the body can potentially be MBs of length name = iParam->first; value = iParam->second; strcpy(pos, name ); while (*pos) pos++; *pos++ = '='; pos += Stream::URLEncode(value, pos); *pos++ = '&'; } *--pos = 0; //remove last & DEBUGPRINT("[%s]: actual bodysize [%u]", DEBUG_LINE, m_ir->domain()->m_domain, pos - bodystart); } //If-Modified-Since: %s\r\n\ //page->m_ifModifiedSince return m_buffer; }