#include "Streams.h" using namespace std; bool Stream::isURLEncoded(const char *string) { char c; for (const char *i = string; c = *i; i++) if (! (isAlphaNumeric(c) || is(c, "%+-_"))) return false; return true; } const char *Stream::URLEncode(char *string, const bool reuseSpaces, const bool checkNotEncodedAlready) { //IMPORTANT! caller only frees result if in != out //no new string is created if the input string does not contain any convertable characters //reuseSpaces = true will cause the origonal string spaces to be overwritten //thus a stirng that is already URL encoded will not be re-encoded //application/x-www-form-urlencoded //RFC 1738 - http://www.faqs.org/rfcs/rfc1738 //from PHP urlencode (http://uk3.php.net/urlencode): //Returns a string in which all non-alphanumeric characters except -_. //have been replaced with a percent (%) sign followed by two hex digits //and spaces encoded as plus (+) signs. //It is encoded the same way that the posted data from a WWW form is encoded, that is the same way as in application/x-www-form-urlencoded media type. //This differs from the » RFC 1738 encoding (see rawurlencode()) in that for historical reasons, spaces are encoded as plus (+) signs. unsigned char c; size_t newsize = 0; bool needsConvert = false, //^[^a-zA-Z0-9_-]*$ onlySpaces = true, //^[^a-zA-Z0-9_- ]*$ alreadyConverted = true; //^[^a-zA-Z0-9_-+%]*$ char *newstring = (char*)string; const char *i; char *j; //traverse stirng looking for convert chars (zero terminated) for (i = string; c = *i; i++) { newsize++; if (!(isAlphaNumeric(c) || is(c, "-_"))) { //needs conversion: spaces take up same amount of space, others require a 2 digit unicode hex value (0-255) needsConvert = true; if (c != ' ') { onlySpaces = false; newsize += 2; } if (!is(c, "%+")) alreadyConverted = false; } } if (!(checkNotEncodedAlready && alreadyConverted) && needsConvert) { if (!onlySpaces || !reuseSpaces) newstring = (char*) mallocCheck(newsize + 1); //+ zero terminator j = newstring; for (i = string; c = *i; i++) { if (isAlphaNumeric(c) || is(c, "-_")) *j++ = c; //copy character directly else if (c == ' ') *j++ = '+'; //change to space else { //convert char to entity *j++ = '%'; itox(c, j); j += 2; } } *j = 0; //zero terminate } return newstring; } size_t Stream::URLEncodedLength(const char *string) { size_t newsize = 0; char c; for (const char *i = string; c = *i; i++) { newsize++; //each char takes up at least one space if (!(isAlphaNumeric(c) || is(c, "-_ "))) newsize += 2; //spaces only take up 1 space, others 3 } return newsize; //returns string length WITHOUT the zero terminator } size_t Stream::URLEncode(const char *in, char *out) { //use URLEncodedLength(...) to allocate memory for the out buffer //application/x-www-form-urlencoded //RFC 1738 - http://www.faqs.org/rfcs/rfc1738 //from PHP urlencode (http://uk3.php.net/urlencode): //Returns a string in which all non-alphanumeric characters except -_. //have been replaced with a percent (%) sign followed by two hex digits //and spaces encoded as plus (+) signs. //It is encoded the same way that the posted data from a WWW form is encoded, that is the same way as in application/x-www-form-urlencoded media type. //This differs from the » RFC 1738 encoding (see rawurlencode()) in that for historical reasons, spaces are encoded as plus (+) signs. unsigned char c; char *j = out; const char *i; for (i = in; c = *i; i++) { if (isAlphaNumeric(c) || is(c, "-_")) *j++ = c; //copy character directly else if (c == ' ') *j++ = '+'; //change to space else { //convert char to entity *j++ = '%'; itox(c, j); j += 2; } } *j = 0; //zero terminate return j-out; //will return 0 for 0 length strings } const char *Stream::HTMLEntityDecodeToUTF8(char *string) { /* //word search vars const char *pos, *endpos, *word; char c; bool utf8Char, dot, capital, number, ignore, whiteSpace, letter, wordLetter; bool nameStart, wordStart, sentenceEnd; bool lastWasWhiteSpace, lastWasIgnore, lastWasDot; bool firstWord; size_t len; //indicate start of word boundary lastWasWhiteSpace = true; lastWasDot = true; sentenceEnd = true; lastWasIgnore = true; firstWord = true; //traverse area sanitising the HTML / entity formatting while ((c = *pos) && pos != endpos) { //until end of area or EOF switch (c) { //replace quotes with format entries? //normalise quotes? //keep in output? case '"': case '\'': { if (!lastWasWhiteSpace) *newpos++ = ' '; *newpos++ = '"'; pos++; break; } case '&': { c = *++pos; if (c <= ' ') { //plain ampersand if (!lastWasWhiteSpace) *newpos++ = ' '; strcpy(newpos, "and "); newpos += 4; lastWasWhiteSpace = true; } else { if (c == '#') { //numeric entity c = *++pos; if (c == 'x') { //hexidecimal if (iEntity = strtol(++pos, 0, 16)) *newpos++ = (char)iEntity; } else { //decimal iEntity = atoi(pos); if (iEntity < 256) *newpos++ = (char)iEntity; } } else { //named entity currententity = HTMLEntity::getentity(pos); switch (currententity) { case entity_quot: { *newpos++ = '"'; //Document::formatInstance fi = {Document::quote, newpos, 0, 0}; //art->addFormat(fi); break; } case entity_amp: { if (!lastWasWhiteSpace) *newpos++ = ' '; strcpy(newpos, "and "); newpos += 4; lastWasWhiteSpace = true; break; } case entity_nbsp: { if (!lastWasWhiteSpace) { *newpos++ = ' '; lastWasWhiteSpace = true; } break; } case entity_copy: { if (!lastWasWhiteSpace) *newpos++ = ' '; strcpy(newpos, "copyright "); newpos += 10; lastWasWhiteSpace = true; break; } } } //move to the end of the entity while ((c = *++pos) && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '#' || c == ';')) 000; } break; } case '<': { break; } default: { if (c >= ' ') { //ignore weird chars //character groups dot = (c == '.'); capital = (c >= 'A' && c <= 'Z'); number = (c >= '0' && c <= '9'); ignore = (c == '"' || c == '\'' || c == ',' || c == ';' || c == ':'); utf8Char = false; //(c > 127); whiteSpace = (c <= ' '); letter = capital || utf8Char || (c >= 'a' && c <= 'z'); wordLetter = letter || number; //combos nameStart = (lastWasIgnore || lastWasWhiteSpace) && capital; wordStart = (lastWasIgnore || lastWasWhiteSpace) && wordLetter; sentenceEnd = lastWasDot && whiteSpace; if (sentenceEnd) firstWord = true; if (nameStart) { if (firstWord) art->addNameAtSentenceStart(pos); else art->addNameInText(pos); } //previous character saves lastWasWhiteSpace = whiteSpace; lastWasIgnore = ignore; lastWasDot = dot; if (wordStart) firstWord = false; } *newpos++ = c; pos++; } } */ //change all HTML entities in a UTF-8 string to the actual characters char c, last; const char *i; char *j = string; for (i = string; c = *i; i++) { if (c >= ' ') { switch (c) { case '&': { //entity start break; } case ' ': { if (last == ' ') break; //normalise white spaces } default: { *j++ = c; break; } } last = c; } } return string; } const char *Stream::UTF8ToHTMLEntityEncode(char *string) { //change all HTML entities in a UTF-8 string to the actual characters char c, last; const char *i; char *j = string; for (i = string; c = *i; i++) { if (c >= ' ') { switch (c) { case '&': { //entity start break; } case ' ': { if (last == ' ') break; //normalise white spaces } default: { *j++ = c; break; } } last = c; } } return string; }