From bacc29586271996154884ff8c531520385318a84 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 25 Aug 2008 18:30:46 +0400 Subject: [PATCH] * Update URL normalizer (partially taken from GNU wget) --- url.c | 385 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 382 insertions(+), 3 deletions(-) diff --git a/url.c b/url.c index 0efa4fc89..b09202edc 100644 --- a/url.c +++ b/url.c @@ -54,6 +54,89 @@ static const struct _proto protocol_backends[] = { { NULL, 0, NULL, 0, 0, 1, 0 }, }; +/* + Table of "reserved" and "unsafe" characters. Those terms are + rfc1738-speak, as such largely obsoleted by rfc2396 and later + specs, but the general idea remains. + + A reserved character is the one that you can't decode without + changing the meaning of the URL. For example, you can't decode + "/foo/%2f/bar" into "/foo///bar" because the number and contents of + path components is different. Non-reserved characters can be + changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". The + unsafe characters are loosely based on rfc1738, plus "$" and ",", + as recommended by rfc2396, and minus "~", which is very frequently + used (and sometimes unrecognized as %7E by broken servers). + + An unsafe character is the one that should be encoded when URLs are + placed in foreign environments. E.g. space and newline are unsafe + in HTTP contexts because HTTP uses them as separator and line + terminator, so they must be encoded to %20 and %0A respectively. + "*" is unsafe in shell context, etc. + + We determine whether a character is unsafe through static table + lookup. This code assumes ASCII character set and 8-bit chars. */ + +enum { + /* rfc1738 reserved chars + "$" and ",". */ + urlchr_reserved = 1, + + /* rfc1738 unsafe chars, plus non-printables. */ + urlchr_unsafe = 2 +}; + +#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask)) +#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved) +#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe) +/* Convert an ASCII hex digit to the corresponding number between 0 + and 15. H should be a hexadecimal digit that satisfies isxdigit; + otherwise, the result is undefined. */ +#define XDIGIT_TO_NUM(h) ((h) < 'A' ? (h) - '0' : toupper (h) - 'A' + 10) +#define X2DIGITS_TO_NUM(h1, h2) ((XDIGIT_TO_NUM (h1) << 4) + XDIGIT_TO_NUM (h2)) +/* The reverse of the above: convert a number in the [0, 16) range to + the ASCII representation of the corresponding hexadecimal digit. + `+ 0' is there so you can't accidentally use it as an lvalue. */ +#define XNUM_TO_DIGIT(x) ("0123456789ABCDEF"[x] + 0) +#define XNUM_TO_digit(x) ("0123456789abcdef"[x] + 0) + +/* Shorthands for the table: */ +#define R urlchr_reserved +#define U urlchr_unsafe +#define RU R|U + +static const unsigned char urlchr_table[256] = +{ + U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */ + U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */ + U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */ + U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */ + U, 0, U, RU, R, U, R, 0, /* SP ! " # $ % & ' */ + 0, 0, 0, R, R, 0, 0, R, /* ( ) * + , - . / */ + 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */ + 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */ + RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */ + 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */ + 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */ + 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */ + U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */ + 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */ + 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */ + 0, 0, 0, U, U, U, 0, U, /* x y z { | } ~ DEL */ + + U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, + U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, + U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, + U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, + + U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, + U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, + U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, + U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, +}; +#undef R +#undef U +#undef RU + static inline int end_of_dir(unsigned char c) { @@ -189,6 +272,292 @@ get_protocol_length(const unsigned char *url) return (*end == ':' || isdigit(*end)) ? end - url : 0; } +/* URL-unescape the string S. + + This is done by transforming the sequences "%HH" to the character + represented by the hexadecimal digits HH. If % is not followed by + two hexadecimal digits, it is inserted literally. + + The transformation is done in place. If you need the original + string intact, make a copy before calling this function. */ + +static void +url_unescape (char *s) +{ + char *t = s; /* t - tortoise */ + char *h = s; /* h - hare */ + + for (; *h; h++, t++) { + if (*h != '%') { + copychar: + *t = *h; + } + else { + char c; + /* Do nothing if '%' is not followed by two hex digits. */ + if (!h[1] || !h[2] || !(isxdigit (h[1]) && isxdigit (h[2]))) + goto copychar; + c = X2DIGITS_TO_NUM (h[1], h[2]); + /* Don't unescape %00 because there is no way to insert it + * into a C string without effectively truncating it. */ + if (c == '\0') + goto copychar; + *t = c; + h += 2; + } + } + *t = '\0'; +} + +/* The core of url_escape_* functions. Escapes the characters that + match the provided mask in urlchr_table. + + If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars + will be returned unchanged. If ALLOW_PASSTHROUGH is zero, a + freshly allocated string will be returned in all cases. */ + +static char * +url_escape_1 (const char *s, unsigned char mask, int allow_passthrough) +{ + const char *p1; + char *p2, *newstr; + int newlen; + int addition = 0; + + for (p1 = s; *p1; p1++) + if (urlchr_test (*p1, mask)) + addition += 2; /* Two more characters (hex digits) */ + + if (!addition) + return allow_passthrough ? (char *)s : strdup (s); + + newlen = (p1 - s) + addition; + newstr = (char *) g_malloc (newlen + 1); + + p1 = s; + p2 = newstr; + while (*p1) { + /* Quote the characters that match the test mask. */ + if (urlchr_test (*p1, mask)) { + unsigned char c = *p1++; + *p2++ = '%'; + *p2++ = XNUM_TO_DIGIT (c >> 4); + *p2++ = XNUM_TO_DIGIT (c & 0xf); + } + else + *p2++ = *p1++; + } + *p2 = '\0'; + + return newstr; +} + +/* URL-escape the unsafe characters (see urlchr_table) in a given + string, returning a freshly allocated string. */ + +char * +url_escape (const char *s) +{ + return url_escape_1 (s, urlchr_unsafe, 0); +} + +/* URL-escape the unsafe characters (see urlchr_table) in a given + string. If no characters are unsafe, S is returned. */ + +static char * +url_escape_allow_passthrough (const char *s) +{ + return url_escape_1 (s, urlchr_unsafe, 1); +} + +/* Decide whether the char at position P needs to be encoded. (It is + not enough to pass a single char *P because the function may need + to inspect the surrounding context.) + + Return 1 if the char should be escaped as %XX, 0 otherwise. */ + +static inline int +char_needs_escaping (const char *p) +{ + if (*p == '%') { + if (isxdigit (*(p + 1)) && isxdigit (*(p + 2))) + return 0; + else + /* Garbled %.. sequence: encode `%'. */ + return 1; + } + else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p)) + return 1; + else + return 0; +} + +/* Translate a %-escaped (but possibly non-conformant) input string S + into a %-escaped (and conformant) output string. If no characters + are encoded or decoded, return the same string S; otherwise, return + a freshly allocated string with the new contents. + + After a URL has been run through this function, the protocols that + use `%' as the quote character can use the resulting string as-is, + while those that don't can use url_unescape to get to the intended + data. This function is stable: once the input is transformed, + further transformations of the result yield the same output. +*/ + +static char * +reencode_escapes (const char *s) +{ + const char *p1; + char *newstr, *p2; + int oldlen, newlen; + + int encode_count = 0; + + /* First pass: inspect the string to see if there's anything to do, + and to calculate the new length. */ + for (p1 = s; *p1; p1++) + if (char_needs_escaping (p1)) + ++encode_count; + + if (!encode_count) + /* The string is good as it is. */ + return (char *) s; /* C const model sucks. */ + + oldlen = p1 - s; + /* Each encoding adds two characters (hex digits). */ + newlen = oldlen + 2 * encode_count; + newstr = g_malloc (newlen + 1); + + /* Second pass: copy the string to the destination address, encoding + chars when needed. */ + p1 = s; + p2 = newstr; + + while (*p1) + if (char_needs_escaping (p1)) { + unsigned char c = *p1++; + *p2++ = '%'; + *p2++ = XNUM_TO_DIGIT (c >> 4); + *p2++ = XNUM_TO_DIGIT (c & 0xf); + } + else { + *p2++ = *p1++; + } + + *p2 = '\0'; + return newstr; +} +/* Unescape CHR in an otherwise escaped STR. Used to selectively + escaping of certain characters, such as "/" and ":". Returns a + count of unescaped chars. */ + +static void +unescape_single_char (char *str, char chr) +{ + const char c1 = XNUM_TO_DIGIT (chr >> 4); + const char c2 = XNUM_TO_DIGIT (chr & 0xf); + char *h = str; /* hare */ + char *t = str; /* tortoise */ + + for (; *h; h++, t++) { + if (h[0] == '%' && h[1] == c1 && h[2] == c2) { + *t = chr; + h += 2; + } + else { + *t = *h; + } + } + *t = '\0'; +} + +/* Escape unsafe and reserved characters, except for the slash + characters. */ + +static char * +url_escape_dir (const char *dir) +{ + char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1); + if (newdir == dir) + return (char *)dir; + + unescape_single_char (newdir, '/'); + return newdir; +} + +/* Resolve "." and ".." elements of PATH by destructively modifying + PATH and return non-zero if PATH has been modified, zero otherwise. + + The algorithm is in spirit similar to the one described in rfc1808, + although implemented differently, in one pass. To recap, path + elements containing only "." are removed, and ".." is taken to mean + "back up one element". Single leading and trailing slashes are + preserved. + + For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive + test examples are provided below. If you change anything in this + function, run test_path_simplify to make sure you haven't broken a + test case. */ + +static int +path_simplify (char *path) +{ + char *h = path; /* hare */ + char *t = path; /* tortoise */ + char *beg = path; /* boundary for backing the tortoise */ + char *end = path + strlen (path); + + while (h < end) { + /* Hare should be at the beginning of a path element. */ + if (h[0] == '.' && (h[1] == '/' || h[1] == '\0')) { + /* Ignore "./". */ + h += 2; + } + else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0')) { + /* Handle "../" by retreating the tortoise by one path + element -- but not past beggining. */ + if (t > beg) { + /* Move backwards until T hits the beginning of the + previous path element or the beginning of path. */ + for (--t; t > beg && t[-1] != '/'; t--); + } + else { + /* If we're at the beginning, copy the "../" literally + move the beginning so a later ".." doesn't remove + it. */ + beg = t + 3; + goto regular; + } + h += 3; + } + else { + regular: + /* A regular path element. If H hasn't advanced past T, + simply skip to the next path element. Otherwise, copy + the path element until the next slash. */ + if (t == h) { + /* Skip the path element, including the slash. */ + while (h < end && *h != '/') + t++, h++; + if (h < end) + t++, h++; + } + else { + /* Copy the path element, including the final slash. */ + while (h < end && *h != '/') + *t++ = *h++; + if (h < end) + *t++ = *h++; + } + } + } + + if (t != h) + *t = '\0'; + + return t != h; +} + static enum uri_errno parse_uri(struct uri *uri, unsigned char *uristring) { @@ -201,8 +570,8 @@ parse_uri(struct uri *uri, unsigned char *uristring) /* Nothing to do for an empty url. */ if (!*uristring) return URI_ERRNO_EMPTY; - - uri->string = uristring; + + uri->string = reencode_escapes (uristring); uri->protocollen = get_protocol_length (uristring); /* Invalid */ @@ -211,7 +580,7 @@ parse_uri(struct uri *uri, unsigned char *uristring) /* Figure out whether the protocol is known */ uri->protocol = get_protocol (struri(uri), uri->protocollen); - prefix_end = uristring + uri->protocollen; /* ':' */ + prefix_end = struri (uri) + uri->protocollen; /* ':' */ /* Check if there's a digit after the protocol name. */ if (isdigit (*prefix_end)) { @@ -379,6 +748,16 @@ parse_uri(struct uri *uri, unsigned char *uristring) if (*prefix_end == POST_CHAR) { uri->post = prefix_end + 1; } + + convert_to_lowercase (uri->host, strlen (uri->host)); + /* Decode %HH sequences in host name. This is important not so much + to support %HH sequences in host names (which other browser + don't), but to support binary characters (which will have been + converted to %HH by reencode_escapes). */ + if (strchr (uri->host, '%')) { + url_unescape (uri->host); + } + path_simplify (uri->data); return URI_ERRNO_OK; } -- 2.39.5