aboutsummaryrefslogtreecommitdiffstats
path: root/src/url.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2008-11-01 18:01:05 +0300
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2008-11-01 18:01:05 +0300
commit2aa9c74f1c449da92f6faf870f8cc801a83bb08b (patch)
tree33f0f941f08583fd0c4c3653cadde8d6ce8426c2 /src/url.c
parentcc5343692b448c27485a24ea7f1b24d714bb82f6 (diff)
downloadrspamd-2aa9c74f1c449da92f6faf870f8cc801a83bb08b.tar.gz
rspamd-2aa9c74f1c449da92f6faf870f8cc801a83bb08b.zip
* Reorganize structure of source files
* Adopt build system for new structure --HG-- rename : cfg_file.h => src/cfg_file.h rename : cfg_file.l => src/cfg_file.l rename : cfg_file.y => src/cfg_file.y rename : cfg_utils.c => src/cfg_utils.c rename : controller.c => src/controller.c rename : filter.c => src/filter.c rename : filter.h => src/filter.h rename : fstring.c => src/fstring.c rename : fstring.h => src/fstring.h rename : main.c => src/main.c rename : main.h => src/main.h rename : mem_pool.c => src/mem_pool.c rename : mem_pool.h => src/mem_pool.h rename : memcached-test.c => src/memcached-test.c rename : memcached.c => src/memcached.c rename : memcached.h => src/memcached.h rename : perl.c => src/perl.c rename : perl.h => src/perl.h rename : plugins/regexp.c => src/plugins/regexp.c rename : plugins/surbl.c => src/plugins/surbl.c rename : protocol.c => src/protocol.c rename : protocol.h => src/protocol.h rename : upstream.c => src/upstream.c rename : upstream.h => src/upstream.h rename : url.c => src/url.c rename : url.h => src/url.h rename : util.c => src/util.c rename : util.h => src/util.h rename : worker.c => src/worker.c
Diffstat (limited to 'src/url.c')
-rw-r--r--src/url.c886
1 files changed, 886 insertions, 0 deletions
diff --git a/src/url.c b/src/url.c
new file mode 100644
index 000000000..83ee0195a
--- /dev/null
+++ b/src/url.c
@@ -0,0 +1,886 @@
+#include <sys/types.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <errno.h>
+#include <syslog.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <string.h>
+
+#include "url.h"
+#include "fstring.h"
+#include "main.h"
+
+#define POST_CHAR 1
+#define POST_CHAR_S "\001"
+
+/* Tcp port range */
+#define LOWEST_PORT 0
+#define HIGHEST_PORT 65535
+
+#define uri_port_is_valid(port) \
+ (LOWEST_PORT <= (port) && (port) <= HIGHEST_PORT)
+
+struct _proto {
+ unsigned char *name;
+ int port;
+ uintptr_t *unused;
+ unsigned int need_slashes:1;
+ unsigned int need_slash_after_host:1;
+ unsigned int free_syntax:1;
+ unsigned int need_ssl:1;
+};
+
+static const char *text_url = "((https?|ftp)://)?"
+"(\\b(?<![.\\@A-Za-z0-9-])"
+"(?: [A-Za-z0-9][A-Za-z0-9-]*(?:\\.[A-Za-z0-9-]+)*\\."
+"(?i:com|net|org|biz|edu|gov|info|name|int|mil|aero|coop|jobs|mobi|museum|pro|travel"
+"|[rs]u|uk|ua|by|de|jp|fr|fi|no|no|ca|it|ro|cn|nl|at|nu|se"
+"|[a-z]{2}"
+"(?(1)|(?=/)))"
+"(?!\\w)"
+"|(?:\\d{1,3}\\.){3}\\d{1,3}(?(1)|(?=[/:]))"
+")"
+"(?::\\d{1,5})?" /* port */
+"(?!\\.\\w)" /* host part ended, no more of this further on */
+"(?:[/?][;/?:@&=+\\$,[\\]\\-_.!~*'()A-Za-z0-9#%]*)?" /* path (&query) */
+"(?<![\\s>?!),.'\"\\]:])"
+"(?!@)"
+")";
+static const char *html_url = "(?: src|href)=\"("
+"((https?|ftp)://)?"
+"(\\b(?<![.\\@A-Za-z0-9-])"
+"(?: [A-Za-z0-9][A-Za-z0-9-]*(?:\\.[A-Za-z0-9-]+)*\\."
+"(?i:com|net|org|biz|edu|gov|info|name|int|mil|aero|coop|jobs|mobi|museum|pro|travel"
+"|[rs]u|uk|ua|by|de|jp|fr|fi|no|no|ca|it|ro|cn|nl|at|nu|se"
+"|[a-z]{2}"
+"(?(1)|(?=/)))"
+"(?!\\w)"
+"|(?:\\d{1,3}\\.){3}\\d{1,3}(?(1)|(?=[/:]))"
+")"
+"(?::\\d{1,5})?" /* port */
+"(?!\\.\\w)" /* host part ended, no more of this further on */
+"(?:[/?][;/?:@&=+\\$,[\\]\\-_.!~*'()A-Za-z0-9#%]*)?" /* path (&query) */
+"(?<![\\s>?!),.'\"\\]:])"
+"(?!@)"
+"))\"";
+
+static short url_initialized = 0;
+GRegex *text_re, *html_re;
+
+static const struct _proto protocol_backends[] = {
+ { "file", 0, NULL, 1, 0, 0, 0 },
+ { "ftp", 21, NULL, 1, 1, 0, 0 },
+ { "http", 80, NULL, 1, 1, 0, 0 },
+ { "https", 443, NULL, 1, 1, 0, 1 },
+
+ /* Keep these last! */
+ { NULL, 0, NULL, 0, 0, 1, 0 },
+};
+
+/*
+ Table of "reserved" and "unsafe" characters. Those terms are
+ rfc1738-speak, as such largely obsoleted by rfc2396 and later
+ specs, but the general idea remains.
+
+ A reserved character is the one that you can't decode without
+ changing the meaning of the URL. For example, you can't decode
+ "/foo/%2f/bar" into "/foo///bar" because the number and contents of
+ path components is different. Non-reserved characters can be
+ changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". The
+ unsafe characters are loosely based on rfc1738, plus "$" and ",",
+ as recommended by rfc2396, and minus "~", which is very frequently
+ used (and sometimes unrecognized as %7E by broken servers).
+
+ An unsafe character is the one that should be encoded when URLs are
+ placed in foreign environments. E.g. space and newline are unsafe
+ in HTTP contexts because HTTP uses them as separator and line
+ terminator, so they must be encoded to %20 and %0A respectively.
+ "*" is unsafe in shell context, etc.
+
+ We determine whether a character is unsafe through static table
+ lookup. This code assumes ASCII character set and 8-bit chars. */
+
+enum {
+ /* rfc1738 reserved chars + "$" and ",". */
+ urlchr_reserved = 1,
+
+ /* rfc1738 unsafe chars, plus non-printables. */
+ urlchr_unsafe = 2
+};
+
+#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
+#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
+#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
+/* Convert an ASCII hex digit to the corresponding number between 0
+ and 15. H should be a hexadecimal digit that satisfies isxdigit;
+ otherwise, the result is undefined. */
+#define XDIGIT_TO_NUM(h) ((h) < 'A' ? (h) - '0' : toupper (h) - 'A' + 10)
+#define X2DIGITS_TO_NUM(h1, h2) ((XDIGIT_TO_NUM (h1) << 4) + XDIGIT_TO_NUM (h2))
+/* The reverse of the above: convert a number in the [0, 16) range to
+ the ASCII representation of the corresponding hexadecimal digit.
+ `+ 0' is there so you can't accidentally use it as an lvalue. */
+#define XNUM_TO_DIGIT(x) ("0123456789ABCDEF"[x] + 0)
+#define XNUM_TO_digit(x) ("0123456789abcdef"[x] + 0)
+
+/* Shorthands for the table: */
+#define R urlchr_reserved
+#define U urlchr_unsafe
+#define RU R|U
+
+static const unsigned char urlchr_table[256] =
+{
+ U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */
+ U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */
+ U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */
+ U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */
+ U, 0, U, RU, R, U, R, 0, /* SP ! " # $ % & ' */
+ 0, 0, 0, R, R, 0, 0, R, /* ( ) * + , - . / */
+ 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */
+ 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */
+ RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */
+ 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */
+ 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */
+ 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */
+ U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */
+ 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */
+ 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */
+ 0, 0, 0, U, U, U, 0, U, /* x y z { | } ~ DEL */
+
+ U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
+ U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
+ U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
+ U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
+
+ U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
+ U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
+ U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
+ U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,
+};
+#undef R
+#undef U
+#undef RU
+
+static inline int
+end_of_dir(unsigned char c)
+{
+ return c == POST_CHAR || c == '#' || c == ';' || c == '?';
+}
+
+static inline int
+is_uri_dir_sep(struct uri *uri, unsigned char pos)
+{
+ return (pos == '/');
+}
+
+static int
+check_uri_file(unsigned char *name)
+{
+ static const unsigned char chars[] = POST_CHAR_S "#?";
+
+ return strcspn(name, chars);
+}
+
+static int
+url_init (void)
+{
+ GError *err = NULL;
+ if (url_initialized == 0) {
+ text_re = g_regex_new (text_url, G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_OPTIMIZE | G_REGEX_EXTENDED, 0, &err);
+ if (err != NULL) {
+ msg_info ("url_init: cannot init text url parsing regexp: %s", err->message);
+ g_error_free (err);
+ return -1;
+ }
+ html_re = g_regex_new (html_url, G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_OPTIMIZE | G_REGEX_EXTENDED, 0, &err);
+ if (err != NULL) {
+ msg_info ("url_init: cannot init html url parsing regexp: %s", err->message);
+ g_error_free (err);
+ return -1;
+ }
+ url_initialized = 1;
+ }
+
+ return 0;
+}
+
+enum protocol
+get_protocol(unsigned char *name, int namelen)
+{
+ /* These are really enum protocol values but can take on negative
+ * values and since 0 <= -1 for enum values it's better to use clean
+ * integer type. */
+ int start, end;
+ enum protocol protocol;
+ unsigned char *pname;
+ int pnamelen, minlen, compare;
+
+ /* Almost dichotomic search is used here */
+ /* Starting at the HTTP entry which is the most common that will make
+ * file and NNTP the next entries checked and amongst the third checks
+ * are proxy and FTP. */
+ start = 0;
+ end = PROTOCOL_UNKNOWN - 1;
+ protocol = PROTOCOL_HTTP;
+
+ while (start <= end) {
+ pname = protocol_backends[protocol].name;
+ pnamelen = strlen (pname);
+ minlen = MIN (pnamelen, namelen);
+ compare = strncasecmp (pname, name, minlen);
+
+ if (compare == 0) {
+ if (pnamelen == namelen)
+ return protocol;
+
+ /* If the current protocol name is longer than the
+ * protocol name being searched for move @end else move
+ * @start. */
+ compare = pnamelen > namelen ? 1 : -1;
+ }
+
+ if (compare > 0)
+ end = protocol - 1;
+ else
+ start = protocol + 1;
+
+ protocol = (start + end) / 2;
+ }
+
+ return PROTOCOL_UNKNOWN;
+}
+
+
+int
+get_protocol_port(enum protocol protocol)
+{
+ return protocol_backends[protocol].port;
+}
+
+int
+get_protocol_need_slashes(enum protocol protocol)
+{
+ return protocol_backends[protocol].need_slashes;
+}
+
+int
+get_protocol_need_slash_after_host(enum protocol protocol)
+{
+ return protocol_backends[protocol].need_slash_after_host;
+}
+
+int
+get_protocol_free_syntax(enum protocol protocol)
+{
+ return protocol_backends[protocol].free_syntax;
+}
+
+static int
+get_protocol_length(const unsigned char *url)
+{
+ unsigned char *end = (unsigned char *) url;
+
+ /* Seek the end of the protocol name if any. */
+ /* RFC1738:
+ * scheme = 1*[ lowalpha | digit | "+" | "-" | "." ]
+ * (but per its recommendations we accept "upalpha" too) */
+ while (isalnum(*end) || *end == '+' || *end == '-' || *end == '.')
+ end++;
+
+ /* Also return 0 if there's no protocol name (@end == @url). */
+ return (*end == ':') ? end - url : 0;
+}
+
+/* URL-unescape the string S.
+
+ This is done by transforming the sequences "%HH" to the character
+ represented by the hexadecimal digits HH. If % is not followed by
+ two hexadecimal digits, it is inserted literally.
+
+ The transformation is done in place. If you need the original
+ string intact, make a copy before calling this function. */
+
+static void
+url_unescape (char *s)
+{
+ char *t = s; /* t - tortoise */
+ char *h = s; /* h - hare */
+
+ for (; *h; h++, t++) {
+ if (*h != '%') {
+ copychar:
+ *t = *h;
+ }
+ else {
+ char c;
+ /* Do nothing if '%' is not followed by two hex digits. */
+ if (!h[1] || !h[2] || !(isxdigit (h[1]) && isxdigit (h[2])))
+ goto copychar;
+ c = X2DIGITS_TO_NUM (h[1], h[2]);
+ /* Don't unescape %00 because there is no way to insert it
+ * into a C string without effectively truncating it. */
+ if (c == '\0')
+ goto copychar;
+ *t = c;
+ h += 2;
+ }
+ }
+ *t = '\0';
+}
+
+/* The core of url_escape_* functions. Escapes the characters that
+ match the provided mask in urlchr_table.
+
+ If ALLOW_PASSTHROUGH is non-zero, a string with no unsafe chars
+ will be returned unchanged. If ALLOW_PASSTHROUGH is zero, a
+ freshly allocated string will be returned in all cases. */
+
+static char *
+url_escape_1 (const char *s, unsigned char mask, int allow_passthrough, memory_pool_t *pool)
+{
+ const char *p1;
+ char *p2, *newstr;
+ int newlen;
+ int addition = 0;
+
+ for (p1 = s; *p1; p1++)
+ if (urlchr_test (*p1, mask))
+ addition += 2; /* Two more characters (hex digits) */
+
+ if (!addition) {
+ if (allow_passthrough) {
+ return (char *)s;
+ }
+ else {
+ return memory_pool_strdup (pool, s);
+ }
+ }
+
+ newlen = (p1 - s) + addition;
+ newstr = (char *) memory_pool_alloc (pool, newlen + 1);
+
+ p1 = s;
+ p2 = newstr;
+ while (*p1) {
+ /* Quote the characters that match the test mask. */
+ if (urlchr_test (*p1, mask)) {
+ unsigned char c = *p1++;
+ *p2++ = '%';
+ *p2++ = XNUM_TO_DIGIT (c >> 4);
+ *p2++ = XNUM_TO_DIGIT (c & 0xf);
+ }
+ else
+ *p2++ = *p1++;
+ }
+ *p2 = '\0';
+
+ return newstr;
+}
+
+/* URL-escape the unsafe characters (see urlchr_table) in a given
+ string, returning a freshly allocated string. */
+
+char *
+url_escape (const char *s, memory_pool_t *pool)
+{
+ return url_escape_1 (s, urlchr_unsafe, 0, pool);
+}
+
+/* URL-escape the unsafe characters (see urlchr_table) in a given
+ string. If no characters are unsafe, S is returned. */
+
+static char *
+url_escape_allow_passthrough (const char *s, memory_pool_t *pool)
+{
+ return url_escape_1 (s, urlchr_unsafe, 1, pool);
+}
+
+/* Decide whether the char at position P needs to be encoded. (It is
+ not enough to pass a single char *P because the function may need
+ to inspect the surrounding context.)
+
+ Return 1 if the char should be escaped as %XX, 0 otherwise. */
+
+static inline int
+char_needs_escaping (const char *p)
+{
+ if (*p == '%') {
+ if (isxdigit (*(p + 1)) && isxdigit (*(p + 2)))
+ return 0;
+ else
+ /* Garbled %.. sequence: encode `%'. */
+ return 1;
+ }
+ else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
+ return 1;
+ else
+ return 0;
+}
+
+/* Translate a %-escaped (but possibly non-conformant) input string S
+ into a %-escaped (and conformant) output string. If no characters
+ are encoded or decoded, return the same string S; otherwise, return
+ a freshly allocated string with the new contents.
+
+ After a URL has been run through this function, the protocols that
+ use `%' as the quote character can use the resulting string as-is,
+ while those that don't can use url_unescape to get to the intended
+ data. This function is stable: once the input is transformed,
+ further transformations of the result yield the same output.
+*/
+
+static char *
+reencode_escapes (const char *s, memory_pool_t *pool)
+{
+ const char *p1;
+ char *newstr, *p2;
+ int oldlen, newlen;
+
+ int encode_count = 0;
+
+ /* First pass: inspect the string to see if there's anything to do,
+ and to calculate the new length. */
+ for (p1 = s; *p1; p1++)
+ if (char_needs_escaping (p1))
+ ++encode_count;
+
+ if (!encode_count) {
+ /* The string is good as it is. */
+ return memory_pool_strdup (pool, s);
+ }
+
+ oldlen = p1 - s;
+ /* Each encoding adds two characters (hex digits). */
+ newlen = oldlen + 2 * encode_count;
+ newstr = memory_pool_alloc (pool, newlen + 1);
+
+ /* Second pass: copy the string to the destination address, encoding
+ chars when needed. */
+ p1 = s;
+ p2 = newstr;
+
+ while (*p1)
+ if (char_needs_escaping (p1)) {
+ unsigned char c = *p1++;
+ *p2++ = '%';
+ *p2++ = XNUM_TO_DIGIT (c >> 4);
+ *p2++ = XNUM_TO_DIGIT (c & 0xf);
+ }
+ else {
+ *p2++ = *p1++;
+ }
+
+ *p2 = '\0';
+ return newstr;
+}
+/* Unescape CHR in an otherwise escaped STR. Used to selectively
+ escaping of certain characters, such as "/" and ":". Returns a
+ count of unescaped chars. */
+
+static void
+unescape_single_char (char *str, char chr)
+{
+ const char c1 = XNUM_TO_DIGIT (chr >> 4);
+ const char c2 = XNUM_TO_DIGIT (chr & 0xf);
+ char *h = str; /* hare */
+ char *t = str; /* tortoise */
+
+ for (; *h; h++, t++) {
+ if (h[0] == '%' && h[1] == c1 && h[2] == c2) {
+ *t = chr;
+ h += 2;
+ }
+ else {
+ *t = *h;
+ }
+ }
+ *t = '\0';
+}
+
+/* Escape unsafe and reserved characters, except for the slash
+ characters. */
+
+static char *
+url_escape_dir (const char *dir, memory_pool_t *pool)
+{
+ char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1, pool);
+ if (newdir == dir)
+ return (char *)dir;
+
+ unescape_single_char (newdir, '/');
+ return newdir;
+}
+
+/* Resolve "." and ".." elements of PATH by destructively modifying
+ PATH and return non-zero if PATH has been modified, zero otherwise.
+
+ The algorithm is in spirit similar to the one described in rfc1808,
+ although implemented differently, in one pass. To recap, path
+ elements containing only "." are removed, and ".." is taken to mean
+ "back up one element". Single leading and trailing slashes are
+ preserved.
+
+ For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive
+ test examples are provided below. If you change anything in this
+ function, run test_path_simplify to make sure you haven't broken a
+ test case. */
+
+static int
+path_simplify (char *path)
+{
+ char *h = path; /* hare */
+ char *t = path; /* tortoise */
+ char *beg = path; /* boundary for backing the tortoise */
+ char *end = path + strlen (path);
+
+ while (h < end) {
+ /* Hare should be at the beginning of a path element. */
+ if (h[0] == '.' && (h[1] == '/' || h[1] == '\0')) {
+ /* Ignore "./". */
+ h += 2;
+ }
+ else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0')) {
+ /* Handle "../" by retreating the tortoise by one path
+ element -- but not past beggining. */
+ if (t > beg) {
+ /* Move backwards until T hits the beginning of the
+ previous path element or the beginning of path. */
+ for (--t; t > beg && t[-1] != '/'; t--);
+ }
+ else {
+ /* If we're at the beginning, copy the "../" literally
+ move the beginning so a later ".." doesn't remove
+ it. */
+ beg = t + 3;
+ goto regular;
+ }
+ h += 3;
+ }
+ else {
+ regular:
+ /* A regular path element. If H hasn't advanced past T,
+ simply skip to the next path element. Otherwise, copy
+ the path element until the next slash. */
+ if (t == h) {
+ /* Skip the path element, including the slash. */
+ while (h < end && *h != '/')
+ t++, h++;
+ if (h < end)
+ t++, h++;
+ }
+ else {
+ /* Copy the path element, including the final slash. */
+ while (h < end && *h != '/')
+ *t++ = *h++;
+ if (h < end)
+ *t++ = *h++;
+ }
+ }
+ }
+
+ if (t != h)
+ *t = '\0';
+
+ return t != h;
+}
+
+enum uri_errno
+parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool)
+{
+ unsigned char *prefix_end, *host_end, *p;
+ unsigned char *lbracket, *rbracket;
+ int datalen, n, addrlen;
+ unsigned char *frag_or_post, *user_end, *port_end;
+
+ memset (uri, 0, sizeof (*uri));
+
+ /* Nothing to do for an empty url. */
+ if (!*uristring) return URI_ERRNO_EMPTY;
+
+ uri->string = reencode_escapes (uristring, pool);
+ msg_debug ("parse_uri: reencoding escapes in original url: '%s'", struri (uri));
+ uri->protocollen = get_protocol_length (struri (uri));
+
+ /* Assume http as default protocol */
+ if (!uri->protocollen || (uri->protocol = get_protocol (struri(uri), uri->protocollen)) == PROTOCOL_UNKNOWN) {
+ p = g_strconcat ("http://", uri->string, NULL);
+ g_free (uri->string);
+ uri->string = p;
+ uri->protocol = PROTOCOL_HTTP;
+ prefix_end = struri (uri) + 7;
+ }
+ else {
+ /* Figure out whether the protocol is known */
+ msg_debug ("parse_uri: getting protocol from url: %d", uri->protocol);
+
+ prefix_end = struri (uri) + uri->protocollen; /* ':' */
+
+ /* Check if there's a digit after the protocol name. */
+ if (isdigit (*prefix_end)) {
+ p = struri (uri);
+ uri->ip_family = p[uri->protocollen] - '0';
+ prefix_end++;
+ }
+ if (*prefix_end != ':') {
+ msg_debug ("parse_uri: invalid protocol in uri");
+ return URI_ERRNO_INVALID_PROTOCOL;
+ }
+ prefix_end++;
+
+ /* Skip slashes */
+
+ if (prefix_end[0] == '/' && prefix_end[1] == '/') {
+ if (prefix_end[2] == '/') {
+ msg_debug ("parse_uri: too many '/' in uri");
+ return URI_ERRNO_TOO_MANY_SLASHES;
+ }
+
+ prefix_end += 2;
+
+ } else {
+ msg_debug ("parse_uri: no '/' in uri");
+ return URI_ERRNO_NO_SLASHES;
+ }
+ }
+
+ if (get_protocol_free_syntax (uri->protocol)) {
+ uri->data = prefix_end;
+ uri->datalen = strlen (prefix_end);
+ return URI_ERRNO_OK;
+
+ } else if (uri->protocol == PROTOCOL_FILE) {
+ datalen = check_uri_file (prefix_end);
+ frag_or_post = prefix_end + datalen;
+
+ /* Extract the fragment part. */
+ if (datalen >= 0) {
+ if (*frag_or_post == '#') {
+ uri->fragment = frag_or_post + 1;
+ uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S);
+ frag_or_post = uri->fragment + uri->fragmentlen;
+ }
+ if (*frag_or_post == POST_CHAR) {
+ uri->post = frag_or_post + 1;
+ }
+ } else {
+ datalen = strlen(prefix_end);
+ }
+
+ uri->data = prefix_end;
+ uri->datalen = datalen;
+
+ return URI_ERRNO_OK;
+ }
+
+ /* Isolate host */
+
+ /* Get brackets enclosing IPv6 address */
+ lbracket = strchr (prefix_end, '[');
+ if (lbracket) {
+ rbracket = strchr (lbracket, ']');
+ /* [address] is handled only inside of hostname part (surprisingly). */
+ if (rbracket && rbracket < prefix_end + strcspn (prefix_end, "/"))
+ uri->ipv6 = 1;
+ else
+ lbracket = rbracket = NULL;
+ } else {
+ rbracket = NULL;
+ }
+
+ /* Possibly skip auth part */
+ host_end = prefix_end + strcspn (prefix_end, "@");
+
+ if (prefix_end + strcspn (prefix_end, "/") > host_end
+ && *host_end) { /* we have auth info here */
+
+ /* Allow '@' in the password component */
+ while (strcspn (host_end + 1, "@") < strcspn (host_end + 1, "/?"))
+ host_end = host_end + 1 + strcspn (host_end + 1, "@");
+
+ user_end = strchr (prefix_end, ':');
+
+ if (!user_end || user_end > host_end) {
+ uri->user = prefix_end;
+ uri->userlen = host_end - prefix_end;
+ } else {
+ uri->user = prefix_end;
+ uri->userlen = user_end - prefix_end;
+ uri->password = user_end + 1;
+ uri->passwordlen = host_end - user_end - 1;
+ }
+ prefix_end = host_end + 1;
+ }
+
+ if (uri->ipv6)
+ host_end = rbracket + strcspn (rbracket, ":/?");
+ else
+ host_end = prefix_end + strcspn (prefix_end, ":/?");
+
+ if (uri->ipv6) {
+ addrlen = rbracket - lbracket - 1;
+
+
+ uri->host = lbracket + 1;
+ uri->hostlen = addrlen;
+ } else {
+ uri->host = prefix_end;
+ uri->hostlen = host_end - prefix_end;
+
+ /* Trim trailing '.'s */
+ if (uri->hostlen && uri->host[uri->hostlen - 1] == '.')
+ return URI_ERRNO_TRAILING_DOTS;
+ }
+
+ if (*host_end == ':') { /* we have port here */
+ port_end = host_end + 1 + strcspn (host_end + 1, "/");
+
+ host_end++;
+
+ uri->port = host_end;
+ uri->portlen = port_end - host_end;
+
+ if (uri->portlen == 0)
+ return URI_ERRNO_NO_PORT_COLON;
+
+ /* We only use 8 bits for portlen so better check */
+ if (uri->portlen != port_end - host_end)
+ return URI_ERRNO_INVALID_PORT;
+
+ /* test if port is number */
+ for (; host_end < port_end; host_end++)
+ if (!isdigit (*host_end))
+ return URI_ERRNO_INVALID_PORT;
+
+ /* Check valid port value, and let show an error message
+ * about invalid url syntax. */
+ if (uri->port && uri->portlen) {
+
+ errno = 0;
+ n = strtol (uri->port, NULL, 10);
+ if (errno || !uri_port_is_valid (n))
+ return URI_ERRNO_INVALID_PORT;
+ }
+ }
+
+ if (*host_end == '/') {
+ host_end++;
+
+ } else if (get_protocol_need_slash_after_host (uri->protocol) && *host_end != '?') {
+ /* The need for slash after the host component depends on the
+ * need for a host component. -- The dangerous mind of Jonah */
+ if (!uri->hostlen)
+ return URI_ERRNO_NO_HOST;
+
+ return URI_ERRNO_NO_HOST_SLASH;
+ }
+
+ /* Look for #fragment or POST_CHAR */
+ prefix_end = host_end + strcspn (host_end, "#" POST_CHAR_S);
+ uri->data = host_end;
+ uri->datalen = prefix_end - host_end;
+
+ if (*prefix_end == '#') {
+ uri->fragment = prefix_end + 1;
+ uri->fragmentlen = strcspn (uri->fragment, POST_CHAR_S);
+ prefix_end = uri->fragment + uri->fragmentlen;
+ }
+
+ if (*prefix_end == POST_CHAR) {
+ uri->post = prefix_end + 1;
+ }
+
+ convert_to_lowercase (uri->string, uri->protocollen);
+ convert_to_lowercase (uri->host, uri->hostlen);
+ /* Decode %HH sequences in host name. This is important not so much
+ to support %HH sequences in host names (which other browser
+ don't), but to support binary characters (which will have been
+ converted to %HH by reencode_escapes). */
+ if (strchr (uri->host, '%')) {
+ url_unescape (uri->host);
+ }
+ path_simplify (uri->data);
+
+ return URI_ERRNO_OK;
+}
+
+void
+url_parse_text (struct worker_task *task, GByteArray *content)
+{
+ GMatchInfo *info;
+ GError *err = NULL;
+ int pos = 0, start;
+ gboolean rc;
+ char *url_str = NULL;
+ struct uri *new;
+
+ if (url_init () == 0) {
+ do {
+ rc = g_regex_match_full (text_re, (const char *)content->data, content->len, pos, 0, &info, &err);
+ if (rc) {
+ if (g_match_info_matches (info)) {
+ g_match_info_fetch_pos (info, 0, &start, &pos);
+ url_str = g_match_info_fetch (info, 0);
+ msg_debug ("url_parse_text: extracted string with regexp: '%s'", url_str);
+ if (url_str != NULL) {
+ new = memory_pool_alloc (task->task_pool, sizeof (struct uri));
+ if (new != NULL) {
+ parse_uri (new, url_str, task->task_pool);
+ TAILQ_INSERT_TAIL (&task->urls, new, next);
+ }
+ }
+ g_free (url_str);
+ }
+ g_match_info_free (info);
+ }
+ else if (err != NULL) {
+ msg_debug ("url_parse_text: error matching regexp: %s", err->message);
+ g_free (err);
+ }
+ else {
+ msg_debug ("url_parse_text: cannot find url pattern in given string");
+ }
+ } while (rc);
+ }
+}
+
+void
+url_parse_html (struct worker_task *task, GByteArray *content)
+{
+ GMatchInfo *info;
+ GError *err = NULL;
+ int pos = 0, start;
+ gboolean rc;
+ char *url_str = NULL;
+ struct uri *new;
+
+ if (url_init () == 0) {
+ do {
+ rc = g_regex_match_full (html_re, (const char *)content->data, content->len, pos, 0, &info, &err);
+ if (rc) {
+ if (g_match_info_matches (info)) {
+ g_match_info_fetch_pos (info, 0, &start, &pos);
+ url_str = g_match_info_fetch (info, 1);
+ msg_debug ("url_parse_html: extracted string with regexp: '%s'", url_str);
+ if (url_str != NULL) {
+ new = memory_pool_alloc (task->task_pool, sizeof (struct uri));
+ if (new != NULL) {
+ parse_uri (new, url_str, task->task_pool);
+ TAILQ_INSERT_TAIL (&task->urls, new, next);
+ }
+ }
+ g_free (url_str);
+ }
+ g_match_info_free (info);
+ }
+ else if (err) {
+ msg_debug ("url_parse_html: error matching regexp: %s", err->message);
+ g_free (err);
+ }
+ else {
+ msg_debug ("url_parse_html: cannot find url pattern in given string");
+ }
+ } while (rc);
+ }
+}