#include #include #include #include #include #include #include #include #include #include #include "url.h" #include "fstring.h" #include "main.h" #define POST_CHAR 1 #define POST_CHAR_S "\001" /* Tcp port range */ #define LOWEST_PORT 0 #define HIGHEST_PORT 65535 #define uri_port_is_valid(port) \ (LOWEST_PORT <= (port) && (port) <= HIGHEST_PORT) struct _proto { unsigned char *name; int port; uintptr_t *unused; unsigned int need_slashes:1; unsigned int need_slash_after_host:1; unsigned int free_syntax:1; unsigned int need_ssl:1; }; static const char *html_url = "((?:href\\s*=\\s*)|(?:archive\\s*=\\s*)|(?:code\\s*=\\s*)|(?:codebase\\s*=\\s*)|(?:src\\s*=\\s*)|(?:cite\\s*=\\s*)" "|(:?background\\s*=\\s*)|(?:pluginspage\\s*=\\s*)|(?:pluginurl\\s*=\\s*)|(?:action\\s*=\\s*)|(?:dynsrc\\s*=\\s*)|(?:longdesc\\s*=\\s*)|(?:lowsrc\\s*=\\s*)|(?:usemap\\s*=\\s*))" "\\\"?([^>\"<]+)\\\"?"; static const char *text_url = "((?:mailto\\:|(?:news|(?:ht|f)tp(?:s?))\\://){1}[^>\"<]+)"; static short url_initialized = 0; static pcre_extra *text_re_extra; static pcre *text_re; static pcre_extra *html_re_extra; static pcre *html_re; static const struct _proto protocol_backends[] = { { "file", 0, NULL, 1, 0, 0, 0 }, { "ftp", 21, NULL, 1, 1, 0, 0 }, { "http", 80, NULL, 1, 1, 0, 0 }, { "https", 443, NULL, 1, 1, 0, 1 }, /* Keep these last! */ { NULL, 0, NULL, 0, 0, 1, 0 }, }; static inline int end_of_dir(unsigned char c) { return c == POST_CHAR || c == '#' || c == ';' || c == '?'; } static inline int is_uri_dir_sep(struct uri *uri, unsigned char pos) { return (pos == '/'); } static int check_uri_file(unsigned char *name) { static const unsigned char chars[] = POST_CHAR_S "#?"; return strcspn(name, chars); } static int url_init (void) { if (url_initialized == 0) { text_re = pcre_compile (text_url, PCRE_CASELESS, NULL, 0, NULL); if (text_re == NULL) { msg_info ("url_init: cannot init url parsing regexp"); return -1; } text_re_extra = pcre_study (text_re, 0, NULL); html_re = pcre_compile (html_url, PCRE_CASELESS, NULL, 0, NULL); if (html_re == NULL) { msg_info ("url_init: cannot init url parsing regexp"); return -1; } html_re_extra = pcre_study (html_re, 0, NULL); url_initialized = 1; } return 0; } enum protocol get_protocol(unsigned char *name, int namelen) { /* These are really enum protocol values but can take on negative * values and since 0 <= -1 for enum values it's better to use clean * integer type. */ int start, end; enum protocol protocol; unsigned char *pname; int pnamelen, minlen, compare; /* Almost dichotomic search is used here */ /* Starting at the HTTP entry which is the most common that will make * file and NNTP the next entries checked and amongst the third checks * are proxy and FTP. */ start = 0; end = PROTOCOL_UNKNOWN - 1; protocol = PROTOCOL_HTTP; while (start <= end) { pname = protocol_backends[protocol].name; pnamelen = strlen (pname); minlen = MIN (pnamelen, namelen); compare = strncasecmp (pname, name, minlen); if (compare == 0) { if (pnamelen == namelen) return protocol; /* If the current protocol name is longer than the * protocol name being searched for move @end else move * @start. */ compare = pnamelen > namelen ? 1 : -1; } if (compare > 0) end = protocol - 1; else start = protocol + 1; protocol = (start + end) / 2; } return PROTOCOL_UNKNOWN; } int get_protocol_port(enum protocol protocol) { return protocol_backends[protocol].port; } int get_protocol_need_slashes(enum protocol protocol) { return protocol_backends[protocol].need_slashes; } int get_protocol_need_slash_after_host(enum protocol protocol) { return protocol_backends[protocol].need_slash_after_host; } int get_protocol_free_syntax(enum protocol protocol) { return protocol_backends[protocol].free_syntax; } static int get_protocol_length(const unsigned char *url) { unsigned char *end = (unsigned char *) url; /* Seek the end of the protocol name if any. */ /* RFC1738: * scheme = 1*[ lowalpha | digit | "+" | "-" | "." ] * (but per its recommendations we accept "upalpha" too) */ while (isalnum(*end) || *end == '+' || *end == '-' || *end == '.') end++; /* Now we make something to support our "IP version in protocol scheme * name" hack and silently chop off the last digit if it's there. The * IETF's not gonna notice I hope or it'd be going after us hard. */ if (end != url && isdigit(end[-1])) end--; /* Also return 0 if there's no protocol name (@end == @url). */ return (*end == ':' || isdigit(*end)) ? end - url : 0; } static enum uri_errno parse_uri(struct uri *uri, unsigned char *uristring) { unsigned char *prefix_end, *host_end; unsigned char *lbracket, *rbracket; int datalen, n, addrlen; unsigned char *frag_or_post, *user_end, *port_end; memset (uri, 0, sizeof (*uri)); /* Nothing to do for an empty url. */ if (!*uristring) return URI_ERRNO_EMPTY; uri->string = uristring; uri->protocollen = get_protocol_length (uristring); /* Invalid */ if (!uri->protocollen) return URI_ERRNO_INVALID_PROTOCOL; /* Figure out whether the protocol is known */ uri->protocol = get_protocol (struri(uri), uri->protocollen); prefix_end = uristring + uri->protocollen; /* ':' */ /* Check if there's a digit after the protocol name. */ if (isdigit (*prefix_end)) { uri->ip_family = uristring[uri->protocollen] - '0'; prefix_end++; } if (*prefix_end != ':') return URI_ERRNO_INVALID_PROTOCOL; prefix_end++; /* Skip slashes */ if (prefix_end[0] == '/' && prefix_end[1] == '/') { if (prefix_end[2] == '/') return URI_ERRNO_TOO_MANY_SLASHES; prefix_end += 2; } else { return URI_ERRNO_NO_SLASHES; } if (get_protocol_free_syntax (uri->protocol)) { uri->data = prefix_end; uri->datalen = strlen (prefix_end); return URI_ERRNO_OK; } else if (uri->protocol == PROTOCOL_FILE) { datalen = check_uri_file (prefix_end); frag_or_post = prefix_end + datalen; /* Extract the fragment part. */ if (datalen >= 0) { if (*frag_or_post == '#') { uri->fragment = frag_or_post + 1; uri->fragmentlen = strcspn(uri->fragment, POST_CHAR_S); frag_or_post = uri->fragment + uri->fragmentlen; } if (*frag_or_post == POST_CHAR) { uri->post = frag_or_post + 1; } } else { datalen = strlen(prefix_end); } uri->data = prefix_end; uri->datalen = datalen; return URI_ERRNO_OK; } /* Isolate host */ /* Get brackets enclosing IPv6 address */ lbracket = strchr (prefix_end, '['); if (lbracket) { rbracket = strchr (lbracket, ']'); /* [address] is handled only inside of hostname part (surprisingly). */ if (rbracket && rbracket < prefix_end + strcspn (prefix_end, "/")) uri->ipv6 = 1; else lbracket = rbracket = NULL; } else { rbracket = NULL; } /* Possibly skip auth part */ host_end = prefix_end + strcspn (prefix_end, "@"); if (prefix_end + strcspn (prefix_end, "/") > host_end && *host_end) { /* we have auth info here */ /* Allow '@' in the password component */ while (strcspn (host_end + 1, "@") < strcspn (host_end + 1, "/?")) host_end = host_end + 1 + strcspn (host_end + 1, "@"); user_end = strchr (prefix_end, ':'); if (!user_end || user_end > host_end) { uri->user = prefix_end; uri->userlen = host_end - prefix_end; } else { uri->user = prefix_end; uri->userlen = user_end - prefix_end; uri->password = user_end + 1; uri->passwordlen = host_end - user_end - 1; } prefix_end = host_end + 1; } if (uri->ipv6) host_end = rbracket + strcspn (rbracket, ":/?"); else host_end = prefix_end + strcspn (prefix_end, ":/?"); if (uri->ipv6) { addrlen = rbracket - lbracket - 1; uri->host = lbracket + 1; uri->hostlen = addrlen; } else { uri->host = prefix_end; uri->hostlen = host_end - prefix_end; /* Trim trailing '.'s */ if (uri->hostlen && uri->host[uri->hostlen - 1] == '.') return URI_ERRNO_TRAILING_DOTS; } if (*host_end == ':') { /* we have port here */ port_end = host_end + 1 + strcspn (host_end + 1, "/"); host_end++; uri->port = host_end; uri->portlen = port_end - host_end; if (uri->portlen == 0) return URI_ERRNO_NO_PORT_COLON; /* We only use 8 bits for portlen so better check */ if (uri->portlen != port_end - host_end) return URI_ERRNO_INVALID_PORT; /* test if port is number */ for (; host_end < port_end; host_end++) if (!isdigit (*host_end)) return URI_ERRNO_INVALID_PORT; /* Check valid port value, and let show an error message * about invalid url syntax. */ if (uri->port && uri->portlen) { errno = 0; n = strtol (uri->port, NULL, 10); if (errno || !uri_port_is_valid (n)) return URI_ERRNO_INVALID_PORT; } } if (*host_end == '/') { host_end++; } else if (get_protocol_need_slash_after_host (uri->protocol)) { /* The need for slash after the host component depends on the * need for a host component. -- The dangerous mind of Jonah */ if (!uri->hostlen) return URI_ERRNO_NO_HOST; return URI_ERRNO_NO_HOST_SLASH; } /* Look for #fragment or POST_CHAR */ prefix_end = host_end + strcspn (host_end, "#" POST_CHAR_S); uri->data = host_end; uri->datalen = prefix_end - host_end; if (*prefix_end == '#') { uri->fragment = prefix_end + 1; uri->fragmentlen = strcspn (uri->fragment, POST_CHAR_S); prefix_end = uri->fragment + uri->fragmentlen; } if (*prefix_end == POST_CHAR) { uri->post = prefix_end + 1; } return URI_ERRNO_OK; } static unsigned char * normalize_uri(struct uri *uri, unsigned char *uristring) { unsigned char *parse_string = uristring; unsigned char *src, *dest, *path; int need_slash = 0; int parse = (uri == NULL); struct uri uri_struct; if (!uri) uri = &uri_struct; /* * We need to get the real (proxied) URI but lowercase relevant URI * parts along the way. */ if (parse && parse_uri (uri, parse_string) != URI_ERRNO_OK) return uristring; /* This is a maybe not the right place but both join_urls() and * get_translated_uri() through translate_url() calls this * function and then it already works on and modifies an * allocated copy. */ convert_to_lowercase (uri->string, uri->protocollen); if (uri->hostlen) convert_to_lowercase (uri->host, uri->hostlen); parse = 1; parse_string = uri->data; if (get_protocol_free_syntax (uri->protocol)) return uristring; if (uri->protocol != PROTOCOL_UNKNOWN) need_slash = get_protocol_need_slash_after_host (uri->protocol); /* We want to start at the first slash to also reduce URIs like * http://host//index.html to http://host/index.html */ path = uri->data - need_slash; dest = src = path; /* This loop mangles the URI string by removing directory elevators and * other cruft. Example: /.././etc////..//usr/ -> /usr/ */ while (*dest) { /* If the following pieces are the LAST parts of URL, we remove * them as well. See RFC 1808 for details. */ if (end_of_dir (src[0])) { /* URL data contains no more path. */ memmove (dest, src, strlen(src) + 1); break; } if (!is_uri_dir_sep (uri, src[0])) { /* This is to reduce indentation */ } else if (src[1] == '.') { if (!src[2]) { /* /. - skip the dot */ *dest++ = *src; *dest = 0; break; } else if (is_uri_dir_sep (uri, src[2])) { /* /./ - strip that.. */ src += 2; continue; } else if (src[2] == '.' && (is_uri_dir_sep (uri, src[3]) || !src[3])) { /* /../ or /.. - skip it and preceding element. */ /* First back out the last incrementation of * @dest (dest++) to get the position that was * last asigned to. */ if (dest > path) dest--; /* @dest might be pointing to a dir separator * so we decrement before any testing. */ while (dest > path) { dest--; if (is_uri_dir_sep (uri, *dest)) break; } if (!src[3]) { /* /.. - add ending slash and stop */ *dest++ = *src; *dest = 0; break; } src += 3; continue; } } else if (is_uri_dir_sep (uri, src[1])) { /* // - ignore first '/'. */ src += 1; continue; } /* We don't want to access memory past the NUL char. */ *dest = *src++; if (*dest) dest++; } return uristring; } void url_parse_text (struct worker_task *task, GByteArray *content) { int ovec[30]; int pos = 0, rc; char *url_str = NULL; struct uri *new; if (url_init () == 0) { while ((rc = pcre_exec (text_re, text_re_extra, (const char *)content->data, content->len, pos, 0, ovec, sizeof (ovec) / sizeof (ovec[0])) >= 0)) { if (rc > 0) { pos = ovec[1]; pcre_get_substring ((const char *)content->data, ovec, rc, 1, (const char **)&url_str); if (url_str != NULL) { new = g_malloc (sizeof (struct uri)); if (new != NULL) { parse_uri (new, url_str); normalize_uri (new, url_str); TAILQ_INSERT_TAIL (&task->urls, new, next); } } } } } } void url_parse_html (struct worker_task *task, GByteArray *content) { int ovec[30]; int pos = 0, rc; char *url_str = NULL; struct uri *new; if (url_init () == 0) { while ((rc = pcre_exec (html_re, html_re_extra, (const char *)content->data, content->len, pos, 0, ovec, sizeof (ovec) / sizeof (ovec[0])) >= 0)) { if (rc > 0) { pos = ovec[1]; pcre_get_substring ((const char *)content->data, ovec, rc, 3, (const char **)&url_str); if (url_str != NULL) { new = g_malloc (sizeof (struct uri)); if (new != NULL) { parse_uri (new, url_str); normalize_uri (new, url_str); TAILQ_INSERT_TAIL (&task->urls, new, next); } } } } } }