From 57e765ce78c6b9746cddab4c3415dc386552151f Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 9 Sep 2008 18:48:49 +0400 Subject: [PATCH] * Fix url parser (get regexp from our mail system) * Add some more tests to url testcase --- plugins/surbl.c | 1 - test/rspamd_test_suite.c | 5 +- test/rspamd_url_test.c | 71 +++++++++--- url.c | 229 ++++++++++++++------------------------- url.h | 1 - util.c | 3 +- 6 files changed, 143 insertions(+), 167 deletions(-) diff --git a/plugins/surbl.c b/plugins/surbl.c index e31f8cc5d..bcb5f38ba 100644 --- a/plugins/surbl.c +++ b/plugins/surbl.c @@ -462,7 +462,6 @@ redirector_callback (int fd, short what, void *arg) if (*p == '\0') { msg_info ("redirector_callback: got reply from redirector: '%s' -> '%s'", struri (param->url), c); parse_uri (param->url, c); - normalize_uri (param->url, c); register_memcached_call (param->url, param->task); param->task->save.saved ++; } diff --git a/test/rspamd_test_suite.c b/test/rspamd_test_suite.c index d9d7f63fe..7b699f40b 100644 --- a/test/rspamd_test_suite.c +++ b/test/rspamd_test_suite.c @@ -16,10 +16,13 @@ int main (int argc, char **argv) { + g_mem_set_vtable(glib_mem_profiler_table); + g_test_init (&argc, &argv, NULL); g_test_add_func ("/rspamd/url", rspamd_url_test_func); g_test_run (); - + + g_mem_profile (); } diff --git a/test/rspamd_url_test.c b/test/rspamd_url_test.c index 5a4d9e5ba..0db5084bf 100644 --- a/test/rspamd_url_test.c +++ b/test/rspamd_url_test.c @@ -17,8 +17,54 @@ #include "../url.h" #include "tests.h" -const char *test_text = "This is test file with http://TesT%45.com/././ url"; -const char *test_html = "This is test file with http://TesT%45.com/././ url"; +const char *test_text = +"www.schemeless.ru\n" +"www.schemeless.rus\n" +" as ftp.schemeless.ru dasd \n" +"ftp12.schemeless.ru\n" +"ftpsearch.schemeless.ru\n" +"schemeless.ru\n" +"www.schemeless.microsoft\n" +"1.2.3.4\n" +"1.2.3.4/a\n" +"1.2.3\n" +"1.2.3.4.5\n" +"www.schemeless.ru,\n" +"www.schemeless.ru.\n" +"http://www.schemed.ru.\n" +"http://www.schemed.ru.\n" +"http://www.bolinfest.com/targetalert/'\n" +"http://www.bolinfest.com/targetalert/'';\n" +"https://www.schemed.ru.\n" +"ufps://www.schemed.ru.\n" +"http://ported.ru:8080\n" +"http://ported.ru:8080\n" +"http://1.2.3.4\n" +"http://1.2.3.4:80\n" +"1.2.3.4:80\n" +"www.a9.com\n" +"www.a-9.com\n" +"http://www.schemed.ru/a.txt:\n" +"http://www.schemed.ru/a.txt'\n" +"http://www.schemed.ru/a.txt\"\n" +"http://www.schemed.ru/a.txt>\n" +"http://www.schemed.ru/a=3&b=4\n" +"http://spam.ru/bad=user@domain.com\n" +"http://spam.ru/bad=user@domain.com\n" +"http://spam.ru user@domain.com\n" +"http://a.foto.radikal.ru/0604/de7793c6ca62.jpg\n" +"http://a.foto.radikal.ru/0604/de7793c6ca62.jpg\n" +"schemeless.gz\n" +"schemeless.jp\n" +"schemeless.ua\n" +"schemeless.gz/a\n" +"mysql.so\n" +"http://mysql.so\n" +"3com.com\n" +"lj-user.livejournal.com\n" +"http://lj-user.livejournal.com\n" +"http://vsem.ru?action;\n"; +const char *test_html = "This is test file with http://TesT.com/././?%45%46%20 url"; /* Function for using in glib test suite */ void @@ -39,12 +85,12 @@ rspamd_url_test_func () TAILQ_INIT (&task.urls); g_test_timer_start (); - g_test_message ("* Testing text URL regexp parser *"); - g_test_message ("Passing string: %s", test_text); + g_test_message ("Testing text URL regexp parser"); + msg_debug ("Passing string: %s", test_text); url_parse_text (&task, text); TAILQ_FOREACH (url, &task.urls, next) { - g_test_message ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data); + msg_debug ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data); i ++; } @@ -54,18 +100,17 @@ rspamd_url_test_func () g_free (url->string); g_free (url); } - g_assert (i == 1); - - g_test_message ("Time elapsed: %.2f", g_test_timer_elapsed ()); + g_assert (i == 39); + msg_debug ("Time elapsed: %.2f", g_test_timer_elapsed ()); i = 0; g_test_timer_start (); - g_test_message ("* Testing html URL regexp parser *"); - g_test_message ("Passing string: %s", test_html); + g_test_message ("Testing html URL regexp parser"); + msg_debug ("Passing string: %s", test_html); url_parse_html (&task, html); TAILQ_FOREACH (url, &task.urls, next) { - g_test_message ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data); + msg_debug ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data); i ++; } @@ -75,6 +120,6 @@ rspamd_url_test_func () g_free (url->string); g_free (url); } - g_assert (i == 2); - g_test_message ("Time elapsed: %.2f", g_test_timer_elapsed ()); + g_assert (i == 1); + msg_debug ("Time elapsed: %.2f", g_test_timer_elapsed ()); } diff --git a/url.c b/url.c index 3288bb2ff..9fc6ecc98 100644 --- a/url.c +++ b/url.c @@ -32,8 +32,39 @@ struct _proto { unsigned int need_ssl:1; }; -static const char *html_url = "((?:href\\s*=\\s*)?([^>\"<]+))?"; -static const char *text_url = "(https?://[^ ]+)"; +static const char *text_url = "((https?|ftp)://)?" +"(\\b(??!),.'\"\\]:])" +"(?!@)" +")"; +static const char *html_url = "(?: src|href)=\"(" +"((https?|ftp)://)?" +"(\\b(??!),.'\"\\]:])" +"(?!@)" +"))\""; static short url_initialized = 0; GRegex *text_re, *html_re; @@ -156,13 +187,13 @@ url_init (void) { GError *err = NULL; if (url_initialized == 0) { - text_re = g_regex_new (text_url, G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_RAW, 0, &err); + text_re = g_regex_new (text_url, G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_OPTIMIZE | G_REGEX_EXTENDED, 0, &err); if (err != NULL) { msg_info ("url_init: cannot init text url parsing regexp: %s", err->message); g_error_free (err); return -1; } - html_re = g_regex_new (html_url, G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_RAW, 0, &err); + html_re = g_regex_new (html_url, G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_OPTIMIZE | G_REGEX_EXTENDED, 0, &err); if (err != NULL) { msg_info ("url_init: cannot init html url parsing regexp: %s", err->message); g_error_free (err); @@ -258,14 +289,8 @@ get_protocol_length(const unsigned char *url) while (isalnum(*end) || *end == '+' || *end == '-' || *end == '.') end++; - /* Now we make something to support our "IP version in protocol scheme - * name" hack and silently chop off the last digit if it's there. The - * IETF's not gonna notice I hope or it'd be going after us hard. */ - if (end != url && isdigit(end[-1])) - end--; - /* Also return 0 if there's no protocol name (@end == @url). */ - return (*end == ':' || isdigit(*end)) ? end - url : 0; + return (*end == ':') ? end - url : 0; } /* URL-unescape the string S. @@ -417,7 +442,7 @@ reencode_escapes (const char *s) if (!encode_count) /* The string is good as it is. */ - return (char *) s; /* C const model sucks. */ + return g_strdup (s); /* C const model sucks. */ oldlen = p1 - s; /* Each encoding adds two characters (hex digits). */ @@ -557,7 +582,7 @@ path_simplify (char *path) enum uri_errno parse_uri(struct uri *uri, unsigned char *uristring) { - unsigned char *prefix_end, *host_end; + unsigned char *prefix_end, *host_end, *p; unsigned char *lbracket, *rbracket; int datalen, n, addrlen; unsigned char *frag_or_post, *user_end, *port_end; @@ -568,35 +593,49 @@ parse_uri(struct uri *uri, unsigned char *uristring) if (!*uristring) return URI_ERRNO_EMPTY; uri->string = reencode_escapes (uristring); - uri->protocollen = get_protocol_length (uristring); - - /* Invalid */ - if (!uri->protocollen) return URI_ERRNO_INVALID_PROTOCOL; - - /* Figure out whether the protocol is known */ - uri->protocol = get_protocol (struri(uri), uri->protocollen); + msg_debug ("parse_uri: reencoding escapes in original url: '%s'", struri (uri)); + uri->protocollen = get_protocol_length (struri (uri)); + + /* Assume http as default protocol */ + if (!uri->protocollen || (uri->protocol = get_protocol (struri(uri), uri->protocollen)) == PROTOCOL_UNKNOWN) { + p = g_strconcat ("http://", uri->string, NULL); + g_free (uri->string); + uri->string = p; + uri->protocol = PROTOCOL_HTTP; + prefix_end = struri (uri) + 7; + } + else { + /* Figure out whether the protocol is known */ + msg_debug ("parse_uri: getting protocol from url: %d", uri->protocol); - prefix_end = struri (uri) + uri->protocollen; /* ':' */ + prefix_end = struri (uri) + uri->protocollen; /* ':' */ - /* Check if there's a digit after the protocol name. */ - if (isdigit (*prefix_end)) { - uri->ip_family = uristring[uri->protocollen] - '0'; + /* Check if there's a digit after the protocol name. */ + if (isdigit (*prefix_end)) { + p = struri (uri); + uri->ip_family = p[uri->protocollen] - '0'; + prefix_end++; + } + if (*prefix_end != ':') { + msg_debug ("parse_uri: invalid protocol in uri"); + return URI_ERRNO_INVALID_PROTOCOL; + } prefix_end++; - } - if (*prefix_end != ':') - return URI_ERRNO_INVALID_PROTOCOL; - prefix_end++; - /* Skip slashes */ + /* Skip slashes */ - if (prefix_end[0] == '/' && prefix_end[1] == '/') { - if (prefix_end[2] == '/') - return URI_ERRNO_TOO_MANY_SLASHES; + if (prefix_end[0] == '/' && prefix_end[1] == '/') { + if (prefix_end[2] == '/') { + msg_debug ("parse_uri: too many '/' in uri"); + return URI_ERRNO_TOO_MANY_SLASHES; + } - prefix_end += 2; + prefix_end += 2; - } else { - return URI_ERRNO_NO_SLASHES; + } else { + msg_debug ("parse_uri: no '/' in uri"); + return URI_ERRNO_NO_SLASHES; + } } if (get_protocol_free_syntax (uri->protocol)) { @@ -721,7 +760,7 @@ parse_uri(struct uri *uri, unsigned char *uristring) if (*host_end == '/') { host_end++; - } else if (get_protocol_need_slash_after_host (uri->protocol)) { + } else if (get_protocol_need_slash_after_host (uri->protocol) && *host_end != '?') { /* The need for slash after the host component depends on the * need for a host component. -- The dangerous mind of Jonah */ if (!uri->hostlen) @@ -745,7 +784,8 @@ parse_uri(struct uri *uri, unsigned char *uristring) uri->post = prefix_end + 1; } - convert_to_lowercase (uri->host, strlen (uri->host)); + convert_to_lowercase (uri->string, uri->protocollen); + convert_to_lowercase (uri->host, uri->hostlen); /* Decode %HH sequences in host name. This is important not so much to support %HH sequences in host names (which other browser don't), but to support binary characters (which will have been @@ -758,115 +798,6 @@ parse_uri(struct uri *uri, unsigned char *uristring) return URI_ERRNO_OK; } -unsigned char * -normalize_uri(struct uri *uri, unsigned char *uristring) -{ - unsigned char *parse_string = uristring; - unsigned char *src, *dest, *path; - int need_slash = 0; - int parse = (uri == NULL); - struct uri uri_struct; - - if (!uri) uri = &uri_struct; - - /* - * We need to get the real (proxied) URI but lowercase relevant URI - * parts along the way. - */ - if (parse && parse_uri (uri, parse_string) != URI_ERRNO_OK) - return uristring; - - - /* This is a maybe not the right place but both join_urls() and - * get_translated_uri() through translate_url() calls this - * function and then it already works on and modifies an - * allocated copy. */ - convert_to_lowercase (uri->string, uri->protocollen); - if (uri->hostlen) convert_to_lowercase (uri->host, uri->hostlen); - - parse = 1; - parse_string = uri->data; - - if (get_protocol_free_syntax (uri->protocol)) - return uristring; - - if (uri->protocol != PROTOCOL_UNKNOWN) - need_slash = get_protocol_need_slash_after_host (uri->protocol); - - /* We want to start at the first slash to also reduce URIs like - * http://host//index.html to http://host/index.html */ - path = uri->data - need_slash; - dest = src = path; - - /* This loop mangles the URI string by removing directory elevators and - * other cruft. Example: /.././etc////..//usr/ -> /usr/ */ - while (*dest) { - /* If the following pieces are the LAST parts of URL, we remove - * them as well. See RFC 1808 for details. */ - - if (end_of_dir (src[0])) { - /* URL data contains no more path. */ - memmove (dest, src, strlen(src) + 1); - break; - } - - if (!is_uri_dir_sep (uri, src[0])) { - /* This is to reduce indentation */ - - } else if (src[1] == '.') { - if (!src[2]) { - /* /. - skip the dot */ - *dest++ = *src; - *dest = 0; - break; - - } else if (is_uri_dir_sep (uri, src[2])) { - /* /./ - strip that.. */ - src += 2; - continue; - - } else if (src[2] == '.' - && (is_uri_dir_sep (uri, src[3]) || !src[3])) { - /* /../ or /.. - skip it and preceding element. */ - - /* First back out the last incrementation of - * @dest (dest++) to get the position that was - * last asigned to. */ - if (dest > path) dest--; - - /* @dest might be pointing to a dir separator - * so we decrement before any testing. */ - while (dest > path) { - dest--; - if (is_uri_dir_sep (uri, *dest)) break; - } - - if (!src[3]) { - /* /.. - add ending slash and stop */ - *dest++ = *src; - *dest = 0; - break; - } - - src += 3; - continue; - } - - } else if (is_uri_dir_sep (uri, src[1])) { - /* // - ignore first '/'. */ - src += 1; - continue; - } - - /* We don't want to access memory past the NUL char. */ - *dest = *src++; - if (*dest) dest++; - } - - return uristring; -} - - void url_parse_text (struct worker_task *task, GByteArray *content) { @@ -883,13 +814,12 @@ url_parse_text (struct worker_task *task, GByteArray *content) if (rc) { if (g_match_info_matches (info)) { g_match_info_fetch_pos (info, 0, &start, &pos); - url_str = g_match_info_fetch (info, 1); + url_str = g_match_info_fetch (info, 0); msg_debug ("url_parse_text: extracted string with regexp: '%s'", url_str); if (url_str != NULL) { new = g_malloc (sizeof (struct uri)); if (new != NULL) { parse_uri (new, url_str); - normalize_uri (new, url_str); TAILQ_INSERT_TAIL (&task->urls, new, next); } } @@ -924,13 +854,12 @@ url_parse_html (struct worker_task *task, GByteArray *content) if (rc) { if (g_match_info_matches (info)) { g_match_info_fetch_pos (info, 0, &start, &pos); - url_str = g_match_info_fetch (info, 2); + url_str = g_match_info_fetch (info, 1); msg_debug ("url_parse_html: extracted string with regexp: '%s'", url_str); if (url_str != NULL) { new = g_malloc (sizeof (struct uri)); if (new != NULL) { parse_uri (new, url_str); - normalize_uri (new, url_str); TAILQ_INSERT_TAIL (&task->urls, new, next); } } diff --git a/url.h b/url.h index ae845aca1..045bfa90e 100644 --- a/url.h +++ b/url.h @@ -83,6 +83,5 @@ enum protocol { void url_parse_html (struct worker_task *task, GByteArray *part); void url_parse_text (struct worker_task *task, GByteArray *part); enum uri_errno parse_uri(struct uri *uri, unsigned char *uristring); -unsigned char * normalize_uri(struct uri *uri, unsigned char *uristring); #endif diff --git a/util.c b/util.c index 52591983a..ed6d18fb6 100644 --- a/util.c +++ b/util.c @@ -197,7 +197,8 @@ pass_signal_worker (struct workq *workers, int signo) void convert_to_lowercase (char *str, unsigned int size) { while (size--) { - *str = tolower (*str ++); + *str = tolower (*str); + str ++; } } -- 2.39.5