瀏覽代碼

* Fix url parser (get regexp from our mail system)

* Add some more tests to url testcase
tags/0.2.7
Vsevolod Stakhov 15 年之前
父節點
當前提交
57e765ce78
共有 6 個文件被更改,包括 143 次插入167 次删除
  1. 0
    1
      plugins/surbl.c
  2. 4
    1
      test/rspamd_test_suite.c
  3. 58
    13
      test/rspamd_url_test.c
  4. 79
    150
      url.c
  5. 0
    1
      url.h
  6. 2
    1
      util.c

+ 0
- 1
plugins/surbl.c 查看文件

@@ -462,7 +462,6 @@ redirector_callback (int fd, short what, void *arg)
if (*p == '\0') {
msg_info ("redirector_callback: got reply from redirector: '%s' -> '%s'", struri (param->url), c);
parse_uri (param->url, c);
normalize_uri (param->url, c);
register_memcached_call (param->url, param->task);
param->task->save.saved ++;
}

+ 4
- 1
test/rspamd_test_suite.c 查看文件

@@ -16,10 +16,13 @@
int
main (int argc, char **argv)
{
g_mem_set_vtable(glib_mem_profiler_table);

g_test_init (&argc, &argv, NULL);

g_test_add_func ("/rspamd/url", rspamd_url_test_func);

g_test_run ();

g_mem_profile ();
}

+ 58
- 13
test/rspamd_url_test.c 查看文件

@@ -17,8 +17,54 @@
#include "../url.h"
#include "tests.h"

const char *test_text = "This is test file with http://TesT%45.com/././ url";
const char *test_html = "<some_tag>This is test file with <a href=\"http://microsoft.com\">http://TesT%45.com/././ url</a></some_tag>";
const char *test_text =
"www.schemeless.ru\n"
"www.schemeless.rus\n"
" as ftp.schemeless.ru dasd \n"
"ftp12.schemeless.ru\n"
"ftpsearch.schemeless.ru\n"
"schemeless.ru\n"
"www.schemeless.microsoft\n"
"1.2.3.4\n"
"1.2.3.4/a\n"
"1.2.3\n"
"1.2.3.4.5\n"
"www.schemeless.ru,\n"
"www.schemeless.ru.\n"
"http://www.schemed.ru.\n"
"http://www.schemed.ru.\n"
"http://www.bolinfest.com/targetalert/'\n"
"http://www.bolinfest.com/targetalert/'';\n"
"https://www.schemed.ru.\n"
"ufps://www.schemed.ru.\n"
"http://ported.ru:8080\n"
"http://ported.ru:8080\n"
"http://1.2.3.4\n"
"http://1.2.3.4:80\n"
"1.2.3.4:80\n"
"www.a9.com\n"
"www.a-9.com\n"
"http://www.schemed.ru/a.txt:\n"
"http://www.schemed.ru/a.txt'\n"
"http://www.schemed.ru/a.txt\"\n"
"http://www.schemed.ru/a.txt>\n"
"http://www.schemed.ru/a=3&b=4\n"
"http://spam.ru/bad=user@domain.com\n"
"http://spam.ru/bad=user@domain.com\n"
"http://spam.ru user@domain.com\n"
"http://a.foto.radikal.ru/0604/de7793c6ca62.jpg\n"
"http://a.foto.radikal.ru/0604/de7793c6ca62.jpg\n"
"schemeless.gz\n"
"schemeless.jp\n"
"schemeless.ua\n"
"schemeless.gz/a\n"
"mysql.so\n"
"http://mysql.so\n"
"3com.com\n"
"lj-user.livejournal.com\n"
"http://lj-user.livejournal.com\n"
"http://vsem.ru?action;\n";
const char *test_html = "<some_tag>This is test file with <a href=\"http://microsoft.com\">http://TesT.com/././?%45%46%20 url</a></some_tag>";

/* Function for using in glib test suite */
void
@@ -39,12 +85,12 @@ rspamd_url_test_func ()
TAILQ_INIT (&task.urls);
g_test_timer_start ();
g_test_message ("* Testing text URL regexp parser *");
g_test_message ("Passing string: %s", test_text);
g_test_message ("Testing text URL regexp parser");
msg_debug ("Passing string: %s", test_text);
url_parse_text (&task, text);

TAILQ_FOREACH (url, &task.urls, next) {
g_test_message ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data);
msg_debug ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data);
i ++;
}

@@ -54,18 +100,17 @@ rspamd_url_test_func ()
g_free (url->string);
g_free (url);
}
g_assert (i == 1);

g_test_message ("Time elapsed: %.2f", g_test_timer_elapsed ());
g_assert (i == 39);

msg_debug ("Time elapsed: %.2f", g_test_timer_elapsed ());
i = 0;
g_test_timer_start ();
g_test_message ("* Testing html URL regexp parser *");
g_test_message ("Passing string: %s", test_html);
g_test_message ("Testing html URL regexp parser");
msg_debug ("Passing string: %s", test_html);
url_parse_html (&task, html);

TAILQ_FOREACH (url, &task.urls, next) {
g_test_message ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data);
msg_debug ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data);
i ++;
}

@@ -75,6 +120,6 @@ rspamd_url_test_func ()
g_free (url->string);
g_free (url);
}
g_assert (i == 2);
g_test_message ("Time elapsed: %.2f", g_test_timer_elapsed ());
g_assert (i == 1);
msg_debug ("Time elapsed: %.2f", g_test_timer_elapsed ());
}

+ 79
- 150
url.c 查看文件

@@ -32,8 +32,39 @@ struct _proto {
unsigned int need_ssl:1;
};

static const char *html_url = "((?:href\\s*=\\s*)?([^>\"<]+))?";
static const char *text_url = "(https?://[^ ]+)";
static const char *text_url = "((https?|ftp)://)?"
"(\\b(?<![.\\@A-Za-z0-9-])"
"(?: [A-Za-z0-9][A-Za-z0-9-]*(?:\\.[A-Za-z0-9-]+)*\\."
"(?i:com|net|org|biz|edu|gov|info|name|int|mil|aero|coop|jobs|mobi|museum|pro|travel"
"|[rs]u|uk|ua|by|de|jp|fr|fi|no|no|ca|it|ro|cn|nl|at|nu|se"
"|[a-z]{2}"
"(?(1)|(?=/)))"
"(?!\\w)"
"|(?:\\d{1,3}\\.){3}\\d{1,3}(?(1)|(?=[/:]))"
")"
"(?::\\d{1,5})?" /* port */
"(?!\\.\\w)" /* host part ended, no more of this further on */
"(?:[/?][;/?:@&=+\\$,[\\]\\-_.!~*'()A-Za-z0-9#%]*)?" /* path (&query) */
"(?<![\\s>?!),.'\"\\]:])"
"(?!@)"
")";
static const char *html_url = "(?: src|href)=\"("
"((https?|ftp)://)?"
"(\\b(?<![.\\@A-Za-z0-9-])"
"(?: [A-Za-z0-9][A-Za-z0-9-]*(?:\\.[A-Za-z0-9-]+)*\\."
"(?i:com|net|org|biz|edu|gov|info|name|int|mil|aero|coop|jobs|mobi|museum|pro|travel"
"|[rs]u|uk|ua|by|de|jp|fr|fi|no|no|ca|it|ro|cn|nl|at|nu|se"
"|[a-z]{2}"
"(?(1)|(?=/)))"
"(?!\\w)"
"|(?:\\d{1,3}\\.){3}\\d{1,3}(?(1)|(?=[/:]))"
")"
"(?::\\d{1,5})?" /* port */
"(?!\\.\\w)" /* host part ended, no more of this further on */
"(?:[/?][;/?:@&=+\\$,[\\]\\-_.!~*'()A-Za-z0-9#%]*)?" /* path (&query) */
"(?<![\\s>?!),.'\"\\]:])"
"(?!@)"
"))\"";

static short url_initialized = 0;
GRegex *text_re, *html_re;
@@ -156,13 +187,13 @@ url_init (void)
{
GError *err = NULL;
if (url_initialized == 0) {
text_re = g_regex_new (text_url, G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_RAW, 0, &err);
text_re = g_regex_new (text_url, G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_OPTIMIZE | G_REGEX_EXTENDED, 0, &err);
if (err != NULL) {
msg_info ("url_init: cannot init text url parsing regexp: %s", err->message);
g_error_free (err);
return -1;
}
html_re = g_regex_new (html_url, G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_RAW, 0, &err);
html_re = g_regex_new (html_url, G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_OPTIMIZE | G_REGEX_EXTENDED, 0, &err);
if (err != NULL) {
msg_info ("url_init: cannot init html url parsing regexp: %s", err->message);
g_error_free (err);
@@ -258,14 +289,8 @@ get_protocol_length(const unsigned char *url)
while (isalnum(*end) || *end == '+' || *end == '-' || *end == '.')
end++;

/* Now we make something to support our "IP version in protocol scheme
* name" hack and silently chop off the last digit if it's there. The
* IETF's not gonna notice I hope or it'd be going after us hard. */
if (end != url && isdigit(end[-1]))
end--;

/* Also return 0 if there's no protocol name (@end == @url). */
return (*end == ':' || isdigit(*end)) ? end - url : 0;
return (*end == ':') ? end - url : 0;
}

/* URL-unescape the string S.
@@ -417,7 +442,7 @@ reencode_escapes (const char *s)

if (!encode_count)
/* The string is good as it is. */
return (char *) s; /* C const model sucks. */
return g_strdup (s); /* C const model sucks. */

oldlen = p1 - s;
/* Each encoding adds two characters (hex digits). */
@@ -557,7 +582,7 @@ path_simplify (char *path)
enum uri_errno
parse_uri(struct uri *uri, unsigned char *uristring)
{
unsigned char *prefix_end, *host_end;
unsigned char *prefix_end, *host_end, *p;
unsigned char *lbracket, *rbracket;
int datalen, n, addrlen;
unsigned char *frag_or_post, *user_end, *port_end;
@@ -568,35 +593,49 @@ parse_uri(struct uri *uri, unsigned char *uristring)
if (!*uristring) return URI_ERRNO_EMPTY;
uri->string = reencode_escapes (uristring);
uri->protocollen = get_protocol_length (uristring);

/* Invalid */
if (!uri->protocollen) return URI_ERRNO_INVALID_PROTOCOL;

/* Figure out whether the protocol is known */
uri->protocol = get_protocol (struri(uri), uri->protocollen);
msg_debug ("parse_uri: reencoding escapes in original url: '%s'", struri (uri));
uri->protocollen = get_protocol_length (struri (uri));

/* Assume http as default protocol */
if (!uri->protocollen || (uri->protocol = get_protocol (struri(uri), uri->protocollen)) == PROTOCOL_UNKNOWN) {
p = g_strconcat ("http://", uri->string, NULL);
g_free (uri->string);
uri->string = p;
uri->protocol = PROTOCOL_HTTP;
prefix_end = struri (uri) + 7;
}
else {
/* Figure out whether the protocol is known */
msg_debug ("parse_uri: getting protocol from url: %d", uri->protocol);

prefix_end = struri (uri) + uri->protocollen; /* ':' */
prefix_end = struri (uri) + uri->protocollen; /* ':' */

/* Check if there's a digit after the protocol name. */
if (isdigit (*prefix_end)) {
uri->ip_family = uristring[uri->protocollen] - '0';
/* Check if there's a digit after the protocol name. */
if (isdigit (*prefix_end)) {
p = struri (uri);
uri->ip_family = p[uri->protocollen] - '0';
prefix_end++;
}
if (*prefix_end != ':') {
msg_debug ("parse_uri: invalid protocol in uri");
return URI_ERRNO_INVALID_PROTOCOL;
}
prefix_end++;
}
if (*prefix_end != ':')
return URI_ERRNO_INVALID_PROTOCOL;
prefix_end++;

/* Skip slashes */
/* Skip slashes */

if (prefix_end[0] == '/' && prefix_end[1] == '/') {
if (prefix_end[2] == '/')
return URI_ERRNO_TOO_MANY_SLASHES;
if (prefix_end[0] == '/' && prefix_end[1] == '/') {
if (prefix_end[2] == '/') {
msg_debug ("parse_uri: too many '/' in uri");
return URI_ERRNO_TOO_MANY_SLASHES;
}

prefix_end += 2;
prefix_end += 2;

} else {
return URI_ERRNO_NO_SLASHES;
} else {
msg_debug ("parse_uri: no '/' in uri");
return URI_ERRNO_NO_SLASHES;
}
}

if (get_protocol_free_syntax (uri->protocol)) {
@@ -721,7 +760,7 @@ parse_uri(struct uri *uri, unsigned char *uristring)
if (*host_end == '/') {
host_end++;

} else if (get_protocol_need_slash_after_host (uri->protocol)) {
} else if (get_protocol_need_slash_after_host (uri->protocol) && *host_end != '?') {
/* The need for slash after the host component depends on the
* need for a host component. -- The dangerous mind of Jonah */
if (!uri->hostlen)
@@ -745,7 +784,8 @@ parse_uri(struct uri *uri, unsigned char *uristring)
uri->post = prefix_end + 1;
}
convert_to_lowercase (uri->host, strlen (uri->host));
convert_to_lowercase (uri->string, uri->protocollen);
convert_to_lowercase (uri->host, uri->hostlen);
/* Decode %HH sequences in host name. This is important not so much
to support %HH sequences in host names (which other browser
don't), but to support binary characters (which will have been
@@ -758,115 +798,6 @@ parse_uri(struct uri *uri, unsigned char *uristring)
return URI_ERRNO_OK;
}

unsigned char *
normalize_uri(struct uri *uri, unsigned char *uristring)
{
unsigned char *parse_string = uristring;
unsigned char *src, *dest, *path;
int need_slash = 0;
int parse = (uri == NULL);
struct uri uri_struct;

if (!uri) uri = &uri_struct;

/*
* We need to get the real (proxied) URI but lowercase relevant URI
* parts along the way.
*/
if (parse && parse_uri (uri, parse_string) != URI_ERRNO_OK)
return uristring;


/* This is a maybe not the right place but both join_urls() and
* get_translated_uri() through translate_url() calls this
* function and then it already works on and modifies an
* allocated copy. */
convert_to_lowercase (uri->string, uri->protocollen);
if (uri->hostlen) convert_to_lowercase (uri->host, uri->hostlen);

parse = 1;
parse_string = uri->data;

if (get_protocol_free_syntax (uri->protocol))
return uristring;

if (uri->protocol != PROTOCOL_UNKNOWN)
need_slash = get_protocol_need_slash_after_host (uri->protocol);

/* We want to start at the first slash to also reduce URIs like
* http://host//index.html to http://host/index.html */
path = uri->data - need_slash;
dest = src = path;

/* This loop mangles the URI string by removing directory elevators and
* other cruft. Example: /.././etc////..//usr/ -> /usr/ */
while (*dest) {
/* If the following pieces are the LAST parts of URL, we remove
* them as well. See RFC 1808 for details. */

if (end_of_dir (src[0])) {
/* URL data contains no more path. */
memmove (dest, src, strlen(src) + 1);
break;
}

if (!is_uri_dir_sep (uri, src[0])) {
/* This is to reduce indentation */

} else if (src[1] == '.') {
if (!src[2]) {
/* /. - skip the dot */
*dest++ = *src;
*dest = 0;
break;

} else if (is_uri_dir_sep (uri, src[2])) {
/* /./ - strip that.. */
src += 2;
continue;

} else if (src[2] == '.'
&& (is_uri_dir_sep (uri, src[3]) || !src[3])) {
/* /../ or /.. - skip it and preceding element. */

/* First back out the last incrementation of
* @dest (dest++) to get the position that was
* last asigned to. */
if (dest > path) dest--;

/* @dest might be pointing to a dir separator
* so we decrement before any testing. */
while (dest > path) {
dest--;
if (is_uri_dir_sep (uri, *dest)) break;
}

if (!src[3]) {
/* /.. - add ending slash and stop */
*dest++ = *src;
*dest = 0;
break;
}

src += 3;
continue;
}

} else if (is_uri_dir_sep (uri, src[1])) {
/* // - ignore first '/'. */
src += 1;
continue;
}

/* We don't want to access memory past the NUL char. */
*dest = *src++;
if (*dest) dest++;
}

return uristring;
}


void
url_parse_text (struct worker_task *task, GByteArray *content)
{
@@ -883,13 +814,12 @@ url_parse_text (struct worker_task *task, GByteArray *content)
if (rc) {
if (g_match_info_matches (info)) {
g_match_info_fetch_pos (info, 0, &start, &pos);
url_str = g_match_info_fetch (info, 1);
url_str = g_match_info_fetch (info, 0);
msg_debug ("url_parse_text: extracted string with regexp: '%s'", url_str);
if (url_str != NULL) {
new = g_malloc (sizeof (struct uri));
if (new != NULL) {
parse_uri (new, url_str);
normalize_uri (new, url_str);
TAILQ_INSERT_TAIL (&task->urls, new, next);
}
}
@@ -924,13 +854,12 @@ url_parse_html (struct worker_task *task, GByteArray *content)
if (rc) {
if (g_match_info_matches (info)) {
g_match_info_fetch_pos (info, 0, &start, &pos);
url_str = g_match_info_fetch (info, 2);
url_str = g_match_info_fetch (info, 1);
msg_debug ("url_parse_html: extracted string with regexp: '%s'", url_str);
if (url_str != NULL) {
new = g_malloc (sizeof (struct uri));
if (new != NULL) {
parse_uri (new, url_str);
normalize_uri (new, url_str);
TAILQ_INSERT_TAIL (&task->urls, new, next);
}
}

+ 0
- 1
url.h 查看文件

@@ -83,6 +83,5 @@ enum protocol {
void url_parse_html (struct worker_task *task, GByteArray *part);
void url_parse_text (struct worker_task *task, GByteArray *part);
enum uri_errno parse_uri(struct uri *uri, unsigned char *uristring);
unsigned char * normalize_uri(struct uri *uri, unsigned char *uristring);

#endif

+ 2
- 1
util.c 查看文件

@@ -197,7 +197,8 @@ pass_signal_worker (struct workq *workers, int signo)
void convert_to_lowercase (char *str, unsigned int size)
{
while (size--) {
*str = tolower (*str ++);
*str = tolower (*str);
str ++;
}
}


Loading…
取消
儲存