From: Vsevolod Stakhov Date: Mon, 8 Sep 2008 15:45:45 +0000 (+0400) Subject: * Fix url length while passing them to normalizer X-Git-Tag: 0.2.7~382 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=f1e17a0d63485d6317fc5d83b57618cfc08dea44;p=rspamd.git * Fix url length while passing them to normalizer TODO: fix html parsing regexp (now it doesn't work) --- diff --git a/test/rspamd_url_test.c b/test/rspamd_url_test.c index 17565de80..5a4d9e5ba 100644 --- a/test/rspamd_url_test.c +++ b/test/rspamd_url_test.c @@ -31,10 +31,10 @@ rspamd_url_test_func () text = g_byte_array_new(); text->data = (gchar *)test_text; - text->len = sizeof (test_text); + text->len = strlen (test_text); html = g_byte_array_new(); html->data = (gchar *)test_html; - html->len = sizeof (test_html); + html->len = strlen (test_html); bzero (&task, sizeof (task)); TAILQ_INIT (&task.urls); diff --git a/url.c b/url.c index 97091c3e2..3288bb2ff 100644 --- a/url.c +++ b/url.c @@ -32,10 +32,8 @@ struct _proto { unsigned int need_ssl:1; }; -static const char *html_url = "((?:href\\s*=\\s*)|(?:archive\\s*=\\s*)|(?:code\\s*=\\s*)|(?:codebase\\s*=\\s*)|(?:src\\s*=\\s*)|(?:cite\\s*=\\s*)" -"|(:?background\\s*=\\s*)|(?:pluginspage\\s*=\\s*)|(?:pluginurl\\s*=\\s*)|(?:action\\s*=\\s*)|(?:dynsrc\\s*=\\s*)|(?:longdesc\\s*=\\s*)|(?:lowsrc\\s*=\\s*)|(?:usemap\\s*=\\s*))" -"\\\"?([^>\"<]+)\\\"?"; -static const char *text_url = "((?:mailto\\:|(?:news|(?:ht|f)tp(?:s?))\\://){1}[^ ]+)"; +static const char *html_url = "((?:href\\s*=\\s*)?([^>\"<]+))?"; +static const char *text_url = "(https?://[^ ]+)"; static short url_initialized = 0; GRegex *text_re, *html_re; @@ -906,7 +904,7 @@ url_parse_text (struct worker_task *task, GByteArray *content) else { msg_debug ("url_parse_text: cannot find url pattern in given string"); } - } while (rc > 0); + } while (rc); } } @@ -926,7 +924,7 @@ url_parse_html (struct worker_task *task, GByteArray *content) if (rc) { if (g_match_info_matches (info)) { g_match_info_fetch_pos (info, 0, &start, &pos); - url_str = g_match_info_fetch (info, 3); + url_str = g_match_info_fetch (info, 2); msg_debug ("url_parse_html: extracted string with regexp: '%s'", url_str); if (url_str != NULL) { new = g_malloc (sizeof (struct uri)); @@ -947,6 +945,6 @@ url_parse_html (struct worker_task *task, GByteArray *content) else { msg_debug ("url_parse_html: cannot find url pattern in given string"); } - } while (rc > 0); + } while (rc); } }