* Fix url length while passing them to normalizer

author Vsevolod Stakhov <vsevolod@rambler-co.ru>

Mon, 8 Sep 2008 15:45:45 +0000 (19:45 +0400)

committer Vsevolod Stakhov <vsevolod@rambler-co.ru>

Mon, 8 Sep 2008 15:45:45 +0000 (19:45 +0400)
author Vsevolod Stakhov <vsevolod@rambler-co.ru>
Mon, 8 Sep 2008 15:45:45 +0000 (19:45 +0400)
committer Vsevolod Stakhov <vsevolod@rambler-co.ru>
Mon, 8 Sep 2008 15:45:45 +0000 (19:45 +0400)
diff --git a/test/rspamd_url_test.c b/test/rspamd_url_test.c

index 17565de80dd61e4320fdfcfed31995190afdf5d9..5a4d9e5ba7f9a180dcadb20e5f5fcb859c7542a2 100644 (file)
--- a/test/rspamd_url_test.c
+++ b/test/rspamd_url_test.c
@@ -31,10 +31,10 @@ rspamd_url_test_func ()
  
         text = g_byte_array_new();
         text->data = (gchar *)test_text;
-       text->len = sizeof (test_text);
+       text->len = strlen (test_text);
         html = g_byte_array_new();
         html->data = (gchar *)test_html;
-       html->len = sizeof (test_html);
+       html->len = strlen (test_html);
         bzero (&task, sizeof (task));
         TAILQ_INIT (&task.urls);
         
diff --git a/url.c b/url.c

index 97091c3e25b7509c889170776069ceb547c84204..3288bb2ff768ce7a170e4131aa67c37f9c9d4519 100644 (file)
--- a/url.c
+++ b/url.c
@@ -32,10 +32,8 @@ struct _proto {
         unsigned int need_ssl:1;
  };
  
-static const char *html_url = "((?:href\\s*=\\s*)|(?:archive\\s*=\\s*)|(?:code\\s*=\\s*)|(?:codebase\\s*=\\s*)|(?:src\\s*=\\s*)|(?:cite\\s*=\\s*)"
-"|(:?background\\s*=\\s*)|(?:pluginspage\\s*=\\s*)|(?:pluginurl\\s*=\\s*)|(?:action\\s*=\\s*)|(?:dynsrc\\s*=\\s*)|(?:longdesc\\s*=\\s*)|(?:lowsrc\\s*=\\s*)|(?:usemap\\s*=\\s*))"
-"\\\"?([^>\"<]+)\\\"?";
-static const char *text_url = "((?:mailto\\:|(?:news|(?:ht|f)tp(?:s?))\\://){1}[^ ]+)";
+static const char *html_url = "((?:href\\s*=\\s*)?([^>\"<]+))?";
+static const char *text_url = "(https?://[^ ]+)";
  
  static short url_initialized = 0;
  GRegex *text_re, *html_re;
@@ -906,7 +904,7 @@ url_parse_text (struct worker_task *task, GByteArray *content)
                         else {
                                 msg_debug ("url_parse_text: cannot find url pattern in given string");
                         }
-               } while (rc > 0);
+               } while (rc);
         }
  }
  
@@ -926,7 +924,7 @@ url_parse_html (struct worker_task *task, GByteArray *content)
                         if (rc) {
                                 if (g_match_info_matches (info)) {
                                         g_match_info_fetch_pos (info, 0, &start, &pos);
-                                       url_str = g_match_info_fetch (info, 3);
+                                       url_str = g_match_info_fetch (info, 2);
                                         msg_debug ("url_parse_html: extracted string with regexp: '%s'", url_str);
                                         if (url_str != NULL) {
                                                 new = g_malloc (sizeof (struct uri));
@@ -947,6 +945,6 @@ url_parse_html (struct worker_task *task, GByteArray *content)
                         else {
                                 msg_debug ("url_parse_html: cannot find url pattern in given string");
                         }
-               } while (rc > 0);
+               } while (rc);
         }
  }
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Mon, 8 Sep 2008 15:45:45 +0000 (19:45 +0400)
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Mon, 8 Sep 2008 15:45:45 +0000 (19:45 +0400)
test/rspamd_url_test.c		patch \| blob \| history
url.c		patch \| blob \| history