]> source.dussan.org Git - rspamd.git/commitdiff
* Add extracting urls from messages
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Mon, 16 Jun 2008 15:24:18 +0000 (19:24 +0400)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Mon, 16 Jun 2008 15:24:18 +0000 (19:24 +0400)
url.c
util.c
util.h

diff --git a/url.c b/url.c
index dc8e8057a1c34392d644c4bb4eb21f24bb808ffa..0efa4fc89659d376b2793b1bf29e5d1b5753b7cc 100644 (file)
--- a/url.c
+++ b/url.c
 #define POST_CHAR 1
 #define POST_CHAR_S "\001"
 
+/* Tcp port range */
+#define LOWEST_PORT 0
+#define HIGHEST_PORT    65535
+
+#define uri_port_is_valid(port) \
+    (LOWEST_PORT <= (port) && (port) <= HIGHEST_PORT)
+
 struct _proto {
        unsigned char *name;
        int port;
@@ -26,10 +33,10 @@ struct _proto {
        unsigned int need_ssl:1;
 };
 
-static const char *html_url = "((?:href=)|(?:archive=)|(?:code=)|(?:codebase=)|(?:src=)|(?:cite=)"
-"|(:?background=)|(?:pluginspage=)|(?:pluginurl=)|(?:action=)|(?:dynsrc=)|(?:longdesc=)|(?:lowsrc=)|(?:src=)|(?:usemap=))"
+static const char *html_url = "((?:href\\s*=\\s*)|(?:archive\\s*=\\s*)|(?:code\\s*=\\s*)|(?:codebase\\s*=\\s*)|(?:src\\s*=\\s*)|(?:cite\\s*=\\s*)"
+"|(:?background\\s*=\\s*)|(?:pluginspage\\s*=\\s*)|(?:pluginurl\\s*=\\s*)|(?:action\\s*=\\s*)|(?:dynsrc\\s*=\\s*)|(?:longdesc\\s*=\\s*)|(?:lowsrc\\s*=\\s*)|(?:usemap\\s*=\\s*))"
 "\\\"?([^>\"<]+)\\\"?";
-static const char *text_url = "((mailto\\:|(news|(ht|f)tp(s?))\\://){1}[^>\"<]+)";
+static const char *text_url = "((?:mailto\\:|(?:news|(?:ht|f)tp(?:s?))\\://){1}[^>\"<]+)";
 
 static short url_initialized = 0;
 static pcre_extra *text_re_extra;
@@ -59,6 +66,14 @@ is_uri_dir_sep(struct uri *uri, unsigned char pos)
        return (pos == '/');
 }
 
+static int
+check_uri_file(unsigned char *name)
+{
+       static const unsigned char chars[] = POST_CHAR_S "#?";
+
+       return strcspn(name, chars);
+}
+
 static int
 url_init (void)
 {
@@ -480,15 +495,53 @@ normalize_uri(struct uri *uri, unsigned char *uristring)
 void 
 url_parse_text (struct worker_task *task, GByteArray *content)
 {
+       int ovec[30];
+       int pos = 0, rc;
+       char *url_str = NULL;
+       struct uri *new;
+
        if (url_init () == 0) {
-               /* TODO: */
+               while ((rc = pcre_exec (text_re, text_re_extra, (const char *)content->data, content->len, pos, 0, 
+                                               ovec, sizeof (ovec) / sizeof (ovec[0])) >= 0)) {
+                       if (rc > 0) {
+                               pos = ovec[1];
+                               pcre_get_substring ((const char *)content->data, ovec, rc, 1, (const char **)&url_str);
+                               if (url_str != NULL) {
+                                       new = g_malloc (sizeof (struct uri));
+                                       if (new != NULL) {
+                                               parse_uri (new, url_str);
+                                               normalize_uri (new, url_str);
+                                               TAILQ_INSERT_TAIL (&task->urls, new, next);
+                                       }
+                               }
+                       }
+               } 
        }
 }
 
 void 
 url_parse_html (struct worker_task *task, GByteArray *content)
 {
+       int ovec[30];
+       int pos = 0, rc;
+       char *url_str = NULL;
+       struct uri *new;
+
        if (url_init () == 0) {
-               /* TODO: */
+               while ((rc = pcre_exec (html_re, html_re_extra, (const char *)content->data, content->len, pos, 0, 
+                                               ovec, sizeof (ovec) / sizeof (ovec[0])) >= 0)) {
+                       if (rc > 0) {
+                               pos = ovec[1];
+                               pcre_get_substring ((const char *)content->data, ovec, rc, 3, (const char **)&url_str);
+                               if (url_str != NULL) {
+                                       new = g_malloc (sizeof (struct uri));
+                                       if (new != NULL) {
+                                               parse_uri (new, url_str);
+                                               normalize_uri (new, url_str);
+                                               TAILQ_INSERT_TAIL (&task->urls, new, next);
+                                       }
+                               }
+                       }
+               }
        }
 }
diff --git a/util.c b/util.c
index 65cafb97eaf6e6f2b0d7bd87fc2e95a49bdab80c..37ceacd44d681007eace8f09c6bc268b51af7183 100644 (file)
--- a/util.c
+++ b/util.c
@@ -194,6 +194,13 @@ pass_signal_worker (struct workq *workers, int signo)
        }
 }
 
+void convert_to_lowercase (char *str, unsigned int size)
+{
+       while (size --) {
+               *str = tolower (*str ++);
+       }
+}
+
 #ifndef HAVE_SETPROCTITLE
 
 static char *title_buffer = 0;
diff --git a/util.h b/util.h
index 6340b21946e8f44b202f660701ca73307863045e..e88521281e2b9416bf3b333c27b5145ab5c3f37d 100644 (file)
--- a/util.h
+++ b/util.h
@@ -31,6 +31,8 @@ int event_make_socket_nonblocking(int);
 void init_signals (struct sigaction *, sig_t);
 /* Send specified signal to each worker */
 void pass_signal_worker (struct workq *, int );
+/* Convert string to lowercase */
+void convert_to_lowercase (char *str, unsigned int size);
 
 #ifndef HAVE_SETPROCTITLE
 int init_title(int argc, char *argv[], char *envp[]);