summaryrefslogtreecommitdiffstats
path: root/url.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2008-06-16 19:24:18 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2008-06-16 19:24:18 +0400
commite92bfae6a160b187f47092074c7f49989f71950d (patch)
tree340f78a21f215676fdfa98bd22a99f186655a379 /url.c
parent7cd13c464ff3e025a0ce70302dede40a1b2d3f29 (diff)
downloadrspamd-e92bfae6a160b187f47092074c7f49989f71950d.tar.gz
rspamd-e92bfae6a160b187f47092074c7f49989f71950d.zip
* Add extracting urls from messages
Diffstat (limited to 'url.c')
-rw-r--r--url.c63
1 files changed, 58 insertions, 5 deletions
diff --git a/url.c b/url.c
index dc8e8057a..0efa4fc89 100644
--- a/url.c
+++ b/url.c
@@ -16,6 +16,13 @@
#define POST_CHAR 1
#define POST_CHAR_S "\001"
+/* Tcp port range */
+#define LOWEST_PORT 0
+#define HIGHEST_PORT 65535
+
+#define uri_port_is_valid(port) \
+ (LOWEST_PORT <= (port) && (port) <= HIGHEST_PORT)
+
struct _proto {
unsigned char *name;
int port;
@@ -26,10 +33,10 @@ struct _proto {
unsigned int need_ssl:1;
};
-static const char *html_url = "((?:href=)|(?:archive=)|(?:code=)|(?:codebase=)|(?:src=)|(?:cite=)"
-"|(:?background=)|(?:pluginspage=)|(?:pluginurl=)|(?:action=)|(?:dynsrc=)|(?:longdesc=)|(?:lowsrc=)|(?:src=)|(?:usemap=))"
+static const char *html_url = "((?:href\\s*=\\s*)|(?:archive\\s*=\\s*)|(?:code\\s*=\\s*)|(?:codebase\\s*=\\s*)|(?:src\\s*=\\s*)|(?:cite\\s*=\\s*)"
+"|(:?background\\s*=\\s*)|(?:pluginspage\\s*=\\s*)|(?:pluginurl\\s*=\\s*)|(?:action\\s*=\\s*)|(?:dynsrc\\s*=\\s*)|(?:longdesc\\s*=\\s*)|(?:lowsrc\\s*=\\s*)|(?:usemap\\s*=\\s*))"
"\\\"?([^>\"<]+)\\\"?";
-static const char *text_url = "((mailto\\:|(news|(ht|f)tp(s?))\\://){1}[^>\"<]+)";
+static const char *text_url = "((?:mailto\\:|(?:news|(?:ht|f)tp(?:s?))\\://){1}[^>\"<]+)";
static short url_initialized = 0;
static pcre_extra *text_re_extra;
@@ -60,6 +67,14 @@ is_uri_dir_sep(struct uri *uri, unsigned char pos)
}
static int
+check_uri_file(unsigned char *name)
+{
+ static const unsigned char chars[] = POST_CHAR_S "#?";
+
+ return strcspn(name, chars);
+}
+
+static int
url_init (void)
{
if (url_initialized == 0) {
@@ -480,15 +495,53 @@ normalize_uri(struct uri *uri, unsigned char *uristring)
void
url_parse_text (struct worker_task *task, GByteArray *content)
{
+ int ovec[30];
+ int pos = 0, rc;
+ char *url_str = NULL;
+ struct uri *new;
+
if (url_init () == 0) {
- /* TODO: */
+ while ((rc = pcre_exec (text_re, text_re_extra, (const char *)content->data, content->len, pos, 0,
+ ovec, sizeof (ovec) / sizeof (ovec[0])) >= 0)) {
+ if (rc > 0) {
+ pos = ovec[1];
+ pcre_get_substring ((const char *)content->data, ovec, rc, 1, (const char **)&url_str);
+ if (url_str != NULL) {
+ new = g_malloc (sizeof (struct uri));
+ if (new != NULL) {
+ parse_uri (new, url_str);
+ normalize_uri (new, url_str);
+ TAILQ_INSERT_TAIL (&task->urls, new, next);
+ }
+ }
+ }
+ }
}
}
void
url_parse_html (struct worker_task *task, GByteArray *content)
{
+ int ovec[30];
+ int pos = 0, rc;
+ char *url_str = NULL;
+ struct uri *new;
+
if (url_init () == 0) {
- /* TODO: */
+ while ((rc = pcre_exec (html_re, html_re_extra, (const char *)content->data, content->len, pos, 0,
+ ovec, sizeof (ovec) / sizeof (ovec[0])) >= 0)) {
+ if (rc > 0) {
+ pos = ovec[1];
+ pcre_get_substring ((const char *)content->data, ovec, rc, 3, (const char **)&url_str);
+ if (url_str != NULL) {
+ new = g_malloc (sizeof (struct uri));
+ if (new != NULL) {
+ parse_uri (new, url_str);
+ normalize_uri (new, url_str);
+ TAILQ_INSERT_TAIL (&task->urls, new, next);
+ }
+ }
+ }
+ }
}
}