aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--main.c1
-rw-r--r--main.h8
-rw-r--r--test/rspamd_url_test.c6
-rw-r--r--url.c110
4 files changed, 76 insertions, 49 deletions
diff --git a/main.c b/main.c
index dc1907f85..0478599ce 100644
--- a/main.c
+++ b/main.c
@@ -184,7 +184,6 @@ main (int argc, char **argv)
rspamd->cfg->cfg_name = strdup (FIXED_CONFIG_FILE);
read_cmd_line (argc, argv, rspamd->cfg);
- openlog("rspamd", LOG_PID, LOG_MAIL);
msg_warn ("(main) starting...");
#ifndef HAVE_SETPROCTITLE
diff --git a/main.h b/main.h
index ddfbf959f..cedec5eb2 100644
--- a/main.h
+++ b/main.h
@@ -32,10 +32,10 @@
#define SOFT_SHUTDOWN_TIME 60
/* Logging in postfix style */
-#define msg_err(args...) syslog(LOG_ERR, ##args)
-#define msg_warn(args...) syslog(LOG_WARNING, ##args)
-#define msg_info(args...) syslog(LOG_INFO, ##args)
-#define msg_debug(args...) syslog(LOG_DEBUG, ##args)
+#define msg_err g_error
+#define msg_warn g_warning
+#define msg_info g_message
+#define msg_debug g_debug
/* Process type: main or worker */
enum process_type {
diff --git a/test/rspamd_url_test.c b/test/rspamd_url_test.c
index 29be737e1..17565de80 100644
--- a/test/rspamd_url_test.c
+++ b/test/rspamd_url_test.c
@@ -33,13 +33,14 @@ rspamd_url_test_func ()
text->data = (gchar *)test_text;
text->len = sizeof (test_text);
html = g_byte_array_new();
- text->data = (gchar *)test_html;
- text->len = sizeof (test_html);
+ html->data = (gchar *)test_html;
+ html->len = sizeof (test_html);
bzero (&task, sizeof (task));
TAILQ_INIT (&task.urls);
g_test_timer_start ();
g_test_message ("* Testing text URL regexp parser *");
+ g_test_message ("Passing string: %s", test_text);
url_parse_text (&task, text);
TAILQ_FOREACH (url, &task.urls, next) {
@@ -60,6 +61,7 @@ rspamd_url_test_func ()
i = 0;
g_test_timer_start ();
g_test_message ("* Testing html URL regexp parser *");
+ g_test_message ("Passing string: %s", test_html);
url_parse_html (&task, html);
TAILQ_FOREACH (url, &task.urls, next) {
diff --git a/url.c b/url.c
index 2fb01f396..97091c3e2 100644
--- a/url.c
+++ b/url.c
@@ -2,7 +2,6 @@
#include <stdlib.h>
#include <ctype.h>
#include <errno.h>
-#include <pcre.h>
#include <syslog.h>
#include <sys/socket.h>
#include <arpa/inet.h>
@@ -36,13 +35,10 @@ struct _proto {
static const char *html_url = "((?:href\\s*=\\s*)|(?:archive\\s*=\\s*)|(?:code\\s*=\\s*)|(?:codebase\\s*=\\s*)|(?:src\\s*=\\s*)|(?:cite\\s*=\\s*)"
"|(:?background\\s*=\\s*)|(?:pluginspage\\s*=\\s*)|(?:pluginurl\\s*=\\s*)|(?:action\\s*=\\s*)|(?:dynsrc\\s*=\\s*)|(?:longdesc\\s*=\\s*)|(?:lowsrc\\s*=\\s*)|(?:usemap\\s*=\\s*))"
"\\\"?([^>\"<]+)\\\"?";
-static const char *text_url = "((?:mailto\\:|(?:news|(?:ht|f)tp(?:s?))\\://){1}[^>\"<]+)";
+static const char *text_url = "((?:mailto\\:|(?:news|(?:ht|f)tp(?:s?))\\://){1}[^ ]+)";
static short url_initialized = 0;
-static pcre_extra *text_re_extra;
-static pcre *text_re;
-static pcre_extra *html_re_extra;
-static pcre *html_re;
+GRegex *text_re, *html_re;
static const struct _proto protocol_backends[] = {
{ "file", 0, NULL, 1, 0, 0, 0 },
@@ -160,20 +156,22 @@ check_uri_file(unsigned char *name)
static int
url_init (void)
{
+ GError *err = NULL;
if (url_initialized == 0) {
- text_re = pcre_compile (text_url, PCRE_CASELESS, NULL, 0, NULL);
- if (text_re == NULL) {
- msg_info ("url_init: cannot init url parsing regexp");
+ text_re = g_regex_new (text_url, G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_RAW, 0, &err);
+ if (err != NULL) {
+ msg_info ("url_init: cannot init text url parsing regexp: %s", err->message);
+ g_error_free (err);
return -1;
}
- text_re_extra = pcre_study (text_re, 0, NULL);
- html_re = pcre_compile (html_url, PCRE_CASELESS, NULL, 0, NULL);
- if (html_re == NULL) {
- msg_info ("url_init: cannot init url parsing regexp");
+ html_re = g_regex_new (html_url, G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_RAW, 0, &err);
+ if (err != NULL) {
+ msg_info ("url_init: cannot init html url parsing regexp: %s", err->message);
+ g_error_free (err);
return -1;
}
- html_re_extra = pcre_study (html_re, 0, NULL);
url_initialized = 1;
+ msg_debug ("url_init: url regexps initialized successfully, text regexp: /%s/, html_regexp: /%s/", text_url, html_url);
}
return 0;
@@ -874,53 +872,81 @@ normalize_uri(struct uri *uri, unsigned char *uristring)
void
url_parse_text (struct worker_task *task, GByteArray *content)
{
- int ovec[30];
- int pos = 0, rc;
+ GMatchInfo *info;
+ GError *err = NULL;
+ int pos = 0, start;
+ gboolean rc;
char *url_str = NULL;
struct uri *new;
if (url_init () == 0) {
- while ((rc = pcre_exec (text_re, text_re_extra, (const char *)content->data, content->len, pos, 0,
- ovec, sizeof (ovec) / sizeof (ovec[0])) >= 0)) {
- if (rc > 0) {
- pos = ovec[1];
- pcre_get_substring ((const char *)content->data, ovec, rc, 1, (const char **)&url_str);
- if (url_str != NULL) {
- new = g_malloc (sizeof (struct uri));
- if (new != NULL) {
- parse_uri (new, url_str);
- normalize_uri (new, url_str);
- TAILQ_INSERT_TAIL (&task->urls, new, next);
+ do {
+ rc = g_regex_match_full (text_re, (const char *)content->data, content->len, pos, 0, &info, &err);
+ if (rc) {
+ if (g_match_info_matches (info)) {
+ g_match_info_fetch_pos (info, 0, &start, &pos);
+ url_str = g_match_info_fetch (info, 1);
+ msg_debug ("url_parse_text: extracted string with regexp: '%s'", url_str);
+ if (url_str != NULL) {
+ new = g_malloc (sizeof (struct uri));
+ if (new != NULL) {
+ parse_uri (new, url_str);
+ normalize_uri (new, url_str);
+ TAILQ_INSERT_TAIL (&task->urls, new, next);
+ }
}
+ g_free (url_str);
}
+ g_match_info_free (info);
}
- }
+ else if (err != NULL) {
+ msg_debug ("url_parse_text: error matching regexp: %s", err->message);
+ g_free (err);
+ }
+ else {
+ msg_debug ("url_parse_text: cannot find url pattern in given string");
+ }
+ } while (rc > 0);
}
}
void
url_parse_html (struct worker_task *task, GByteArray *content)
{
- int ovec[30];
- int pos = 0, rc;
+ GMatchInfo *info;
+ GError *err = NULL;
+ int pos = 0, start;
+ gboolean rc;
char *url_str = NULL;
struct uri *new;
if (url_init () == 0) {
- while ((rc = pcre_exec (html_re, html_re_extra, (const char *)content->data, content->len, pos, 0,
- ovec, sizeof (ovec) / sizeof (ovec[0])) >= 0)) {
- if (rc > 0) {
- pos = ovec[1];
- pcre_get_substring ((const char *)content->data, ovec, rc, 3, (const char **)&url_str);
- if (url_str != NULL) {
- new = g_malloc (sizeof (struct uri));
- if (new != NULL) {
- parse_uri (new, url_str);
- normalize_uri (new, url_str);
- TAILQ_INSERT_TAIL (&task->urls, new, next);
+ do {
+ rc = g_regex_match_full (html_re, (const char *)content->data, content->len, pos, 0, &info, &err);
+ if (rc) {
+ if (g_match_info_matches (info)) {
+ g_match_info_fetch_pos (info, 0, &start, &pos);
+ url_str = g_match_info_fetch (info, 3);
+ msg_debug ("url_parse_html: extracted string with regexp: '%s'", url_str);
+ if (url_str != NULL) {
+ new = g_malloc (sizeof (struct uri));
+ if (new != NULL) {
+ parse_uri (new, url_str);
+ normalize_uri (new, url_str);
+ TAILQ_INSERT_TAIL (&task->urls, new, next);
+ }
}
+ g_free (url_str);
}
+ g_match_info_free (info);
}
- }
+ else if (err) {
+ msg_debug ("url_parse_html: error matching regexp: %s", err->message);
+ g_free (err);
+ }
+ else {
+ msg_debug ("url_parse_html: cannot find url pattern in given string");
+ }
+ } while (rc > 0);
}
}