From 5237a665760b680f2f2765322238b53c5988a40c Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 8 Sep 2008 19:29:34 +0400 Subject: [PATCH] * Use glib logger and regexp matching functions --- main.c | 1 - main.h | 8 +-- test/rspamd_url_test.c | 6 ++- url.c | 110 +++++++++++++++++++++++++---------------- 4 files changed, 76 insertions(+), 49 deletions(-) diff --git a/main.c b/main.c index dc1907f85..0478599ce 100644 --- a/main.c +++ b/main.c @@ -184,7 +184,6 @@ main (int argc, char **argv) rspamd->cfg->cfg_name = strdup (FIXED_CONFIG_FILE); read_cmd_line (argc, argv, rspamd->cfg); - openlog("rspamd", LOG_PID, LOG_MAIL); msg_warn ("(main) starting..."); #ifndef HAVE_SETPROCTITLE diff --git a/main.h b/main.h index ddfbf959f..cedec5eb2 100644 --- a/main.h +++ b/main.h @@ -32,10 +32,10 @@ #define SOFT_SHUTDOWN_TIME 60 /* Logging in postfix style */ -#define msg_err(args...) syslog(LOG_ERR, ##args) -#define msg_warn(args...) syslog(LOG_WARNING, ##args) -#define msg_info(args...) syslog(LOG_INFO, ##args) -#define msg_debug(args...) syslog(LOG_DEBUG, ##args) +#define msg_err g_error +#define msg_warn g_warning +#define msg_info g_message +#define msg_debug g_debug /* Process type: main or worker */ enum process_type { diff --git a/test/rspamd_url_test.c b/test/rspamd_url_test.c index 29be737e1..17565de80 100644 --- a/test/rspamd_url_test.c +++ b/test/rspamd_url_test.c @@ -33,13 +33,14 @@ rspamd_url_test_func () text->data = (gchar *)test_text; text->len = sizeof (test_text); html = g_byte_array_new(); - text->data = (gchar *)test_html; - text->len = sizeof (test_html); + html->data = (gchar *)test_html; + html->len = sizeof (test_html); bzero (&task, sizeof (task)); TAILQ_INIT (&task.urls); g_test_timer_start (); g_test_message ("* Testing text URL regexp parser *"); + g_test_message ("Passing string: %s", test_text); url_parse_text (&task, text); TAILQ_FOREACH (url, &task.urls, next) { @@ -60,6 +61,7 @@ rspamd_url_test_func () i = 0; g_test_timer_start (); g_test_message ("* Testing html URL regexp parser *"); + g_test_message ("Passing string: %s", test_html); url_parse_html (&task, html); TAILQ_FOREACH (url, &task.urls, next) { diff --git a/url.c b/url.c index 2fb01f396..97091c3e2 100644 --- a/url.c +++ b/url.c @@ -2,7 +2,6 @@ #include #include #include -#include #include #include #include @@ -36,13 +35,10 @@ struct _proto { static const char *html_url = "((?:href\\s*=\\s*)|(?:archive\\s*=\\s*)|(?:code\\s*=\\s*)|(?:codebase\\s*=\\s*)|(?:src\\s*=\\s*)|(?:cite\\s*=\\s*)" "|(:?background\\s*=\\s*)|(?:pluginspage\\s*=\\s*)|(?:pluginurl\\s*=\\s*)|(?:action\\s*=\\s*)|(?:dynsrc\\s*=\\s*)|(?:longdesc\\s*=\\s*)|(?:lowsrc\\s*=\\s*)|(?:usemap\\s*=\\s*))" "\\\"?([^>\"<]+)\\\"?"; -static const char *text_url = "((?:mailto\\:|(?:news|(?:ht|f)tp(?:s?))\\://){1}[^>\"<]+)"; +static const char *text_url = "((?:mailto\\:|(?:news|(?:ht|f)tp(?:s?))\\://){1}[^ ]+)"; static short url_initialized = 0; -static pcre_extra *text_re_extra; -static pcre *text_re; -static pcre_extra *html_re_extra; -static pcre *html_re; +GRegex *text_re, *html_re; static const struct _proto protocol_backends[] = { { "file", 0, NULL, 1, 0, 0, 0 }, @@ -160,20 +156,22 @@ check_uri_file(unsigned char *name) static int url_init (void) { + GError *err = NULL; if (url_initialized == 0) { - text_re = pcre_compile (text_url, PCRE_CASELESS, NULL, 0, NULL); - if (text_re == NULL) { - msg_info ("url_init: cannot init url parsing regexp"); + text_re = g_regex_new (text_url, G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_RAW, 0, &err); + if (err != NULL) { + msg_info ("url_init: cannot init text url parsing regexp: %s", err->message); + g_error_free (err); return -1; } - text_re_extra = pcre_study (text_re, 0, NULL); - html_re = pcre_compile (html_url, PCRE_CASELESS, NULL, 0, NULL); - if (html_re == NULL) { - msg_info ("url_init: cannot init url parsing regexp"); + html_re = g_regex_new (html_url, G_REGEX_CASELESS | G_REGEX_MULTILINE | G_REGEX_RAW, 0, &err); + if (err != NULL) { + msg_info ("url_init: cannot init html url parsing regexp: %s", err->message); + g_error_free (err); return -1; } - html_re_extra = pcre_study (html_re, 0, NULL); url_initialized = 1; + msg_debug ("url_init: url regexps initialized successfully, text regexp: /%s/, html_regexp: /%s/", text_url, html_url); } return 0; @@ -874,53 +872,81 @@ normalize_uri(struct uri *uri, unsigned char *uristring) void url_parse_text (struct worker_task *task, GByteArray *content) { - int ovec[30]; - int pos = 0, rc; + GMatchInfo *info; + GError *err = NULL; + int pos = 0, start; + gboolean rc; char *url_str = NULL; struct uri *new; if (url_init () == 0) { - while ((rc = pcre_exec (text_re, text_re_extra, (const char *)content->data, content->len, pos, 0, - ovec, sizeof (ovec) / sizeof (ovec[0])) >= 0)) { - if (rc > 0) { - pos = ovec[1]; - pcre_get_substring ((const char *)content->data, ovec, rc, 1, (const char **)&url_str); - if (url_str != NULL) { - new = g_malloc (sizeof (struct uri)); - if (new != NULL) { - parse_uri (new, url_str); - normalize_uri (new, url_str); - TAILQ_INSERT_TAIL (&task->urls, new, next); + do { + rc = g_regex_match_full (text_re, (const char *)content->data, content->len, pos, 0, &info, &err); + if (rc) { + if (g_match_info_matches (info)) { + g_match_info_fetch_pos (info, 0, &start, &pos); + url_str = g_match_info_fetch (info, 1); + msg_debug ("url_parse_text: extracted string with regexp: '%s'", url_str); + if (url_str != NULL) { + new = g_malloc (sizeof (struct uri)); + if (new != NULL) { + parse_uri (new, url_str); + normalize_uri (new, url_str); + TAILQ_INSERT_TAIL (&task->urls, new, next); + } } + g_free (url_str); } + g_match_info_free (info); } - } + else if (err != NULL) { + msg_debug ("url_parse_text: error matching regexp: %s", err->message); + g_free (err); + } + else { + msg_debug ("url_parse_text: cannot find url pattern in given string"); + } + } while (rc > 0); } } void url_parse_html (struct worker_task *task, GByteArray *content) { - int ovec[30]; - int pos = 0, rc; + GMatchInfo *info; + GError *err = NULL; + int pos = 0, start; + gboolean rc; char *url_str = NULL; struct uri *new; if (url_init () == 0) { - while ((rc = pcre_exec (html_re, html_re_extra, (const char *)content->data, content->len, pos, 0, - ovec, sizeof (ovec) / sizeof (ovec[0])) >= 0)) { - if (rc > 0) { - pos = ovec[1]; - pcre_get_substring ((const char *)content->data, ovec, rc, 3, (const char **)&url_str); - if (url_str != NULL) { - new = g_malloc (sizeof (struct uri)); - if (new != NULL) { - parse_uri (new, url_str); - normalize_uri (new, url_str); - TAILQ_INSERT_TAIL (&task->urls, new, next); + do { + rc = g_regex_match_full (html_re, (const char *)content->data, content->len, pos, 0, &info, &err); + if (rc) { + if (g_match_info_matches (info)) { + g_match_info_fetch_pos (info, 0, &start, &pos); + url_str = g_match_info_fetch (info, 3); + msg_debug ("url_parse_html: extracted string with regexp: '%s'", url_str); + if (url_str != NULL) { + new = g_malloc (sizeof (struct uri)); + if (new != NULL) { + parse_uri (new, url_str); + normalize_uri (new, url_str); + TAILQ_INSERT_TAIL (&task->urls, new, next); + } } + g_free (url_str); } + g_match_info_free (info); } - } + else if (err) { + msg_debug ("url_parse_html: error matching regexp: %s", err->message); + g_free (err); + } + else { + msg_debug ("url_parse_html: cannot find url pattern in given string"); + } + } while (rc > 0); } } -- 2.39.5