From e92bfae6a160b187f47092074c7f49989f71950d Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 16 Jun 2008 19:24:18 +0400 Subject: [PATCH] * Add extracting urls from messages --- url.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++----- util.c | 7 +++++++ util.h | 2 ++ 3 files changed, 67 insertions(+), 5 deletions(-) diff --git a/url.c b/url.c index dc8e8057a..0efa4fc89 100644 --- a/url.c +++ b/url.c @@ -16,6 +16,13 @@ #define POST_CHAR 1 #define POST_CHAR_S "\001" +/* Tcp port range */ +#define LOWEST_PORT 0 +#define HIGHEST_PORT 65535 + +#define uri_port_is_valid(port) \ + (LOWEST_PORT <= (port) && (port) <= HIGHEST_PORT) + struct _proto { unsigned char *name; int port; @@ -26,10 +33,10 @@ struct _proto { unsigned int need_ssl:1; }; -static const char *html_url = "((?:href=)|(?:archive=)|(?:code=)|(?:codebase=)|(?:src=)|(?:cite=)" -"|(:?background=)|(?:pluginspage=)|(?:pluginurl=)|(?:action=)|(?:dynsrc=)|(?:longdesc=)|(?:lowsrc=)|(?:src=)|(?:usemap=))" +static const char *html_url = "((?:href\\s*=\\s*)|(?:archive\\s*=\\s*)|(?:code\\s*=\\s*)|(?:codebase\\s*=\\s*)|(?:src\\s*=\\s*)|(?:cite\\s*=\\s*)" +"|(:?background\\s*=\\s*)|(?:pluginspage\\s*=\\s*)|(?:pluginurl\\s*=\\s*)|(?:action\\s*=\\s*)|(?:dynsrc\\s*=\\s*)|(?:longdesc\\s*=\\s*)|(?:lowsrc\\s*=\\s*)|(?:usemap\\s*=\\s*))" "\\\"?([^>\"<]+)\\\"?"; -static const char *text_url = "((mailto\\:|(news|(ht|f)tp(s?))\\://){1}[^>\"<]+)"; +static const char *text_url = "((?:mailto\\:|(?:news|(?:ht|f)tp(?:s?))\\://){1}[^>\"<]+)"; static short url_initialized = 0; static pcre_extra *text_re_extra; @@ -59,6 +66,14 @@ is_uri_dir_sep(struct uri *uri, unsigned char pos) return (pos == '/'); } +static int +check_uri_file(unsigned char *name) +{ + static const unsigned char chars[] = POST_CHAR_S "#?"; + + return strcspn(name, chars); +} + static int url_init (void) { @@ -480,15 +495,53 @@ normalize_uri(struct uri *uri, unsigned char *uristring) void url_parse_text (struct worker_task *task, GByteArray *content) { + int ovec[30]; + int pos = 0, rc; + char *url_str = NULL; + struct uri *new; + if (url_init () == 0) { - /* TODO: */ + while ((rc = pcre_exec (text_re, text_re_extra, (const char *)content->data, content->len, pos, 0, + ovec, sizeof (ovec) / sizeof (ovec[0])) >= 0)) { + if (rc > 0) { + pos = ovec[1]; + pcre_get_substring ((const char *)content->data, ovec, rc, 1, (const char **)&url_str); + if (url_str != NULL) { + new = g_malloc (sizeof (struct uri)); + if (new != NULL) { + parse_uri (new, url_str); + normalize_uri (new, url_str); + TAILQ_INSERT_TAIL (&task->urls, new, next); + } + } + } + } } } void url_parse_html (struct worker_task *task, GByteArray *content) { + int ovec[30]; + int pos = 0, rc; + char *url_str = NULL; + struct uri *new; + if (url_init () == 0) { - /* TODO: */ + while ((rc = pcre_exec (html_re, html_re_extra, (const char *)content->data, content->len, pos, 0, + ovec, sizeof (ovec) / sizeof (ovec[0])) >= 0)) { + if (rc > 0) { + pos = ovec[1]; + pcre_get_substring ((const char *)content->data, ovec, rc, 3, (const char **)&url_str); + if (url_str != NULL) { + new = g_malloc (sizeof (struct uri)); + if (new != NULL) { + parse_uri (new, url_str); + normalize_uri (new, url_str); + TAILQ_INSERT_TAIL (&task->urls, new, next); + } + } + } + } } } diff --git a/util.c b/util.c index 65cafb97e..37ceacd44 100644 --- a/util.c +++ b/util.c @@ -194,6 +194,13 @@ pass_signal_worker (struct workq *workers, int signo) } } +void convert_to_lowercase (char *str, unsigned int size) +{ + while (size --) { + *str = tolower (*str ++); + } +} + #ifndef HAVE_SETPROCTITLE static char *title_buffer = 0; diff --git a/util.h b/util.h index 6340b2194..e88521281 100644 --- a/util.h +++ b/util.h @@ -31,6 +31,8 @@ int event_make_socket_nonblocking(int); void init_signals (struct sigaction *, sig_t); /* Send specified signal to each worker */ void pass_signal_worker (struct workq *, int ); +/* Convert string to lowercase */ +void convert_to_lowercase (char *str, unsigned int size); #ifndef HAVE_SETPROCTITLE int init_title(int argc, char *argv[], char *envp[]); -- 2.39.5