aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-02-03 17:39:03 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-02-03 17:39:03 +0000
commitb4c0e9b59d3985726d9a346085172394a0495ce6 (patch)
treef03a48c2206d37299f26c87f9ecd85d394a39271
parentaa8b1c618148c2ead44e1cc643e9eb23423a4843 (diff)
downloadrspamd-b4c0e9b59d3985726d9a346085172394a0495ce6.tar.gz
rspamd-b4c0e9b59d3985726d9a346085172394a0495ce6.zip
Remove old crap functions from url parser code.
-rw-r--r--src/libmime/message.c14
-rw-r--r--src/libserver/html.c18
-rw-r--r--src/libserver/protocol.c4
-rw-r--r--src/libserver/url.c723
-rw-r--r--src/libserver/url.h38
-rw-r--r--src/libutil/util.c4
-rw-r--r--src/lua/lua_task.c24
-rw-r--r--src/plugins/regexp.c2
-rw-r--r--src/plugins/surbl.c14
-rw-r--r--src/plugins/surbl.h4
10 files changed, 167 insertions, 678 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 869d0a06e..702b148cb 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -1233,7 +1233,7 @@ process_text_part (struct rspamd_task *task,
decode_entitles (text_part->content->data,
&text_part->content->len);
}
- url_parse_text (task->task_pool, task, text_part, TRUE);
+ rspamd_url_text_extract (task->task_pool, task, text_part, TRUE);
rspamd_fuzzy_from_text_part (text_part, task->task_pool, task->cfg->max_diff);
rspamd_mempool_add_destructor (task->task_pool,
@@ -1260,7 +1260,7 @@ process_text_part (struct rspamd_task *task,
type,
text_part);
text_part->orig = part_content;
- url_parse_text (task->task_pool, task, text_part, FALSE);
+ rspamd_url_text_extract (task->task_pool, task, text_part, FALSE);
rspamd_fuzzy_from_text_part (text_part, task->task_pool, task->cfg->max_diff);
task->text_parts = g_list_prepend (task->text_parts, text_part);
}
@@ -1460,7 +1460,7 @@ process_message (struct rspamd_task *task)
GMimeDataWrapper *wrapper;
struct received_header *recv;
gchar *mid, *url_str, *p, *end, *url_end;
- struct uri *subject_url;
+ struct rspamd_url *subject_url;
gsize len;
gint rc;
@@ -1634,14 +1634,14 @@ process_message (struct rspamd_task *task)
while (p < end) {
/* Search to the end of url */
- if (url_try_text (task->task_pool, p, end - p, NULL, &url_end,
+ if (rspamd_url_find (task->task_pool, p, end - p, NULL, &url_end,
&url_str, FALSE)) {
if (url_str != NULL) {
subject_url = rspamd_mempool_alloc0 (task->task_pool,
- sizeof (struct uri));
+ sizeof (struct rspamd_url));
if (subject_url != NULL) {
/* Try to parse url */
- rc = parse_uri (subject_url, url_str, task->task_pool);
+ rc = rspamd_url_parse (subject_url, url_str, task->task_pool);
if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES ||
rc == URI_ERRNO_NO_HOST_SLASH) &&
subject_url->hostlen > 0) {
@@ -1656,7 +1656,7 @@ process_message (struct rspamd_task *task)
else if (rc != URI_ERRNO_OK) {
msg_info ("extract of url '%s' failed: %s",
url_str,
- url_strerror (rc));
+ rspamd_url_strerror (rc));
}
}
}
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 5055a9aae..7df9270c3 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -674,12 +674,12 @@ decode_entitles (gchar *s, guint * len)
static void
check_phishing (struct rspamd_task *task,
- struct uri *href_url,
+ struct rspamd_url *href_url,
const gchar *url_text,
gsize remain,
tag_id_t id)
{
- struct uri *new;
+ struct rspamd_url *new;
gchar *url_str;
const gchar *p, *c;
gchar tagbuf[128];
@@ -732,12 +732,12 @@ check_phishing (struct rspamd_task *task,
p++;
}
- if (url_try_text (task->task_pool, url_text, len, NULL, NULL, &url_str,
+ if (rspamd_url_find (task->task_pool, url_text, len, NULL, NULL, &url_str,
TRUE) && url_str != NULL) {
- new = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct uri));
+ new = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_url));
if (new != NULL) {
g_strstrip (url_str);
- rc = parse_uri (new, url_str, task->task_pool);
+ rc = rspamd_url_parse (new, url_str, task->task_pool);
if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc ==
URI_ERRNO_NO_HOST_SLASH) {
@@ -787,7 +787,7 @@ check_phishing (struct rspamd_task *task,
else {
msg_info ("extract of url '%s' failed: %s",
url_str,
- url_strerror (rc));
+ rspamd_url_strerror (rc));
}
}
}
@@ -804,7 +804,7 @@ parse_tag_url (struct rspamd_task *task,
{
gchar *c = NULL, *p, *url_text;
gint len, rc;
- struct uri *url;
+ struct rspamd_url *url;
gboolean got_single_quote = FALSE, got_double_quote = FALSE;
/* For A tags search for href= and for IMG tags search for src= */
@@ -885,8 +885,8 @@ parse_tag_url (struct rspamd_task *task,
return;
}
- url = rspamd_mempool_alloc (task->task_pool, sizeof (struct uri));
- rc = parse_uri (url, url_text, task->task_pool);
+ url = rspamd_mempool_alloc (task->task_pool, sizeof (struct rspamd_url));
+ rc = rspamd_url_parse (url, url_text, task->task_pool);
if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST && url->hostlen !=
0) {
diff --git a/src/libserver/protocol.c b/src/libserver/protocol.c
index 25dcee1c0..e702bfc14 100644
--- a/src/libserver/protocol.c
+++ b/src/libserver/protocol.c
@@ -492,7 +492,7 @@ static gboolean
urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
{
struct tree_cb_data *cb = ud;
- struct uri *url = value;
+ struct rspamd_url *url = value;
ucl_object_t *obj, *elt;
if (!cb->task->extended_urls) {
@@ -550,7 +550,7 @@ static gboolean
emails_protocol_cb (gpointer key, gpointer value, gpointer ud)
{
struct tree_cb_data *cb = ud;
- struct uri *url = value;
+ struct rspamd_url *url = value;
ucl_object_t *obj;
obj = ucl_object_fromlstring (url->user, url->userlen + url->hostlen + 1);
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 3e4ccc827..22cb15759 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -29,6 +29,7 @@
#include "main.h"
#include "message.h"
#include "trie.h"
+#include "http.h"
#define POST_CHAR 1
#define POST_CHAR_S "\001"
@@ -695,28 +696,6 @@ struct url_match_scanner {
struct url_match_scanner *url_scanner = NULL;
-static const struct _proto protocol_backends[] = {
- {"file", 0, NULL, 1, 0, 0, 0},
- {"ftp", 21, NULL, 1, 0, 0, 0},
- {"http", 80, NULL, 1, 0, 0, 0},
- {"https", 443, NULL, 1, 0, 0, 1},
- {"mailto", 25, NULL, 1, 0, 0, 0},
- /* Keep these last! */
- {NULL, 0, NULL, 0, 0, 1, 0}
-};
-
-/* Convert an ASCII hex digit to the corresponding number between 0
- and 15. H should be a hexadecimal digit that satisfies isxdigit;
- otherwise, the result is undefined. */
-#define XDIGIT_TO_NUM(h) ((h) < 'A' ? (h) - '0' : g_ascii_toupper (h) - 'A' + \
- 10)
-#define X2DIGITS_TO_NUM(h1, h2) ((XDIGIT_TO_NUM (h1) << 4) + XDIGIT_TO_NUM (h2))
-/* The reverse of the above: convert a number in the [0, 16) range to
- the ASCII representation of the corresponding hexadecimal digit.
- `+ 0' is there so you can't accidentally use it as an lvalue. */
-#define XNUM_TO_DIGIT(x) ("0123456789ABCDEF"[x] + 0)
-#define XNUM_TO_digit(x) ("0123456789abcdef"[x] + 0)
-
static guchar url_scanner_table[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 1, 1, 9, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -759,7 +738,7 @@ enum {
const gchar *
-url_strerror (enum uri_errno err)
+rspamd_url_strerror (enum uri_errno err)
{
switch (err) {
case URI_ERRNO_OK:
@@ -768,37 +747,17 @@ url_strerror (enum uri_errno err)
return "The URI string was empty";
case URI_ERRNO_INVALID_PROTOCOL:
return "No protocol was found";
- case URI_ERRNO_NO_SLASHES:
- return "Slashes after protocol missing";
- case URI_ERRNO_TOO_MANY_SLASHES:
- return "Too many slashes after protocol";
- case URI_ERRNO_TRAILING_DOTS:
- return "'.' after host";
- case URI_ERRNO_NO_HOST:
- return "Host part is missing";
- case URI_ERRNO_NO_PORT_COLON:
- return "':' after host without port";
- case URI_ERRNO_NO_HOST_SLASH:
- return "Slash after host missing";
- case URI_ERRNO_IPV6_SECURITY:
- return "IPv6 security bug detected";
+ case URI_ERRNO_BAD_FORMAT:
+ return "Bad URL format";
+ case URI_ERRNO_BAD_ENCODING:
+ return "Invalid symbols encoded";
case URI_ERRNO_INVALID_PORT:
return "Port number is bad";
- case URI_ERRNO_INVALID_PORT_RANGE:
- return "Port number is not within 0-65535";
}
return NULL;
}
static gint
-check_uri_file (gchar *name)
-{
- static const gchar chars[] = POST_CHAR_S "#?";
-
- return strcspn (name, chars);
-}
-
-static gint
url_init (void)
{
guint i;
@@ -843,590 +802,129 @@ url_init (void)
return 0;
}
-enum protocol
-get_protocol (gchar *name, gint namelen)
-{
- /* These are really enum protocol values but can take on negative
- * values and since 0 <= -1 for enum values it's better to use clean
- * integer type. */
- gint start, end;
- enum protocol protocol;
- guchar *pname;
- gint pnamelen, minlen, compare;
-
- /* Almost dichotomic search is used here */
- /* Starting at the HTTP entry which is the most common that will make
- * file and NNTP the next entries checked and amongst the third checks
- * are proxy and FTP. */
- start = 0;
- end = PROTOCOL_UNKNOWN - 1;
- protocol = PROTOCOL_HTTP;
-
- while (start <= end) {
- pname = protocol_backends[protocol].name;
- pnamelen = strlen (pname);
- minlen = MIN (pnamelen, namelen);
- compare = g_ascii_strncasecmp (pname, name, minlen);
-
- if (compare == 0) {
- if (pnamelen == namelen)
- return protocol;
-
- /* If the current protocol name is longer than the
- * protocol name being searched for move @end else move
- * @start. */
- compare = pnamelen > namelen ? 1 : -1;
- }
-
- if (compare > 0)
- end = protocol - 1;
- else
- start = protocol + 1;
-
- protocol = (start + end) / 2;
- }
-
- return PROTOCOL_UNKNOWN;
-}
-
-
-gint
-get_protocol_port (enum protocol protocol)
-{
- return protocol_backends[protocol].port;
-}
-
-gint
-get_protocol_need_slashes (enum protocol protocol)
-{
- return protocol_backends[protocol].need_slashes;
-}
-
-gint
-get_protocol_need_slash_after_host (enum protocol protocol)
-{
- return protocol_backends[protocol].need_slash_after_host;
-}
-
-gint
-get_protocol_free_syntax (enum protocol protocol)
-{
- return protocol_backends[protocol].free_syntax;
-}
-
-static gint
-get_protocol_length (const gchar *url)
-{
- gchar *end = (gchar *)url;
-
- /* Seek the end of the protocol name if any. */
- /* RFC1738:
- * scheme = 1*[ lowalpha | digit | "+" | "-" | "." ]
- * (but per its recommendations we accept "upalpha" too) */
- while (*end && (g_ascii_isalnum (*end) || *end == '+'
- || *end == '-' || *end == '.')) {
- end++;
- }
-
- /* Also return 0 if there's no protocol name (@end == @url). */
- return (*end == ':') ? end - url : 0;
-}
-
-
-/*
- * Calcualte new length of unescaped hostlen
- */
-static guint
-url_calculate_escaped_hostlen (gchar *host, guint hostlen)
-{
- guint i, result = hostlen;
- gchar *p = host, c;
-
- for (i = 0; i < hostlen; i++, p++) {
- if (*p == '%' && g_ascii_isxdigit (*(p + 1)) &&
- g_ascii_isxdigit (*(p + 2)) && i < hostlen - 2) {
- c = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));
- if (c != '\0') {
- result -= 2;
- }
- }
- }
-
- return result;
-}
-
-void
-rspamd_url_unescape (gchar *s)
-{
- gchar *t = s; /* t - tortoise */
- gchar *h = s; /* h - hare */
-
- for (; *h; h++, t++) {
- if (*h != '%') {
- *t = *h;
- }
- else {
- gchar c;
- if (!h[1] || !h[2] ||
- !(g_ascii_isxdigit (h[1]) && g_ascii_isxdigit (h[2]))) {
- *t = *h;
- }
- else {
- c = X2DIGITS_TO_NUM (h[1], h[2]);
- if (c != '\0') {
- *t = c;
- h += 2;
- }
- else {
- *t = *h;
- }
- }
- }
- }
- *t = '\0';
-}
-
-static void
-url_strip (gchar *s)
-{
- gchar *t = s; /* t - tortoise */
- gchar *h = s; /* h - hare */
-
- while (*h) {
- if (g_ascii_isgraph (*h)) {
- *t = *h;
- t++;
- }
- h++;
- }
- *t = '\0';
-}
-
-static gchar *
-url_escape_1 (const gchar *s, gint allow_passthrough, rspamd_mempool_t * pool)
-{
- const gchar *p1;
- gchar *p2, *newstr;
- gint newlen;
- gint addition = 0;
-
- for (p1 = s; *p1; p1++)
- if (!is_urlsafe (*p1)) {
- addition += 2; /* Two more characters (hex digits) */
- }
-
- if (!addition) {
- if (allow_passthrough) {
- return (gchar *)s;
- }
- else {
- return rspamd_mempool_strdup (pool, s);
- }
- }
-
- newlen = (p1 - s) + addition;
- newstr = (gchar *)rspamd_mempool_alloc (pool, newlen + 1);
-
- p1 = s;
- p2 = newstr;
- while (*p1) {
- /* Quote the characters that match the test mask. */
- if (!is_urlsafe (*p1)) {
- guchar c = *p1++;
- *p2++ = '%';
- *p2++ = XNUM_TO_DIGIT (c >> 4);
- *p2++ = XNUM_TO_DIGIT (c & 0xf);
- }
- else
- *p2++ = *p1++;
- }
- *p2 = '\0';
-
- return newstr;
-}
-
-/* URL-escape the unsafe characters (see urlchr_table) in a given
- string, returning a freshly allocated string. */
-
-gchar *
-url_escape (const gchar *s, rspamd_mempool_t * pool)
-{
- return url_escape_1 (s, 0, pool);
-}
-
-/* Decide whether the gchar at position P needs to be encoded. (It is
- not enough to pass a single gchar *P because the function may need
- to inspect the surrounding context.)
-
- Return 1 if the gchar should be escaped as %XX, 0 otherwise. */
-
-static inline gboolean
-char_needs_escaping (const gchar *p)
-{
- if (*p == '%') {
- if (g_ascii_isxdigit (*(p + 1)) && g_ascii_isxdigit (*(p + 2))) {
- return FALSE;
- }
- else {
- return TRUE;
- }
- }
- else if (!is_urlsafe (*p)) {
- return TRUE;
- }
- return FALSE;
-}
-
-static gchar *
-rspamd_url_reencode_escapes (gchar *s, rspamd_mempool_t * pool)
-{
- const gchar *p1;
- gchar *newstr, *p2;
- gint oldlen, newlen;
-
- gint encode_count = 0;
-
- /* First pass: inspect the string to see if there's anything to do,
- and to calculate the new length. */
- for (p1 = s; *p1; p1++) {
- if (char_needs_escaping (p1)) {
- ++encode_count;
- }
- }
-
- if (!encode_count) {
- /* The string is good as it is. */
- return s;
- }
-
- oldlen = p1 - s;
- /* Each encoding adds two characters (hex digits). */
- newlen = oldlen + 2 * encode_count;
- newstr = rspamd_mempool_alloc (pool, newlen + 1);
-
- /* Second pass: copy the string to the destination address, encoding
- chars when needed. */
- p1 = s;
- p2 = newstr;
-
- while (*p1) {
- if (char_needs_escaping (p1)) {
- guchar c = *p1++;
- *p2++ = '%';
- *p2++ = XNUM_TO_DIGIT (c >> 4);
- *p2++ = XNUM_TO_DIGIT (c & 0xf);
- }
- else {
- *p2++ = *p1++;
- }
- }
-
- *p2 = '\0';
- return newstr;
-}
-
-/*
- * Resolve "." and ".." elements of PATH by destructively modifying
- * PATH and return non-zero if PATH has been modified, zero otherwise.
- */
-
-static gboolean
-path_simplify (gchar *path)
-{
- gchar *h = path; /* hare */
- gchar *t = path; /* tortoise */
- gchar *beg = path; /* boundary for backing the tortoise */
- gchar *end = path + strlen (path);
-
- while (h < end) {
- /* Hare should be at the beginning of a path element. */
- if (h[0] == '.' && (h[1] == '/' || h[1] == '\0')) {
- /* Ignore "./". */
- h += 2;
- }
- else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0')) {
- /* Handle "../" by retreating the tortoise by one path
- element -- but not past beginning. */
- if (t > beg) {
- /* Move backwards until T hits the beginning of the
- previous path element or the beginning of path. */
- for (--t; t > beg && t[-1] != '/'; t--) ;
- }
- else {
- /* If we're at the beginning, copy the "../" literally
- move the beginning so a later ".." doesn't remove
- it. */
- beg = t + 3;
- goto regular;
- }
- h += 3;
- }
- else {
-regular:
- /* A regular path element. If H hasn't advanced past T,
- simply skip to the next path element. Otherwise, copy
- the path element until the next slash. */
- if (t == h) {
- /* Skip the path element, including the slash. */
- while (h < end && *h != '/')
- t++, h++;
- if (h < end)
- t++, h++;
- }
- else {
- /* Copy the path element, including the final slash. */
- while (h < end && *h != '/')
- *t++ = *h++;
- if (h < end)
- *t++ = *h++;
- }
- }
- }
-
- if (t != h)
- *t = '\0';
-
- return t != h;
-}
enum uri_errno
-parse_uri (struct uri *uri, gchar *uristring, rspamd_mempool_t * pool)
+rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
+ rspamd_mempool_t *pool)
{
- guchar *prefix_end, *host_end, *p;
- guchar *lbracket, *rbracket;
- gint datalen, n, addrlen;
- guchar *frag_or_post, *user_end, *port_end;
+ struct http_parser_url u;
+ gchar *p, *comp;
+ gint i, complen;
+
+ const struct {
+ enum rspamd_url_protocol proto;
+ const gchar *name;
+ gsize len;
+ } protocols[] = {
+ {
+ .proto = PROTOCOL_FILE,
+ .name = "file",
+ .len = 4
+ },
+ {
+ .proto = PROTOCOL_FTP,
+ .name = "ftp",
+ .len = 3
+ },
+ {
+ .proto = PROTOCOL_HTTP,
+ .name = "http",
+ .len = 4
+ },
+ {
+ .proto = PROTOCOL_HTTPS,
+ .name = "https",
+ .len = 5
+ },
+ {
+ .proto = PROTOCOL_MAILTO,
+ .name = "mailto",
+ .len = 6
+ },
+ {
+ .proto = PROTOCOL_UNKNOWN,
+ .name = NULL,
+ .len = 0
+ }
+ };
memset (uri, 0, sizeof (*uri));
- if (!*uristring) {
+ if (*uristring == '\0') {
return URI_ERRNO_EMPTY;
}
- uri->string = rspamd_url_reencode_escapes (uristring, pool);
- msg_debug ("reencoding escapes in original url: '%s'", struri (uri));
- uri->protocollen = get_protocol_length (struri (uri));
-
- /* Assume http as default protocol */
- if (!uri->protocollen ||
- (uri->protocol =
- get_protocol (struri (uri), uri->protocollen)) == PROTOCOL_UNKNOWN) {
- /* Make exception for numeric urls */
- p = uri->string;
- while (*p && (g_ascii_isalnum (*p) || *p == ':')) {
- p++;
- }
- if (*p == '\0') {
- return URI_ERRNO_INVALID_PROTOCOL;
- }
- p = g_strconcat ("http://", uri->string, NULL);
- uri->string = rspamd_mempool_strdup (pool, p);
- g_free (p);
- uri->protocol = PROTOCOL_HTTP;
- prefix_end = struri (uri) + 7;
+ p = g_uri_unescape_string (uristring, NULL);
+ if (p == NULL) {
+ return URI_ERRNO_BAD_ENCODING;
}
- else {
- /* Figure out whether the protocol is known */
- msg_debug ("getting protocol from url: %d", uri->protocol);
-
- prefix_end = struri (uri) + uri->protocollen; /* ':' */
- /* Check if there's a digit after the protocol name. */
- if (g_ascii_isdigit (*prefix_end)) {
- p = struri (uri);
- uri->ip_family = p[uri->protocollen] - '0';
- prefix_end++;
- }
- if (*prefix_end != ':') {
- msg_debug ("invalid protocol in uri");
- return URI_ERRNO_INVALID_PROTOCOL;
- }
- prefix_end++;
+ uri->string = p;
- /* Skip slashes */
+ rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t)g_free, p);
- if (prefix_end[0] == '/' && prefix_end[1] == '/') {
- if (prefix_end[2] == '/') {
- msg_debug ("too many '/' in uri");
- return URI_ERRNO_TOO_MANY_SLASHES;
- }
-
- prefix_end += 2;
-
- }
- else {
- msg_debug ("no '/' in uri");
- return URI_ERRNO_NO_SLASHES;
- }
+ /*
+ * We assume here that urls has the sane scheme
+ */
+ if (http_parser_parse_url (p, len, 0, &u) != 0) {
+ return URI_ERRNO_BAD_FORMAT;
}
- if (get_protocol_free_syntax (uri->protocol)) {
- uri->data = prefix_end;
- uri->datalen = strlen (prefix_end);
- return URI_ERRNO_OK;
-
- }
- else if (uri->protocol == PROTOCOL_FILE) {
- datalen = check_uri_file (prefix_end);
- frag_or_post = prefix_end + datalen;
-
- /* Extract the fragment part. */
- if (datalen >= 0) {
- if (*frag_or_post == '#') {
- uri->fragment = frag_or_post + 1;
- uri->fragmentlen = strcspn (uri->fragment, POST_CHAR_S);
- frag_or_post = uri->fragment + uri->fragmentlen;
- }
- if (*frag_or_post == POST_CHAR) {
- uri->post = frag_or_post + 1;
+ for (i = 0; i < UF_MAX; i ++) {
+ if (u.field_set & (1 << i)) {
+ comp = p + u.field_data[i].off;
+ complen = u.field_data[i].len;
+ switch (i) {
+ case UF_SCHEMA:
+ uri->protocollen = u.field_data[i].len;
+ break;
+ case UF_HOST:
+ uri->host = comp;
+ uri->hostlen = complen;
+ break;
+ case UF_PATH:
+ uri->data = comp;
+ uri->datalen = complen;
+ break;
+ case UF_QUERY:
+ uri->query = comp;
+ uri->querylen = complen;
+ break;
+ case UF_FRAGMENT:
+ uri->fragment = comp;
+ uri->fragmentlen = complen;
+ break;
+ case UF_USERINFO:
+ uri->user = comp;
+ uri->userlen = complen;
+ break;
+ default:
+ break;
}
}
- else {
- datalen = strlen (prefix_end);
- }
-
- uri->data = prefix_end;
- uri->datalen = datalen;
-
- return URI_ERRNO_OK;
- }
-
- /* Isolate host */
-
- /* Get brackets enclosing IPv6 address */
- lbracket = strchr (prefix_end, '[');
- if (lbracket) {
- rbracket = strchr (lbracket, ']');
- /* [address] is handled only inside of hostname part (surprisingly). */
- if (rbracket && rbracket < prefix_end + strcspn (prefix_end, "/"))
- uri->ipv6 = 1;
- else
- lbracket = rbracket = NULL;
- }
- else {
- rbracket = NULL;
- }
-
- /* Possibly skip auth part */
- host_end = prefix_end + strcspn (prefix_end, "@");
-
- if (prefix_end + strcspn (prefix_end, "/?") > host_end && *host_end) { /* we have auth info here */
-
- /* Allow '@' in the password component */
- while (strcspn (host_end + 1, "@") < strcspn (host_end + 1, "/?"))
- host_end = host_end + 1 + strcspn (host_end + 1, "@");
-
- user_end = strchr (prefix_end, ':');
-
- if (!user_end || user_end > host_end) {
- uri->user = prefix_end;
- uri->userlen = host_end - prefix_end;
- }
- else {
- uri->user = prefix_end;
- uri->userlen = user_end - prefix_end;
- uri->password = user_end + 1;
- uri->passwordlen = host_end - user_end - 1;
- }
- prefix_end = host_end + 1;
- }
-
- if (uri->ipv6 && rbracket != NULL) {
- host_end = rbracket + strcspn (rbracket, ":/?");
- }
- else {
- host_end = prefix_end + strcspn (prefix_end, ":/?");
- }
-
- if (uri->ipv6) {
- addrlen = rbracket - lbracket - 1;
-
-
- uri->host = lbracket + 1;
- uri->hostlen = addrlen;
}
- else {
- uri->host = prefix_end;
- uri->hostlen = host_end - prefix_end;
- /* Trim trailing '.'s */
- if (uri->hostlen && uri->host[uri->hostlen - 1] == '.')
- return URI_ERRNO_TRAILING_DOTS;
+ if (!uri->hostlen) {
+ return URI_ERRNO_BAD_FORMAT;
}
- if (*host_end == ':') { /* we have port here */
- port_end = host_end + 1 + strcspn (host_end + 1, "/");
-
- host_end++;
-
- uri->port = host_end;
- uri->portlen = port_end - host_end;
-
- if (uri->portlen == 0)
- return URI_ERRNO_NO_PORT_COLON;
-
- /* We only use 8 bits for portlen so better check */
- if ((gint)uri->portlen != port_end - host_end)
- return URI_ERRNO_INVALID_PORT;
-
- /* test if port is number */
- for (; host_end < port_end; host_end++)
- if (!g_ascii_isdigit (*host_end))
- return URI_ERRNO_INVALID_PORT;
+ rspamd_str_lc (uri->string, uri->protocollen);
+ rspamd_str_lc (uri->host, uri->hostlen);
- /* Check valid port value, and let show an error message
- * about invalid url syntax. */
- if (uri->port && uri->portlen) {
+ uri->protocol = PROTOCOL_UNKNOWN;
- errno = 0;
- n = strtol (uri->port, NULL, 10);
- if (errno || !uri_port_is_valid (n))
- return URI_ERRNO_INVALID_PORT;
+ for (i = 0; i < G_N_ELEMENTS (protocols); i ++) {
+ if (uri->protocollen == protocols[i].len) {
+ if (memcmp (uri->string, protocols[i].name, uri->protocollen) == 0) {
+ uri->protocol = i;
+ break;
+ }
}
}
- if (*host_end == '/') {
- host_end++;
+ if (uri->protocol == PROTOCOL_UNKNOWN) {
+ return URI_ERRNO_INVALID_PROTOCOL;
}
- else if (get_protocol_need_slash_after_host (uri->protocol) && *host_end !=
- '?') {
- /* The need for slash after the host component depends on the
- * need for a host component. -- The dangerous mind of Jonah */
- if (!uri->hostlen)
- return URI_ERRNO_NO_HOST;
-
- return URI_ERRNO_NO_HOST_SLASH;
- }
-
- /* Look for #fragment or POST_CHAR */
- prefix_end = host_end + strcspn (host_end, "#" POST_CHAR_S);
- uri->data = host_end;
- uri->datalen = prefix_end - host_end;
-
- if (*prefix_end == '#') {
- uri->fragment = prefix_end + 1;
- uri->fragmentlen = strcspn (uri->fragment, POST_CHAR_S);
- prefix_end = uri->fragment + uri->fragmentlen;
- }
-
- if (*prefix_end == POST_CHAR) {
- uri->post = prefix_end + 1;
- }
-
- rspamd_str_lc (uri->string, uri->protocollen);
- rspamd_str_lc (uri->host, uri->hostlen);
- /* Decode %HH sequences in host name. This is important not so much
- to support %HH sequences in host names (which other browser
- don't), but to support binary characters (which will have been
- converted to %HH by reencode_escapes). */
- if (strchr (uri->host, '%')) {
- uri->hostlen = url_calculate_escaped_hostlen (uri->host, uri->hostlen);
- }
-
- url_strip (struri (uri));
- rspamd_url_unescape (uri->host);
-
- path_simplify (uri->data);
return URI_ERRNO_OK;
}
@@ -1821,14 +1319,14 @@ url_email_end (const gchar *begin,
}
void
-url_parse_text (rspamd_mempool_t * pool,
+rspamd_url_text_extract (rspamd_mempool_t * pool,
struct rspamd_task *task,
struct mime_text_part *part,
gboolean is_html)
{
gint rc;
gchar *url_str = NULL, *url_start, *url_end;
- struct uri *new;
+ struct rspamd_url *new;
struct process_exception *ex;
gchar *p, *end, *begin;
@@ -1843,18 +1341,17 @@ url_parse_text (rspamd_mempool_t * pool,
end = begin + part->content->len;
p = begin;
while (p < end) {
- if (url_try_text (pool, p, end - p, &url_start, &url_end, &url_str,
+ if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str,
is_html)) {
if (url_str != NULL) {
- new = rspamd_mempool_alloc0 (pool, sizeof (struct uri));
+ new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
ex =
rspamd_mempool_alloc0 (pool,
sizeof (struct process_exception));
if (new != NULL) {
g_strstrip (url_str);
- rc = parse_uri (new, url_str, pool);
- if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES ||
- rc == URI_ERRNO_NO_HOST_SLASH) &&
+ rc = rspamd_url_parse (new, url_str, strlen (url_str), pool);
+ if (rc == URI_ERRNO_OK &&
new->hostlen > 0) {
ex->pos = url_start - begin;
ex->len = url_end - url_start;
@@ -1877,7 +1374,7 @@ url_parse_text (rspamd_mempool_t * pool,
else if (rc != URI_ERRNO_OK) {
msg_info ("extract of url '%s' failed: %s",
url_str,
- url_strerror (rc));
+ rspamd_url_strerror (rc));
}
}
}
@@ -1897,7 +1394,7 @@ url_parse_text (rspamd_mempool_t * pool,
}
gboolean
-url_try_text (rspamd_mempool_t *pool,
+rspamd_url_find (rspamd_mempool_t *pool,
const gchar *begin,
gsize len,
gchar **start,
diff --git a/src/libserver/url.h b/src/libserver/url.h
index c9700436b..db3a3472c 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -8,12 +8,9 @@
struct rspamd_task;
struct mime_text_part;
-struct uri {
- /* The start of the uri (and thus start of the protocol string). */
+struct rspamd_url {
gchar *string;
-
- /* The internal type of protocol. Can _never_ be PROTOCOL_UNKNOWN. */
- gint protocol; /* enum protocol */
+ gint protocol;
gint ip_family;
@@ -22,20 +19,20 @@ struct uri {
gchar *host;
gchar *port;
gchar *data;
+ gchar *query;
gchar *fragment;
gchar *post;
gchar *surbl;
- struct uri *phished_url;
+ struct rspamd_url *phished_url;
- /* @protocollen should only be usable if @protocol is either
- * PROTOCOL_USER or an uri string should be composed. */
guint protocollen;
guint userlen;
guint passwordlen;
guint hostlen;
guint portlen;
guint datalen;
+ guint querylen;
guint fragmentlen;
guint surbllen;
@@ -46,22 +43,16 @@ struct uri {
};
enum uri_errno {
- URI_ERRNO_OK, /* Parsing went well */
+ URI_ERRNO_OK = 0, /* Parsing went well */
URI_ERRNO_EMPTY, /* The URI string was empty */
URI_ERRNO_INVALID_PROTOCOL, /* No protocol was found */
- URI_ERRNO_NO_SLASHES, /* Slashes after protocol missing */
- URI_ERRNO_TOO_MANY_SLASHES, /* Too many slashes after protocol */
- URI_ERRNO_TRAILING_DOTS, /* '.' after host */
- URI_ERRNO_NO_HOST, /* Host part is missing */
- URI_ERRNO_NO_PORT_COLON, /* ':' after host without port */
- URI_ERRNO_NO_HOST_SLASH, /* Slash after host missing */
- URI_ERRNO_IPV6_SECURITY, /* IPv6 security bug detected */
URI_ERRNO_INVALID_PORT, /* Port number is bad */
- URI_ERRNO_INVALID_PORT_RANGE /* Port number is not within 0-65535 */
+ URI_ERRNO_BAD_ENCODING, /* Bad characters encoding */
+ URI_ERRNO_BAD_FORMAT
};
-enum protocol {
- PROTOCOL_FILE,
+enum rspamd_url_protocol {
+ PROTOCOL_FILE = 0,
PROTOCOL_FTP,
PROTOCOL_HTTP,
PROTOCOL_HTTPS,
@@ -78,7 +69,7 @@ enum protocol {
* @param part current text part
* @param is_html turn on html euristic
*/
-void url_parse_text (rspamd_mempool_t *pool,
+void rspamd_url_text_extract (rspamd_mempool_t *pool,
struct rspamd_task *task,
struct mime_text_part *part,
gboolean is_html);
@@ -89,8 +80,9 @@ void url_parse_text (rspamd_mempool_t *pool,
* @param uristring text form of url
* @param uri url object, must be pre allocated
*/
-enum uri_errno parse_uri (struct uri *uri,
+enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
gchar *uristring,
+ gsize len,
rspamd_mempool_t *pool);
/*
@@ -103,7 +95,7 @@ enum uri_errno parse_uri (struct uri *uri,
* @param url_str storage for url string(or NULL)
* @return TRUE if url is found in specified text
*/
-gboolean url_try_text (rspamd_mempool_t *pool,
+gboolean rspamd_url_find (rspamd_mempool_t *pool,
const gchar *begin,
gsize len,
gchar **start,
@@ -114,7 +106,7 @@ gboolean url_try_text (rspamd_mempool_t *pool,
/*
* Return text representation of url parsing error
*/
-const gchar * url_strerror (enum uri_errno err);
+const gchar * rspamd_url_strerror (enum uri_errno err);
/*
* URL unescape characters in the specified string
diff --git a/src/libutil/util.c b/src/libutil/util.c
index 6d5682f25..f88ed8e72 100644
--- a/src/libutil/util.c
+++ b/src/libutil/util.c
@@ -1427,7 +1427,7 @@ rspamd_strlcpy_tolower (gchar *dst, const gchar *src, gsize siz)
gint
rspamd_emails_cmp (gconstpointer a, gconstpointer b)
{
- const struct uri *u1 = a, *u2 = b;
+ const struct rspamd_url *u1 = a, *u2 = b;
gint r;
if (u1->hostlen != u2->hostlen || u1->hostlen == 0) {
@@ -1453,7 +1453,7 @@ rspamd_emails_cmp (gconstpointer a, gconstpointer b)
gint
rspamd_urls_cmp (gconstpointer a, gconstpointer b)
{
- const struct uri *u1 = a, *u2 = b;
+ const struct rspamd_url *u1 = a, *u2 = b;
int r;
if (u1->hostlen != u2->hostlen || u1->hostlen == 0) {
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index ef52b4544..6fee606c4 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -683,12 +683,12 @@ lua_check_image (lua_State * L)
return ud ? *((struct rspamd_image **)ud) : NULL;
}
-static struct uri *
+static struct rspamd_url *
lua_check_url (lua_State * L)
{
void *ud = luaL_checkudata (L, 1, "rspamd{url}");
luaL_argcheck (L, ud != NULL, 1, "'url' expected");
- return ud ? *((struct uri **)ud) : NULL;
+ return ud ? *((struct rspamd_url **)ud) : NULL;
}
static int
@@ -924,10 +924,10 @@ struct lua_tree_cb_data {
static gboolean
lua_tree_url_callback (gpointer key, gpointer value, gpointer ud)
{
- struct uri **purl;
+ struct rspamd_url **purl;
struct lua_tree_cb_data *cb = ud;
- purl = lua_newuserdata (cb->L, sizeof (struct uri *));
+ purl = lua_newuserdata (cb->L, sizeof (struct rspamd_url *));
rspamd_lua_setclass (cb->L, "rspamd{url}", -1);
*purl = value;
lua_rawseti (cb->L, -2, cb->i++);
@@ -2352,7 +2352,7 @@ lua_image_get_filename (lua_State *L)
static gint
lua_url_get_length (lua_State *L)
{
- struct uri *url = lua_check_url (L);
+ struct rspamd_url *url = lua_check_url (L);
if (url != NULL) {
lua_pushinteger (L, strlen (struri (url)));
@@ -2366,7 +2366,7 @@ lua_url_get_length (lua_State *L)
static gint
lua_url_get_host (lua_State *L)
{
- struct uri *url = lua_check_url (L);
+ struct rspamd_url *url = lua_check_url (L);
if (url != NULL) {
lua_pushlstring (L, url->host, url->hostlen);
@@ -2380,7 +2380,7 @@ lua_url_get_host (lua_State *L)
static gint
lua_url_get_user (lua_State *L)
{
- struct uri *url = lua_check_url (L);
+ struct rspamd_url *url = lua_check_url (L);
if (url != NULL && url->user != NULL) {
lua_pushlstring (L, url->user, url->userlen);
@@ -2395,7 +2395,7 @@ lua_url_get_user (lua_State *L)
static gint
lua_url_get_path (lua_State *L)
{
- struct uri *url = lua_check_url (L);
+ struct rspamd_url *url = lua_check_url (L);
if (url != NULL) {
lua_pushlstring (L, url->data, url->datalen);
@@ -2410,7 +2410,7 @@ lua_url_get_path (lua_State *L)
static gint
lua_url_get_text (lua_State *L)
{
- struct uri *url = lua_check_url (L);
+ struct rspamd_url *url = lua_check_url (L);
if (url != NULL) {
lua_pushstring (L, struri (url));
@@ -2425,7 +2425,7 @@ lua_url_get_text (lua_State *L)
static gint
lua_url_is_phished (lua_State *L)
{
- struct uri *url = lua_check_url (L);
+ struct rspamd_url *url = lua_check_url (L);
if (url != NULL) {
lua_pushboolean (L, url->is_phished);
@@ -2440,11 +2440,11 @@ lua_url_is_phished (lua_State *L)
static gint
lua_url_get_phished (lua_State *L)
{
- struct uri **purl, *url = lua_check_url (L);
+ struct rspamd_url **purl, *url = lua_check_url (L);
if (url) {
if (url->is_phished && url->phished_url != NULL) {
- purl = lua_newuserdata (L, sizeof (struct uri *));
+ purl = lua_newuserdata (L, sizeof (struct rspamd_url *));
rspamd_lua_setclass (L, "rspamd{url}", -1);
*purl = url->phished_url;
diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c
index 74ab46ab5..15eed1674 100644
--- a/src/plugins/regexp.c
+++ b/src/plugins/regexp.c
@@ -378,7 +378,7 @@ static gboolean
tree_url_callback (gpointer key, gpointer value, void *data)
{
struct url_regexp_param *param = data;
- struct uri *url = value;
+ struct rspamd_url *url = value;
GError *err = NULL;
if (g_regex_match_full (param->regexp, struri (url), -1, 0, 0, NULL,
diff --git a/src/plugins/surbl.c b/src/plugins/surbl.c
index b7a3a8337..df9227c08 100644
--- a/src/plugins/surbl.c
+++ b/src/plugins/surbl.c
@@ -576,7 +576,7 @@ format_surbl_request (rspamd_mempool_t * pool,
GError ** err,
gboolean forced,
GTree *tree,
- struct uri *url)
+ struct rspamd_url *url)
{
GHashTable *t;
gchar *result = NULL, *dots[MAX_LEVELS],
@@ -753,7 +753,7 @@ format_surbl_request (rspamd_mempool_t * pool,
}
static void
-make_surbl_requests (struct uri *url, struct rspamd_task *task,
+make_surbl_requests (struct rspamd_url *url, struct rspamd_task *task,
struct suffix_item *suffix, gboolean forced, GTree *tree)
{
gchar *surbl_req;
@@ -953,7 +953,7 @@ redirector_callback (gint fd, short what, void *arg)
struri (param->url),
c);
r =
- parse_uri (param->url,
+ rspamd_url_parse (param->url,
rspamd_mempool_strdup (param->task->task_pool,
c), param->task->task_pool);
if (r == URI_ERRNO_OK || r == URI_ERRNO_NO_SLASHES || r ==
@@ -985,7 +985,7 @@ redirector_callback (gint fd, short what, void *arg)
static void
-register_redirector_call (struct uri *url, struct rspamd_task *task,
+register_redirector_call (struct rspamd_url *url, struct rspamd_task *task,
struct suffix_item *suffix, const gchar *rule, GTree *tree)
{
gint s = -1;
@@ -1042,7 +1042,7 @@ surbl_tree_url_callback (gpointer key, gpointer value, void *data)
{
struct redirector_param *param = data;
struct rspamd_task *task;
- struct uri *url = value;
+ struct rspamd_url *url = value;
gchar *red_domain;
const gchar *pos;
GRegex *re;
@@ -1134,7 +1134,7 @@ static gboolean
calculate_buflen_cb (gpointer key, gpointer value, gpointer cbdata)
{
struct urls_tree_cb_data *cb = cbdata;
- struct uri *url = value;
+ struct rspamd_url *url = value;
cb->len += strlen (struri (url)) + url->hostlen + sizeof (" <\"\">, ") - 1;
@@ -1145,7 +1145,7 @@ static gboolean
write_urls_buffer (gpointer key, gpointer value, gpointer cbdata)
{
struct urls_tree_cb_data *cb = cbdata;
- struct uri *url = value;
+ struct rspamd_url *url = value;
rspamd_fstring_t f;
gchar *urlstr;
gsize len;
diff --git a/src/plugins/surbl.h b/src/plugins/surbl.h
index 7701c8304..959a730de 100644
--- a/src/plugins/surbl.h
+++ b/src/plugins/surbl.h
@@ -46,14 +46,14 @@ struct suffix_item {
};
struct dns_param {
- struct uri *url;
+ struct rspamd_url *url;
struct rspamd_task *task;
gchar *host_resolve;
struct suffix_item *suffix;
};
struct redirector_param {
- struct uri *url;
+ struct rspamd_url *url;
struct rspamd_task *task;
struct upstream *redirector;
enum {