From d050686aee0ad85364ceae6185c1e2698feb3e6e Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 23 Jan 2017 13:27:45 +0000 Subject: [PATCH] [Feature] Add url encoding function --- src/libserver/task.c | 4 +- src/libserver/url.c | 240 ++++++++++++++++++++++++++++++++++++++++- src/libserver/url.h | 27 +++++ src/libutil/http.c | 7 +- src/libutil/str_util.c | 149 ------------------------- src/libutil/str_util.h | 17 --- src/lua/lua_util.c | 2 +- 7 files changed, 269 insertions(+), 177 deletions(-) diff --git a/src/libserver/task.c b/src/libserver/task.c index 75c44f21a..f02665afd 100644 --- a/src/libserver/task.c +++ b/src/libserver/task.c @@ -326,7 +326,7 @@ rspamd_task_load_message (struct rspamd_task *task, r = rspamd_strlcpy (filepath, tok->begin, MIN (sizeof (filepath), tok->len + 1)); - rspamd_decode_url (filepath, filepath, r + 1); + rspamd_url_decode (filepath, filepath, r + 1); flen = strlen (filepath); if (filepath[0] == '"' && flen > 2) { @@ -424,7 +424,7 @@ rspamd_task_load_message (struct rspamd_task *task, r = rspamd_strlcpy (filepath, tok->begin, MIN (sizeof (filepath), tok->len + 1)); - rspamd_decode_url (filepath, filepath, r + 1); + rspamd_url_decode (filepath, filepath, r + 1); flen = strlen (filepath); if (filepath[0] == '"' && flen > 2) { diff --git a/src/libserver/url.c b/src/libserver/url.c index 4252d5ac1..4c7e643e7 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -1569,28 +1569,28 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, uri->string = p; uri->urllen = len; - unquoted_len = rspamd_decode_url (uri->string, + unquoted_len = rspamd_url_decode (uri->string, uri->string, uri->protocollen); rspamd_url_shift (uri, unquoted_len, UF_SCHEMA); - unquoted_len = rspamd_decode_url (uri->host, uri->host, uri->hostlen); + unquoted_len = rspamd_url_decode (uri->host, uri->host, uri->hostlen); rspamd_url_shift (uri, unquoted_len, UF_HOST); if (uri->datalen) { - unquoted_len = rspamd_decode_url (uri->data, uri->data, uri->datalen); + unquoted_len = rspamd_url_decode (uri->data, uri->data, uri->datalen); rspamd_url_shift (uri, unquoted_len, UF_PATH); /* We now normalize path */ rspamd_http_normalize_path_inplace (uri->data, uri->datalen, &unquoted_len); rspamd_url_shift (uri, unquoted_len, UF_PATH); } if (uri->querylen) { - unquoted_len = rspamd_decode_url (uri->query, + unquoted_len = rspamd_url_decode (uri->query, uri->query, uri->querylen); rspamd_url_shift (uri, unquoted_len, UF_QUERY); } if (uri->fragmentlen) { - unquoted_len = rspamd_decode_url (uri->fragment, + unquoted_len = rspamd_url_decode (uri->fragment, uri->fragment, uri->fragmentlen); rspamd_url_shift (uri, unquoted_len, UF_FRAGMENT); @@ -2569,3 +2569,233 @@ rspamd_url_add_tag (struct rspamd_url *url, const gchar *tag, DL_APPEND (found, ntag); } + +guint +rspamd_url_hash (gconstpointer u) +{ + const struct rspamd_url *url = u; + rspamd_cryptobox_fast_hash_state_t st; + + rspamd_cryptobox_fast_hash_init (&st, rspamd_hash_seed ()); + + if (url->urllen > 0) { + rspamd_cryptobox_fast_hash_update (&st, url->string, url->urllen); + } + + rspamd_cryptobox_fast_hash_update (&st, &url->flags, sizeof (url->flags)); + + return rspamd_cryptobox_fast_hash_final (&st); +} + +/* Compare two emails for building emails tree */ +gboolean +rspamd_emails_cmp (gconstpointer a, gconstpointer b) +{ + const struct rspamd_url *u1 = a, *u2 = b; + gint r; + + if (u1->hostlen != u2->hostlen || u1->hostlen == 0) { + return FALSE; + } + else { + if ((r = rspamd_lc_cmp (u1->host, u2->host, u1->hostlen)) == 0) { + if (u1->userlen != u2->userlen || u1->userlen == 0) { + return FALSE; + } + else { + return rspamd_lc_cmp (u1->user, u2->user, u1->userlen) == + 0; + } + } + else { + return r == 0; + } + } + + return FALSE; +} + +gboolean +rspamd_urls_cmp (gconstpointer a, gconstpointer b) +{ + const struct rspamd_url *u1 = a, *u2 = b; + int r; + + if (u1->urllen != u2->urllen) { + return FALSE; + } + else { + r = memcmp (u1->string, u2->string, u1->urllen); + if (r == 0 && u1->flags != u2->flags) { + /* Always insert phished urls to the tree */ + return FALSE; + } + } + + return r == 0; +} + +gsize +rspamd_url_decode (gchar *dst, const gchar *src, gsize size) +{ + gchar *d, ch, c, decoded; + const gchar *s; + enum { + sw_usual = 0, + sw_quoted, + sw_quoted_second + } state; + + d = dst; + s = src; + + state = 0; + decoded = 0; + + while (size--) { + + ch = *s++; + + switch (state) { + case sw_usual: + + if (ch == '%') { + state = sw_quoted; + break; + } + else if (ch == '+') { + *d++ = ' '; + } + else { + *d++ = ch; + } + break; + + case sw_quoted: + + if (ch >= '0' && ch <= '9') { + decoded = (ch - '0'); + state = sw_quoted_second; + break; + } + + c = (ch | 0x20); + if (c >= 'a' && c <= 'f') { + decoded = (c - 'a' + 10); + state = sw_quoted_second; + break; + } + + /* the invalid quoted character */ + + state = sw_usual; + + *d++ = ch; + + break; + + case sw_quoted_second: + + state = sw_usual; + + if (ch >= '0' && ch <= '9') { + ch = ((decoded << 4) + ch - '0'); + *d++ = ch; + + break; + } + + c = (u_char) (ch | 0x20); + if (c >= 'a' && c <= 'f') { + ch = ((decoded << 4) + c - 'a' + 10); + + *d++ = ch; + break; + } + + /* the invalid quoted character */ + break; + } + } + + return (d - dst); +} + +#define CHECK_URL_COMPONENT(beg, len) do { \ + for (i = 0; i < (len); i ++) { \ + if ((beg)[i] > 0x80 || !is_urlsafe ((beg)[i])) { \ + dlen += 2; \ + } \ + } \ +} while (0) + +#define ENCODE_URL_COMPONENT(beg, len) do { \ + for (i = 0; i < (len) && dend > d; i ++) { \ + if ((beg)[i] > 0x80 || !is_urlsafe ((beg)[i])) { \ + *d++ = '%'; \ + *d++ = hexdigests[((beg)[i] >> 4) & 0xf]; \ + *d++ = hexdigests[(beg)[i] & 0xf]; \ + } \ + else { \ + *d++ = (beg)[i]; \ + } \ + } \ +} while (0) + +const gchar * +rspamd_url_encode (struct rspamd_url *url, gsize *pdlen, + rspamd_mempool_t *pool) +{ + guchar *dest, *d, *dend; + static const gchar hexdigests[16] = "0123456789abcdef"; + guint i; + gsize dlen = 0; + + g_assert (pdlen != NULL && url != NULL && pool != NULL); + + CHECK_URL_COMPONENT ((guchar *)url->host, url->hostlen); + CHECK_URL_COMPONENT ((guchar *)url->user, url->userlen); + CHECK_URL_COMPONENT ((guchar *)url->data, url->datalen); + CHECK_URL_COMPONENT ((guchar *)url->query, url->querylen); + CHECK_URL_COMPONENT ((guchar *)url->fragment, url->fragmentlen); + + if (dlen == 0) { + *pdlen = url->urllen; + + return url->string; + } + + /* Need to encode */ + dlen += url->urllen; + dest = rspamd_mempool_alloc (pool, dlen + 1); + d = dest; + dend = d + dlen; + d += rspamd_snprintf ((gchar *)d, dend - d, + "%*s://", url->protocollen, url->protocol); + + if (url->userlen > 0) { + ENCODE_URL_COMPONENT ((guchar *)url->user, url->userlen); + *d++ = ':'; + } + + ENCODE_URL_COMPONENT ((guchar *)url->host, url->hostlen); + + if (url->datalen > 0) { + *d++ = '/'; + ENCODE_URL_COMPONENT ((guchar *)url->data, url->datalen); + } + + if (url->querylen > 0) { + *d++ = '/'; + ENCODE_URL_COMPONENT ((guchar *)url->query, url->querylen); + } + + if (url->fragmentlen > 0) { + *d++ = '/'; + ENCODE_URL_COMPONENT ((guchar *)url->fragment, url->fragmentlen); + } + + *pdlen = (d - dest); + + return (const gchar *)dest; +} diff --git a/src/libserver/url.h b/src/libserver/url.h index dbe3eb00b..f56649558 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -177,4 +177,31 @@ void rspamd_url_add_tag (struct rspamd_url *url, const gchar *tag, const gchar *value, rspamd_mempool_t *pool); +guint rspamd_url_hash (gconstpointer u); + +/* Compare two emails for building emails hash */ +gboolean rspamd_emails_cmp (gconstpointer a, gconstpointer b); + +/* Compare two urls for building emails hash */ +gboolean rspamd_urls_cmp (gconstpointer a, gconstpointer b); + +/** + * Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated + * @param dst + * @param src + * @param size + * @return + */ +gsize rspamd_url_decode (gchar *dst, const gchar *src, gsize size); + +/** + * Encode url if needed. In this case, memory is allocated from the specific pool. + * Returns pointer to begin and encoded length in `dlen` + * @param url + * @param pool + * @return + */ +const gchar * rspamd_url_encode (struct rspamd_url *url, gsize *dlen, + rspamd_mempool_t *pool); + #endif diff --git a/src/libutil/http.c b/src/libutil/http.c index eec53b515..9a33b1a90 100644 --- a/src/libutil/http.c +++ b/src/libutil/http.c @@ -27,6 +27,7 @@ #include "unix-std.h" #include "libutil/ssl_util.h" #include "libutil/regexp.h" +#include "libserver/url.h" #define ENCRYPTED_VERSION " HTTP/1.0" @@ -3376,7 +3377,7 @@ rspamd_http_message_parse_query (struct rspamd_http_message *msg) /* We have a single parameter without a value */ key = rspamd_fstring_new_init (c, p - c); key_tok = rspamd_ftok_map (key); - key_tok->len = rspamd_decode_url (key->str, key->str, + key_tok->len = rspamd_url_decode (key->str, key->str, key->len); value = rspamd_fstring_new_init ("", 0); @@ -3389,7 +3390,7 @@ rspamd_http_message_parse_query (struct rspamd_http_message *msg) /* We have something like key=value */ key = rspamd_fstring_new_init (c, p - c); key_tok = rspamd_ftok_map (key); - key_tok->len = rspamd_decode_url (key->str, key->str, + key_tok->len = rspamd_url_decode (key->str, key->str, key->len); state = parse_eqsign; @@ -3415,7 +3416,7 @@ rspamd_http_message_parse_query (struct rspamd_http_message *msg) if (p > c) { value = rspamd_fstring_new_init (c, p - c); value_tok = rspamd_ftok_map (value); - value_tok->len = rspamd_decode_url (value->str, + value_tok->len = rspamd_url_decode (value->str, value->str, value->len); /* Detect quotes for value */ diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index 3b3dc06b7..10f5d54e3 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -897,91 +897,7 @@ rspamd_encode_base64_fold (const guchar *in, gsize inlen, gint str_len, return rspamd_encode_base64_common (in, inlen, str_len, outlen, TRUE, how); } -gsize -rspamd_decode_url (gchar *dst, const gchar *src, gsize size) -{ - gchar *d, ch, c, decoded; - const gchar *s; - enum { - sw_usual = 0, - sw_quoted, - sw_quoted_second - } state; - - d = dst; - s = src; - - state = 0; - decoded = 0; - - while (size--) { - - ch = *s++; - - switch (state) { - case sw_usual: - - if (ch == '%') { - state = sw_quoted; - break; - } - else if (ch == '+') { - *d++ = ' '; - } - else { - *d++ = ch; - } - break; - - case sw_quoted: - - if (ch >= '0' && ch <= '9') { - decoded = (ch - '0'); - state = sw_quoted_second; - break; - } - - c = (ch | 0x20); - if (c >= 'a' && c <= 'f') { - decoded = (c - 'a' + 10); - state = sw_quoted_second; - break; - } - - /* the invalid quoted character */ - - state = sw_usual; - - *d++ = ch; - - break; - - case sw_quoted_second: - - state = sw_usual; - - if (ch >= '0' && ch <= '9') { - ch = ((decoded << 4) + ch - '0'); - *d++ = ch; - - break; - } - - c = (u_char) (ch | 0x20); - if (c >= 'a' && c <= 'f') { - ch = ((decoded << 4) + c - 'a' + 10); - - *d++ = ch; - break; - } - - /* the invalid quoted character */ - break; - } - } - return (d - dst); -} #define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c))) gint @@ -2143,71 +2059,6 @@ rspamd_ucl_emit_fstring_comments (const ucl_object_t *obj, ucl_object_emit_full (obj, emit_type, &func, comments); } -guint -rspamd_url_hash (gconstpointer u) -{ - const struct rspamd_url *url = u; - rspamd_cryptobox_fast_hash_state_t st; - - rspamd_cryptobox_fast_hash_init (&st, rspamd_hash_seed ()); - - if (url->urllen > 0) { - rspamd_cryptobox_fast_hash_update (&st, url->string, url->urllen); - } - - rspamd_cryptobox_fast_hash_update (&st, &url->flags, sizeof (url->flags)); - - return rspamd_cryptobox_fast_hash_final (&st); -} - -/* Compare two emails for building emails tree */ -gboolean -rspamd_emails_cmp (gconstpointer a, gconstpointer b) -{ - const struct rspamd_url *u1 = a, *u2 = b; - gint r; - - if (u1->hostlen != u2->hostlen || u1->hostlen == 0) { - return FALSE; - } - else { - if ((r = rspamd_lc_cmp (u1->host, u2->host, u1->hostlen)) == 0) { - if (u1->userlen != u2->userlen || u1->userlen == 0) { - return FALSE; - } - else { - return rspamd_lc_cmp (u1->user, u2->user, u1->userlen) == - 0; - } - } - else { - return r == 0; - } - } - - return FALSE; -} - -gboolean -rspamd_urls_cmp (gconstpointer a, gconstpointer b) -{ - const struct rspamd_url *u1 = a, *u2 = b; - int r; - - if (u1->urllen != u2->urllen) { - return FALSE; - } - else { - r = memcmp (u1->string, u2->string, u1->urllen); - if (r == 0 && u1->flags != u2->flags) { - /* Always insert phished urls to the tree */ - return FALSE; - } - } - - return r == 0; -} - const void * rspamd_memrchr (const void *m, gint c, gsize len) { diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h index 941d141b4..ea3d97278 100644 --- a/src/libutil/str_util.h +++ b/src/libutil/str_util.h @@ -204,15 +204,6 @@ gchar * rspamd_encode_base64 (const guchar *in, gsize inlen, gint str_len, gchar * rspamd_encode_base64_fold (const guchar *in, gsize inlen, gint str_len, gsize *outlen, enum rspamd_newlines_type how); -/** - * Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated - * @param dst - * @param src - * @param size - * @return - */ -gsize rspamd_decode_url (gchar *dst, const gchar *src, gsize size); - /** * Decode quoted-printable encoded buffer, input and output must not overlap * @param in input @@ -343,14 +334,6 @@ void rspamd_ucl_emit_fstring_comments (const ucl_object_t *obj, rspamd_fstring_t **target, const ucl_object_t *comments); -guint rspamd_url_hash (gconstpointer u); - -/* Compare two emails for building emails hash */ -gboolean rspamd_emails_cmp (gconstpointer a, gconstpointer b); - -/* Compare two urls for building emails hash */ -gboolean rspamd_urls_cmp (gconstpointer a, gconstpointer b); - extern const guchar lc_map[256]; /** diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 95471601b..b1bfdce28 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -857,7 +857,7 @@ lua_util_decode_url (lua_State *L) rspamd_lua_setclass (L, "rspamd{text}", -1); t->start = g_malloc (inlen); memcpy ((char *)t->start, s, inlen); - t->len = rspamd_decode_url ((char *)t->start, s, inlen); + t->len = rspamd_url_decode ((char *)t->start, s, inlen); t->flags = RSPAMD_TEXT_FLAG_OWN; } else { -- 2.39.5