Browse Source

[Feature] Add url encoding function

tags/1.5.0
Vsevolod Stakhov 7 years ago
parent
commit
d050686aee
7 changed files with 269 additions and 177 deletions
  1. 2
    2
      src/libserver/task.c
  2. 235
    5
      src/libserver/url.c
  3. 27
    0
      src/libserver/url.h
  4. 4
    3
      src/libutil/http.c
  5. 0
    149
      src/libutil/str_util.c
  6. 0
    17
      src/libutil/str_util.h
  7. 1
    1
      src/lua/lua_util.c

+ 2
- 2
src/libserver/task.c View File

@@ -326,7 +326,7 @@ rspamd_task_load_message (struct rspamd_task *task,
r = rspamd_strlcpy (filepath, tok->begin,
MIN (sizeof (filepath), tok->len + 1));

rspamd_decode_url (filepath, filepath, r + 1);
rspamd_url_decode (filepath, filepath, r + 1);
flen = strlen (filepath);

if (filepath[0] == '"' && flen > 2) {
@@ -424,7 +424,7 @@ rspamd_task_load_message (struct rspamd_task *task,
r = rspamd_strlcpy (filepath, tok->begin,
MIN (sizeof (filepath), tok->len + 1));

rspamd_decode_url (filepath, filepath, r + 1);
rspamd_url_decode (filepath, filepath, r + 1);
flen = strlen (filepath);

if (filepath[0] == '"' && flen > 2) {

+ 235
- 5
src/libserver/url.c View File

@@ -1569,28 +1569,28 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
uri->string = p;
uri->urllen = len;

unquoted_len = rspamd_decode_url (uri->string,
unquoted_len = rspamd_url_decode (uri->string,
uri->string,
uri->protocollen);
rspamd_url_shift (uri, unquoted_len, UF_SCHEMA);
unquoted_len = rspamd_decode_url (uri->host, uri->host, uri->hostlen);
unquoted_len = rspamd_url_decode (uri->host, uri->host, uri->hostlen);
rspamd_url_shift (uri, unquoted_len, UF_HOST);

if (uri->datalen) {
unquoted_len = rspamd_decode_url (uri->data, uri->data, uri->datalen);
unquoted_len = rspamd_url_decode (uri->data, uri->data, uri->datalen);
rspamd_url_shift (uri, unquoted_len, UF_PATH);
/* We now normalize path */
rspamd_http_normalize_path_inplace (uri->data, uri->datalen, &unquoted_len);
rspamd_url_shift (uri, unquoted_len, UF_PATH);
}
if (uri->querylen) {
unquoted_len = rspamd_decode_url (uri->query,
unquoted_len = rspamd_url_decode (uri->query,
uri->query,
uri->querylen);
rspamd_url_shift (uri, unquoted_len, UF_QUERY);
}
if (uri->fragmentlen) {
unquoted_len = rspamd_decode_url (uri->fragment,
unquoted_len = rspamd_url_decode (uri->fragment,
uri->fragment,
uri->fragmentlen);
rspamd_url_shift (uri, unquoted_len, UF_FRAGMENT);
@@ -2569,3 +2569,233 @@ rspamd_url_add_tag (struct rspamd_url *url, const gchar *tag,

DL_APPEND (found, ntag);
}

guint
rspamd_url_hash (gconstpointer u)
{
const struct rspamd_url *url = u;
rspamd_cryptobox_fast_hash_state_t st;

rspamd_cryptobox_fast_hash_init (&st, rspamd_hash_seed ());

if (url->urllen > 0) {
rspamd_cryptobox_fast_hash_update (&st, url->string, url->urllen);
}

rspamd_cryptobox_fast_hash_update (&st, &url->flags, sizeof (url->flags));

return rspamd_cryptobox_fast_hash_final (&st);
}

/* Compare two emails for building emails tree */
gboolean
rspamd_emails_cmp (gconstpointer a, gconstpointer b)
{
const struct rspamd_url *u1 = a, *u2 = b;
gint r;

if (u1->hostlen != u2->hostlen || u1->hostlen == 0) {
return FALSE;
}
else {
if ((r = rspamd_lc_cmp (u1->host, u2->host, u1->hostlen)) == 0) {
if (u1->userlen != u2->userlen || u1->userlen == 0) {
return FALSE;
}
else {
return rspamd_lc_cmp (u1->user, u2->user, u1->userlen) ==
0;
}
}
else {
return r == 0;
}
}

return FALSE;
}

gboolean
rspamd_urls_cmp (gconstpointer a, gconstpointer b)
{
const struct rspamd_url *u1 = a, *u2 = b;
int r;

if (u1->urllen != u2->urllen) {
return FALSE;
}
else {
r = memcmp (u1->string, u2->string, u1->urllen);
if (r == 0 && u1->flags != u2->flags) {
/* Always insert phished urls to the tree */
return FALSE;
}
}

return r == 0;
}

gsize
rspamd_url_decode (gchar *dst, const gchar *src, gsize size)
{
gchar *d, ch, c, decoded;
const gchar *s;
enum {
sw_usual = 0,
sw_quoted,
sw_quoted_second
} state;

d = dst;
s = src;

state = 0;
decoded = 0;

while (size--) {

ch = *s++;

switch (state) {
case sw_usual:

if (ch == '%') {
state = sw_quoted;
break;
}
else if (ch == '+') {
*d++ = ' ';
}
else {
*d++ = ch;
}
break;

case sw_quoted:

if (ch >= '0' && ch <= '9') {
decoded = (ch - '0');
state = sw_quoted_second;
break;
}

c = (ch | 0x20);
if (c >= 'a' && c <= 'f') {
decoded = (c - 'a' + 10);
state = sw_quoted_second;
break;
}

/* the invalid quoted character */

state = sw_usual;

*d++ = ch;

break;

case sw_quoted_second:

state = sw_usual;

if (ch >= '0' && ch <= '9') {
ch = ((decoded << 4) + ch - '0');
*d++ = ch;

break;
}

c = (u_char) (ch | 0x20);
if (c >= 'a' && c <= 'f') {
ch = ((decoded << 4) + c - 'a' + 10);

*d++ = ch;
break;
}

/* the invalid quoted character */
break;
}
}

return (d - dst);
}

#define CHECK_URL_COMPONENT(beg, len) do { \
for (i = 0; i < (len); i ++) { \
if ((beg)[i] > 0x80 || !is_urlsafe ((beg)[i])) { \
dlen += 2; \
} \
} \
} while (0)

#define ENCODE_URL_COMPONENT(beg, len) do { \
for (i = 0; i < (len) && dend > d; i ++) { \
if ((beg)[i] > 0x80 || !is_urlsafe ((beg)[i])) { \
*d++ = '%'; \
*d++ = hexdigests[((beg)[i] >> 4) & 0xf]; \
*d++ = hexdigests[(beg)[i] & 0xf]; \
} \
else { \
*d++ = (beg)[i]; \
} \
} \
} while (0)

const gchar *
rspamd_url_encode (struct rspamd_url *url, gsize *pdlen,
rspamd_mempool_t *pool)
{
guchar *dest, *d, *dend;
static const gchar hexdigests[16] = "0123456789abcdef";
guint i;
gsize dlen = 0;

g_assert (pdlen != NULL && url != NULL && pool != NULL);

CHECK_URL_COMPONENT ((guchar *)url->host, url->hostlen);
CHECK_URL_COMPONENT ((guchar *)url->user, url->userlen);
CHECK_URL_COMPONENT ((guchar *)url->data, url->datalen);
CHECK_URL_COMPONENT ((guchar *)url->query, url->querylen);
CHECK_URL_COMPONENT ((guchar *)url->fragment, url->fragmentlen);

if (dlen == 0) {
*pdlen = url->urllen;

return url->string;
}

/* Need to encode */
dlen += url->urllen;
dest = rspamd_mempool_alloc (pool, dlen + 1);
d = dest;
dend = d + dlen;
d += rspamd_snprintf ((gchar *)d, dend - d,
"%*s://", url->protocollen, url->protocol);

if (url->userlen > 0) {
ENCODE_URL_COMPONENT ((guchar *)url->user, url->userlen);
*d++ = ':';
}

ENCODE_URL_COMPONENT ((guchar *)url->host, url->hostlen);

if (url->datalen > 0) {
*d++ = '/';
ENCODE_URL_COMPONENT ((guchar *)url->data, url->datalen);
}

if (url->querylen > 0) {
*d++ = '/';
ENCODE_URL_COMPONENT ((guchar *)url->query, url->querylen);
}

if (url->fragmentlen > 0) {
*d++ = '/';
ENCODE_URL_COMPONENT ((guchar *)url->fragment, url->fragmentlen);
}

*pdlen = (d - dest);

return (const gchar *)dest;
}

+ 27
- 0
src/libserver/url.h View File

@@ -177,4 +177,31 @@ void rspamd_url_add_tag (struct rspamd_url *url, const gchar *tag,
const gchar *value,
rspamd_mempool_t *pool);

guint rspamd_url_hash (gconstpointer u);

/* Compare two emails for building emails hash */
gboolean rspamd_emails_cmp (gconstpointer a, gconstpointer b);

/* Compare two urls for building emails hash */
gboolean rspamd_urls_cmp (gconstpointer a, gconstpointer b);

/**
* Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated
* @param dst
* @param src
* @param size
* @return
*/
gsize rspamd_url_decode (gchar *dst, const gchar *src, gsize size);

/**
* Encode url if needed. In this case, memory is allocated from the specific pool.
* Returns pointer to begin and encoded length in `dlen`
* @param url
* @param pool
* @return
*/
const gchar * rspamd_url_encode (struct rspamd_url *url, gsize *dlen,
rspamd_mempool_t *pool);

#endif

+ 4
- 3
src/libutil/http.c View File

@@ -27,6 +27,7 @@
#include "unix-std.h"
#include "libutil/ssl_util.h"
#include "libutil/regexp.h"
#include "libserver/url.h"

#define ENCRYPTED_VERSION " HTTP/1.0"

@@ -3376,7 +3377,7 @@ rspamd_http_message_parse_query (struct rspamd_http_message *msg)
/* We have a single parameter without a value */
key = rspamd_fstring_new_init (c, p - c);
key_tok = rspamd_ftok_map (key);
key_tok->len = rspamd_decode_url (key->str, key->str,
key_tok->len = rspamd_url_decode (key->str, key->str,
key->len);

value = rspamd_fstring_new_init ("", 0);
@@ -3389,7 +3390,7 @@ rspamd_http_message_parse_query (struct rspamd_http_message *msg)
/* We have something like key=value */
key = rspamd_fstring_new_init (c, p - c);
key_tok = rspamd_ftok_map (key);
key_tok->len = rspamd_decode_url (key->str, key->str,
key_tok->len = rspamd_url_decode (key->str, key->str,
key->len);

state = parse_eqsign;
@@ -3415,7 +3416,7 @@ rspamd_http_message_parse_query (struct rspamd_http_message *msg)
if (p > c) {
value = rspamd_fstring_new_init (c, p - c);
value_tok = rspamd_ftok_map (value);
value_tok->len = rspamd_decode_url (value->str,
value_tok->len = rspamd_url_decode (value->str,
value->str,
value->len);
/* Detect quotes for value */

+ 0
- 149
src/libutil/str_util.c View File

@@ -897,91 +897,7 @@ rspamd_encode_base64_fold (const guchar *in, gsize inlen, gint str_len,
return rspamd_encode_base64_common (in, inlen, str_len, outlen, TRUE, how);
}

gsize
rspamd_decode_url (gchar *dst, const gchar *src, gsize size)
{
gchar *d, ch, c, decoded;
const gchar *s;
enum {
sw_usual = 0,
sw_quoted,
sw_quoted_second
} state;

d = dst;
s = src;

state = 0;
decoded = 0;

while (size--) {

ch = *s++;

switch (state) {
case sw_usual:

if (ch == '%') {
state = sw_quoted;
break;
}
else if (ch == '+') {
*d++ = ' ';
}
else {
*d++ = ch;
}
break;

case sw_quoted:

if (ch >= '0' && ch <= '9') {
decoded = (ch - '0');
state = sw_quoted_second;
break;
}

c = (ch | 0x20);
if (c >= 'a' && c <= 'f') {
decoded = (c - 'a' + 10);
state = sw_quoted_second;
break;
}

/* the invalid quoted character */

state = sw_usual;

*d++ = ch;

break;

case sw_quoted_second:

state = sw_usual;

if (ch >= '0' && ch <= '9') {
ch = ((decoded << 4) + ch - '0');
*d++ = ch;

break;
}

c = (u_char) (ch | 0x20);
if (c >= 'a' && c <= 'f') {
ch = ((decoded << 4) + c - 'a' + 10);

*d++ = ch;
break;
}

/* the invalid quoted character */
break;
}
}

return (d - dst);
}
#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))

gint
@@ -2143,71 +2059,6 @@ rspamd_ucl_emit_fstring_comments (const ucl_object_t *obj,
ucl_object_emit_full (obj, emit_type, &func, comments);
}

guint
rspamd_url_hash (gconstpointer u)
{
const struct rspamd_url *url = u;
rspamd_cryptobox_fast_hash_state_t st;

rspamd_cryptobox_fast_hash_init (&st, rspamd_hash_seed ());

if (url->urllen > 0) {
rspamd_cryptobox_fast_hash_update (&st, url->string, url->urllen);
}

rspamd_cryptobox_fast_hash_update (&st, &url->flags, sizeof (url->flags));

return rspamd_cryptobox_fast_hash_final (&st);
}

/* Compare two emails for building emails tree */
gboolean
rspamd_emails_cmp (gconstpointer a, gconstpointer b)
{
const struct rspamd_url *u1 = a, *u2 = b;
gint r;

if (u1->hostlen != u2->hostlen || u1->hostlen == 0) {
return FALSE;
}
else {
if ((r = rspamd_lc_cmp (u1->host, u2->host, u1->hostlen)) == 0) {
if (u1->userlen != u2->userlen || u1->userlen == 0) {
return FALSE;
}
else {
return rspamd_lc_cmp (u1->user, u2->user, u1->userlen) ==
0;
}
}
else {
return r == 0;
}
}

return FALSE;
}

gboolean
rspamd_urls_cmp (gconstpointer a, gconstpointer b)
{
const struct rspamd_url *u1 = a, *u2 = b;
int r;

if (u1->urllen != u2->urllen) {
return FALSE;
}
else {
r = memcmp (u1->string, u2->string, u1->urllen);
if (r == 0 && u1->flags != u2->flags) {
/* Always insert phished urls to the tree */
return FALSE;
}
}

return r == 0;
}

const void *
rspamd_memrchr (const void *m, gint c, gsize len)
{

+ 0
- 17
src/libutil/str_util.h View File

@@ -204,15 +204,6 @@ gchar * rspamd_encode_base64 (const guchar *in, gsize inlen, gint str_len,
gchar * rspamd_encode_base64_fold (const guchar *in, gsize inlen, gint str_len,
gsize *outlen, enum rspamd_newlines_type how);

/**
* Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated
* @param dst
* @param src
* @param size
* @return
*/
gsize rspamd_decode_url (gchar *dst, const gchar *src, gsize size);

/**
* Decode quoted-printable encoded buffer, input and output must not overlap
* @param in input
@@ -343,14 +334,6 @@ void rspamd_ucl_emit_fstring_comments (const ucl_object_t *obj,
rspamd_fstring_t **target,
const ucl_object_t *comments);

guint rspamd_url_hash (gconstpointer u);

/* Compare two emails for building emails hash */
gboolean rspamd_emails_cmp (gconstpointer a, gconstpointer b);

/* Compare two urls for building emails hash */
gboolean rspamd_urls_cmp (gconstpointer a, gconstpointer b);

extern const guchar lc_map[256];

/**

+ 1
- 1
src/lua/lua_util.c View File

@@ -857,7 +857,7 @@ lua_util_decode_url (lua_State *L)
rspamd_lua_setclass (L, "rspamd{text}", -1);
t->start = g_malloc (inlen);
memcpy ((char *)t->start, s, inlen);
t->len = rspamd_decode_url ((char *)t->start, s, inlen);
t->len = rspamd_url_decode ((char *)t->start, s, inlen);
t->flags = RSPAMD_TEXT_FLAG_OWN;
}
else {

Loading…
Cancel
Save