diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-03-06 13:14:41 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-03-09 10:46:11 +0000 |
commit | 3e3b94276f03f520bcd1756876c1077f250127d9 (patch) | |
tree | 950caa40276702d2bf2532deb87f4f2acda01f51 | |
parent | cd270c51b2ccd814804e4f17eb31dc7d91a69980 (diff) | |
download | rspamd-3e3b94276f03f520bcd1756876c1077f250127d9.tar.gz rspamd-3e3b94276f03f520bcd1756876c1077f250127d9.zip |
[Rework] Rework URL structure: more structure optimisations
-rw-r--r-- | src/libserver/html.c | 4 | ||||
-rw-r--r-- | src/libserver/url.c | 114 | ||||
-rw-r--r-- | src/libserver/url.h | 37 | ||||
-rw-r--r-- | src/lua/lua_url.c | 12 |
4 files changed, 98 insertions, 69 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c index 7dca72453..e1a211d2c 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -1631,7 +1631,7 @@ rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url, if (url->querylen > 0) { - if (rspamd_url_find (pool, url->query, url->querylen, &url_str, + if (rspamd_url_find (pool, rspamd_url_query_unsafe (url), url->querylen, &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) { query_url = rspamd_mempool_alloc0 (pool, @@ -1646,7 +1646,7 @@ rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url, if (rc == URI_ERRNO_OK && query_url->hostlen > 0) { msg_debug_html ("found url %s in query of url" - " %*s", url_str, url->querylen, url->query); + " %*s", url_str, url->querylen, rspamd_url_query_unsafe (url)); if (query_url->protocol == PROTOCOL_MAILTO) { target_tbl = tbl_emails; diff --git a/src/libserver/url.c b/src/libserver/url.c index ac4c11916..7e85a460e 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -1573,6 +1573,7 @@ rspamd_url_regen_from_inet_addr (struct rspamd_url *uri, const void *addr, int a rspamd_mempool_t *pool) { gchar *strbuf, *p; + const gchar *start_offset; gsize slen = uri->urllen - uri->hostlen; goffset r = 0; @@ -1589,39 +1590,46 @@ rspamd_url_regen_from_inet_addr (struct rspamd_url *uri, const void *addr, int a (gint)(uri->hostshift), uri->string); uri->hostshift = r; + start_offset = strbuf + r; inet_ntop (af, addr, strbuf + r, slen - r + 1); - uri->hostlen = strlen (rspamd_url_host_unsafe (uri)); + uri->hostlen = strlen (start_offset); r += uri->hostlen; - uri->tld = rspamd_url_host_unsafe (uri); + uri->tld = (const gchar *)start_offset; uri->tldlen = uri->hostlen; uri->flags |= RSPAMD_URL_FLAG_NUMERIC; /* Reconstruct URL */ if (uri->datalen > 0) { - p = strbuf + r + 1; + p = strbuf + r; + start_offset = p + 1; r += rspamd_snprintf (strbuf + r, slen - r, "/%*s", (gint)uri->datalen, - uri->data); - uri->data = p; + rspamd_url_data_unsafe (uri)); + uri->datashift = start_offset - strbuf; } else { /* Add trailing slash if needed */ - r += rspamd_snprintf (strbuf + r, slen - r, "/"); + if (uri->hostlen + uri->hostshift < uri->urllen && + *(rspamd_url_host_unsafe (uri) + uri->hostlen) == '/') { + r += rspamd_snprintf (strbuf + r, slen - r, "/"); + } } if (uri->querylen > 0) { - p = strbuf + r + 1; + p = strbuf + r; + start_offset = p + 1; r += rspamd_snprintf (strbuf + r, slen - r, "?%*s", (gint)uri->querylen, - uri->query); - uri->query = p; + rspamd_url_query_unsafe (uri)); + uri->queryshift = start_offset - strbuf; } if (uri->fragmentlen > 0) { - p = strbuf + r + 1; + p = strbuf + r; + start_offset = p + 1; r += rspamd_snprintf (strbuf + r, slen - r, "#%*s", (gint)uri->fragmentlen, - uri->fragment); - uri->fragment = p; + rspamd_url_fragment_unsafe (uri)); + uri->fragmentshift = start_offset - strbuf; } uri->string = strbuf; @@ -1832,9 +1840,10 @@ rspamd_url_shift (struct rspamd_url *uri, gsize nlen, old_shift = uri->datalen; uri->datalen -= shift; - remain = (uri->urllen - (uri->data - uri->string)) - old_shift; + remain = (uri->urllen - (uri->datashift)) - old_shift; g_assert (remain >= 0); - memmove (uri->data + uri->datalen, uri->data + old_shift, + memmove (rspamd_url_data_unsafe (uri) + uri->datalen, + rspamd_url_data_unsafe (uri) + old_shift, remain); uri->urllen -= shift; uri->flags |= RSPAMD_URL_FLAG_PATHENCODED; @@ -1849,9 +1858,10 @@ rspamd_url_shift (struct rspamd_url *uri, gsize nlen, old_shift = uri->querylen; uri->querylen -= shift; - remain = (uri->urllen - (uri->query - uri->string)) - old_shift; + remain = (uri->urllen - (uri->queryshift)) - old_shift; g_assert (remain >= 0); - memmove (uri->query + uri->querylen, uri->query + old_shift, + memmove (rspamd_url_query_unsafe (uri) + uri->querylen, + rspamd_url_query_unsafe (uri) + old_shift, remain); uri->urllen -= shift; uri->flags |= RSPAMD_URL_FLAG_QUERYENCODED; @@ -1881,21 +1891,25 @@ rspamd_url_shift (struct rspamd_url *uri, gsize nlen, uri->hostshift -= shift; } /* Go forward */ + /* FALLTHRU */ case UF_HOST: if (uri->datalen > 0) { - uri->data -= shift; + uri->datashift -= shift; } /* Go forward */ + /* FALLTHRU */ case UF_PATH: if (uri->querylen > 0) { - uri->query -= shift; + uri->queryshift -= shift; } /* Go forward */ + /* FALLTHRU */ case UF_QUERY: if (uri->fragmentlen > 0) { - uri->fragment -= shift; + uri->fragmentshift -= shift; } /* Go forward */ + /* FALLTHRU */ case UF_FRAGMENT: default: break; @@ -1943,7 +1957,7 @@ rspamd_url_parse (struct rspamd_url *uri, enum rspamd_url_parse_flags parse_flags) { struct http_parser_url u; - gchar *p, *comp; + gchar *p; const gchar *end; guint i, complen, ret, flags = 0; guint unquoted_len = 0; @@ -2015,31 +2029,36 @@ rspamd_url_parse (struct rspamd_url *uri, for (i = 0; i < UF_MAX; i++) { if (u.field_set & (1 << i)) { - comp = uri->string + u.field_data[i].off; + guint shift = u.field_data[i].off; complen = u.field_data[i].len; + if (complen >= G_MAXUINT16) { + /* Too large component length */ + return URI_ERRNO_BAD_FORMAT; + } + switch (i) { case UF_SCHEMA: uri->protocollen = u.field_data[i].len; break; case UF_HOST: - uri->hostshift = u.field_data[i].off; + uri->hostshift = shift; uri->hostlen = complen; break; case UF_PATH: - uri->data = comp; + uri->datashift = shift; uri->datalen = complen; break; case UF_QUERY: - uri->query = comp; + uri->queryshift = shift; uri->querylen = complen; break; case UF_FRAGMENT: - uri->fragment = comp; + uri->fragmentshift = shift; uri->fragmentlen = complen; break; case UF_USERINFO: - uri->usershift = u.field_data[i].off; + uri->usershift = shift; uri->userlen = complen; break; default: @@ -2129,31 +2148,36 @@ rspamd_url_parse (struct rspamd_url *uri, /* Process data part */ if (uri->datalen) { - unquoted_len = rspamd_url_decode (uri->data, uri->data, uri->datalen); - if (rspamd_normalise_unicode_inplace (pool, uri->data, &unquoted_len)) { + unquoted_len = rspamd_url_decode (rspamd_url_data_unsafe (uri), + rspamd_url_data_unsafe (uri), uri->datalen); + if (rspamd_normalise_unicode_inplace (pool, rspamd_url_data_unsafe (uri), + &unquoted_len)) { uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED; } rspamd_url_shift (uri, unquoted_len, UF_PATH); /* We now normalize path */ - rspamd_http_normalize_path_inplace (uri->data, uri->datalen, &unquoted_len); + rspamd_http_normalize_path_inplace (rspamd_url_data_unsafe (uri), + uri->datalen, &unquoted_len); rspamd_url_shift (uri, unquoted_len, UF_PATH); } if (uri->querylen) { - unquoted_len = rspamd_url_decode (uri->query, - uri->query, + unquoted_len = rspamd_url_decode (rspamd_url_query_unsafe (uri), + rspamd_url_query_unsafe (uri), uri->querylen); - if (rspamd_normalise_unicode_inplace (pool, uri->query, &unquoted_len)) { + if (rspamd_normalise_unicode_inplace (pool, rspamd_url_query_unsafe (uri), + &unquoted_len)) { uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED; } rspamd_url_shift (uri, unquoted_len, UF_QUERY); } if (uri->fragmentlen) { - unquoted_len = rspamd_url_decode (uri->fragment, - uri->fragment, + unquoted_len = rspamd_url_decode (rspamd_url_fragment_unsafe (uri), + rspamd_url_fragment_unsafe (uri), uri->fragmentlen); - if (rspamd_normalise_unicode_inplace (pool, uri->fragment, &unquoted_len)) { + if (rspamd_normalise_unicode_inplace (pool, rspamd_url_fragment_unsafe (uri), + &unquoted_len)) { uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED; } rspamd_url_shift (uri, unquoted_len, UF_FRAGMENT); @@ -3148,7 +3172,7 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, /* We also search the query for additional url inside */ if (url->querylen > 0) { - if (rspamd_url_find (task->task_pool, url->query, url->querylen, + if (rspamd_url_find (task->task_pool, rspamd_url_query_unsafe (url), url->querylen, &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) { query_url = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_url)); @@ -3161,7 +3185,7 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, if (rc == URI_ERRNO_OK && query_url->hostlen > 0) { msg_debug_task ("found url %s in query of url" - " %*s", url_str, url->querylen, url->query); + " %*s", url_str, url->querylen, rspamd_url_query_unsafe (url)); if (prefix_added) { query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS; @@ -3314,7 +3338,7 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset, /* We also search the query for additional url inside */ if (url->querylen > 0) { - if (rspamd_url_find (task->task_pool, url->query, url->querylen, + if (rspamd_url_find (task->task_pool, rspamd_url_query_unsafe (url), url->querylen, &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) { query_url = rspamd_mempool_alloc0 (task->task_pool, @@ -3328,7 +3352,7 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset, if (rc == URI_ERRNO_OK && url->hostlen > 0) { msg_debug_task ("found url %s in query of url" - " %*s", url_str, url->querylen, url->query); + " %*s", url_str, url->querylen, rspamd_url_query_unsafe (url)); if (prefix_added) { query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS; @@ -3651,11 +3675,11 @@ rspamd_url_encode (struct rspamd_url *url, gsize *pdlen, RSPAMD_URL_FLAGS_HOSTSAFE); CHECK_URL_COMPONENT (rspamd_url_user_unsafe(url), url->userlen, RSPAMD_URL_FLAGS_USERSAFE); - CHECK_URL_COMPONENT ((guchar *)url->data, url->datalen, + CHECK_URL_COMPONENT (rspamd_url_data_unsafe (url), url->datalen, RSPAMD_URL_FLAGS_PATHSAFE); - CHECK_URL_COMPONENT ((guchar *)url->query, url->querylen, + CHECK_URL_COMPONENT (rspamd_url_query_unsafe (url), url->querylen, RSPAMD_URL_FLAGS_QUERYSAFE); - CHECK_URL_COMPONENT ((guchar *)url->fragment, url->fragmentlen, + CHECK_URL_COMPONENT (rspamd_url_fragment_unsafe (url), url->fragmentlen, RSPAMD_URL_FLAGS_FRAGMENTSAFE); if (dlen == 0) { @@ -3698,19 +3722,19 @@ rspamd_url_encode (struct rspamd_url *url, gsize *pdlen, if (url->datalen > 0) { *d++ = '/'; - ENCODE_URL_COMPONENT ((guchar *)url->data, url->datalen, + ENCODE_URL_COMPONENT (rspamd_url_data_unsafe (url), url->datalen, RSPAMD_URL_FLAGS_PATHSAFE); } if (url->querylen > 0) { *d++ = '?'; - ENCODE_URL_COMPONENT ((guchar *)url->query, url->querylen, + ENCODE_URL_COMPONENT (rspamd_url_query_unsafe (url), url->querylen, RSPAMD_URL_FLAGS_QUERYSAFE); } if (url->fragmentlen > 0) { *d++ = '#'; - ENCODE_URL_COMPONENT ((guchar *)url->fragment, url->fragmentlen, + ENCODE_URL_COMPONENT (rspamd_url_fragment_unsafe (url), url->fragmentlen, RSPAMD_URL_FLAGS_FRAGMENTSAFE); } diff --git a/src/libserver/url.h b/src/libserver/url.h index 080f005c3..87766c4e6 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -44,33 +44,34 @@ struct rspamd_url_tag { struct rspamd_url { gchar *raw; gchar *string; - guint protocol; - guint port; - guint usershift; - guint userlen; + guint16 protocol; + guint16 port; + guint usershift; guint hostshift; - guint hostlen; + guint datashift; + guint queryshift; + guint fragmentshift; - gchar *data; - gchar *query; - gchar *fragment; gchar *tld; gchar *visible_part; struct rspamd_url *phished_url; - guint protocollen; - guint datalen; - guint querylen; - guint fragmentlen; - guint tldlen; guint urllen; guint rawlen; + guint32 flags; - enum rspamd_url_flags flags; - guint count; + guint16 protocollen; + guint16 userlen; + guint16 hostlen; + guint16 datalen; + guint16 querylen; + guint16 fragmentlen; + guint16 tldlen; + + guint16 count; }; #define rspamd_url_user(u) ((u)->userlen > 0 ? (u)->string + (u)->usershift : NULL) @@ -79,6 +80,10 @@ struct rspamd_url { #define rspamd_url_host(u) ((u)->hostlen > 0 ? (u)->string + (u)->hostshift : NULL) #define rspamd_url_host_unsafe(u) ((u)->string + (u)->hostshift) +#define rspamd_url_data_unsafe(u) ((u)->string + (u)->datashift) +#define rspamd_url_query_unsafe(u) ((u)->string + (u)->queryshift) +#define rspamd_url_fragment_unsafe(u) ((u)->string + (u)->fragmentshift) + enum uri_errno { URI_ERRNO_OK = 0, /* Parsing went well */ URI_ERRNO_EMPTY, /* The URI string was empty */ @@ -97,7 +102,7 @@ enum rspamd_url_protocol { PROTOCOL_HTTPS = 1u << 3u, PROTOCOL_MAILTO = 1u << 4u, PROTOCOL_TELEPHONE = 1u << 5u, - PROTOCOL_UNKNOWN = 1u << 31u, + PROTOCOL_UNKNOWN = 1u << 15u, }; enum rspamd_url_parse_flags { diff --git a/src/lua/lua_url.c b/src/lua/lua_url.c index bd94120e2..cb54a694c 100644 --- a/src/lua/lua_url.c +++ b/src/lua/lua_url.c @@ -220,7 +220,7 @@ lua_url_get_path (lua_State *L) struct rspamd_lua_url *url = lua_check_url (L, 1); if (url != NULL && url->url->datalen > 0) { - lua_pushlstring (L, url->url->data, url->url->datalen); + lua_pushlstring (L, rspamd_url_data_unsafe (url->url), url->url->datalen); } else { lua_pushnil (L); @@ -241,7 +241,7 @@ lua_url_get_query (lua_State *L) struct rspamd_lua_url *url = lua_check_url (L, 1); if (url != NULL && url->url->querylen > 0) { - lua_pushlstring (L, url->url->query, url->url->querylen); + lua_pushlstring (L, rspamd_url_query_unsafe (url->url), url->url->querylen); } else { lua_pushnil (L); @@ -262,7 +262,7 @@ lua_url_get_fragment (lua_State *L) struct rspamd_lua_url *url = lua_check_url (L, 1); if (url != NULL && url->url->fragmentlen > 0) { - lua_pushlstring (L, url->url->fragment, url->url->fragmentlen); + lua_pushlstring (L, rspamd_url_fragment_unsafe (url->url), url->url->fragmentlen); } else { lua_pushnil (L); @@ -684,19 +684,19 @@ lua_url_to_table (lua_State *L) if (u->datalen > 0) { lua_pushstring (L, "path"); - lua_pushlstring (L, u->data, u->datalen); + lua_pushlstring (L, rspamd_url_data_unsafe (u), u->datalen); lua_settable (L, -3); } if (u->querylen > 0) { lua_pushstring (L, "query"); - lua_pushlstring (L, u->query, u->querylen); + lua_pushlstring (L, rspamd_url_query_unsafe (u), u->querylen); lua_settable (L, -3); } if (u->fragmentlen > 0) { lua_pushstring (L, "fragment"); - lua_pushlstring (L, u->fragment, u->fragmentlen); + lua_pushlstring (L, rspamd_url_fragment_unsafe (u), u->fragmentlen); lua_settable (L, -3); } |