From c62f291c138c795eb6f4ec8ce0e59204f5de3ca2 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 11 May 2021 15:13:15 +0100 Subject: [Fix] Fix normalisation flags propagation --- src/libserver/html.c | 19 ++------ src/libserver/url.c | 30 ++++++------- src/libserver/url.h | 125 ++++++++++++++++++++++++++++++--------------------- 3 files changed, 91 insertions(+), 83 deletions(-) (limited to 'src/libserver') diff --git a/src/libserver/html.c b/src/libserver/html.c index 4cb46445f..c373bb115 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -1593,21 +1593,7 @@ rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len, url = rspamd_mempool_alloc0 (pool, sizeof (*url)); - enum rspamd_normalise_result norm_res; - - norm_res = rspamd_normalise_unicode_inplace (pool, decoded, &dlen); - - if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) { - saved_flags |= RSPAMD_URL_FLAG_UNNORMALISED; - } - - if (norm_res & (RSPAMD_UNICODE_NORM_ZERO_SPACES|RSPAMD_UNICODE_NORM_ERROR)) { - saved_flags |= RSPAMD_URL_FLAG_OBSCURED; - - if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) { - saved_flags |= RSPAMD_URL_FLAG_ZW_SPACES; - } - } + rspamd_url_normalise_propagate_flags (pool, decoded, &dlen, saved_flags); rc = rspamd_url_parse (url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF); @@ -2644,6 +2630,9 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool, if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) { saved_flags |= RSPAMD_URL_FLAG_UNNORMALISED; } + if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) { + saved_flags |= RSPAMD_URL_FLAG_ZW_SPACES; + } rspamd_html_url_is_phished (pool, url, url->visible_part, diff --git a/src/libserver/url.c b/src/libserver/url.c index d36704e73..eb663519d 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -1339,7 +1339,7 @@ rspamd_web_parse (struct http_parser_url *u, const gchar *str, gsize len, if (!u_isalnum (uc)) { /* Bad symbol */ if (IS_ZERO_WIDTH_SPACE (uc)) { - (*flags) |= RSPAMD_URL_FLAG_OBSCURED; + (*flags) |= RSPAMD_URL_FLAG_OBSCURED|RSPAMD_URL_FLAG_ZW_SPACES; } else { if (!u_isgraph (uc)) { @@ -2308,10 +2308,8 @@ rspamd_url_parse (struct rspamd_url *uri, unquoted_len = rspamd_url_decode (rspamd_url_host_unsafe (uri), rspamd_url_host_unsafe (uri), uri->hostlen); - if (rspamd_normalise_unicode_inplace (pool, - rspamd_url_host_unsafe (uri), &unquoted_len)) { - uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED; - } + rspamd_url_normalise_propagate_flags (pool, rspamd_url_host_unsafe (uri), + &unquoted_len, uri->flags); rspamd_url_shift (uri, unquoted_len, UF_HOST); @@ -2380,10 +2378,10 @@ rspamd_url_parse (struct rspamd_url *uri, if (uri->datalen) { unquoted_len = rspamd_url_decode (rspamd_url_data_unsafe (uri), rspamd_url_data_unsafe (uri), uri->datalen); - if (rspamd_normalise_unicode_inplace (pool, rspamd_url_data_unsafe (uri), - &unquoted_len)) { - uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED; - } + + rspamd_url_normalise_propagate_flags (pool, rspamd_url_data_unsafe (uri), + &unquoted_len, uri->flags); + rspamd_url_shift (uri, unquoted_len, UF_PATH); /* We now normalize path */ rspamd_http_normalize_path_inplace (rspamd_url_data_unsafe (uri), @@ -2395,10 +2393,9 @@ rspamd_url_parse (struct rspamd_url *uri, unquoted_len = rspamd_url_decode (rspamd_url_query_unsafe (uri), rspamd_url_query_unsafe (uri), uri->querylen); - if (rspamd_normalise_unicode_inplace (pool, rspamd_url_query_unsafe (uri), - &unquoted_len)) { - uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED; - } + + rspamd_url_normalise_propagate_flags (pool, rspamd_url_query_unsafe (uri), + &unquoted_len, uri->flags); rspamd_url_shift (uri, unquoted_len, UF_QUERY); } @@ -2406,10 +2403,9 @@ rspamd_url_parse (struct rspamd_url *uri, unquoted_len = rspamd_url_decode (rspamd_url_fragment_unsafe (uri), rspamd_url_fragment_unsafe (uri), uri->fragmentlen); - if (rspamd_normalise_unicode_inplace (pool, rspamd_url_fragment_unsafe (uri), - &unquoted_len)) { - uri->flags |= RSPAMD_URL_FLAG_UNNORMALISED; - } + + rspamd_url_normalise_propagate_flags (pool, rspamd_url_fragment_unsafe (uri), + &unquoted_len, uri->flags); rspamd_url_shift (uri, unquoted_len, UF_FRAGMENT); } diff --git a/src/libserver/url.h b/src/libserver/url.h index 249c316e4..72fce5f9e 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -127,9 +127,9 @@ enum rspamd_url_find_type { * Initialize url library * @param cfg */ -void rspamd_url_init (const gchar *tld_file); +void rspamd_url_init(const gchar *tld_file); -void rspamd_url_deinit (void); +void rspamd_url_deinit(void); /* * Parse urls inside text @@ -138,10 +138,10 @@ void rspamd_url_deinit (void); * @param part current text part * @param is_html turn on html euristic */ -void rspamd_url_text_extract (rspamd_mempool_t *pool, - struct rspamd_task *task, - struct rspamd_mime_text_part *part, - enum rspamd_url_find_type how); +void rspamd_url_text_extract(rspamd_mempool_t *pool, + struct rspamd_task *task, + struct rspamd_mime_text_part *part, + enum rspamd_url_find_type how); /* * Parse a single url into an uri structure @@ -149,11 +149,11 @@ void rspamd_url_text_extract (rspamd_mempool_t *pool, * @param uristring text form of url * @param uri url object, must be pre allocated */ -enum uri_errno rspamd_url_parse (struct rspamd_url *uri, - gchar *uristring, - gsize len, - rspamd_mempool_t *pool, - enum rspamd_url_parse_flags flags); +enum uri_errno rspamd_url_parse(struct rspamd_url *uri, + gchar *uristring, + gsize len, + rspamd_mempool_t *pool, + enum rspamd_url_parse_flags flags); /* * Try to extract url from a text @@ -165,17 +165,17 @@ enum uri_errno rspamd_url_parse (struct rspamd_url *uri, * @param url_str storage for url string(or NULL) * @return TRUE if url is found in specified text */ -gboolean rspamd_url_find (rspamd_mempool_t *pool, - const gchar *begin, gsize len, - gchar **url_str, - enum rspamd_url_find_type how, - goffset *url_pos, - gboolean *prefix_added); +gboolean rspamd_url_find(rspamd_mempool_t *pool, + const gchar *begin, gsize len, + gchar **url_str, + enum rspamd_url_find_type how, + goffset *url_pos, + gboolean *prefix_added); /* * Return text representation of url parsing error */ -const gchar *rspamd_url_strerror (int err); +const gchar *rspamd_url_strerror(int err); /** @@ -185,10 +185,10 @@ const gchar *rspamd_url_strerror (int err); * @param out output rspamd_ftok_t with tld position * @return TRUE if tld has been found */ -gboolean rspamd_url_find_tld (const gchar *in, gsize inlen, rspamd_ftok_t *out); +gboolean rspamd_url_find_tld(const gchar *in, gsize inlen, rspamd_ftok_t *out); -typedef gboolean (*url_insert_function) (struct rspamd_url *url, - gsize start_offset, gsize end_offset, void *ud); +typedef gboolean (*url_insert_function)(struct rspamd_url *url, + gsize start_offset, gsize end_offset, void *ud); /** * Search for multiple urls in text and call `func` for each url found @@ -199,12 +199,12 @@ typedef gboolean (*url_insert_function) (struct rspamd_url *url, * @param func * @param ud */ -void rspamd_url_find_multiple (rspamd_mempool_t *pool, - const gchar *in, gsize inlen, - enum rspamd_url_find_type how, - GPtrArray *nlines, - url_insert_function func, - gpointer ud); +void rspamd_url_find_multiple(rspamd_mempool_t *pool, + const gchar *in, gsize inlen, + enum rspamd_url_find_type how, + GPtrArray *nlines, + url_insert_function func, + gpointer ud); /** * Search for a single url in text and call `func` for each url found @@ -215,11 +215,11 @@ void rspamd_url_find_multiple (rspamd_mempool_t *pool, * @param func * @param ud */ -void rspamd_url_find_single (rspamd_mempool_t *pool, - const gchar *in, gsize inlen, - enum rspamd_url_find_type how, - url_insert_function func, - gpointer ud); +void rspamd_url_find_single(rspamd_mempool_t *pool, + const gchar *in, gsize inlen, + enum rspamd_url_find_type how, + url_insert_function func, + gpointer ud); /** * Generic callback to insert URLs into rspamd_task @@ -228,9 +228,9 @@ void rspamd_url_find_single (rspamd_mempool_t *pool, * @param end_offset * @param ud */ -gboolean rspamd_url_task_subject_callback (struct rspamd_url *url, - gsize start_offset, - gsize end_offset, gpointer ud); +gboolean rspamd_url_task_subject_callback(struct rspamd_url *url, + gsize start_offset, + gsize end_offset, gpointer ud); /** * Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated @@ -239,7 +239,7 @@ gboolean rspamd_url_task_subject_callback (struct rspamd_url *url, * @param size * @return */ -gsize rspamd_url_decode (gchar *dst, const gchar *src, gsize size); +gsize rspamd_url_decode(gchar *dst, const gchar *src, gsize size); /** * Encode url if needed. In this case, memory is allocated from the specific pool. @@ -248,8 +248,8 @@ gsize rspamd_url_decode (gchar *dst, const gchar *src, gsize size); * @param pool * @return */ -const gchar *rspamd_url_encode (struct rspamd_url *url, gsize *dlen, - rspamd_mempool_t *pool); +const gchar *rspamd_url_encode(struct rspamd_url *url, gsize *dlen, + rspamd_mempool_t *pool); /** @@ -257,14 +257,14 @@ const gchar *rspamd_url_encode (struct rspamd_url *url, gsize *dlen, * @param c * @return */ -gboolean rspamd_url_is_domain (int c); +gboolean rspamd_url_is_domain(int c); /** * Returns symbolic name for protocol * @param proto * @return */ -const gchar *rspamd_url_protocol_name (enum rspamd_url_protocol proto); +const gchar *rspamd_url_protocol_name(enum rspamd_url_protocol proto); /** @@ -272,7 +272,7 @@ const gchar *rspamd_url_protocol_name (enum rspamd_url_protocol proto); * @param str * @return */ -enum rspamd_url_protocol rspamd_url_protocol_from_string (const gchar *str); +enum rspamd_url_protocol rspamd_url_protocol_from_string(const gchar *str); /** * Converts string to a url flag @@ -280,14 +280,14 @@ enum rspamd_url_protocol rspamd_url_protocol_from_string (const gchar *str); * @param flag * @return */ -bool rspamd_url_flag_from_string (const gchar *str, gint *flag); +bool rspamd_url_flag_from_string(const gchar *str, gint *flag); /** * Converts url flag to a string * @param flag * @return */ -const gchar * rspamd_url_flag_to_string (int flag); +const gchar *rspamd_url_flag_to_string(int flag); /* Defines sets of urls indexed by url as is */ KHASH_DECLARE (rspamd_url_hash, struct rspamd_url *, char); @@ -310,24 +310,25 @@ bool rspamd_url_set_add_or_increase(khash_t (rspamd_url_hash) *set, * @param u * @return */ -struct rspamd_url * rspamd_url_set_add_or_return (khash_t (rspamd_url_hash) *set, - struct rspamd_url *u); +struct rspamd_url *rspamd_url_set_add_or_return(khash_t (rspamd_url_hash) *set, + struct rspamd_url *u); /** * Helper for url host set * @param set * @param u * @return */ -bool rspamd_url_host_set_add (khash_t (rspamd_url_host_hash) *set, - struct rspamd_url *u); +bool rspamd_url_host_set_add(khash_t (rspamd_url_host_hash) *set, + struct rspamd_url *u); /** * Checks if a url is in set * @param set * @param u * @return */ -bool rspamd_url_set_has (khash_t (rspamd_url_hash) *set, struct rspamd_url *u); -bool rspamd_url_host_set_has (khash_t (rspamd_url_host_hash) *set, struct rspamd_url *u); +bool rspamd_url_set_has(khash_t (rspamd_url_hash) *set, struct rspamd_url *u); + +bool rspamd_url_host_set_has(khash_t (rspamd_url_host_hash) *set, struct rspamd_url *u); /** * Compares two urls (similar to C comparison functions) lexicographically @@ -335,15 +336,37 @@ bool rspamd_url_host_set_has (khash_t (rspamd_url_host_hash) *set, struct rspamd * @param u2 * @return */ -int rspamd_url_cmp (const struct rspamd_url *u1, const struct rspamd_url *u2); +int rspamd_url_cmp(const struct rspamd_url *u1, const struct rspamd_url *u2); + /** * Same but used for qsort to sort `struct rspamd_url *[]` array * @param u1 * @param u2 * @return */ -int rspamd_url_cmp_qsort (const void *u1, const void *u2); +int rspamd_url_cmp_qsort(const void *u1, const void *u2); +/** + * Normalize unicode input and set out url flags as appropriate + * @param pool + * @param input + * @param len_out (must be &var) + * @param url_flags_out (must be just a var with no dereference) + */ +#define rspamd_url_normalise_propagate_flags(pool, input, len_out, url_flags_out) \ + do { \ + enum rspamd_normalise_result norm_res; \ + norm_res = rspamd_normalise_unicode_inplace((pool), (input), (len_out)); \ + if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) { \ + url_flags_out |= RSPAMD_URL_FLAG_UNNORMALISED; \ + } \ + if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) { \ + url_flags_out |= RSPAMD_URL_FLAG_ZW_SPACES; \ + } \ + if (norm_res & (RSPAMD_UNICODE_NORM_ERROR)) { \ + url_flags_out |= RSPAMD_URL_FLAG_OBSCURED; \ + } \ + } while(0) #ifdef __cplusplus } #endif -- cgit v1.2.3