/* * Copyright 2024 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* URL check functions */ #ifndef URL_H #define URL_H #include "config.h" #include "mem_pool.h" #include "khash.h" #include "fstring.h" #include "libutil/cxx/utf8_util.h" #ifdef __cplusplus extern "C" { #endif struct rspamd_task; struct rspamd_mime_text_part; enum rspamd_url_flags { RSPAMD_URL_FLAG_PHISHED = 1u << 0u, RSPAMD_URL_FLAG_NUMERIC = 1u << 1u, RSPAMD_URL_FLAG_OBSCURED = 1u << 2u, RSPAMD_URL_FLAG_REDIRECTED = 1u << 3u, RSPAMD_URL_FLAG_HTML_DISPLAYED = 1u << 4u, RSPAMD_URL_FLAG_FROM_TEXT = 1u << 5u, RSPAMD_URL_FLAG_SUBJECT = 1u << 6u, RSPAMD_URL_FLAG_HOSTENCODED = 1u << 7u, RSPAMD_URL_FLAG_SCHEMAENCODED = 1u << 8u, RSPAMD_URL_FLAG_PATHENCODED = 1u << 9u, RSPAMD_URL_FLAG_QUERYENCODED = 1u << 10u, RSPAMD_URL_FLAG_MISSINGSLASHES = 1u << 11u, RSPAMD_URL_FLAG_IDN = 1u << 12u, RSPAMD_URL_FLAG_HAS_PORT = 1u << 13u, RSPAMD_URL_FLAG_HAS_USER = 1u << 14u, RSPAMD_URL_FLAG_SCHEMALESS = 1u << 15u, RSPAMD_URL_FLAG_UNNORMALISED = 1u << 16u, RSPAMD_URL_FLAG_ZW_SPACES = 1u << 17u, RSPAMD_URL_FLAG_DISPLAY_URL = 1u << 18u, RSPAMD_URL_FLAG_IMAGE = 1u << 19u, RSPAMD_URL_FLAG_QUERY = 1u << 20u, RSPAMD_URL_FLAG_CONTENT = 1u << 21u, RSPAMD_URL_FLAG_NO_TLD = 1u << 22u, RSPAMD_URL_FLAG_TRUNCATED = 1u << 23u, RSPAMD_URL_FLAG_REDIRECT_TARGET = 1u << 24u, RSPAMD_URL_FLAG_INVISIBLE = 1u << 25u, RSPAMD_URL_FLAG_SPECIAL = 1u << 26u, }; #define RSPAMD_URL_MAX_FLAG_SHIFT (26u) struct rspamd_url_tag { const char *data; struct rspamd_url_tag *prev, *next; }; struct rspamd_url_ext; /** * URL structure */ struct rspamd_url { char *string; char *raw; struct rspamd_url_ext *ext; uint32_t flags; uint8_t protocol; uint8_t protocollen; uint16_t hostshift; uint16_t datashift; uint16_t queryshift; uint16_t fragmentshift; uint16_t tldshift; uint16_t usershift; uint16_t userlen; uint16_t hostlen; uint16_t datalen; uint16_t querylen; uint16_t fragmentlen; uint16_t tldlen; uint16_t count; uint16_t urllen; uint16_t rawlen; /* Absolute order of the URL in a message */ uint16_t order; /* Order of the URL in a specific part of message */ uint16_t part_order; }; /** * Rarely used url fields */ struct rspamd_url_ext { char *visible_part; struct rspamd_url *linked_url; uint16_t port; }; #define rspamd_url_user(u) ((u)->userlen > 0 ? (u)->string + (u)->usershift : NULL) #define rspamd_url_user_unsafe(u) ((u)->string + (u)->usershift) #define rspamd_url_host(u) ((u)->hostlen > 0 ? (u)->string + (u)->hostshift : NULL) #define rspamd_url_host_unsafe(u) ((u)->string + (u)->hostshift) #define rspamd_url_tld_unsafe(u) ((u)->string + (u)->tldshift) #define rspamd_url_data_unsafe(u) ((u)->string + (u)->datashift) #define rspamd_url_query_unsafe(u) ((u)->string + (u)->queryshift) #define rspamd_url_fragment_unsafe(u) ((u)->string + (u)->fragmentshift) enum uri_errno { URI_ERRNO_OK = 0, /* Parsing went well */ URI_ERRNO_EMPTY, /* The URI string was empty */ URI_ERRNO_INVALID_PROTOCOL, /* No protocol was found */ URI_ERRNO_INVALID_PORT, /* Port number is bad */ URI_ERRNO_BAD_ENCODING, /* Bad characters encoding */ URI_ERRNO_BAD_FORMAT, URI_ERRNO_TLD_MISSING, URI_ERRNO_HOST_MISSING, URI_ERRNO_TOO_LONG, }; enum rspamd_url_protocol { PROTOCOL_FILE = 1u << 0u, PROTOCOL_FTP = 1u << 1u, PROTOCOL_HTTP = 1u << 2u, PROTOCOL_HTTPS = 1u << 3u, PROTOCOL_MAILTO = 1u << 4u, PROTOCOL_TELEPHONE = 1u << 5u, PROTOCOL_UNKNOWN = 1u << 7u, }; enum rspamd_url_parse_flags { RSPAMD_URL_PARSE_TEXT = 0u, RSPAMD_URL_PARSE_HREF = (1u << 0u), RSPAMD_URL_PARSE_CHECK = (1u << 1u), }; enum rspamd_url_find_type { RSPAMD_URL_FIND_ALL = 0, RSPAMD_URL_FIND_STRICT, }; /** * Initialize url library * @param cfg */ void rspamd_url_init(const char *tld_file); void rspamd_url_deinit(void); /* * Parse urls inside text * @param pool memory pool * @param task task object * @param part current text part * @param is_html turn on html heuristic */ void rspamd_url_text_extract(rspamd_mempool_t *pool, struct rspamd_task *task, struct rspamd_mime_text_part *part, uint16_t *cur_order, enum rspamd_url_find_type how); /* * Parse a single url into an uri structure * @param pool memory pool * @param uristring text form of url * @param uri url object, must be pre allocated */ enum uri_errno rspamd_url_parse(struct rspamd_url *uri, char *uristring, gsize len, rspamd_mempool_t *pool, enum rspamd_url_parse_flags flags); /* * Try to extract url from a text * @param pool memory pool * @param begin begin of text * @param len length of text * @param start storage for start position of url found (or NULL) * @param end storage for end position of url found (or NULL) * @param url_str storage for url string(or NULL) * @return TRUE if url is found in specified text */ gboolean rspamd_url_find(rspamd_mempool_t *pool, const char *begin, gsize len, char **url_str, enum rspamd_url_find_type how, goffset *url_pos, gboolean *prefix_added); /* * Return text representation of url parsing error */ const char *rspamd_url_strerror(int err); /** * Find TLD for a specified host string * @param in input host * @param inlen length of input * @param out output rspamd_ftok_t with tld position * @return TRUE if tld has been found */ gboolean rspamd_url_find_tld(const char *in, gsize inlen, rspamd_ftok_t *out); typedef gboolean (*url_insert_function)(struct rspamd_url *url, gsize start_offset, gsize end_offset, void *ud); /** * Search for multiple urls in text and call `func` for each url found * @param pool * @param in * @param inlen * @param is_html * @param func * @param ud */ void rspamd_url_find_multiple(rspamd_mempool_t *pool, const char *in, gsize inlen, enum rspamd_url_find_type how, GPtrArray *nlines, url_insert_function func, gpointer ud); /** * Search for a single url in text and call `func` for each url found * @param pool * @param in * @param inlen * @param is_html * @param func * @param ud */ void rspamd_url_find_single(rspamd_mempool_t *pool, const char *in, gsize inlen, enum rspamd_url_find_type how, url_insert_function func, gpointer ud); /** * Generic callback to insert URLs into rspamd_task * @param url * @param start_offset * @param end_offset * @param ud */ gboolean rspamd_url_task_subject_callback(struct rspamd_url *url, gsize start_offset, gsize end_offset, gpointer ud); /** * Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated * @param dst * @param src * @param size * @return */ gsize rspamd_url_decode(char *dst, const char *src, gsize size); /** * Encode url if needed. In this case, memory is allocated from the specific pool. * Returns pointer to begin and encoded length in `dlen` * @param url * @param pool * @return */ const char *rspamd_url_encode(struct rspamd_url *url, gsize *dlen, rspamd_mempool_t *pool); /** * Returns if a character is domain character * @param c * @return */ gboolean rspamd_url_is_domain(int c); /** * Returns symbolic name for protocol * @param proto * @return */ const char *rspamd_url_protocol_name(enum rspamd_url_protocol proto); /** * Converts string to a numeric protocol * @param str * @return */ enum rspamd_url_protocol rspamd_url_protocol_from_string(const char *str); /** * Converts string to a url flag * @param str * @param flag * @return */ bool rspamd_url_flag_from_string(const char *str, int *flag); /** * Converts url flag to a string * @param flag * @return */ const char *rspamd_url_flag_to_string(int flag); /* Defines sets of urls indexed by url as is */ KHASH_DECLARE(rspamd_url_hash, struct rspamd_url *, char); KHASH_DECLARE(rspamd_url_host_hash, struct rspamd_url *, char); /* Convenience functions for url sets */ /** * Add an url to set or increase the existing url count * @param set * @param u * @return true if a new url has been added */ bool rspamd_url_set_add_or_increase(khash_t(rspamd_url_hash) * set, struct rspamd_url *u, bool enforce_replace); /** * Same as rspamd_url_set_add_or_increase but returns the existing url if found * @param set * @param u * @return */ struct rspamd_url *rspamd_url_set_add_or_return(khash_t(rspamd_url_hash) * set, struct rspamd_url *u); /** * Helper for url host set * @param set * @param u * @return */ bool rspamd_url_host_set_add(khash_t(rspamd_url_host_hash) * set, struct rspamd_url *u); /** * Checks if a url is in set * @param set * @param u * @return */ bool rspamd_url_set_has(khash_t(rspamd_url_hash) * set, struct rspamd_url *u); bool rspamd_url_host_set_has(khash_t(rspamd_url_host_hash) * set, struct rspamd_url *u); /** * Compares two urls (similar to C comparison functions) lexicographically * @param u1 * @param u2 * @return */ int rspamd_url_cmp(const struct rspamd_url *u1, const struct rspamd_url *u2); /** * Same but used for qsort to sort `struct rspamd_url *[]` array * @param u1 * @param u2 * @return */ int rspamd_url_cmp_qsort(const void *u1, const void *u2); /** * Returns a port for some url * @param u * @return */ static RSPAMD_PURE_FUNCTION inline uint16_t rspamd_url_get_port(struct rspamd_url *u) { if ((u->flags & RSPAMD_URL_FLAG_HAS_PORT) && u->ext) { return u->ext->port; } else { /* Assume standard port */ if (u->protocol == PROTOCOL_HTTPS) { return 443; } else { return 80; } } } /** * Returns a port for some url if it is set * @param u * @return */ static RSPAMD_PURE_FUNCTION inline uint16_t rspamd_url_get_port_if_special(struct rspamd_url *u) { if ((u->flags & RSPAMD_URL_FLAG_HAS_PORT) && u->ext) { return u->ext->port; } return 0; } /** * Normalize unicode input and set out url flags as appropriate * @param pool * @param input * @param len_out (must be &var) * @param url_flags_out (must be just a var with no dereference) */ #define rspamd_url_normalise_propagate_flags(pool, input, len_out, url_flags_out) \ do { \ enum rspamd_utf8_normalise_result norm_res; \ norm_res = rspamd_normalise_unicode_inplace((input), (len_out)); \ if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) { \ url_flags_out |= RSPAMD_URL_FLAG_UNNORMALISED; \ } \ if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) { \ url_flags_out |= RSPAMD_URL_FLAG_ZW_SPACES; \ } \ if (norm_res & (RSPAMD_UNICODE_NORM_ERROR)) { \ url_flags_out |= RSPAMD_URL_FLAG_OBSCURED; \ } \ } while (0) #ifdef __cplusplus } #endif #endif