123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446 |
- /*
- * Copyright 2024 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /* URL check functions */
- #ifndef URL_H
- #define URL_H
-
- #include "config.h"
- #include "mem_pool.h"
- #include "khash.h"
- #include "fstring.h"
- #include "libutil/cxx/utf8_util.h"
-
- #ifdef __cplusplus
- extern "C" {
- #endif
-
- struct rspamd_task;
- struct rspamd_mime_text_part;
-
- enum rspamd_url_flags {
- RSPAMD_URL_FLAG_PHISHED = 1u << 0u,
- RSPAMD_URL_FLAG_NUMERIC = 1u << 1u,
- RSPAMD_URL_FLAG_OBSCURED = 1u << 2u,
- RSPAMD_URL_FLAG_REDIRECTED = 1u << 3u,
- RSPAMD_URL_FLAG_HTML_DISPLAYED = 1u << 4u,
- RSPAMD_URL_FLAG_FROM_TEXT = 1u << 5u,
- RSPAMD_URL_FLAG_SUBJECT = 1u << 6u,
- RSPAMD_URL_FLAG_HOSTENCODED = 1u << 7u,
- RSPAMD_URL_FLAG_SCHEMAENCODED = 1u << 8u,
- RSPAMD_URL_FLAG_PATHENCODED = 1u << 9u,
- RSPAMD_URL_FLAG_QUERYENCODED = 1u << 10u,
- RSPAMD_URL_FLAG_MISSINGSLASHES = 1u << 11u,
- RSPAMD_URL_FLAG_IDN = 1u << 12u,
- RSPAMD_URL_FLAG_HAS_PORT = 1u << 13u,
- RSPAMD_URL_FLAG_HAS_USER = 1u << 14u,
- RSPAMD_URL_FLAG_SCHEMALESS = 1u << 15u,
- RSPAMD_URL_FLAG_UNNORMALISED = 1u << 16u,
- RSPAMD_URL_FLAG_ZW_SPACES = 1u << 17u,
- RSPAMD_URL_FLAG_DISPLAY_URL = 1u << 18u,
- RSPAMD_URL_FLAG_IMAGE = 1u << 19u,
- RSPAMD_URL_FLAG_QUERY = 1u << 20u,
- RSPAMD_URL_FLAG_CONTENT = 1u << 21u,
- RSPAMD_URL_FLAG_NO_TLD = 1u << 22u,
- RSPAMD_URL_FLAG_TRUNCATED = 1u << 23u,
- RSPAMD_URL_FLAG_REDIRECT_TARGET = 1u << 24u,
- RSPAMD_URL_FLAG_INVISIBLE = 1u << 25u,
- RSPAMD_URL_FLAG_SPECIAL = 1u << 26u,
-
- };
- #define RSPAMD_URL_MAX_FLAG_SHIFT (26u)
-
- struct rspamd_url_tag {
- const char *data;
- struct rspamd_url_tag *prev, *next;
- };
-
- struct rspamd_url_ext;
- /**
- * URL structure
- */
- struct rspamd_url {
- char *string;
- char *raw;
- struct rspamd_url_ext *ext;
-
- uint32_t flags;
-
- uint8_t protocol;
- uint8_t protocollen;
-
- uint16_t hostshift;
- uint16_t datashift;
- uint16_t queryshift;
- uint16_t fragmentshift;
- uint16_t tldshift;
- uint16_t usershift;
- uint16_t userlen;
-
- uint16_t hostlen;
- uint16_t datalen;
- uint16_t querylen;
- uint16_t fragmentlen;
- uint16_t tldlen;
- uint16_t count;
- uint16_t urllen;
- uint16_t rawlen;
-
- /* Absolute order of the URL in a message */
- uint16_t order;
- /* Order of the URL in a specific part of message */
- uint16_t part_order;
- };
-
- /**
- * Rarely used url fields
- */
- struct rspamd_url_ext {
- char *visible_part;
- struct rspamd_url *linked_url;
-
- uint16_t port;
- };
-
- #define rspamd_url_user(u) ((u)->userlen > 0 ? (u)->string + (u)->usershift : NULL)
- #define rspamd_url_user_unsafe(u) ((u)->string + (u)->usershift)
-
- #define rspamd_url_host(u) ((u)->hostlen > 0 ? (u)->string + (u)->hostshift : NULL)
- #define rspamd_url_host_unsafe(u) ((u)->string + (u)->hostshift)
- #define rspamd_url_tld_unsafe(u) ((u)->string + (u)->tldshift)
-
- #define rspamd_url_data_unsafe(u) ((u)->string + (u)->datashift)
- #define rspamd_url_query_unsafe(u) ((u)->string + (u)->queryshift)
- #define rspamd_url_fragment_unsafe(u) ((u)->string + (u)->fragmentshift)
-
- enum uri_errno {
- URI_ERRNO_OK = 0, /* Parsing went well */
- URI_ERRNO_EMPTY, /* The URI string was empty */
- URI_ERRNO_INVALID_PROTOCOL, /* No protocol was found */
- URI_ERRNO_INVALID_PORT, /* Port number is bad */
- URI_ERRNO_BAD_ENCODING, /* Bad characters encoding */
- URI_ERRNO_BAD_FORMAT,
- URI_ERRNO_TLD_MISSING,
- URI_ERRNO_HOST_MISSING,
- URI_ERRNO_TOO_LONG,
- };
-
- enum rspamd_url_protocol {
- PROTOCOL_FILE = 1u << 0u,
- PROTOCOL_FTP = 1u << 1u,
- PROTOCOL_HTTP = 1u << 2u,
- PROTOCOL_HTTPS = 1u << 3u,
- PROTOCOL_MAILTO = 1u << 4u,
- PROTOCOL_TELEPHONE = 1u << 5u,
- PROTOCOL_UNKNOWN = 1u << 7u,
- };
-
- enum rspamd_url_parse_flags {
- RSPAMD_URL_PARSE_TEXT = 0u,
- RSPAMD_URL_PARSE_HREF = (1u << 0u),
- RSPAMD_URL_PARSE_CHECK = (1u << 1u),
- };
-
- enum rspamd_url_find_type {
- RSPAMD_URL_FIND_ALL = 0,
- RSPAMD_URL_FIND_STRICT,
- };
-
- /**
- * Initialize url library
- * @param cfg
- */
- void rspamd_url_init(const char *tld_file);
-
- void rspamd_url_deinit(void);
-
- /*
- * Parse urls inside text
- * @param pool memory pool
- * @param task task object
- * @param part current text part
- * @param is_html turn on html heuristic
- */
- void rspamd_url_text_extract(rspamd_mempool_t *pool,
- struct rspamd_task *task,
- struct rspamd_mime_text_part *part,
- uint16_t *cur_order,
- enum rspamd_url_find_type how);
-
- /*
- * Parse a single url into an uri structure
- * @param pool memory pool
- * @param uristring text form of url
- * @param uri url object, must be pre allocated
- */
- enum uri_errno rspamd_url_parse(struct rspamd_url *uri,
- char *uristring,
- gsize len,
- rspamd_mempool_t *pool,
- enum rspamd_url_parse_flags flags);
-
- /*
- * Try to extract url from a text
- * @param pool memory pool
- * @param begin begin of text
- * @param len length of text
- * @param start storage for start position of url found (or NULL)
- * @param end storage for end position of url found (or NULL)
- * @param url_str storage for url string(or NULL)
- * @return TRUE if url is found in specified text
- */
- gboolean rspamd_url_find(rspamd_mempool_t *pool,
- const char *begin, gsize len,
- char **url_str,
- enum rspamd_url_find_type how,
- goffset *url_pos,
- gboolean *prefix_added);
-
- /*
- * Return text representation of url parsing error
- */
- const char *rspamd_url_strerror(int err);
-
-
- /**
- * Find TLD for a specified host string
- * @param in input host
- * @param inlen length of input
- * @param out output rspamd_ftok_t with tld position
- * @return TRUE if tld has been found
- */
- gboolean rspamd_url_find_tld(const char *in, gsize inlen, rspamd_ftok_t *out);
-
- typedef gboolean (*url_insert_function)(struct rspamd_url *url,
- gsize start_offset, gsize end_offset, void *ud);
-
- /**
- * Search for multiple urls in text and call `func` for each url found
- * @param pool
- * @param in
- * @param inlen
- * @param is_html
- * @param func
- * @param ud
- */
- void rspamd_url_find_multiple(rspamd_mempool_t *pool,
- const char *in, gsize inlen,
- enum rspamd_url_find_type how,
- GPtrArray *nlines,
- url_insert_function func,
- gpointer ud);
-
- /**
- * Search for a single url in text and call `func` for each url found
- * @param pool
- * @param in
- * @param inlen
- * @param is_html
- * @param func
- * @param ud
- */
- void rspamd_url_find_single(rspamd_mempool_t *pool,
- const char *in, gsize inlen,
- enum rspamd_url_find_type how,
- url_insert_function func,
- gpointer ud);
-
- /**
- * Generic callback to insert URLs into rspamd_task
- * @param url
- * @param start_offset
- * @param end_offset
- * @param ud
- */
- gboolean rspamd_url_task_subject_callback(struct rspamd_url *url,
- gsize start_offset,
- gsize end_offset, gpointer ud);
-
- /**
- * Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated
- * @param dst
- * @param src
- * @param size
- * @return
- */
- gsize rspamd_url_decode(char *dst, const char *src, gsize size);
-
- /**
- * Encode url if needed. In this case, memory is allocated from the specific pool.
- * Returns pointer to begin and encoded length in `dlen`
- * @param url
- * @param pool
- * @return
- */
- const char *rspamd_url_encode(struct rspamd_url *url, gsize *dlen,
- rspamd_mempool_t *pool);
-
-
- /**
- * Returns if a character is domain character
- * @param c
- * @return
- */
- gboolean rspamd_url_is_domain(int c);
-
- /**
- * Returns symbolic name for protocol
- * @param proto
- * @return
- */
- const char *rspamd_url_protocol_name(enum rspamd_url_protocol proto);
-
-
- /**
- * Converts string to a numeric protocol
- * @param str
- * @return
- */
- enum rspamd_url_protocol rspamd_url_protocol_from_string(const char *str);
-
- /**
- * Converts string to a url flag
- * @param str
- * @param flag
- * @return
- */
- bool rspamd_url_flag_from_string(const char *str, int *flag);
-
- /**
- * Converts url flag to a string
- * @param flag
- * @return
- */
- const char *rspamd_url_flag_to_string(int flag);
-
- /* Defines sets of urls indexed by url as is */
- KHASH_DECLARE(rspamd_url_hash, struct rspamd_url *, char);
- KHASH_DECLARE(rspamd_url_host_hash, struct rspamd_url *, char);
-
- /* Convenience functions for url sets */
- /**
- * Add an url to set or increase the existing url count
- * @param set
- * @param u
- * @return true if a new url has been added
- */
- bool rspamd_url_set_add_or_increase(khash_t(rspamd_url_hash) * set,
- struct rspamd_url *u,
- bool enforce_replace);
-
- /**
- * Same as rspamd_url_set_add_or_increase but returns the existing url if found
- * @param set
- * @param u
- * @return
- */
- struct rspamd_url *rspamd_url_set_add_or_return(khash_t(rspamd_url_hash) * set,
- struct rspamd_url *u);
- /**
- * Helper for url host set
- * @param set
- * @param u
- * @return
- */
- bool rspamd_url_host_set_add(khash_t(rspamd_url_host_hash) * set,
- struct rspamd_url *u);
- /**
- * Checks if a url is in set
- * @param set
- * @param u
- * @return
- */
- bool rspamd_url_set_has(khash_t(rspamd_url_hash) * set, struct rspamd_url *u);
-
- bool rspamd_url_host_set_has(khash_t(rspamd_url_host_hash) * set, struct rspamd_url *u);
-
- /**
- * Compares two urls (similar to C comparison functions) lexicographically
- * @param u1
- * @param u2
- * @return
- */
- int rspamd_url_cmp(const struct rspamd_url *u1, const struct rspamd_url *u2);
-
- /**
- * Same but used for qsort to sort `struct rspamd_url *[]` array
- * @param u1
- * @param u2
- * @return
- */
- int rspamd_url_cmp_qsort(const void *u1, const void *u2);
-
- /**
- * Returns a port for some url
- * @param u
- * @return
- */
- static RSPAMD_PURE_FUNCTION inline uint16_t rspamd_url_get_port(struct rspamd_url *u)
- {
- if ((u->flags & RSPAMD_URL_FLAG_HAS_PORT) && u->ext) {
- return u->ext->port;
- }
- else {
- /* Assume standard port */
- if (u->protocol == PROTOCOL_HTTPS) {
- return 443;
- }
- else {
- return 80;
- }
- }
- }
-
- /**
- * Returns a port for some url if it is set
- * @param u
- * @return
- */
- static RSPAMD_PURE_FUNCTION inline uint16_t rspamd_url_get_port_if_special(struct rspamd_url *u)
- {
- if ((u->flags & RSPAMD_URL_FLAG_HAS_PORT) && u->ext) {
- return u->ext->port;
- }
-
- return 0;
- }
-
- /**
- * Normalize unicode input and set out url flags as appropriate
- * @param pool
- * @param input
- * @param len_out (must be &var)
- * @param url_flags_out (must be just a var with no dereference)
- */
- #define rspamd_url_normalise_propagate_flags(pool, input, len_out, url_flags_out) \
- do { \
- enum rspamd_utf8_normalise_result norm_res; \
- norm_res = rspamd_normalise_unicode_inplace((input), (len_out)); \
- if (norm_res & RSPAMD_UNICODE_NORM_UNNORMAL) { \
- url_flags_out |= RSPAMD_URL_FLAG_UNNORMALISED; \
- } \
- if (norm_res & RSPAMD_UNICODE_NORM_ZERO_SPACES) { \
- url_flags_out |= RSPAMD_URL_FLAG_ZW_SPACES; \
- } \
- if (norm_res & (RSPAMD_UNICODE_NORM_ERROR)) { \
- url_flags_out |= RSPAMD_URL_FLAG_OBSCURED; \
- } \
- } while (0)
- #ifdef __cplusplus
- }
- #endif
-
- #endif
|