aboutsummaryrefslogtreecommitdiffstats
path: root/src/libserver/html/html.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'src/libserver/html/html.cxx')
-rw-r--r--src/libserver/html/html.cxx403
1 files changed, 71 insertions, 332 deletions
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index c167b004f..c384a9023 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -30,6 +30,7 @@
#include "html_tag_defs.hxx"
#include "html_entities.hxx"
#include "html_tag.hxx"
+#include "html_url.hxx"
#include <vector>
#include <frozen/unordered_map.h>
@@ -633,273 +634,76 @@ parse_tag_content(rspamd_mempool_t *pool,
parser_env.cur_state = state;
}
-}
-
-/* Unconverted C part */
-
-static struct rspamd_url *rspamd_html_process_url(rspamd_mempool_t *pool,
- const gchar *start, guint len,
- struct html_tag_component *comp);
-
-
-
-
-struct rspamd_url *
-rspamd_html_process_url(rspamd_mempool_t *pool, const gchar *start, guint len,
- struct html_tag_component *comp) {
- struct rspamd_url *url;
- guint saved_flags = 0;
- gchar *decoded;
- gint rc;
- gsize decoded_len;
- const gchar *p, *s, *prefix = "http://";
- gchar *d;
- guint i;
- gsize dlen;
- gboolean has_bad_chars = FALSE, no_prefix = FALSE;
- static const gchar hexdigests[] = "0123456789abcdef";
-
- p = start;
-
- /* Strip spaces from the url */
- /* Head spaces */
- while (p < start + len && g_ascii_isspace (*p)) {
- p++;
- start++;
- len--;
- }
-
- if (comp) {
- comp->start = (guchar *)p;
- comp->len = len;
- }
-
- /* Trailing spaces */
- p = start + len - 1;
-
- while (p >= start && g_ascii_isspace (*p)) {
- p--;
- len--;
-
- if (comp) {
- comp->len--;
- }
- }
+static auto
+html_process_url_tag(rspamd_mempool_t *pool,
+ struct html_tag *tag,
+ struct html_content *hc) -> std::optional<struct rspamd_url *>
+{
+ auto found_href_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_HREF);
- s = start;
- dlen = 0;
+ if (found_href_it != tag->parameters.end()) {
+ /* Check base url */
+ auto &href_value = found_href_it->second;
- for (i = 0; i < len; i++) {
- if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
- dlen += 3;
- }
- else {
- dlen++;
- }
- }
+ if (hc && hc->base_url && href_value.size() > 2) {
+ /*
+ * Relative url cannot start from the following:
+ * schema://
+ * data:
+ * slash
+ */
- if (rspamd_substring_search(start, len, "://", 3) == -1) {
- if (len >= sizeof("mailto:") &&
- (memcmp(start, "mailto:", sizeof("mailto:") - 1) == 0 ||
- memcmp(start, "tel:", sizeof("tel:") - 1) == 0 ||
- memcmp(start, "callto:", sizeof("callto:") - 1) == 0)) {
- /* Exclusion, has valid but 'strange' prefix */
- }
- else {
- for (i = 0; i < len; i++) {
- if (!((s[i] & 0x80) || g_ascii_isalnum (s[i]))) {
- if (i == 0 && len > 2 && s[i] == '/' && s[i + 1] == '/') {
- prefix = "http:";
- dlen += sizeof("http:") - 1;
- no_prefix = TRUE;
- }
- else if (s[i] == '@') {
- /* Likely email prefix */
- prefix = "mailto://";
- dlen += sizeof("mailto://") - 1;
- no_prefix = TRUE;
- }
- else if (s[i] == ':' && i != 0) {
- /* Special case */
- no_prefix = FALSE;
- }
- else {
- if (i == 0) {
- /* No valid data */
- return NULL;
- }
- else {
- no_prefix = TRUE;
- dlen += strlen(prefix);
- }
- }
+ if (rspamd_substring_search(href_value.data(), href_value.size(), "://", 3) == -1) {
- break;
+ if (href_value.size() >= sizeof("data:") &&
+ g_ascii_strncasecmp(href_value.data(), "data:", sizeof("data:") - 1) == 0) {
+ /* Image data url, never insert as url */
+ return std::nullopt;
}
- }
- }
- }
-
- decoded = (char *)rspamd_mempool_alloc (pool, dlen + 1);
- d = decoded;
-
- if (no_prefix) {
- gsize plen = strlen(prefix);
- memcpy(d, prefix, plen);
- d += plen;
- }
-
- /*
- * We also need to remove all internal newlines, spaces
- * and encode unsafe characters
- */
- for (i = 0; i < len; i++) {
- if (G_UNLIKELY (g_ascii_isspace(s[i]))) {
- continue;
- }
- else if (G_UNLIKELY (((guint) s[i]) < 0x80 && !g_ascii_isgraph(s[i]))) {
- /* URL encode */
- *d++ = '%';
- *d++ = hexdigests[(s[i] >> 4) & 0xf];
- *d++ = hexdigests[s[i] & 0xf];
- has_bad_chars = TRUE;
- }
- else {
- *d++ = s[i];
- }
- }
-
- *d = '\0';
- dlen = d - decoded;
- url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
+ /* Assume relative url */
+ auto need_slash = false;
- rspamd_url_normalise_propagate_flags (pool, decoded, &dlen, saved_flags);
+ auto orig_len = href_value.size();
+ auto len = orig_len + hc->base_url->urllen;
- rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
-
- /* Filter some completely damaged urls */
- if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
- !((url->protocol & PROTOCOL_UNKNOWN))) {
- url->flags |= saved_flags;
-
- if (has_bad_chars) {
- url->flags |= RSPAMD_URL_FLAG_OBSCURED;
- }
-
- if (no_prefix) {
- url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
+ if (hc->base_url->datalen == 0) {
+ need_slash = true;
+ len++;
+ }
- if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) {
- /* Ignore urls with both no schema and no tld */
- return NULL;
+ auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
+ auto nlen = (std::size_t)rspamd_snprintf(buf, len + 1,
+ "%*s%s%*s",
+ hc->base_url->urllen, hc->base_url->string,
+ need_slash ? "/" : "",
+ (gint) orig_len, href_value.size());
+ href_value = {buf, nlen};
+ }
+ else if (href_value[0] == '/' && href_value[1] != '/') {
+ /* Relative to the hostname */
+ auto orig_len = href_value.size();
+ auto len = orig_len + hc->base_url->hostlen + hc->base_url->protocollen +
+ 3 /* for :// */;
+ auto *buf = rspamd_mempool_alloc_buffer(pool, len + 1);
+ auto nlen = (std::size_t)rspamd_snprintf(buf, len + 1, "%*s://%*s/%*s",
+ hc->base_url->protocollen, hc->base_url->string,
+ hc->base_url->hostlen, rspamd_url_host_unsafe (hc->base_url),
+ (gint)orig_len, href_value.data());
+ href_value = {buf, nlen};
}
}
- decoded = url->string;
- decoded_len = url->urllen;
+ auto url = html_process_url(pool, href_value);
- if (comp) {
- comp->start = (guchar *)decoded;
- comp->len = decoded_len;
- }
- /* Spaces in href usually mean an attempt to obfuscate URL */
- /* See https://github.com/vstakhov/rspamd/issues/593 */
-#if 0
- if (has_spaces) {
- url->flags |= RSPAMD_URL_FLAG_OBSCURED;
+ if (url && tag->extra == nullptr) {
+ tag->extra = url.value();
}
-#endif
return url;
}
- return NULL;
-}
-
-static struct rspamd_url *
-rspamd_html_process_url_tag(rspamd_mempool_t *pool, struct html_tag *tag,
- struct html_content *hc) {
- struct html_tag_component *comp;
- GList *cur;
- struct rspamd_url *url;
- const gchar *start;
- gsize len;
-
- cur = tag->params->head;
-
- while (cur) {
- comp = (struct html_tag_component *)cur->data;
-
- if (comp->type == RSPAMD_HTML_COMPONENT_HREF && comp->len > 0) {
- start = (char *)comp->start;
- len = comp->len;
-
- /* Check base url */
- if (hc && hc->base_url && comp->len > 2) {
- /*
- * Relative url cannot start from the following:
- * schema://
- * data:
- * slash
- */
- gchar *buf;
- gsize orig_len;
-
- if (rspamd_substring_search(start, len, "://", 3) == -1) {
-
- if (len >= sizeof("data:") &&
- g_ascii_strncasecmp(start, "data:", sizeof("data:") - 1) == 0) {
- /* Image data url, never insert as url */
- return NULL;
- }
-
- /* Assume relative url */
-
- gboolean need_slash = FALSE;
-
- orig_len = len;
- len += hc->base_url->urllen;
-
- if (hc->base_url->datalen == 0) {
- need_slash = TRUE;
- len++;
- }
-
- buf = (char *)rspamd_mempool_alloc (pool, len + 1);
- rspamd_snprintf(buf, len + 1, "%*s%s%*s",
- hc->base_url->urllen, hc->base_url->string,
- need_slash ? "/" : "",
- (gint) orig_len, start);
- start = buf;
- }
- else if (start[0] == '/' && start[1] != '/') {
- /* Relative to the hostname */
- orig_len = len;
- len += hc->base_url->hostlen + hc->base_url->protocollen +
- 3 /* for :// */;
- buf = (char *)rspamd_mempool_alloc (pool, len + 1);
- rspamd_snprintf(buf, len + 1, "%*s://%*s/%*s",
- hc->base_url->protocollen, hc->base_url->string,
- hc->base_url->hostlen, rspamd_url_host_unsafe (hc->base_url),
- (gint) orig_len, start);
- start = buf;
- }
- }
-
- url = rspamd_html_process_url(pool, start, len, comp);
-
- if (url && tag->extra == NULL) {
- tag->extra = url;
- }
-
- return url;
- }
-
- cur = g_list_next (cur);
- }
-
- return NULL;
+ return std::nullopt;
}
struct rspamd_html_url_query_cbd {
@@ -910,8 +714,9 @@ struct rspamd_html_url_query_cbd {
};
static gboolean
-rspamd_html_url_query_callback(struct rspamd_url *url, gsize start_offset,
- gsize end_offset, gpointer ud) {
+html_url_query_callback(struct rspamd_url *url, gsize start_offset,
+ gsize end_offset, gpointer ud)
+{
struct rspamd_html_url_query_cbd *cbd =
(struct rspamd_html_url_query_cbd *) ud;
rspamd_mempool_t *pool;
@@ -939,9 +744,10 @@ rspamd_html_url_query_callback(struct rspamd_url *url, gsize start_offset,
}
static void
-rspamd_process_html_url(rspamd_mempool_t *pool, struct rspamd_url *url,
- khash_t (rspamd_url_hash) *url_set,
- GPtrArray *part_urls) {
+process_html_query_url(rspamd_mempool_t *pool, struct rspamd_url *url,
+ khash_t (rspamd_url_hash) *url_set,
+ GPtrArray *part_urls)
+{
if (url->querylen > 0) {
struct rspamd_html_url_query_cbd qcbd;
@@ -953,7 +759,7 @@ rspamd_process_html_url(rspamd_mempool_t *pool, struct rspamd_url *url,
rspamd_url_find_multiple(pool,
rspamd_url_query_unsafe (url), url->querylen,
RSPAMD_URL_FIND_ALL, NULL,
- rspamd_html_url_query_callback, &qcbd);
+ html_url_query_callback, &qcbd);
}
if (part_urls) {
@@ -1013,10 +819,12 @@ rspamd_html_process_data_image(rspamd_mempool_t *pool,
}
static void
-rspamd_html_process_img_tag(rspamd_mempool_t *pool, struct html_tag *tag,
- struct html_content *hc, khash_t (rspamd_url_hash) *url_set,
- GPtrArray *part_urls,
- GByteArray *dest) {
+html_process_img_tag(rspamd_mempool_t *pool, struct html_tag *tag,
+ struct html_content *hc,
+ khash_t (rspamd_url_hash) *url_set,
+ GPtrArray *part_urls,
+ GByteArray *dest)
+{
struct html_tag_component *comp;
struct html_image *img;
rspamd_ftok_t fstr;
@@ -1205,6 +1013,10 @@ rspamd_html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag,
}
}
+}
+
+/* Unconverted C part */
+
static void
rspamd_html_process_color(const gchar *line, guint len, struct html_color *cl)
{
@@ -1764,80 +1576,7 @@ rspamd_html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
tag->extra = bl;
}
-static void
-rspamd_html_check_displayed_url(rspamd_mempool_t *pool,
- GList **exceptions,
- khash_t (rspamd_url_hash) *url_set,
- GByteArray *dest,
- gint href_offset,
- struct rspamd_url *url) {
- struct rspamd_url *displayed_url = NULL;
- struct rspamd_url *turl;
- gboolean url_found = FALSE;
- struct rspamd_process_exception *ex;
- guint saved_flags = 0;
- gsize dlen;
-
- if (href_offset < 0) {
- /* No dispalyed url, just some text within <a> tag */
- return;
- }
-
- url->visible_part = (gchar *)rspamd_mempool_alloc (pool, dest->len - href_offset + 1);
- rspamd_strlcpy(url->visible_part,
- reinterpret_cast<const gchar *>(dest->data + href_offset),
- dest->len - href_offset + 1);
- dlen = dest->len - href_offset;
-
- /* Strip unicode spaces from the start and the end */
- url->visible_part = rspamd_string_unicode_trim_inplace(url->visible_part,
- &dlen);
- rspamd_html_url_is_phished(pool, url,
- reinterpret_cast<const guchar *>(url->visible_part),
- dlen,
- &url_found, &displayed_url);
-
- if (url_found) {
- url->flags |= saved_flags | RSPAMD_URL_FLAG_DISPLAY_URL;
- }
-
- if (exceptions && url_found) {
- ex = rspamd_mempool_alloc_type (pool,struct rspamd_process_exception);
- ex->pos = href_offset;
- ex->len = dest->len - href_offset;
- ex->type = RSPAMD_EXCEPTION_URL;
- ex->ptr = url;
-
- *exceptions = g_list_prepend(*exceptions,
- ex);
- }
-
- if (displayed_url && url_set) {
- turl = rspamd_url_set_add_or_return(url_set,
- displayed_url);
- if (turl != NULL) {
- /* Here, we assume the following:
- * if we have a URL in the text part which
- * is the same as displayed URL in the
- * HTML part, we assume that it is also
- * hint only.
- */
- if (turl->flags &
- RSPAMD_URL_FLAG_FROM_TEXT) {
- turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
- turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
- }
-
- turl->count++;
- }
- else {
- /* Already inserted by `rspamd_url_set_add_or_return` */
- }
- }
-
- rspamd_normalise_unicode_inplace(url->visible_part, &dlen);
-}
static gboolean
rspamd_html_propagate_lengths(GNode *node, gpointer _unused) {