static struct rspamd_url *rspamd_html_process_url (rspamd_mempool_t *pool,
const gchar *start, guint len,
- struct html_tag_component *comp,
- bool is_image);
+ struct html_tag_component *comp);
static void
rspamd_html_library_init (void)
struct rspamd_url *
rspamd_html_process_url (rspamd_mempool_t *pool, const gchar *start, guint len,
- struct html_tag_component *comp, bool is_image)
+ struct html_tag_component *comp)
{
struct rspamd_url *url;
guint saved_flags = 0;
}
}
- rc = rspamd_url_parse (url, decoded, dlen, pool,
- is_image ? RSPAMD_URL_PARSE_TEXT :RSPAMD_URL_PARSE_HREF);
+ rc = rspamd_url_parse (url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
/* Filter some completely damaged urls */
if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
if (no_prefix) {
url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
+
+ if (url->tldlen == 0 || (url->flags & RSPAMD_URL_FLAG_NO_TLD)) {
+ /* Ignore urls with both no schema and no tld */
+ return NULL;
+ }
}
decoded = url->string;
}
}
- url = rspamd_html_process_url (pool, start, len, comp, false);
+ url = rspamd_html_process_url (pool, start, len, comp);
if (url && tag->extra == NULL) {
tag->extra = url;
if (img->src) {
img->url = rspamd_html_process_url (pool,
- img->src, fstr.len, NULL, true);
+ img->src, fstr.len, NULL);
if (img->url) {
img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
{"url_displayed", RSPAMD_URL_FLAG_DISPLAY_URL, -1},
{"image", RSPAMD_URL_FLAG_IMAGE, -1},
{"query", RSPAMD_URL_FLAG_QUERY, -1},
- {"content", RSPAMD_URL_FLAG_CONTENT, -1}
+ {"content", RSPAMD_URL_FLAG_CONTENT, -1},
+ {"no_tld", RSPAMD_URL_FLAG_NO_TLD, -1},
};
uri->tldshift = uri->hostshift;
uri->tldlen = uri->hostlen;
}
+ else if (uri->flags & RSPAMD_URL_FLAG_SCHEMALESS) {
+ /* Ignore urls with both no schema and no tld */
+ return URI_ERRNO_TLD_MISSING;
+ }
+
+ uri->flags |= RSPAMD_URL_FLAG_NO_TLD;
}
}