static void
rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,
- GHashTable *tbl_urls, GHashTable *tbl_emails)
+ khash_t (rspamd_url_hash) *url_set)
{
- GHashTable *target_tbl;
- struct rspamd_url *query_url, *existing;
+ struct rspamd_url *query_url;
gchar *url_str;
gint rc;
gboolean prefix_added;
msg_debug_html ("found url %s in query of url"
" %*s", url_str, url->querylen, rspamd_url_query_unsafe (url));
- if (query_url->protocol == PROTOCOL_MAILTO) {
- target_tbl = tbl_emails;
- }
- else {
- target_tbl = tbl_urls;
- }
-
if (prefix_added) {
query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
}
query_url->flags |= RSPAMD_URL_FLAG_OBSCURED;
}
- if ((existing = g_hash_table_lookup (target_tbl,
- query_url)) == NULL) {
- g_hash_table_insert (target_tbl,
- query_url,
- query_url);
- }
- else {
- existing->count ++;
- }
+ rspamd_url_set_add_or_increase (url_set, query_url);
}
}
}
static void
rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag,
- struct html_content *hc, GHashTable *urls)
+ struct html_content *hc, khash_t (rspamd_url_hash) *url_set)
{
struct html_tag_component *comp;
struct html_image *img;
img->src, fstr.len, NULL);
if (img->url) {
- struct rspamd_url *turl = g_hash_table_lookup (urls,
- img->url);
-
img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
-
- if (turl == NULL) {
- g_hash_table_insert (urls, img->url, img->url);
- }
- else {
- turl->count++;
- }
+ rspamd_url_set_add_or_increase (url_set, img->url);
}
}
}
static void
rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
- GList **exceptions, GHashTable *urls, GHashTable *emails,
- GByteArray *dest, GHashTable *target_tbl,
- gint href_offset,
- struct rspamd_url *url)
+ GList **exceptions,
+ khash_t (rspamd_url_hash) *url_set,
+ GByteArray *dest,
+ gint href_offset,
+ struct rspamd_url *url)
{
struct rspamd_url *displayed_url = NULL;
struct rspamd_url *turl;
if (url_found) {
url->flags |= RSPAMD_URL_FLAG_DISPLAY_URL;
}
+
if (exceptions && url_found) {
ex = rspamd_mempool_alloc (pool,
sizeof (*ex));
ex);
}
- if (displayed_url) {
- if (displayed_url->protocol ==
- PROTOCOL_MAILTO) {
- target_tbl = emails;
- }
- else {
- target_tbl = urls;
- }
+ if (displayed_url && url_set) {
+ turl = rspamd_url_set_add_or_return (url_set,
+ displayed_url);
- if (target_tbl != NULL) {
- turl = g_hash_table_lookup (target_tbl,
- displayed_url);
-
- if (turl != NULL) {
- /* Here, we assume the following:
- * if we have a URL in the text part which
- * is the same as displayed URL in the
- * HTML part, we assume that it is also
- * hint only.
- */
- if (turl->flags &
- RSPAMD_URL_FLAG_FROM_TEXT) {
- turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
- turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
- }
-
- turl->count ++;
- }
- else {
- g_hash_table_insert (target_tbl,
- displayed_url,
- displayed_url);
+ if (turl != NULL) {
+ /* Here, we assume the following:
+ * if we have a URL in the text part which
+ * is the same as displayed URL in the
+ * HTML part, we assume that it is also
+ * hint only.
+ */
+ if (turl->flags &
+ RSPAMD_URL_FLAG_FROM_TEXT) {
+ turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
+ turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
}
+
+ turl->count ++;
+ }
+ else {
+ /* Already inserted by `rspamd_url_set_add_or_return` */
}
}
}
}
GByteArray*
-rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
- GByteArray *in, GList **exceptions, GHashTable *urls, GHashTable *emails)
+rspamd_html_process_part_full (rspamd_mempool_t *pool,
+ struct html_content *hc,
+ GByteArray *in,
+ GList **exceptions,
+ khash_t (rspamd_url_hash) *url_set)
{
const guchar *p, *c, *end, *savep = NULL;
guchar t;
gboolean closing = FALSE, need_decode = FALSE, save_space = FALSE,
balanced;
GByteArray *dest;
- GHashTable *target_tbl;
guint obrace = 0, ebrace = 0;
GNode *cur_level = NULL;
gint substate = 0, len, href_offset = -1;
struct html_tag *cur_tag = NULL, *content_tag = NULL;
- struct rspamd_url *url = NULL, *turl;
+ struct rspamd_url *url = NULL;
GQueue *styles_blocks;
enum {
if (url != NULL) {
- if (url->protocol == PROTOCOL_MAILTO) {
- target_tbl = emails;
- }
- else {
- target_tbl = urls;
- }
-
- if (target_tbl != NULL) {
- turl = g_hash_table_lookup (target_tbl, url);
-
- if (turl == NULL) {
- g_hash_table_insert (target_tbl, url, url);
- }
- else {
- turl->count ++;
- url = NULL;
- }
-
- if (turl == NULL && url != NULL) {
- rspamd_process_html_url (pool,
- url,
- urls, emails);
+ if (url_set != NULL) {
+ if (!rspamd_url_set_add_or_increase (url_set, url)) {
+ rspamd_process_html_url (pool, url, url_set);
}
}
prev_url = prev_tag->extra;
rspamd_html_check_displayed_url (pool,
- exceptions, urls, emails,
- dest, target_tbl, href_offset,
+ exceptions, url_set,
+ dest, href_offset,
prev_url);
}
}
/* Insert exception */
if (url != NULL && (gint) dest->len > href_offset) {
rspamd_html_check_displayed_url (pool,
- exceptions, urls, emails,
- dest, target_tbl, href_offset,
+ exceptions, url_set,
+ dest, href_offset,
url);
}
}
if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
- rspamd_html_process_img_tag (pool, cur_tag, hc, urls);
+ rspamd_html_process_img_tag (pool, cur_tag, hc, url_set);
}
else if (cur_tag->flags & FL_BLOCK) {
struct html_block *bl;
struct html_content *hc,
GByteArray *in)
{
- return rspamd_html_process_part_full (pool, hc, in, NULL, NULL, NULL);
+ return rspamd_html_process_part_full (pool, hc, in, NULL, NULL);
}
{
struct rspamd_task *task = ud;
gchar *url_str = NULL;
- struct rspamd_url *query_url, *existing;
+ struct rspamd_url *query_url;
gint rc;
gboolean prefix_added;
return true;
}
+struct rspamd_url *
+rspamd_url_set_add_or_return (khash_t (rspamd_url_hash) *set,
+ struct rspamd_url *u)
+{
+ khiter_t k;
+ gint r;
+
+ if (set) {
+ k = kh_put (rspamd_url_hash, set, u, &r);
+
+ if (r == 0) {
+ struct rspamd_url *ex = kh_key (set, k);
+
+ return ex;
+ }
+ }
+
+ return NULL;
+}
+
bool
rspamd_url_host_set_add (khash_t (rspamd_url_host_hash) *set,
struct rspamd_url *u)
khiter_t k;
gint r;
- k = kh_put (rspamd_url_host_hash, set, u, &r);
+ if (set) {
+ k = kh_put (rspamd_url_host_hash, set, u, &r);
- if (r == 0) {
- return false;
+ if (r == 0) {
+ return false;
+ }
+
+ return true;
}
- return true;
+ return false;
}
bool
{
khiter_t k;
- k = kh_get (rspamd_url_hash, set, u);
+ if (set) {
+ k = kh_get (rspamd_url_hash, set, u);
- if (k == kh_end (set)) {
- return false;
+ if (k == kh_end (set)) {
+ return false;
+ }
+
+ return true;
}
- return true;
+ return false;
}
bool
{
khiter_t k;
- k = kh_get (rspamd_url_hash, set, u);
+ if (set) {
+ k = kh_get (rspamd_url_host_hash, set, u);
- if (k == kh_end (set)) {
- return false;
+ if (k == kh_end (set)) {
+ return false;
+ }
+
+ return true;
}
- return true;
+ return false;
}
\ No newline at end of file