aboutsummaryrefslogtreecommitdiffstats
path: root/src/libserver/html.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2021-01-06 19:37:57 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2021-01-06 19:37:57 +0000
commite151d66a0298613fa553f65f94699ec2ba46e56a (patch)
tree496128b76a8b0373117f78f767ea153f5ee5557c /src/libserver/html.c
parentbfe48b659baf0e5007e5fd6b7804881cb92e32fe (diff)
downloadrspamd-e151d66a0298613fa553f65f94699ec2ba46e56a.tar.gz
rspamd-e151d66a0298613fa553f65f94699ec2ba46e56a.zip
[Feature] Extract text from img alt attributes
Diffstat (limited to 'src/libserver/html.c')
-rw-r--r--src/libserver/html.c23
1 files changed, 20 insertions, 3 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c
index a33e4e7fc..687970baa 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -983,6 +983,9 @@ rspamd_html_parse_tag_component (rspamd_mempool_t *pool,
else if (g_ascii_strncasecmp (p, "rel", len) == 0) {
NEW_COMPONENT (RSPAMD_HTML_COMPONENT_REL);
}
+ else if (g_ascii_strncasecmp (p, "alt", len) == 0) {
+ NEW_COMPONENT (RSPAMD_HTML_COMPONENT_ALT);
+ }
}
else if (len == 4) {
if (g_ascii_strncasecmp (p, "href", len) == 0) {
@@ -1817,7 +1820,8 @@ rspamd_html_process_data_image (rspamd_mempool_t *pool,
static void
rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag,
struct html_content *hc, khash_t (rspamd_url_hash) *url_set,
- GPtrArray *part_urls)
+ GPtrArray *part_urls,
+ GByteArray *dest)
{
struct html_tag_component *comp;
struct html_image *img;
@@ -1930,6 +1934,19 @@ rspamd_html_process_img_tag (rspamd_mempool_t *pool, struct html_tag *tag,
}
}
}
+ else if (comp->type == RSPAMD_HTML_COMPONENT_ALT && comp->len > 0 && dest != NULL) {
+ if (dest->len > 0 && !g_ascii_isspace (dest->data[dest->len - 1])) {
+ /* Add a space */
+ g_byte_array_append (dest, " ", 1);
+ }
+
+ g_byte_array_append (dest, comp->start, comp->len);
+
+ if (!g_ascii_isspace (dest->data[dest->len - 1])) {
+ /* Add a space */
+ g_byte_array_append (dest, " ", 1);
+ }
+ }
cur = g_list_next (cur);
}
@@ -1971,7 +1988,7 @@ rspamd_html_process_link_tag (rspamd_mempool_t *pool, struct html_tag *tag,
if (comp->len == sizeof ("icon") - 1 &&
rspamd_lc_cmp (comp->start, "icon", sizeof ("icon") - 1) == 0) {
- rspamd_html_process_img_tag (pool, tag, hc, url_set, part_urls);
+ rspamd_html_process_img_tag (pool, tag, hc, url_set, part_urls, NULL);
}
}
@@ -3248,7 +3265,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool,
if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
rspamd_html_process_img_tag (pool, cur_tag, hc, url_set,
- part_urls);
+ part_urls, dest);
}
else if (cur_tag->id == Tag_LINK && !(cur_tag->flags & FL_CLOSING)) {
rspamd_html_process_link_tag (pool, cur_tag, hc, url_set,