summaryrefslogtreecommitdiffstats
path: root/src/libserver
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2016-08-05 11:11:12 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2016-08-05 11:11:12 +0100
commit832e08e53f2364bda26afc6498ce968874d0fb33 (patch)
tree922dbeaffa99190b192d03b49fcb357b70f49462 /src/libserver
parenta976f102cda0df5d85f46d471273d9b5a7a231a5 (diff)
downloadrspamd-832e08e53f2364bda26afc6498ce968874d0fb33.tar.gz
rspamd-832e08e53f2364bda26afc6498ce968874d0fb33.zip
[Fix] Further fixes in HTML tags parsing
Diffstat (limited to 'src/libserver')
-rw-r--r--src/libserver/html.c54
-rw-r--r--src/libserver/html.h19
-rw-r--r--src/libserver/html_tags.h10
3 files changed, 59 insertions, 24 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c
index a65dd9a95..f897acd8e 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -539,23 +539,37 @@ rspamd_html_check_balance (GNode * node, GNode ** cur_level)
return FALSE;
}
-gboolean
-rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname)
+gint
+rspamd_html_tag_by_name (const gchar *name)
{
struct html_tag tag;
struct html_tag_def *found;
- g_assert (hc != NULL);
- g_assert (hc->tags_seen != NULL);
-
- tag.name.start = tagname;
- tag.name.len = strlen (tagname);
+ tag.name.start = name;
+ tag.name.len = strlen (name);
found = bsearch (&tag, tag_defs, G_N_ELEMENTS (tag_defs),
sizeof (tag_defs[0]), tag_find);
if (found) {
- return isset (hc->tags_seen, found->id);
+ return found->id;
+ }
+
+ return -1;
+}
+
+gboolean
+rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname)
+{
+ gint id;
+
+ g_assert (hc != NULL);
+ g_assert (hc->tags_seen != NULL);
+
+ id = rspamd_html_tag_by_name (tagname);
+
+ if (id != -1) {
+ return isset (hc->tags_seen, id);
}
return FALSE;
@@ -775,8 +789,12 @@ rspamd_html_process_tag (rspamd_mempool_t *pool, struct html_content *hc,
else {
parent = (*cur_level)->data;
- if (parent && (parent->flags & FL_IGNORE)) {
- tag->flags |= FL_IGNORE;
+ if (parent) {
+ if ((parent->flags & FL_IGNORE)) {
+ tag->flags |= FL_IGNORE;
+ }
+
+ parent->content_length += tag->content_length;
}
g_node_append (*cur_level, nnode);
@@ -1605,7 +1623,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
guint obrace = 0, ebrace = 0;
GNode *cur_level = NULL;
gint substate = 0, len, href_offset = -1;
- struct html_tag *cur_tag = NULL;
+ struct html_tag *cur_tag = NULL, *content_tag = NULL;
struct rspamd_url *url = NULL, *turl;
struct rspamd_process_exception *ex;
enum {
@@ -1830,10 +1848,6 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
save_space = FALSE;
}
}
-
- if (cur_tag) {
- cur_tag->content_length ++;
- }
}
else {
if (c != p) {
@@ -1847,6 +1861,12 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
}
g_byte_array_append (dest, c, len);
+
+ if (content_tag) {
+ content_tag->content_length = len;
+ content_tag->content = c;
+ content_tag = NULL;
+ }
}
state = tag_begin;
@@ -1922,6 +1942,10 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
setbit (hc->tags_seen, cur_tag->id);
}
+ if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) {
+ content_tag = cur_tag;
+ }
+
/* Handle newlines */
if (cur_tag->id == Tag_BR || cur_tag->id == Tag_HR) {
if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
diff --git a/src/libserver/html.h b/src/libserver/html.h
index eb8ca8070..35af8f05c 100644
--- a/src/libserver/html.h
+++ b/src/libserver/html.h
@@ -77,10 +77,22 @@ struct html_block {
gchar *class;
};
+/* Public tags flags */
+/* XML tag */
+#define FL_XML (1 << 23)
+/* Closing tag */
+#define FL_CLOSING (1 << 24)
+/* Fully closed tag (e.g. <a attrs />) */
+#define FL_CLOSED (1 << 25)
+#define FL_BROKEN (1 << 26)
+#define FL_IGNORE (1 << 27)
+#define FL_BLOCK (1 << 28)
+
struct html_tag {
gint id;
gint flags;
gsize content_length;
+ const gchar *content;
struct html_tag_component name;
GQueue *params;
gpointer extra; /** Additional data associated with tag (e.g. image) */
@@ -124,6 +136,13 @@ gboolean rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname);
const gchar* rspamd_html_tag_by_id (gint id);
/**
+ * Returns HTML tag id by name
+ * @param name
+ * @return
+ */
+gint rspamd_html_tag_by_name (const gchar *name);
+
+/**
* Extract URL from HTML tag component and sets component elements if needed
* @param pool
* @param start
diff --git a/src/libserver/html_tags.h b/src/libserver/html_tags.h
index 1b82832f1..8809975e3 100644
--- a/src/libserver/html_tags.h
+++ b/src/libserver/html_tags.h
@@ -195,14 +195,6 @@ typedef enum
#define CM_OMITST (1 << 21)
/* Unique elements */
#define CM_UNIQUE (1 << 22)
-/* XML tag */
-#define FL_XML (1 << 23)
-/* Closing tag */
-#define FL_CLOSING (1 << 24)
-/* Fully closed tag (e.g. <a attrs />) */
-#define FL_CLOSED (1 << 25)
-#define FL_BROKEN (1 << 26)
-#define FL_IGNORE (1 << 27)
-#define FL_BLOCK (1 << 28)
+
#endif /* SRC_LIBSERVER_HTML_TAGS_H_ */