]> source.dussan.org Git - rspamd.git/commitdiff
Fix content saving in html parser.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 16 Jul 2015 12:01:09 +0000 (13:01 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 16 Jul 2015 12:01:09 +0000 (13:01 +0100)
src/libserver/html.c

index 5bf042153acf7fdf7eb1bfeb61a70a810c3f9cf0..b979b3f8c26c127f6ded591225609a5b9c509516 100644 (file)
@@ -1391,6 +1391,7 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
                xml_tag_end,
                content_ignore,
                content_write,
+               content_ignore_sp
        } state = parse_start;
 
        g_assert (in != NULL);
@@ -1558,11 +1559,26 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
 
                case content_write:
                        if (t != '<') {
-                               p ++;
-
                                if (t == '&') {
                                        need_decode = TRUE;
                                }
+                               else if (g_ascii_isspace (t)) {
+
+                                       if (c != p) {
+                                               if (need_decode) {
+                                                       len = rspamd_html_decode_entitles_inplace ((gchar *)c,
+                                                                       p - c);
+                                               }
+                                               else {
+                                                       len = p - c;
+                                               }
+
+                                               g_byte_array_append (dest, c, len);
+                                       }
+
+                                       c = p;
+                                       state = content_ignore_sp;
+                               }
                        }
                        else {
                                if (c != p) {
@@ -1579,7 +1595,28 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
                                }
 
                                state = tag_begin;
+                               continue;
                        }
+
+                       p ++;
+                       break;
+
+               case content_ignore_sp:
+                       if (!g_ascii_isspace (t)) {
+                               c = p;
+                               state = content_write;
+
+                               if (t != '<') {
+                                       /* Append one space if needed */
+                                       if (dest->len > 0 &&
+                                                       !g_ascii_isspace (dest->data[dest->len - 1])) {
+                                               g_byte_array_append (dest, " ", 1);
+                                       }
+                               }
+                               continue;
+                       }
+
+                       p ++;
                        break;
 
                case sgml_content:
@@ -1627,9 +1664,11 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc,
                                }
                        }
                        else {
-                               /* Do not save content of SGML/XML tags */
-                               state = content_ignore;
+                               state = content_write;
                        }
+
+                       p++;
+                       c = p;
                        cur_tag = NULL;
                        break;
                }