You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

html.h 3.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. /*
  2. * Functions for simple html parsing
  3. */
  4. #ifndef RSPAMD_HTML_H
  5. #define RSPAMD_HTML_H
  6. #include "config.h"
  7. #include "libutil/mem_pool.h"
  8. #include "libserver/url.h"
  9. #ifdef __cplusplus
  10. extern "C" {
  11. #endif
  12. /*
  13. * HTML content flags
  14. */
  15. #define RSPAMD_HTML_FLAG_BAD_START (1 << 0)
  16. #define RSPAMD_HTML_FLAG_BAD_ELEMENTS (1 << 1)
  17. #define RSPAMD_HTML_FLAG_XML (1 << 2)
  18. #define RSPAMD_HTML_FLAG_UNBALANCED (1 << 3)
  19. #define RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS (1 << 4)
  20. #define RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS (1 << 5)
  21. #define RSPAMD_HTML_FLAG_TOO_MANY_TAGS (1 << 6)
  22. #define RSPAMD_HTML_FLAG_HAS_DATA_URLS (1 << 7)
  23. /*
  24. * Image flags
  25. */
  26. #define RSPAMD_HTML_FLAG_IMAGE_EMBEDDED (1 << 0)
  27. #define RSPAMD_HTML_FLAG_IMAGE_EXTERNAL (1 << 1)
  28. #define RSPAMD_HTML_FLAG_IMAGE_DATA (1 << 2)
  29. enum html_component_type {
  30. RSPAMD_HTML_COMPONENT_NAME = 0,
  31. RSPAMD_HTML_COMPONENT_HREF,
  32. RSPAMD_HTML_COMPONENT_COLOR,
  33. RSPAMD_HTML_COMPONENT_BGCOLOR,
  34. RSPAMD_HTML_COMPONENT_STYLE,
  35. RSPAMD_HTML_COMPONENT_CLASS,
  36. RSPAMD_HTML_COMPONENT_WIDTH,
  37. RSPAMD_HTML_COMPONENT_HEIGHT,
  38. RSPAMD_HTML_COMPONENT_SIZE,
  39. RSPAMD_HTML_COMPONENT_REL,
  40. RSPAMD_HTML_COMPONENT_ALT,
  41. };
  42. struct html_tag_component {
  43. enum html_component_type type;
  44. guint len;
  45. const guchar *start;
  46. };
  47. struct rspamd_image;
  48. struct html_image {
  49. guint height;
  50. guint width;
  51. guint flags;
  52. gchar *src;
  53. struct rspamd_url *url;
  54. struct rspamd_image *embedded_image;
  55. struct html_tag *tag;
  56. };
  57. struct html_color {
  58. union {
  59. struct {
  60. #if !defined(BYTE_ORDER) || BYTE_ORDER == LITTLE_ENDIAN
  61. guint8 b;
  62. guint8 g;
  63. guint8 r;
  64. guint8 alpha;
  65. #else
  66. guint8 alpha;
  67. guint8 r;
  68. guint8 g;
  69. guint8 b;
  70. #endif
  71. } comp;
  72. guint32 val;
  73. } d;
  74. gboolean valid;
  75. };
  76. struct html_block {
  77. struct html_tag *tag;
  78. struct html_color font_color;
  79. struct html_color background_color;
  80. struct html_tag_component style;
  81. guint font_size;
  82. gboolean visible;
  83. gchar *html_class;
  84. };
  85. /* Public tags flags */
  86. /* XML tag */
  87. #define FL_XML (1 << 23)
  88. /* Closing tag */
  89. #define FL_CLOSING (1 << 24)
  90. /* Fully closed tag (e.g. <a attrs />) */
  91. #define FL_CLOSED (1 << 25)
  92. #define FL_BROKEN (1 << 26)
  93. #define FL_IGNORE (1 << 27)
  94. #define FL_BLOCK (1 << 28)
  95. #define FL_HREF (1 << 29)
  96. #define FL_IMAGE (1 << 30)
  97. struct html_tag {
  98. gint id;
  99. gint flags;
  100. struct html_tag_component name;
  101. guint content_length;
  102. goffset content_offset;
  103. GQueue *params;
  104. gpointer extra; /** Additional data associated with tag (e.g. image) */
  105. GNode *parent;
  106. };
  107. /* Forwarded declaration */
  108. struct rspamd_task;
  109. struct html_content {
  110. struct rspamd_url *base_url;
  111. GNode *html_tags;
  112. gint flags;
  113. guint total_tags;
  114. struct html_color bgcolor;
  115. guchar *tags_seen;
  116. GPtrArray *images;
  117. GPtrArray *blocks;
  118. GByteArray *parsed;
  119. };
  120. /*
  121. * Decode HTML entitles in text. Text is modified in place.
  122. */
  123. guint rspamd_html_decode_entitles_inplace (gchar *s, gsize len);
  124. GByteArray *rspamd_html_process_part (rspamd_mempool_t *pool,
  125. struct html_content *hc,
  126. GByteArray *in);
  127. GByteArray *rspamd_html_process_part_full (rspamd_mempool_t *pool,
  128. struct html_content *hc,
  129. GByteArray *in, GList **exceptions,
  130. khash_t (rspamd_url_hash) *url_set,
  131. GPtrArray *part_urls,
  132. bool allow_css);
  133. /*
  134. * Returns true if a specified tag has been seen in a part
  135. */
  136. gboolean rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname);
  137. /**
  138. * Returns name for the specified tag id
  139. * @param id
  140. * @return
  141. */
  142. const gchar *rspamd_html_tag_by_id (gint id);
  143. /**
  144. * Returns HTML tag id by name
  145. * @param name
  146. * @return
  147. */
  148. gint rspamd_html_tag_by_name (const gchar *name);
  149. /**
  150. * Extract URL from HTML tag component and sets component elements if needed
  151. * @param pool
  152. * @param start
  153. * @param len
  154. * @param comp
  155. * @return
  156. */
  157. #ifdef __cplusplus
  158. }
  159. #endif
  160. #endif