Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

html.h 3.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. /*
  2. * Functions for simple html parsing
  3. */
  4. #ifndef RSPAMD_HTML_H
  5. #define RSPAMD_HTML_H
  6. #include "config.h"
  7. #include "mem_pool.h"
  8. /*
  9. * HTML content flags
  10. */
  11. #define RSPAMD_HTML_FLAG_BAD_START (1 << 0)
  12. #define RSPAMD_HTML_FLAG_BAD_ELEMENTS (1 << 1)
  13. #define RSPAMD_HTML_FLAG_XML (1 << 2)
  14. #define RSPAMD_HTML_FLAG_UNBALANCED (1 << 3)
  15. #define RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS (1 << 4)
  16. #define RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS (1 << 5)
  17. #define RSPAMD_HTML_FLAG_TOO_MANY_TAGS (1 << 6)
  18. #define RSPAMD_HTML_FLAG_HAS_DATA_URLS (1 << 7)
  19. /*
  20. * Image flags
  21. */
  22. #define RSPAMD_HTML_FLAG_IMAGE_EMBEDDED (1 << 0)
  23. #define RSPAMD_HTML_FLAG_IMAGE_EXTERNAL (1 << 1)
  24. #define RSPAMD_HTML_FLAG_IMAGE_DATA (1 << 2)
  25. enum html_component_type {
  26. RSPAMD_HTML_COMPONENT_NAME = 0,
  27. RSPAMD_HTML_COMPONENT_HREF,
  28. RSPAMD_HTML_COMPONENT_COLOR,
  29. RSPAMD_HTML_COMPONENT_BGCOLOR,
  30. RSPAMD_HTML_COMPONENT_STYLE,
  31. RSPAMD_HTML_COMPONENT_CLASS,
  32. RSPAMD_HTML_COMPONENT_WIDTH,
  33. RSPAMD_HTML_COMPONENT_HEIGHT,
  34. RSPAMD_HTML_COMPONENT_SIZE,
  35. };
  36. struct html_tag_component {
  37. enum html_component_type type;
  38. guint len;
  39. const guchar *start;
  40. };
  41. struct rspamd_image;
  42. struct html_image {
  43. guint height;
  44. guint width;
  45. guint flags;
  46. gchar *src;
  47. struct rspamd_url *url;
  48. struct rspamd_image *embedded_image;
  49. struct html_tag *tag;
  50. };
  51. struct html_color {
  52. union {
  53. struct {
  54. #if !defined(BYTE_ORDER) || BYTE_ORDER == LITTLE_ENDIAN
  55. guint8 b;
  56. guint8 g;
  57. guint8 r;
  58. guint8 alpha;
  59. #else
  60. guint8 alpha;
  61. guint8 r;
  62. guint8 g;
  63. guint8 b;
  64. #endif
  65. } comp;
  66. guint32 val;
  67. } d;
  68. gboolean valid;
  69. };
  70. struct html_block {
  71. struct html_tag *tag;
  72. struct html_color font_color;
  73. struct html_color background_color;
  74. struct html_tag_component style;
  75. guint font_size;
  76. gboolean visible;
  77. gchar *class;
  78. };
  79. /* Public tags flags */
  80. /* XML tag */
  81. #define FL_XML (1 << 23)
  82. /* Closing tag */
  83. #define FL_CLOSING (1 << 24)
  84. /* Fully closed tag (e.g. <a attrs />) */
  85. #define FL_CLOSED (1 << 25)
  86. #define FL_BROKEN (1 << 26)
  87. #define FL_IGNORE (1 << 27)
  88. #define FL_BLOCK (1 << 28)
  89. struct html_tag {
  90. gint id;
  91. gint flags;
  92. guint content_length;
  93. struct html_tag_component name;
  94. const gchar *content;
  95. GQueue *params;
  96. gpointer extra; /** Additional data associated with tag (e.g. image) */
  97. GNode *parent;
  98. };
  99. /* Forwarded declaration */
  100. struct rspamd_task;
  101. struct html_content {
  102. struct rspamd_url *base_url;
  103. GNode *html_tags;
  104. gint flags;
  105. guint total_tags;
  106. struct html_color bgcolor;
  107. guchar *tags_seen;
  108. GPtrArray *images;
  109. GPtrArray *blocks;
  110. };
  111. /*
  112. * Decode HTML entitles in text. Text is modified in place.
  113. */
  114. guint rspamd_html_decode_entitles_inplace (gchar *s, gsize len);
  115. GByteArray* rspamd_html_process_part (rspamd_mempool_t *pool,
  116. struct html_content *hc,
  117. GByteArray *in);
  118. GByteArray* rspamd_html_process_part_full (rspamd_mempool_t *pool,
  119. struct html_content *hc,
  120. GByteArray *in, GList **exceptions, GHashTable *urls, GHashTable *emails);
  121. /*
  122. * Returns true if a specified tag has been seen in a part
  123. */
  124. gboolean rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname);
  125. /**
  126. * Returns name for the specified tag id
  127. * @param id
  128. * @return
  129. */
  130. const gchar* rspamd_html_tag_by_id (gint id);
  131. /**
  132. * Returns HTML tag id by name
  133. * @param name
  134. * @return
  135. */
  136. gint rspamd_html_tag_by_name (const gchar *name);
  137. /**
  138. * Extract URL from HTML tag component and sets component elements if needed
  139. * @param pool
  140. * @param start
  141. * @param len
  142. * @param comp
  143. * @return
  144. */
  145. struct rspamd_url * rspamd_html_process_url (rspamd_mempool_t *pool,
  146. const gchar *start, guint len,
  147. struct html_tag_component *comp);
  148. #endif