You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

message.h 6.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. /**
  2. * @file message.h
  3. * Message processing functions and structures
  4. */
  5. #ifndef RSPAMD_MESSAGE_H
  6. #define RSPAMD_MESSAGE_H
  7. #include "config.h"
  8. #include "libmime/email_addr.h"
  9. #include "libutil/addr.h"
  10. #include "libcryptobox/cryptobox.h"
  11. #include "libmime/mime_headers.h"
  12. #include "libmime/content_type.h"
  13. #include "libserver/url.h"
  14. #include "libutil/ref.h"
  15. #include "libutil/str_util.h"
  16. #include <unicode/uchar.h>
  17. #include <unicode/utext.h>
  18. #ifdef __cplusplus
  19. extern "C" {
  20. #endif
  21. struct rspamd_task;
  22. struct controller_session;
  23. struct rspamd_image;
  24. struct rspamd_archive;
  25. enum rspamd_mime_part_flags {
  26. RSPAMD_MIME_PART_ATTACHEMENT = (1u << 1u),
  27. RSPAMD_MIME_PART_BAD_CTE = (1u << 4u),
  28. RSPAMD_MIME_PART_MISSING_CTE = (1u << 5u),
  29. RSPAMD_MIME_PART_NO_TEXT_EXTRACTION = (1u << 6u),
  30. };
  31. enum rspamd_mime_part_type {
  32. RSPAMD_MIME_PART_UNDEFINED = 0,
  33. RSPAMD_MIME_PART_MULTIPART,
  34. RSPAMD_MIME_PART_MESSAGE,
  35. RSPAMD_MIME_PART_TEXT,
  36. RSPAMD_MIME_PART_ARCHIVE,
  37. RSPAMD_MIME_PART_IMAGE,
  38. RSPAMD_MIME_PART_CUSTOM_LUA
  39. };
  40. #define IS_PART_MULTIPART(part) ((part) && ((part)->part_type == RSPAMD_MIME_PART_MULTIPART))
  41. #define IS_PART_TEXT(part) ((part) && ((part)->part_type == RSPAMD_MIME_PART_TEXT))
  42. #define IS_PART_MESSAGE(part) ((part) && ((part)->part_type == RSPAMD_MIME_PART_MESSAGE))
  43. enum rspamd_cte {
  44. RSPAMD_CTE_UNKNOWN = 0,
  45. RSPAMD_CTE_7BIT = 1,
  46. RSPAMD_CTE_8BIT = 2,
  47. RSPAMD_CTE_QP = 3,
  48. RSPAMD_CTE_B64 = 4,
  49. RSPAMD_CTE_UUE = 5,
  50. };
  51. struct rspamd_mime_text_part;
  52. struct rspamd_mime_multipart {
  53. GPtrArray *children;
  54. rspamd_ftok_t boundary;
  55. };
  56. enum rspamd_lua_specific_type {
  57. RSPAMD_LUA_PART_TEXT,
  58. RSPAMD_LUA_PART_STRING,
  59. RSPAMD_LUA_PART_TABLE,
  60. RSPAMD_LUA_PART_FUNCTION,
  61. RSPAMD_LUA_PART_UNKNOWN,
  62. };
  63. struct rspamd_lua_specific_part {
  64. int cbref;
  65. enum rspamd_lua_specific_type type;
  66. };
  67. struct rspamd_mime_part {
  68. struct rspamd_content_type *ct;
  69. struct rspamd_content_type *detected_ct;
  70. char *detected_type;
  71. char *detected_ext;
  72. struct rspamd_content_disposition *cd;
  73. rspamd_ftok_t raw_data;
  74. rspamd_ftok_t parsed_data;
  75. struct rspamd_mime_part *parent_part;
  76. struct rspamd_mime_header *headers_order;
  77. struct rspamd_mime_headers_table *raw_headers;
  78. GPtrArray *urls;
  79. char *raw_headers_str;
  80. gsize raw_headers_len;
  81. enum rspamd_cte cte;
  82. unsigned int flags;
  83. enum rspamd_mime_part_type part_type;
  84. unsigned int part_number;
  85. union {
  86. struct rspamd_mime_multipart *mp;
  87. struct rspamd_mime_text_part *txt;
  88. struct rspamd_image *img;
  89. struct rspamd_archive *arch;
  90. struct rspamd_lua_specific_part lua_specific;
  91. } specific;
  92. unsigned char digest[rspamd_cryptobox_HASHBYTES];
  93. };
  94. #define RSPAMD_MIME_TEXT_PART_FLAG_UTF (1 << 0)
  95. #define RSPAMD_MIME_TEXT_PART_FLAG_EMPTY (1 << 1)
  96. #define RSPAMD_MIME_TEXT_PART_FLAG_HTML (1 << 2)
  97. #define RSPAMD_MIME_TEXT_PART_FLAG_8BIT_RAW (1 << 3)
  98. #define RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED (1 << 4)
  99. #define RSPAMD_MIME_TEXT_PART_ATTACHMENT (1 << 5)
  100. #define IS_TEXT_PART_EMPTY(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_EMPTY)
  101. #define IS_TEXT_PART_UTF(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_UTF)
  102. #define IS_TEXT_PART_HTML(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_HTML)
  103. #define IS_TEXT_PART_ATTACHMENT(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_ATTACHMENT)
  104. struct rspamd_mime_text_part {
  105. const char *language;
  106. GPtrArray *languages;
  107. const char *real_charset;
  108. /* Raw data in native encoding */
  109. rspamd_ftok_t raw;
  110. rspamd_ftok_t parsed; /* decoded from mime encodings */
  111. /* UTF8 content */
  112. rspamd_ftok_t utf_content; /* utf8 encoded processed content */
  113. GByteArray *utf_raw_content; /* utf raw content */
  114. GByteArray *utf_stripped_content; /* utf content with no newlines */
  115. GArray *normalized_hashes; /* Array of uint64_t */
  116. GArray *utf_words; /* Array of rspamd_stat_token_t */
  117. UText utf_stripped_text; /* Used by libicu to represent the utf8 content */
  118. GPtrArray *newlines; /**< positions of newlines in text, relative to content*/
  119. void *html;
  120. GList *exceptions; /**< list of offsets of urls */
  121. struct rspamd_mime_part *mime_part;
  122. unsigned int flags;
  123. unsigned int nlines;
  124. unsigned int spaces;
  125. unsigned int nwords;
  126. unsigned int non_ascii_chars;
  127. unsigned int ascii_chars;
  128. unsigned int double_spaces;
  129. unsigned int non_spaces;
  130. unsigned int empty_lines;
  131. unsigned int capital_letters;
  132. unsigned int numeric_characters;
  133. unsigned int unicode_scripts;
  134. };
  135. struct rspamd_message_raw_headers_content {
  136. const char *begin;
  137. gsize len;
  138. const char *body_start;
  139. };
  140. struct rspamd_message {
  141. const char *message_id;
  142. char *subject;
  143. GPtrArray *parts; /**< list of parsed parts */
  144. GPtrArray *text_parts; /**< list of text parts */
  145. struct rspamd_message_raw_headers_content raw_headers_content;
  146. void *received_headers; /**< list of received headers */
  147. khash_t(rspamd_url_hash) * urls;
  148. struct rspamd_mime_headers_table *raw_headers; /**< list of raw headers */
  149. struct rspamd_mime_header *headers_order; /**< order of raw headers */
  150. struct rspamd_task *task;
  151. GPtrArray *rcpt_mime;
  152. GPtrArray *from_mime;
  153. unsigned char digest[16];
  154. enum rspamd_newlines_type nlines_type; /**< type of newlines (detected on most of headers */
  155. ref_entry_t ref;
  156. };
  157. #define MESSAGE_FIELD(task, field) ((task)->message->field)
  158. #define MESSAGE_FIELD_CHECK(task, field) ((task)->message ? (task)->message->field : (__typeof__((task)->message->field)) NULL)
  159. /**
  160. * Parse and pre-process mime message
  161. * @param task worker_task object
  162. * @return
  163. */
  164. gboolean rspamd_message_parse(struct rspamd_task *task);
  165. /**
  166. * Process content in task (e.g. HTML parsing)
  167. * @param task
  168. */
  169. void rspamd_message_process(struct rspamd_task *task);
  170. /**
  171. * Converts string to cte
  172. * @param str
  173. * @return
  174. */
  175. enum rspamd_cte rspamd_cte_from_string(const char *str);
  176. /**
  177. * Converts cte to string
  178. * @param ct
  179. * @return
  180. */
  181. const char *rspamd_cte_to_string(enum rspamd_cte ct);
  182. struct rspamd_message *rspamd_message_new(struct rspamd_task *task);
  183. struct rspamd_message *rspamd_message_ref(struct rspamd_message *msg);
  184. void rspamd_message_unref(struct rspamd_message *msg);
  185. /**
  186. * Updates digest of the message if modified
  187. * @param msg
  188. * @param input
  189. * @param len
  190. */
  191. void rspamd_message_update_digest(struct rspamd_message *msg,
  192. const void *input, gsize len);
  193. #ifdef __cplusplus
  194. }
  195. #endif
  196. #endif