You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

chartable.c 5.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /***MODULE:chartable
  17. * rspamd module that make marks based on symbol chains
  18. *
  19. * Allowed options:
  20. * - symbol (string): symbol to insert (default: 'R_BAD_CHARSET')
  21. * - threshold (double): value that would be used as threshold in expression characters_changed / total_characters
  22. * (e.g. if threshold is 0.1 than charset change should occure more often than in 10 symbols), default: 0.1
  23. */
  24. #include "config.h"
  25. #include "libmime/message.h"
  26. #include "rspamd.h"
  27. #define DEFAULT_SYMBOL "R_CHARSET_MIXED"
  28. #define DEFAULT_THRESHOLD 0.1
  29. /* Initialization */
  30. gint chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx);
  31. gint chartable_module_config (struct rspamd_config *cfg);
  32. gint chartable_module_reconfig (struct rspamd_config *cfg);
  33. module_t chartable_module = {
  34. "chartable",
  35. chartable_module_init,
  36. chartable_module_config,
  37. chartable_module_reconfig,
  38. NULL,
  39. RSPAMD_MODULE_VER
  40. };
  41. struct chartable_ctx {
  42. struct module_ctx ctx;
  43. const gchar *symbol;
  44. double threshold;
  45. rspamd_mempool_t *chartable_pool;
  46. };
  47. static struct chartable_ctx *chartable_module_ctx = NULL;
  48. static void chartable_symbol_callback (struct rspamd_task *task, void *unused);
  49. gint
  50. chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx)
  51. {
  52. chartable_module_ctx = g_malloc (sizeof (struct chartable_ctx));
  53. chartable_module_ctx->chartable_pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), NULL);
  54. *ctx = (struct module_ctx *)chartable_module_ctx;
  55. return 0;
  56. }
  57. gint
  58. chartable_module_config (struct rspamd_config *cfg)
  59. {
  60. const ucl_object_t *value;
  61. gint res = TRUE;
  62. if (!rspamd_config_is_module_enabled (cfg, "chartable")) {
  63. return TRUE;
  64. }
  65. if ((value =
  66. rspamd_config_get_module_opt (cfg, "chartable", "symbol")) != NULL) {
  67. chartable_module_ctx->symbol = ucl_obj_tostring (value);
  68. }
  69. else {
  70. chartable_module_ctx->symbol = DEFAULT_SYMBOL;
  71. }
  72. if ((value =
  73. rspamd_config_get_module_opt (cfg, "chartable", "threshold")) != NULL) {
  74. if (!ucl_obj_todouble_safe (value, &chartable_module_ctx->threshold)) {
  75. msg_warn_config ("invalid numeric value");
  76. chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
  77. }
  78. }
  79. else {
  80. chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
  81. }
  82. rspamd_symbols_cache_add_symbol (cfg->cache,
  83. chartable_module_ctx->symbol,
  84. 0,
  85. chartable_symbol_callback,
  86. NULL,
  87. SYMBOL_TYPE_NORMAL,
  88. -1);
  89. msg_info_config ("init internal chartable module");
  90. return res;
  91. }
  92. gint
  93. chartable_module_reconfig (struct rspamd_config *cfg)
  94. {
  95. rspamd_mempool_delete (chartable_module_ctx->chartable_pool);
  96. chartable_module_ctx->chartable_pool = rspamd_mempool_new (1024, NULL);
  97. return chartable_module_config (cfg);
  98. }
  99. static gboolean
  100. check_part (struct mime_text_part *part, gboolean raw_mode)
  101. {
  102. guchar *p, *p1;
  103. gunichar c, t;
  104. GUnicodeScript scc, sct;
  105. guint32 mark = 0, total = 0, max = 0, i;
  106. guint32 remain = part->content->len;
  107. guint32 scripts[G_UNICODE_SCRIPT_NKO];
  108. GUnicodeScript sel = 0;
  109. p = part->content->data;
  110. if (IS_PART_UTF (part) || raw_mode) {
  111. while (remain > 1) {
  112. if ((g_ascii_isalpha (*p) &&
  113. (*(p + 1) & 0x80)) ||
  114. ((*p & 0x80) && g_ascii_isalpha (*(p + 1)))) {
  115. mark++;
  116. total++;
  117. }
  118. /* Current and next symbols are of one class */
  119. else if (((*p & 0x80) &&
  120. (*(p + 1) & 0x80)) ||
  121. (g_ascii_isalpha (*p) && g_ascii_isalpha (*(p + 1)))) {
  122. total++;
  123. }
  124. p++;
  125. remain--;
  126. }
  127. }
  128. else {
  129. memset (&scripts, 0, sizeof (scripts));
  130. while (remain > 0) {
  131. c = g_utf8_get_char_validated (p, remain);
  132. if (c == (gunichar) - 2 || c == (gunichar) - 1) {
  133. /* Invalid characters detected, stop processing */
  134. return FALSE;
  135. }
  136. scc = g_unichar_get_script (c);
  137. if (scc < (gint)G_N_ELEMENTS (scripts)) {
  138. scripts[scc]++;
  139. }
  140. p1 = g_utf8_next_char (p);
  141. remain -= p1 - p;
  142. p = p1;
  143. if (remain > 0) {
  144. t = g_utf8_get_char_validated (p, remain);
  145. if (t == (gunichar) - 2 || t == (gunichar) - 1) {
  146. /* Invalid characters detected, stop processing */
  147. return FALSE;
  148. }
  149. sct = g_unichar_get_script (t);
  150. if (g_unichar_isalpha (c) && g_unichar_isalpha (t)) {
  151. /* We have two unicode alphanumeric characters, so we can check its script */
  152. if (sct != scc) {
  153. mark++;
  154. }
  155. total++;
  156. }
  157. p1 = g_utf8_next_char (p);
  158. remain -= p1 - p;
  159. p = p1;
  160. }
  161. }
  162. /* Detect the mostly charset of this part */
  163. for (i = 0; i < G_N_ELEMENTS (scripts); i++) {
  164. if (scripts[i] > max) {
  165. max = scripts[i];
  166. sel = i;
  167. }
  168. }
  169. part->script = sel;
  170. }
  171. if (total == 0) {
  172. return 0;
  173. }
  174. return ((double)mark / (double)total) > chartable_module_ctx->threshold;
  175. }
  176. static void
  177. chartable_symbol_callback (struct rspamd_task *task, void *unused)
  178. {
  179. guint i;
  180. struct mime_text_part *part;
  181. for (i = 0; i < task->text_parts->len; i ++) {
  182. part = g_ptr_array_index (task->text_parts, i);
  183. if (!IS_PART_EMPTY (part) && check_part (part, task->cfg->raw_mode)) {
  184. rspamd_task_insert_result (task, chartable_module_ctx->symbol, 1, NULL);
  185. }
  186. }
  187. }