You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenizers.c 23KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955
  1. /*
  2. * Copyright 2024 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /*
  17. * Common tokenization functions
  18. */
  19. #include "rspamd.h"
  20. #include "tokenizers.h"
  21. #include "stat_internal.h"
  22. #include "contrib/mumhash/mum.h"
  23. #include "libmime/lang_detection.h"
  24. #include "libstemmer.h"
  25. #include <unicode/utf8.h>
  26. #include <unicode/uchar.h>
  27. #include <unicode/uiter.h>
  28. #include <unicode/ubrk.h>
  29. #include <unicode/ucnv.h>
  30. #if U_ICU_VERSION_MAJOR_NUM >= 44
  31. #include <unicode/unorm2.h>
  32. #endif
  33. #include <math.h>
  34. typedef gboolean (*token_get_function)(rspamd_stat_token_t *buf, gchar const **pos,
  35. rspamd_stat_token_t *token,
  36. GList **exceptions, gsize *rl, gboolean check_signature);
  37. const gchar t_delimiters[256] = {
  38. 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
  39. 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
  40. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  41. 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
  42. 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
  43. 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
  44. 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
  45. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  46. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  47. 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
  48. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  49. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  50. 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
  51. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  52. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  53. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  54. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  55. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  56. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  57. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  58. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  59. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  60. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  61. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  62. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  63. 0, 0, 0, 0, 0, 0};
  64. /* Get next word from specified f_str_t buf */
  65. static gboolean
  66. rspamd_tokenizer_get_word_raw(rspamd_stat_token_t *buf,
  67. gchar const **cur, rspamd_stat_token_t *token,
  68. GList **exceptions, gsize *rl, gboolean unused)
  69. {
  70. gsize remain, pos;
  71. const gchar *p;
  72. struct rspamd_process_exception *ex = NULL;
  73. if (buf == NULL) {
  74. return FALSE;
  75. }
  76. g_assert(cur != NULL);
  77. if (exceptions != NULL && *exceptions != NULL) {
  78. ex = (*exceptions)->data;
  79. }
  80. if (token->original.begin == NULL || *cur == NULL) {
  81. if (ex != NULL) {
  82. if (ex->pos == 0) {
  83. token->original.begin = buf->original.begin + ex->len;
  84. token->original.len = ex->len;
  85. token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
  86. }
  87. else {
  88. token->original.begin = buf->original.begin;
  89. token->original.len = 0;
  90. }
  91. }
  92. else {
  93. token->original.begin = buf->original.begin;
  94. token->original.len = 0;
  95. }
  96. *cur = token->original.begin;
  97. }
  98. token->original.len = 0;
  99. pos = *cur - buf->original.begin;
  100. if (pos >= buf->original.len) {
  101. return FALSE;
  102. }
  103. remain = buf->original.len - pos;
  104. p = *cur;
  105. /* Skip non delimiters symbols */
  106. do {
  107. if (ex != NULL && ex->pos == pos) {
  108. /* Go to the next exception */
  109. *exceptions = g_list_next(*exceptions);
  110. *cur = p + ex->len;
  111. return TRUE;
  112. }
  113. pos++;
  114. p++;
  115. remain--;
  116. } while (remain > 0 && t_delimiters[(guchar) *p]);
  117. token->original.begin = p;
  118. while (remain > 0 && !t_delimiters[(guchar) *p]) {
  119. if (ex != NULL && ex->pos == pos) {
  120. *exceptions = g_list_next(*exceptions);
  121. *cur = p + ex->len;
  122. return TRUE;
  123. }
  124. token->original.len++;
  125. pos++;
  126. remain--;
  127. p++;
  128. }
  129. if (remain == 0) {
  130. return FALSE;
  131. }
  132. if (rl) {
  133. *rl = token->original.len;
  134. }
  135. token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
  136. *cur = p;
  137. return TRUE;
  138. }
  139. static inline gboolean
  140. rspamd_tokenize_check_limit(gboolean decay,
  141. guint word_decay,
  142. guint nwords,
  143. uint64_t *hv,
  144. uint64_t *prob,
  145. const rspamd_stat_token_t *token,
  146. gssize remain,
  147. gssize total)
  148. {
  149. static const gdouble avg_word_len = 6.0;
  150. if (!decay) {
  151. if (token->original.len >= sizeof(uint64_t)) {
  152. uint64_t tmp;
  153. memcpy(&tmp, token->original.begin, sizeof(tmp));
  154. *hv = mum_hash_step(*hv, tmp);
  155. }
  156. /* Check for decay */
  157. if (word_decay > 0 && nwords > word_decay && remain < (gssize) total) {
  158. /* Start decay */
  159. gdouble decay_prob;
  160. *hv = mum_hash_finish(*hv);
  161. /* We assume that word is 6 symbols length in average */
  162. decay_prob = (gdouble) word_decay / ((total - (remain)) / avg_word_len) * 10;
  163. decay_prob = floor(decay_prob) / 10.0;
  164. if (decay_prob >= 1.0) {
  165. *prob = G_MAXUINT64;
  166. }
  167. else {
  168. *prob = (uint64_t) (decay_prob * (double) G_MAXUINT64);
  169. }
  170. return TRUE;
  171. }
  172. }
  173. else {
  174. /* Decaying probability */
  175. /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
  176. *hv = (*hv) * 2862933555777941757ULL + 3037000493ULL;
  177. if (*hv > *prob) {
  178. return TRUE;
  179. }
  180. }
  181. return FALSE;
  182. }
  183. static inline gboolean
  184. rspamd_utf_word_valid(const guchar *text, const guchar *end,
  185. int32_t start, int32_t finish)
  186. {
  187. const guchar *st = text + start, *fin = text + finish;
  188. UChar32 c;
  189. if (st >= end || fin > end || st >= fin) {
  190. return FALSE;
  191. }
  192. U8_NEXT(text, start, finish, c);
  193. if (u_isJavaIDPart(c)) {
  194. return TRUE;
  195. }
  196. return FALSE;
  197. }
  198. #define SHIFT_EX \
  199. do { \
  200. cur = g_list_next(cur); \
  201. if (cur) { \
  202. ex = (struct rspamd_process_exception *) cur->data; \
  203. } \
  204. else { \
  205. ex = NULL; \
  206. } \
  207. } while (0)
  208. static inline void
  209. rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res)
  210. {
  211. rspamd_stat_token_t token;
  212. memset(&token, 0, sizeof(token));
  213. if (ex->type == RSPAMD_EXCEPTION_GENERIC) {
  214. token.original.begin = "!!EX!!";
  215. token.original.len = sizeof("!!EX!!") - 1;
  216. token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
  217. g_array_append_val(res, token);
  218. token.flags = 0;
  219. }
  220. else if (ex->type == RSPAMD_EXCEPTION_URL) {
  221. struct rspamd_url *uri;
  222. uri = ex->ptr;
  223. if (uri && uri->tldlen > 0) {
  224. token.original.begin = rspamd_url_tld_unsafe(uri);
  225. token.original.len = uri->tldlen;
  226. }
  227. else {
  228. token.original.begin = "!!EX!!";
  229. token.original.len = sizeof("!!EX!!") - 1;
  230. }
  231. token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
  232. g_array_append_val(res, token);
  233. token.flags = 0;
  234. }
  235. }
  236. GArray *
  237. rspamd_tokenize_text(const gchar *text, gsize len,
  238. const UText *utxt,
  239. enum rspamd_tokenize_type how,
  240. struct rspamd_config *cfg,
  241. GList *exceptions,
  242. uint64_t *hash,
  243. GArray *cur_words,
  244. rspamd_mempool_t *pool)
  245. {
  246. rspamd_stat_token_t token, buf;
  247. const gchar *pos = NULL;
  248. gsize l = 0;
  249. GArray *res;
  250. GList *cur = exceptions;
  251. guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
  252. uint64_t hv = 0;
  253. gboolean decay = FALSE, long_text_mode = FALSE;
  254. uint64_t prob = 0;
  255. static UBreakIterator *bi = NULL;
  256. static const gsize long_text_limit = 1 * 1024 * 1024;
  257. static const ev_tstamp max_exec_time = 0.2; /* 200 ms */
  258. ev_tstamp start;
  259. if (text == NULL) {
  260. return cur_words;
  261. }
  262. if (len > long_text_limit) {
  263. /*
  264. * In this mode we do additional checks to avoid performance issues
  265. */
  266. long_text_mode = TRUE;
  267. start = ev_time();
  268. }
  269. buf.original.begin = text;
  270. buf.original.len = len;
  271. buf.flags = 0;
  272. memset(&token, 0, sizeof(token));
  273. if (cfg != NULL) {
  274. min_len = cfg->min_word_len;
  275. max_len = cfg->max_word_len;
  276. word_decay = cfg->words_decay;
  277. initial_size = word_decay * 2;
  278. }
  279. if (!cur_words) {
  280. res = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_stat_token_t),
  281. initial_size);
  282. }
  283. else {
  284. res = cur_words;
  285. }
  286. if (G_UNLIKELY(how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) {
  287. while (rspamd_tokenizer_get_word_raw(&buf, &pos, &token, &cur, &l, FALSE)) {
  288. if (l == 0 || (min_len > 0 && l < min_len) ||
  289. (max_len > 0 && l > max_len)) {
  290. token.original.begin = pos;
  291. continue;
  292. }
  293. if (token.original.len > 0 &&
  294. rspamd_tokenize_check_limit(decay, word_decay, res->len,
  295. &hv, &prob, &token, pos - text, len)) {
  296. if (!decay) {
  297. decay = TRUE;
  298. }
  299. else {
  300. token.original.begin = pos;
  301. continue;
  302. }
  303. }
  304. if (long_text_mode) {
  305. if ((res->len + 1) % 16 == 0) {
  306. ev_tstamp now = ev_time();
  307. if (now - start > max_exec_time) {
  308. msg_warn_pool_check(
  309. "too long time has been spent on tokenization:"
  310. " %.1f ms, limit is %.1f ms; %d words added so far",
  311. (now - start) * 1e3, max_exec_time * 1e3,
  312. res->len);
  313. goto end;
  314. }
  315. }
  316. }
  317. g_array_append_val(res, token);
  318. if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) {
  319. /* Due to bug in glib ! */
  320. msg_err_pool_check(
  321. "too many words found: %d, stop tokenization to avoid DoS",
  322. res->len);
  323. goto end;
  324. }
  325. token.original.begin = pos;
  326. }
  327. }
  328. else {
  329. /* UTF8 boundaries */
  330. UErrorCode uc_err = U_ZERO_ERROR;
  331. int32_t last, p;
  332. struct rspamd_process_exception *ex = NULL;
  333. if (bi == NULL) {
  334. bi = ubrk_open(UBRK_WORD, NULL, NULL, 0, &uc_err);
  335. g_assert(U_SUCCESS(uc_err));
  336. }
  337. ubrk_setUText(bi, (UText *) utxt, &uc_err);
  338. last = ubrk_first(bi);
  339. p = last;
  340. if (cur) {
  341. ex = (struct rspamd_process_exception *) cur->data;
  342. }
  343. while (p != UBRK_DONE) {
  344. start_over:
  345. token.original.len = 0;
  346. if (p > last) {
  347. if (ex && cur) {
  348. /* Check exception */
  349. if (ex->pos >= last && ex->pos <= p) {
  350. /* We have an exception within boundary */
  351. /* First, start to drain exceptions from the start */
  352. while (cur && ex->pos <= last) {
  353. /* We have an exception at the beginning, skip those */
  354. last += ex->len;
  355. rspamd_tokenize_exception(ex, res);
  356. if (last > p) {
  357. /* Exception spread over the boundaries */
  358. while (last > p && p != UBRK_DONE) {
  359. int32_t old_p = p;
  360. p = ubrk_next(bi);
  361. if (p != UBRK_DONE && p <= old_p) {
  362. msg_warn_pool_check(
  363. "tokenization reversed back on position %d,"
  364. "%d new position (%d backward), likely libicu bug!",
  365. (gint) (p), (gint) (old_p), old_p - p);
  366. goto end;
  367. }
  368. }
  369. /* We need to reset our scan with new p and last */
  370. SHIFT_EX;
  371. goto start_over;
  372. }
  373. SHIFT_EX;
  374. }
  375. /* Now, we can have an exception within boundary again */
  376. if (cur && ex->pos >= last && ex->pos <= p) {
  377. /* Append the first part */
  378. if (rspamd_utf_word_valid(text, text + len, last,
  379. ex->pos)) {
  380. token.original.begin = text + last;
  381. token.original.len = ex->pos - last;
  382. token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
  383. RSPAMD_STAT_TOKEN_FLAG_UTF;
  384. }
  385. /* Process the current exception */
  386. last += ex->len + (ex->pos - last);
  387. rspamd_tokenize_exception(ex, res);
  388. if (last > p) {
  389. /* Exception spread over the boundaries */
  390. while (last > p && p != UBRK_DONE) {
  391. int32_t old_p = p;
  392. p = ubrk_next(bi);
  393. if (p != UBRK_DONE && p <= old_p) {
  394. msg_warn_pool_check(
  395. "tokenization reversed back on position %d,"
  396. "%d new position (%d backward), likely libicu bug!",
  397. (gint) (p), (gint) (old_p), old_p - p);
  398. goto end;
  399. }
  400. }
  401. /* We need to reset our scan with new p and last */
  402. SHIFT_EX;
  403. goto start_over;
  404. }
  405. SHIFT_EX;
  406. }
  407. else if (p > last) {
  408. if (rspamd_utf_word_valid(text, text + len, last, p)) {
  409. token.original.begin = text + last;
  410. token.original.len = p - last;
  411. token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
  412. RSPAMD_STAT_TOKEN_FLAG_UTF;
  413. }
  414. }
  415. }
  416. else if (ex->pos < last) {
  417. /* Forward exceptions list */
  418. while (cur && ex->pos <= last) {
  419. /* We have an exception at the beginning, skip those */
  420. SHIFT_EX;
  421. }
  422. if (rspamd_utf_word_valid(text, text + len, last, p)) {
  423. token.original.begin = text + last;
  424. token.original.len = p - last;
  425. token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
  426. RSPAMD_STAT_TOKEN_FLAG_UTF;
  427. }
  428. }
  429. else {
  430. /* No exceptions within boundary */
  431. if (rspamd_utf_word_valid(text, text + len, last, p)) {
  432. token.original.begin = text + last;
  433. token.original.len = p - last;
  434. token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
  435. RSPAMD_STAT_TOKEN_FLAG_UTF;
  436. }
  437. }
  438. }
  439. else {
  440. if (rspamd_utf_word_valid(text, text + len, last, p)) {
  441. token.original.begin = text + last;
  442. token.original.len = p - last;
  443. token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
  444. RSPAMD_STAT_TOKEN_FLAG_UTF;
  445. }
  446. }
  447. if (token.original.len > 0 &&
  448. rspamd_tokenize_check_limit(decay, word_decay, res->len,
  449. &hv, &prob, &token, p, len)) {
  450. if (!decay) {
  451. decay = TRUE;
  452. }
  453. else {
  454. token.flags |= RSPAMD_STAT_TOKEN_FLAG_SKIPPED;
  455. }
  456. }
  457. }
  458. if (token.original.len > 0) {
  459. /* Additional check for number of words */
  460. if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) {
  461. /* Due to bug in glib ! */
  462. msg_err("too many words found: %d, stop tokenization to avoid DoS",
  463. res->len);
  464. goto end;
  465. }
  466. g_array_append_val(res, token);
  467. }
  468. /* Also check for long text mode */
  469. if (long_text_mode) {
  470. /* Check time each 128 words added */
  471. const int words_check_mask = 0x7F;
  472. if ((res->len & words_check_mask) == words_check_mask) {
  473. ev_tstamp now = ev_time();
  474. if (now - start > max_exec_time) {
  475. msg_warn_pool_check(
  476. "too long time has been spent on tokenization:"
  477. " %.1f ms, limit is %.1f ms; %d words added so far",
  478. (now - start) * 1e3, max_exec_time * 1e3,
  479. res->len);
  480. goto end;
  481. }
  482. }
  483. }
  484. last = p;
  485. p = ubrk_next(bi);
  486. if (p != UBRK_DONE && p <= last) {
  487. msg_warn_pool_check("tokenization reversed back on position %d,"
  488. "%d new position (%d backward), likely libicu bug!",
  489. (gint) (p), (gint) (last), last - p);
  490. goto end;
  491. }
  492. }
  493. }
  494. end:
  495. if (!decay) {
  496. hv = mum_hash_finish(hv);
  497. }
  498. if (hash) {
  499. *hash = hv;
  500. }
  501. return res;
  502. }
  503. #undef SHIFT_EX
  504. static void
  505. rspamd_add_metawords_from_str(const gchar *beg, gsize len,
  506. struct rspamd_task *task)
  507. {
  508. UText utxt = UTEXT_INITIALIZER;
  509. UErrorCode uc_err = U_ZERO_ERROR;
  510. guint i = 0;
  511. UChar32 uc;
  512. gboolean valid_utf = TRUE;
  513. while (i < len) {
  514. U8_NEXT(beg, i, len, uc);
  515. if (((int32_t) uc) < 0) {
  516. valid_utf = FALSE;
  517. break;
  518. }
  519. #if U_ICU_VERSION_MAJOR_NUM < 50
  520. if (u_isalpha(uc)) {
  521. int32_t sc = ublock_getCode(uc);
  522. if (sc == UBLOCK_THAI) {
  523. valid_utf = FALSE;
  524. msg_info_task("enable workaround for Thai characters for old libicu");
  525. break;
  526. }
  527. }
  528. #endif
  529. }
  530. if (valid_utf) {
  531. utext_openUTF8(&utxt,
  532. beg,
  533. len,
  534. &uc_err);
  535. task->meta_words = rspamd_tokenize_text(beg, len,
  536. &utxt, RSPAMD_TOKENIZE_UTF,
  537. task->cfg, NULL, NULL,
  538. task->meta_words,
  539. task->task_pool);
  540. utext_close(&utxt);
  541. }
  542. else {
  543. task->meta_words = rspamd_tokenize_text(beg, len,
  544. NULL, RSPAMD_TOKENIZE_RAW,
  545. task->cfg, NULL, NULL, task->meta_words,
  546. task->task_pool);
  547. }
  548. }
  549. void rspamd_tokenize_meta_words(struct rspamd_task *task)
  550. {
  551. guint i = 0;
  552. rspamd_stat_token_t *tok;
  553. if (MESSAGE_FIELD(task, subject)) {
  554. rspamd_add_metawords_from_str(MESSAGE_FIELD(task, subject),
  555. strlen(MESSAGE_FIELD(task, subject)), task);
  556. }
  557. if (MESSAGE_FIELD(task, from_mime) && MESSAGE_FIELD(task, from_mime)->len > 0) {
  558. struct rspamd_email_address *addr;
  559. addr = g_ptr_array_index(MESSAGE_FIELD(task, from_mime), 0);
  560. if (addr->name) {
  561. rspamd_add_metawords_from_str(addr->name, strlen(addr->name), task);
  562. }
  563. }
  564. if (task->meta_words != NULL) {
  565. const gchar *language = NULL;
  566. if (MESSAGE_FIELD(task, text_parts) &&
  567. MESSAGE_FIELD(task, text_parts)->len > 0) {
  568. struct rspamd_mime_text_part *tp = g_ptr_array_index(
  569. MESSAGE_FIELD(task, text_parts), 0);
  570. if (tp->language) {
  571. language = tp->language;
  572. }
  573. }
  574. rspamd_normalize_words(task->meta_words, task->task_pool);
  575. rspamd_stem_words(task->meta_words, task->task_pool, language,
  576. task->lang_det);
  577. for (i = 0; i < task->meta_words->len; i++) {
  578. tok = &g_array_index(task->meta_words, rspamd_stat_token_t, i);
  579. tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER;
  580. }
  581. }
  582. }
  583. static inline void
  584. rspamd_uchars_to_ucs32(const UChar *src, gsize srclen,
  585. rspamd_stat_token_t *tok,
  586. rspamd_mempool_t *pool)
  587. {
  588. UChar32 *dest, t, *d;
  589. int32_t i = 0;
  590. dest = rspamd_mempool_alloc(pool, srclen * sizeof(UChar32));
  591. d = dest;
  592. while (i < srclen) {
  593. U16_NEXT_UNSAFE(src, i, t);
  594. if (u_isgraph(t)) {
  595. UCharCategory cat;
  596. cat = u_charType(t);
  597. #if U_ICU_VERSION_MAJOR_NUM >= 57
  598. if (u_hasBinaryProperty(t, UCHAR_EMOJI)) {
  599. tok->flags |= RSPAMD_STAT_TOKEN_FLAG_EMOJI;
  600. }
  601. #endif
  602. if ((cat >= U_UPPERCASE_LETTER && cat <= U_OTHER_NUMBER) ||
  603. cat == U_CONNECTOR_PUNCTUATION ||
  604. cat == U_MATH_SYMBOL ||
  605. cat == U_CURRENCY_SYMBOL) {
  606. *d++ = u_tolower(t);
  607. }
  608. }
  609. else {
  610. /* Invisible spaces ! */
  611. tok->flags |= RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES;
  612. }
  613. }
  614. tok->unicode.begin = dest;
  615. tok->unicode.len = d - dest;
  616. }
  617. static inline void
  618. rspamd_ucs32_to_normalised(rspamd_stat_token_t *tok,
  619. rspamd_mempool_t *pool)
  620. {
  621. guint i, doff = 0;
  622. gsize utflen = 0;
  623. gchar *dest;
  624. UChar32 t;
  625. for (i = 0; i < tok->unicode.len; i++) {
  626. utflen += U8_LENGTH(tok->unicode.begin[i]);
  627. }
  628. dest = rspamd_mempool_alloc(pool, utflen + 1);
  629. for (i = 0; i < tok->unicode.len; i++) {
  630. t = tok->unicode.begin[i];
  631. U8_APPEND_UNSAFE(dest, doff, t);
  632. }
  633. g_assert(doff <= utflen);
  634. dest[doff] = '\0';
  635. tok->normalized.len = doff;
  636. tok->normalized.begin = dest;
  637. }
  638. void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool)
  639. {
  640. UErrorCode uc_err = U_ZERO_ERROR;
  641. UConverter *utf8_converter;
  642. UChar tmpbuf[1024]; /* Assume that we have no longer words... */
  643. gsize ulen;
  644. utf8_converter = rspamd_get_utf8_converter();
  645. if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
  646. ulen = ucnv_toUChars(utf8_converter,
  647. tmpbuf,
  648. G_N_ELEMENTS(tmpbuf),
  649. tok->original.begin,
  650. tok->original.len,
  651. &uc_err);
  652. /* Now, we need to understand if we need to normalise the word */
  653. if (!U_SUCCESS(uc_err)) {
  654. tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
  655. tok->unicode.begin = NULL;
  656. tok->unicode.len = 0;
  657. tok->normalized.begin = NULL;
  658. tok->normalized.len = 0;
  659. }
  660. else {
  661. #if U_ICU_VERSION_MAJOR_NUM >= 44
  662. const UNormalizer2 *norm = rspamd_get_unicode_normalizer();
  663. int32_t end;
  664. /* We can now check if we need to decompose */
  665. end = unorm2_spanQuickCheckYes(norm, tmpbuf, ulen, &uc_err);
  666. if (!U_SUCCESS(uc_err)) {
  667. rspamd_uchars_to_ucs32(tmpbuf, ulen, tok, pool);
  668. tok->normalized.begin = NULL;
  669. tok->normalized.len = 0;
  670. tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
  671. }
  672. else {
  673. if (end == ulen) {
  674. /* Already normalised, just lowercase */
  675. rspamd_uchars_to_ucs32(tmpbuf, ulen, tok, pool);
  676. rspamd_ucs32_to_normalised(tok, pool);
  677. }
  678. else {
  679. /* Perform normalization */
  680. UChar normbuf[1024];
  681. g_assert(end < G_N_ELEMENTS(normbuf));
  682. /* First part */
  683. memcpy(normbuf, tmpbuf, end * sizeof(UChar));
  684. /* Second part */
  685. ulen = unorm2_normalizeSecondAndAppend(norm,
  686. normbuf, end,
  687. G_N_ELEMENTS(normbuf),
  688. tmpbuf + end,
  689. ulen - end,
  690. &uc_err);
  691. if (!U_SUCCESS(uc_err)) {
  692. if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
  693. msg_warn_pool_check("cannot normalise text '%*s': %s",
  694. (gint) tok->original.len, tok->original.begin,
  695. u_errorName(uc_err));
  696. rspamd_uchars_to_ucs32(tmpbuf, ulen, tok, pool);
  697. rspamd_ucs32_to_normalised(tok, pool);
  698. tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
  699. }
  700. }
  701. else {
  702. /* Copy normalised back */
  703. rspamd_uchars_to_ucs32(normbuf, ulen, tok, pool);
  704. tok->flags |= RSPAMD_STAT_TOKEN_FLAG_NORMALISED;
  705. rspamd_ucs32_to_normalised(tok, pool);
  706. }
  707. }
  708. }
  709. #else
  710. /* Legacy version with no unorm2 interface */
  711. rspamd_uchars_to_ucs32(tmpbuf, ulen, tok, pool);
  712. rspamd_ucs32_to_normalised(tok, pool);
  713. #endif
  714. }
  715. }
  716. else {
  717. if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
  718. /* Simple lowercase */
  719. gchar *dest;
  720. dest = rspamd_mempool_alloc(pool, tok->original.len + 1);
  721. rspamd_strlcpy(dest, tok->original.begin, tok->original.len + 1);
  722. rspamd_str_lc(dest, tok->original.len);
  723. tok->normalized.len = tok->original.len;
  724. tok->normalized.begin = dest;
  725. }
  726. }
  727. }
  728. void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool)
  729. {
  730. rspamd_stat_token_t *tok;
  731. guint i;
  732. for (i = 0; i < words->len; i++) {
  733. tok = &g_array_index(words, rspamd_stat_token_t, i);
  734. rspamd_normalize_single_word(tok, pool);
  735. }
  736. }
  737. void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
  738. const gchar *language,
  739. struct rspamd_lang_detector *lang_detector)
  740. {
  741. static GHashTable *stemmers = NULL;
  742. struct sb_stemmer *stem = NULL;
  743. guint i;
  744. rspamd_stat_token_t *tok;
  745. gchar *dest;
  746. gsize dlen;
  747. if (!stemmers) {
  748. stemmers = g_hash_table_new(rspamd_strcase_hash,
  749. rspamd_strcase_equal);
  750. }
  751. if (language && language[0] != '\0') {
  752. stem = g_hash_table_lookup(stemmers, language);
  753. if (stem == NULL) {
  754. stem = sb_stemmer_new(language, "UTF_8");
  755. if (stem == NULL) {
  756. msg_debug_pool(
  757. "cannot create lemmatizer for %s language",
  758. language);
  759. g_hash_table_insert(stemmers, g_strdup(language),
  760. GINT_TO_POINTER(-1));
  761. }
  762. else {
  763. g_hash_table_insert(stemmers, g_strdup(language),
  764. stem);
  765. }
  766. }
  767. else if (stem == GINT_TO_POINTER(-1)) {
  768. /* Negative cache */
  769. stem = NULL;
  770. }
  771. }
  772. for (i = 0; i < words->len; i++) {
  773. tok = &g_array_index(words, rspamd_stat_token_t, i);
  774. if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
  775. if (stem) {
  776. const gchar *stemmed = NULL;
  777. stemmed = sb_stemmer_stem(stem,
  778. tok->normalized.begin, tok->normalized.len);
  779. dlen = sb_stemmer_length(stem);
  780. if (stemmed != NULL && dlen > 0) {
  781. dest = rspamd_mempool_alloc(pool, dlen);
  782. memcpy(dest, stemmed, dlen);
  783. tok->stemmed.len = dlen;
  784. tok->stemmed.begin = dest;
  785. tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STEMMED;
  786. }
  787. else {
  788. /* Fallback */
  789. tok->stemmed.len = tok->normalized.len;
  790. tok->stemmed.begin = tok->normalized.begin;
  791. }
  792. }
  793. else {
  794. tok->stemmed.len = tok->normalized.len;
  795. tok->stemmed.begin = tok->normalized.begin;
  796. }
  797. if (tok->stemmed.len > 0 && lang_detector != NULL &&
  798. rspamd_language_detector_is_stop_word(lang_detector, tok->stemmed.begin, tok->stemmed.len)) {
  799. tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STOP_WORD;
  800. }
  801. }
  802. else {
  803. if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
  804. /* Raw text, lowercase */
  805. tok->stemmed.len = tok->normalized.len;
  806. tok->stemmed.begin = tok->normalized.begin;
  807. }
  808. }
  809. }
  810. }