You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tokenizers.c 20KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /*
  17. * Common tokenization functions
  18. */
  19. #include "rspamd.h"
  20. #include "tokenizers.h"
  21. #include "stat_internal.h"
  22. #include "contrib/mumhash/mum.h"
  23. #include "libmime/lang_detection.h"
  24. #include "libstemmer.h"
  25. #include <unicode/utf8.h>
  26. #include <unicode/uchar.h>
  27. #include <unicode/uiter.h>
  28. #include <unicode/ubrk.h>
  29. #include <unicode/ucnv.h>
  30. #if U_ICU_VERSION_MAJOR_NUM >= 44
  31. #include <unicode/unorm2.h>
  32. #endif
  33. #include <math.h>
  34. typedef gboolean (*token_get_function) (rspamd_stat_token_t * buf, gchar const **pos,
  35. rspamd_stat_token_t * token,
  36. GList **exceptions, gsize *rl, gboolean check_signature);
  37. const gchar t_delimiters[255] = {
  38. 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
  39. 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
  40. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  41. 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
  42. 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
  43. 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
  44. 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
  45. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  46. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  47. 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
  48. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  49. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  50. 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
  51. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  52. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  53. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  54. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  55. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  56. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  57. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  58. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  59. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  60. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  61. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  62. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  63. 0, 0, 0, 0, 0
  64. };
  65. /* Get next word from specified f_str_t buf */
  66. static gboolean
  67. rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf,
  68. gchar const **cur, rspamd_stat_token_t * token,
  69. GList **exceptions, gsize *rl, gboolean unused)
  70. {
  71. gsize remain, pos;
  72. const gchar *p;
  73. struct rspamd_process_exception *ex = NULL;
  74. if (buf == NULL) {
  75. return FALSE;
  76. }
  77. g_assert (cur != NULL);
  78. if (exceptions != NULL && *exceptions != NULL) {
  79. ex = (*exceptions)->data;
  80. }
  81. if (token->original.begin == NULL || *cur == NULL) {
  82. if (ex != NULL) {
  83. if (ex->pos == 0) {
  84. token->original.begin = buf->original.begin + ex->len;
  85. token->original.len = ex->len;
  86. token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
  87. }
  88. else {
  89. token->original.begin = buf->original.begin;
  90. token->original.len = 0;
  91. }
  92. }
  93. else {
  94. token->original.begin = buf->original.begin;
  95. token->original.len = 0;
  96. }
  97. *cur = token->original.begin;
  98. }
  99. token->original.len = 0;
  100. pos = *cur - buf->original.begin;
  101. if (pos >= buf->original.len) {
  102. return FALSE;
  103. }
  104. remain = buf->original.len - pos;
  105. p = *cur;
  106. /* Skip non delimiters symbols */
  107. do {
  108. if (ex != NULL && ex->pos == pos) {
  109. /* Go to the next exception */
  110. *exceptions = g_list_next (*exceptions);
  111. *cur = p + ex->len;
  112. return TRUE;
  113. }
  114. pos++;
  115. p++;
  116. remain--;
  117. } while (remain > 0 && t_delimiters[(guchar)*p]);
  118. token->original.begin = p;
  119. while (remain > 0 && !t_delimiters[(guchar)*p]) {
  120. if (ex != NULL && ex->pos == pos) {
  121. *exceptions = g_list_next (*exceptions);
  122. *cur = p + ex->len;
  123. return TRUE;
  124. }
  125. token->original.len++;
  126. pos++;
  127. remain--;
  128. p++;
  129. }
  130. if (remain == 0) {
  131. return FALSE;
  132. }
  133. if (rl) {
  134. *rl = token->original.len;
  135. }
  136. token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
  137. *cur = p;
  138. return TRUE;
  139. }
  140. static inline gboolean
  141. rspamd_tokenize_check_limit (gboolean decay,
  142. guint word_decay,
  143. guint nwords,
  144. guint64 *hv,
  145. guint64 *prob,
  146. const rspamd_stat_token_t *token,
  147. gssize remain,
  148. gssize total)
  149. {
  150. static const gdouble avg_word_len = 6.0;
  151. if (!decay) {
  152. if (token->original.len >= sizeof (guint64)) {
  153. #ifdef _MUM_UNALIGNED_ACCESS
  154. *hv = mum_hash_step (*hv, *(guint64 *)token->original.begin);
  155. #else
  156. guint64 tmp;
  157. memcpy (&tmp, token->original.begin, sizeof (tmp));
  158. *hv = mum_hash_step (*hv, tmp);
  159. #endif
  160. }
  161. /* Check for decay */
  162. if (word_decay > 0 && nwords > word_decay && remain < (gssize)total) {
  163. /* Start decay */
  164. gdouble decay_prob;
  165. *hv = mum_hash_finish (*hv);
  166. /* We assume that word is 6 symbols length in average */
  167. decay_prob = (gdouble)word_decay / ((total - (remain)) / avg_word_len) * 10;
  168. decay_prob = floor (decay_prob) / 10.0;
  169. if (decay_prob >= 1.0) {
  170. *prob = G_MAXUINT64;
  171. }
  172. else {
  173. *prob = decay_prob * G_MAXUINT64;
  174. }
  175. return TRUE;
  176. }
  177. }
  178. else {
  179. /* Decaying probability */
  180. /* LCG64 x[n] = a x[n - 1] + b mod 2^64 */
  181. *hv = (*hv) * 2862933555777941757ULL + 3037000493ULL;
  182. if (*hv > *prob) {
  183. return TRUE;
  184. }
  185. }
  186. return FALSE;
  187. }
  188. static inline gboolean
  189. rspamd_utf_word_valid (const guchar *text, const guchar *end,
  190. gint32 start, gint32 finish)
  191. {
  192. const guchar *st = text + start, *fin = text + finish;
  193. UChar32 c;
  194. if (st >= end || fin > end || st >= fin) {
  195. return FALSE;
  196. }
  197. U8_NEXT (text, start, finish, c);
  198. if (u_isJavaIDPart (c)) {
  199. return TRUE;
  200. }
  201. return FALSE;
  202. }
  203. #define SHIFT_EX do { \
  204. cur = g_list_next (cur); \
  205. if (cur) { \
  206. ex = (struct rspamd_process_exception *) cur->data; \
  207. } \
  208. else { \
  209. ex = NULL; \
  210. } \
  211. } while(0)
  212. static inline void
  213. rspamd_tokenize_exception (struct rspamd_process_exception *ex, GArray *res)
  214. {
  215. rspamd_stat_token_t token;
  216. memset (&token, 0, sizeof (token));
  217. if (ex->type == RSPAMD_EXCEPTION_GENERIC) {
  218. token.original.begin = "!!EX!!";
  219. token.original.len = sizeof ("!!EX!!") - 1;
  220. token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
  221. g_array_append_val (res, token);
  222. token.flags = 0;
  223. }
  224. else if (ex->type == RSPAMD_EXCEPTION_URL) {
  225. struct rspamd_url *uri;
  226. uri = ex->ptr;
  227. if (uri && uri->tldlen > 0) {
  228. token.original.begin = uri->tld;
  229. token.original.len = uri->tldlen;
  230. }
  231. else {
  232. token.original.begin = "!!EX!!";
  233. token.original.len = sizeof ("!!EX!!") - 1;
  234. }
  235. token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
  236. g_array_append_val (res, token);
  237. token.flags = 0;
  238. }
  239. }
  240. GArray *
  241. rspamd_tokenize_text (const gchar *text, gsize len,
  242. const UText *utxt,
  243. enum rspamd_tokenize_type how,
  244. struct rspamd_config *cfg,
  245. GList *exceptions,
  246. guint64 *hash,
  247. GArray *cur_words)
  248. {
  249. rspamd_stat_token_t token, buf;
  250. const gchar *pos = NULL;
  251. gsize l = 0;
  252. GArray *res;
  253. GList *cur = exceptions;
  254. guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
  255. guint64 hv = 0;
  256. gboolean decay = FALSE;
  257. guint64 prob = 0;
  258. static UBreakIterator* bi = NULL;
  259. if (text == NULL) {
  260. return cur_words;
  261. }
  262. buf.original.begin = text;
  263. buf.original.len = len;
  264. buf.flags = 0;
  265. memset (&token, 0, sizeof (token));
  266. if (cfg != NULL) {
  267. min_len = cfg->min_word_len;
  268. max_len = cfg->max_word_len;
  269. word_decay = cfg->words_decay;
  270. initial_size = word_decay * 2;
  271. }
  272. if (!cur_words) {
  273. res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_stat_token_t),
  274. initial_size);
  275. }
  276. else {
  277. res = cur_words;
  278. }
  279. if (G_UNLIKELY (how == RSPAMD_TOKENIZE_RAW || utxt == NULL)) {
  280. while (rspamd_tokenizer_get_word_raw (&buf, &pos, &token, &cur, &l, FALSE)) {
  281. if (l == 0 || (min_len > 0 && l < min_len) ||
  282. (max_len > 0 && l > max_len)) {
  283. token.original.begin = pos;
  284. continue;
  285. }
  286. if (token.original.len > 0 &&
  287. rspamd_tokenize_check_limit (decay, word_decay, res->len,
  288. &hv, &prob, &token, pos - text, len)) {
  289. if (!decay) {
  290. decay = TRUE;
  291. }
  292. else {
  293. token.original.begin = pos;
  294. continue;
  295. }
  296. }
  297. g_array_append_val (res, token);
  298. token.original.begin = pos;
  299. }
  300. }
  301. else {
  302. /* UTF8 boundaries */
  303. UErrorCode uc_err = U_ZERO_ERROR;
  304. int32_t last, p;
  305. struct rspamd_process_exception *ex = NULL;
  306. if (bi == NULL) {
  307. bi = ubrk_open (UBRK_WORD, NULL, NULL, 0, &uc_err);
  308. g_assert (U_SUCCESS (uc_err));
  309. }
  310. ubrk_setUText (bi, (UText*)utxt, &uc_err);
  311. last = ubrk_first (bi);
  312. p = last;
  313. if (cur) {
  314. ex = (struct rspamd_process_exception *)cur->data;
  315. }
  316. while (p != UBRK_DONE) {
  317. start_over:
  318. token.original.len = 0;
  319. if (p > last) {
  320. if (ex && cur) {
  321. /* Check exception */
  322. if (ex->pos >= last && ex->pos <= p) {
  323. /* We have an exception within boundary */
  324. /* First, start to drain exceptions from the start */
  325. while (cur && ex->pos <= last) {
  326. /* We have an exception at the beginning, skip those */
  327. last += ex->len;
  328. rspamd_tokenize_exception (ex, res);
  329. if (last > p) {
  330. /* Exception spread over the boundaries */
  331. while (last > p && p != UBRK_DONE) {
  332. p = ubrk_next (bi);
  333. }
  334. /* We need to reset our scan with new p and last */
  335. SHIFT_EX;
  336. goto start_over;
  337. }
  338. SHIFT_EX;
  339. }
  340. /* Now, we can have an exception within boundary again */
  341. if (cur && ex->pos >= last && ex->pos <= p) {
  342. /* Append the first part */
  343. if (rspamd_utf_word_valid (text, text + len, last,
  344. ex->pos)) {
  345. token.original.begin = text + last;
  346. token.original.len = ex->pos - last;
  347. token.flags = 0;
  348. g_array_append_val (res, token);
  349. }
  350. /* Process the current exception */
  351. last += ex->len + (ex->pos - last);
  352. rspamd_tokenize_exception (ex, res);
  353. if (last > p) {
  354. /* Exception spread over the boundaries */
  355. while (last > p && p != UBRK_DONE) {
  356. p = ubrk_next (bi);
  357. }
  358. /* We need to reset our scan with new p and last */
  359. SHIFT_EX;
  360. goto start_over;
  361. }
  362. SHIFT_EX;
  363. }
  364. else if (p > last) {
  365. if (rspamd_utf_word_valid (text, text + len, last, p)) {
  366. token.original.begin = text + last;
  367. token.original.len = p - last;
  368. token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
  369. RSPAMD_STAT_TOKEN_FLAG_UTF;
  370. }
  371. }
  372. }
  373. else if (ex->pos < last) {
  374. /* Forward exceptions list */
  375. while (cur && ex->pos <= last) {
  376. /* We have an exception at the beginning, skip those */
  377. SHIFT_EX;
  378. }
  379. if (rspamd_utf_word_valid (text, text + len, last, p)) {
  380. token.original.begin = text + last;
  381. token.original.len = p - last;
  382. token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
  383. RSPAMD_STAT_TOKEN_FLAG_UTF;
  384. }
  385. }
  386. else {
  387. /* No exceptions within boundary */
  388. if (rspamd_utf_word_valid (text, text + len, last, p)) {
  389. token.original.begin = text + last;
  390. token.original.len = p - last;
  391. token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
  392. RSPAMD_STAT_TOKEN_FLAG_UTF;
  393. }
  394. }
  395. }
  396. else {
  397. if (rspamd_utf_word_valid (text, text + len, last, p)) {
  398. token.original.begin = text + last;
  399. token.original.len = p - last;
  400. token.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT |
  401. RSPAMD_STAT_TOKEN_FLAG_UTF;
  402. }
  403. }
  404. if (token.original.len > 0 &&
  405. rspamd_tokenize_check_limit (decay, word_decay, res->len,
  406. &hv, &prob, &token, p, len)) {
  407. if (!decay) {
  408. decay = TRUE;
  409. } else {
  410. token.flags |= RSPAMD_STAT_TOKEN_FLAG_SKIPPED;
  411. }
  412. }
  413. }
  414. if (token.original.len > 0) {
  415. if (((gsize)res->len) * sizeof (token) > (0x1ull << 30u)) {
  416. /* Due to bug in glib ! */
  417. msg_err ("too many words found: %d, stop tokenization to avoid DoS",
  418. res->len);
  419. goto end;
  420. }
  421. g_array_append_val (res, token);
  422. }
  423. last = p;
  424. p = ubrk_next (bi);
  425. }
  426. }
  427. end:
  428. if (!decay) {
  429. hv = mum_hash_finish (hv);
  430. }
  431. if (hash) {
  432. *hash = hv;
  433. }
  434. return res;
  435. }
  436. #undef SHIFT_EX
  437. static void
  438. rspamd_add_metawords_from_str (const gchar *beg, gsize len,
  439. struct rspamd_task *task)
  440. {
  441. UText utxt = UTEXT_INITIALIZER;
  442. UErrorCode uc_err = U_ZERO_ERROR;
  443. guint i = 0;
  444. UChar32 uc;
  445. gboolean valid_utf = TRUE;
  446. while (i < len) {
  447. U8_NEXT (beg, i, len, uc);
  448. if (((gint32) uc) < 0) {
  449. valid_utf = FALSE;
  450. break;
  451. }
  452. #if U_ICU_VERSION_MAJOR_NUM < 50
  453. if (u_isalpha (uc)) {
  454. gint32 sc = ublock_getCode (uc);
  455. if (sc == UBLOCK_THAI) {
  456. valid_utf = FALSE;
  457. msg_info_task ("enable workaround for Thai characters for old libicu");
  458. break;
  459. }
  460. }
  461. #endif
  462. }
  463. if (valid_utf) {
  464. utext_openUTF8 (&utxt,
  465. beg,
  466. len,
  467. &uc_err);
  468. task->meta_words = rspamd_tokenize_text (beg, len,
  469. &utxt, RSPAMD_TOKENIZE_UTF,
  470. task->cfg, NULL, NULL, task->meta_words);
  471. utext_close (&utxt);
  472. }
  473. else {
  474. task->meta_words = rspamd_tokenize_text (beg, len,
  475. NULL, RSPAMD_TOKENIZE_RAW,
  476. task->cfg, NULL, NULL, task->meta_words);
  477. }
  478. }
  479. void
  480. rspamd_tokenize_meta_words (struct rspamd_task *task)
  481. {
  482. guint i = 0;
  483. rspamd_stat_token_t *tok;
  484. if (task->subject) {
  485. rspamd_add_metawords_from_str (task->subject, strlen (task->subject), task);
  486. }
  487. if (task->from_mime && task->from_mime->len > 0) {
  488. struct rspamd_email_address *addr;
  489. addr = g_ptr_array_index (task->from_mime, 0);
  490. if (addr->name) {
  491. rspamd_add_metawords_from_str (addr->name, strlen (addr->name), task);
  492. }
  493. }
  494. if (task->meta_words != NULL) {
  495. const gchar *language = NULL;
  496. if (task->text_parts && task->text_parts->len > 0) {
  497. struct rspamd_mime_text_part *tp = g_ptr_array_index (task->text_parts, 0);
  498. if (tp->language) {
  499. language = tp->language;
  500. }
  501. }
  502. rspamd_normalize_words (task->meta_words, task->task_pool);
  503. rspamd_stem_words (task->meta_words, task->task_pool, language,
  504. task->lang_det);
  505. for (i = 0; i < task->meta_words->len; i++) {
  506. tok = &g_array_index (task->meta_words, rspamd_stat_token_t, i);
  507. tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER;
  508. }
  509. }
  510. }
  511. static inline void
  512. rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen,
  513. rspamd_stat_token_t *tok,
  514. rspamd_mempool_t *pool)
  515. {
  516. UChar32 *dest, t, *d;
  517. gint32 i = 0;
  518. dest = rspamd_mempool_alloc (pool, srclen * sizeof (UChar32));
  519. d = dest;
  520. while (i < srclen) {
  521. U16_NEXT_UNSAFE (src, i, t);
  522. if (u_isgraph (t)) {
  523. UCharCategory cat;
  524. cat = u_charType (t);
  525. #if U_ICU_VERSION_MAJOR_NUM >= 57
  526. if (u_hasBinaryProperty (t, UCHAR_EMOJI)) {
  527. tok->flags |= RSPAMD_STAT_TOKEN_FLAG_EMOJI;
  528. }
  529. #endif
  530. if ((cat >= U_UPPERCASE_LETTER && cat <= U_OTHER_NUMBER) ||
  531. cat == U_CONNECTOR_PUNCTUATION ||
  532. cat == U_MATH_SYMBOL ||
  533. cat == U_CURRENCY_SYMBOL) {
  534. *d++ = u_tolower (t);
  535. }
  536. }
  537. else {
  538. /* Invisible spaces ! */
  539. tok->flags |= RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES;
  540. }
  541. }
  542. tok->unicode.begin = dest;
  543. tok->unicode.len = d - dest;
  544. }
  545. static inline void
  546. rspamd_ucs32_to_normalised (rspamd_stat_token_t *tok,
  547. rspamd_mempool_t *pool)
  548. {
  549. guint i, doff = 0;
  550. gsize utflen = 0;
  551. gchar *dest;
  552. UChar32 t;
  553. for (i = 0; i < tok->unicode.len; i ++) {
  554. utflen += U8_LENGTH (tok->unicode.begin[i]);
  555. }
  556. dest = rspamd_mempool_alloc (pool, utflen + 1);
  557. for (i = 0; i < tok->unicode.len; i ++) {
  558. t = tok->unicode.begin[i];
  559. U8_APPEND_UNSAFE (dest, doff, t);
  560. }
  561. g_assert (doff <= utflen);
  562. dest[doff] = '\0';
  563. tok->normalized.len = doff;
  564. tok->normalized.begin = dest;
  565. }
  566. void
  567. rspamd_normalize_single_word (rspamd_stat_token_t *tok, rspamd_mempool_t *pool)
  568. {
  569. UErrorCode uc_err = U_ZERO_ERROR;
  570. UConverter *utf8_converter;
  571. UChar tmpbuf[1024]; /* Assume that we have no longer words... */
  572. gsize ulen;
  573. utf8_converter = rspamd_get_utf8_converter ();
  574. if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
  575. ulen = ucnv_toUChars (utf8_converter,
  576. tmpbuf,
  577. G_N_ELEMENTS (tmpbuf),
  578. tok->original.begin,
  579. tok->original.len,
  580. &uc_err);
  581. /* Now, we need to understand if we need to normalise the word */
  582. if (!U_SUCCESS (uc_err)) {
  583. tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
  584. tok->unicode.begin = NULL;
  585. tok->unicode.len = 0;
  586. tok->normalized.begin = NULL;
  587. tok->normalized.len = 0;
  588. }
  589. else {
  590. #if U_ICU_VERSION_MAJOR_NUM >= 44
  591. const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
  592. gint32 end;
  593. /* We can now check if we need to decompose */
  594. end = unorm2_spanQuickCheckYes (norm, tmpbuf, ulen, &uc_err);
  595. if (!U_SUCCESS (uc_err)) {
  596. rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
  597. tok->normalized.begin = NULL;
  598. tok->normalized.len = 0;
  599. tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
  600. }
  601. else {
  602. if (end == ulen) {
  603. /* Already normalised, just lowercase */
  604. rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
  605. rspamd_ucs32_to_normalised (tok, pool);
  606. }
  607. else {
  608. /* Perform normalization */
  609. UChar normbuf[1024];
  610. g_assert (end < G_N_ELEMENTS (normbuf));
  611. /* First part */
  612. memcpy (normbuf, tmpbuf, end * sizeof (UChar));
  613. /* Second part */
  614. ulen = unorm2_normalizeSecondAndAppend (norm,
  615. normbuf, end,
  616. G_N_ELEMENTS (normbuf),
  617. tmpbuf + end,
  618. ulen - end,
  619. &uc_err);
  620. if (!U_SUCCESS (uc_err)) {
  621. if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
  622. msg_warn_pool_check ("cannot normalise text '%*s': %s",
  623. (gint)tok->original.len, tok->original.begin,
  624. u_errorName (uc_err));
  625. rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
  626. rspamd_ucs32_to_normalised (tok, pool);
  627. tok->flags |= RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE;
  628. }
  629. }
  630. else {
  631. /* Copy normalised back */
  632. rspamd_uchars_to_ucs32 (normbuf, ulen, tok, pool);
  633. tok->flags |= RSPAMD_STAT_TOKEN_FLAG_NORMALISED;
  634. rspamd_ucs32_to_normalised (tok, pool);
  635. }
  636. }
  637. }
  638. #else
  639. /* Legacy version with no unorm2 interface */
  640. rspamd_uchars_to_ucs32 (tmpbuf, ulen, tok, pool);
  641. rspamd_ucs32_to_normalised (tok, pool);
  642. #endif
  643. }
  644. }
  645. else {
  646. if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
  647. /* Simple lowercase */
  648. gchar *dest;
  649. dest = rspamd_mempool_alloc (pool, tok->original.len + 1);
  650. rspamd_strlcpy (dest, tok->original.begin, tok->original.len + 1);
  651. rspamd_str_lc (dest, tok->original.len);
  652. tok->normalized.len = tok->original.len;
  653. tok->normalized.begin = dest;
  654. }
  655. }
  656. }
  657. void
  658. rspamd_normalize_words (GArray *words, rspamd_mempool_t *pool)
  659. {
  660. rspamd_stat_token_t *tok;
  661. guint i;
  662. for (i = 0; i < words->len; i++) {
  663. tok = &g_array_index (words, rspamd_stat_token_t, i);
  664. rspamd_normalize_single_word (tok, pool);
  665. }
  666. }
  667. void
  668. rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
  669. const gchar *language,
  670. struct rspamd_lang_detector *d)
  671. {
  672. static GHashTable *stemmers = NULL;
  673. struct sb_stemmer *stem = NULL;
  674. guint i;
  675. rspamd_stat_token_t *tok;
  676. gchar *dest;
  677. gsize dlen;
  678. if (!stemmers) {
  679. stemmers = g_hash_table_new (rspamd_strcase_hash,
  680. rspamd_strcase_equal);
  681. }
  682. if (language && language[0] != '\0') {
  683. stem = g_hash_table_lookup (stemmers, language);
  684. if (stem == NULL) {
  685. stem = sb_stemmer_new (language, "UTF_8");
  686. if (stem == NULL) {
  687. msg_debug_pool (
  688. "<%s> cannot create lemmatizer for %s language",
  689. language);
  690. g_hash_table_insert (stemmers, g_strdup (language),
  691. GINT_TO_POINTER (-1));
  692. }
  693. else {
  694. g_hash_table_insert (stemmers, g_strdup (language),
  695. stem);
  696. }
  697. }
  698. else if (stem == GINT_TO_POINTER (-1)) {
  699. /* Negative cache */
  700. stem = NULL;
  701. }
  702. }
  703. for (i = 0; i < words->len; i++) {
  704. tok = &g_array_index (words, rspamd_stat_token_t, i);
  705. if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
  706. if (stem) {
  707. const gchar *stemmed = NULL;
  708. stemmed = sb_stemmer_stem (stem,
  709. tok->normalized.begin, tok->normalized.len);
  710. dlen = stemmed ? strlen (stemmed) : 0;
  711. if (dlen > 0) {
  712. dest = rspamd_mempool_alloc (pool, dlen + 1);
  713. memcpy (dest, stemmed, dlen);
  714. dest[dlen] = '\0';
  715. tok->stemmed.len = dlen;
  716. tok->stemmed.begin = dest;
  717. tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STEMMED;
  718. }
  719. else {
  720. /* Fallback */
  721. tok->stemmed.len = tok->normalized.len;
  722. tok->stemmed.begin = tok->normalized.begin;
  723. }
  724. }
  725. else {
  726. tok->stemmed.len = tok->normalized.len;
  727. tok->stemmed.begin = tok->normalized.begin;
  728. }
  729. if (tok->stemmed.len > 0 && d != NULL &&
  730. rspamd_language_detector_is_stop_word (d, tok->stemmed.begin, tok->stemmed.len)) {
  731. tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STOP_WORD;
  732. }
  733. }
  734. else {
  735. if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
  736. /* Raw text, lowercase */
  737. tok->stemmed.len = tok->normalized.len;
  738. tok->stemmed.begin = tok->normalized.begin;
  739. }
  740. }
  741. }
  742. }