You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lang_detection.c 43KB


  1. /*-
  2. * Copyright 2017 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "lang_detection.h"
  17. #include "libutil/logger.h"
  18. #include "libcryptobox/cryptobox.h"
  19. #include "libutil/multipattern.h"
  20. #include "ucl.h"
  21. #include "khash.h"
  22. #include "libstemmer.h"
  23. #include <glob.h>
  24. #include <unicode/utf8.h>
  25. #include <unicode/utf16.h>
  26. #include <unicode/ucnv.h>
  27. #include <unicode/uchar.h>
  28. #include <unicode/ustring.h>
  29. #include <math.h>
  30. static const gsize default_short_text_limit = 20;
  31. static const gsize default_words = 80;
  32. static const gdouble update_prob = 0.6;
  33. static const gchar *default_languages_path = RSPAMD_SHAREDIR "/languages";
  34. #undef EXTRA_LANGDET_DEBUG
  35. struct rspamd_language_unicode_match {
  36. const gchar *lang;
  37. gint unicode_code;
  38. };
  39. /*
  40. * List of languages detected by unicode scripts
  41. */
  42. static const struct rspamd_language_unicode_match unicode_langs[] = {
  43. {"el", RSPAMD_UNICODE_GREEK},
  44. {"ml", RSPAMD_UNICODE_MALAYALAM},
  45. {"te", RSPAMD_UNICODE_TELUGU},
  46. {"ta", RSPAMD_UNICODE_TAMIL},
  47. {"gu", RSPAMD_UNICODE_GUJARATI},
  48. {"th", RSPAMD_UNICODE_THAI},
  49. {"ka", RSPAMD_UNICODE_GEORGIAN},
  50. {"si", RSPAMD_UNICODE_SINHALA},
  51. {"hy", RSPAMD_UNICODE_ARMENIAN},
  52. {"ja", RSPAMD_UNICODE_JP},
  53. {"ko", RSPAMD_UNICODE_HANGUL},
  54. };
  55. /*
  56. * Top languages
  57. */
  58. static const gchar *tier0_langs[] = {
  59. "en",
  60. };
  61. static const gchar *tier1_langs[] = {
  62. "fr", "it", "de", "es", "nl",
  63. "pt", "ru", "pl", "tk", "th", "ar"
  64. };
  65. enum rspamd_language_elt_flags {
  66. RS_LANGUAGE_DEFAULT = 0,
  67. RS_LANGUAGE_LATIN = (1 << 0),
  68. RS_LANGUAGE_TIER1 = (1 << 3),
  69. RS_LANGUAGE_TIER0 = (1 << 4),
  70. };
  71. enum rspamd_language_category {
  72. RSPAMD_LANGUAGE_LATIN = 0,
  73. RSPAMD_LANGUAGE_CYRILLIC,
  74. RSPAMD_LANGUAGE_DEVANAGARI,
  75. RSPAMD_LANGUAGE_ARAB,
  76. RSPAMD_LANGUAGE_MAX,
  77. };
  78. struct rspamd_language_elt {
  79. const gchar *name; /* e.g. "en" or "ru" */
  80. enum rspamd_language_elt_flags flags;
  81. enum rspamd_language_category category;
  82. guint trigramms_words;
  83. guint stop_words;
  84. gdouble mean;
  85. gdouble std;
  86. guint occurencies; /* total number of parts with this language */
  87. };
  88. struct rspamd_ngramm_elt {
  89. struct rspamd_language_elt *elt;
  90. gdouble prob;
  91. };
  92. struct rspamd_ngramm_chain {
  93. GPtrArray *languages;
  94. gdouble mean;
  95. gdouble std;
  96. gchar *utf;
  97. };
  98. struct rspamd_stop_word_range {
  99. guint start;
  100. guint stop;
  101. struct rspamd_language_elt *elt;
  102. };
  103. struct rspamd_stop_word_elt {
  104. struct rspamd_multipattern *mp;
  105. GArray *ranges; /* of rspamd_stop_word_range */
  106. };
  107. #define msg_debug_lang_det(...) rspamd_conditional_debug_fast (NULL, NULL, \
  108. rspamd_langdet_log_id, "langdet", task->task_pool->tag.uid, \
  109. G_STRFUNC, \
  110. __VA_ARGS__)
  111. #define msg_debug_lang_det_cfg(...) rspamd_conditional_debug_fast (NULL, NULL, \
  112. rspamd_langdet_log_id, "langdet", cfg->cfg_pool->tag.uid, \
  113. G_STRFUNC, \
  114. __VA_ARGS__)
  115. INIT_LOG_MODULE(langdet)
  116. static const struct rspamd_language_unicode_match *
  117. rspamd_language_search_unicode_match (const gchar *key,
  118. const struct rspamd_language_unicode_match *elts, size_t nelts)
  119. {
  120. size_t i;
  121. for (i = 0; i < nelts; i++) {
  122. if (strcmp (elts[i].lang, key) == 0) {
  123. return &elts[i];
  124. }
  125. }
  126. return NULL;
  127. }
  128. static gboolean
  129. rspamd_language_search_str (const gchar *key, const gchar *elts[], size_t nelts)
  130. {
  131. size_t i;
  132. for (i = 0; i < nelts; i++) {
  133. if (strcmp (elts[i], key) == 0) {
  134. return TRUE;
  135. }
  136. }
  137. return FALSE;
  138. }
  139. static guint
  140. rspamd_trigram_hash_func (gconstpointer key)
  141. {
  142. return rspamd_cryptobox_fast_hash (key, 3 * sizeof (UChar32),
  143. rspamd_hash_seed ());
  144. }
  145. static gboolean
  146. rspamd_trigram_equal_func (gconstpointer v, gconstpointer v2)
  147. {
  148. return memcmp (v, v2, 3 * sizeof (UChar32)) == 0;
  149. }
  150. KHASH_INIT (rspamd_trigram_hash, const UChar32 *, struct rspamd_ngramm_chain, true,
  151. rspamd_trigram_hash_func, rspamd_trigram_equal_func);
  152. KHASH_INIT (rspamd_candidates_hash, const gchar *,
  153. struct rspamd_lang_detector_res *, true,
  154. rspamd_str_hash, rspamd_str_equal);
  155. KHASH_INIT (rspamd_stopwords_hash, rspamd_ftok_t *,
  156. char, false,
  157. rspamd_ftok_hash, rspamd_ftok_equal);
  158. struct rspamd_lang_detector {
  159. GPtrArray *languages;
  160. khash_t(rspamd_trigram_hash) *trigramms[RSPAMD_LANGUAGE_MAX]; /* trigramms frequencies */
  161. struct rspamd_stop_word_elt stop_words[RSPAMD_LANGUAGE_MAX];
  162. khash_t(rspamd_stopwords_hash) *stop_words_norm;
  163. UConverter *uchar_converter;
  164. gsize short_text_limit;
  165. gsize total_occurencies; /* number of all languages found */
  166. ref_entry_t ref;
  167. };
  168. static void
  169. rspamd_language_detector_ucs_lowercase (UChar32 *s, gsize len)
  170. {
  171. gsize i;
  172. for (i = 0; i < len; i ++) {
  173. s[i] = u_tolower (s[i]);
  174. }
  175. }
  176. static gboolean
  177. rspamd_language_detector_ucs_is_latin (const UChar32 *s, gsize len)
  178. {
  179. gsize i;
  180. gboolean ret = TRUE;
  181. for (i = 0; i < len; i ++) {
  182. if (s[i] >= 128 || !(g_ascii_isalnum (s[i]) || s[i] == ' ')) {
  183. ret = FALSE;
  184. break;
  185. }
  186. }
  187. return ret;
  188. }
  189. struct rspamd_language_ucs_elt {
  190. guint freq;
  191. const gchar *utf;
  192. UChar32 s[0];
  193. };
  194. static void
  195. rspamd_language_detector_init_ngramm (struct rspamd_config *cfg,
  196. struct rspamd_lang_detector *d,
  197. struct rspamd_language_elt *lelt,
  198. struct rspamd_language_ucs_elt *ucs,
  199. guint len,
  200. guint freq,
  201. guint total,
  202. khash_t (rspamd_trigram_hash) *htb)
  203. {
  204. struct rspamd_ngramm_chain *chain = NULL, st_chain;
  205. struct rspamd_ngramm_elt *elt;
  206. khiter_t k;
  207. guint i;
  208. gboolean found;
  209. switch (len) {
  210. case 1:
  211. case 2:
  212. g_assert_not_reached ();
  213. break;
  214. case 3:
  215. k = kh_get (rspamd_trigram_hash, htb, ucs->s);
  216. if (k != kh_end (htb)) {
  217. chain = &kh_value (htb, k);
  218. }
  219. break;
  220. default:
  221. g_assert_not_reached ();
  222. break;
  223. }
  224. if (chain == NULL) {
  225. /* New element */
  226. chain = &st_chain;
  227. memset (chain, 0, sizeof (st_chain));
  228. chain->languages = g_ptr_array_sized_new (32);
  229. rspamd_mempool_add_destructor (cfg->cfg_pool, rspamd_ptr_array_free_hard,
  230. chain->languages);
  231. chain->utf = rspamd_mempool_strdup (cfg->cfg_pool, ucs->utf);
  232. elt = rspamd_mempool_alloc (cfg->cfg_pool, sizeof (*elt));
  233. elt->elt = lelt;
  234. elt->prob = ((gdouble)freq) / ((gdouble)total);
  235. g_ptr_array_add (chain->languages, elt);
  236. k = kh_put (rspamd_trigram_hash, htb, ucs->s, &i);
  237. kh_value (htb, k) = *chain;
  238. }
  239. else {
  240. /* Check sanity */
  241. found = FALSE;
  242. PTR_ARRAY_FOREACH (chain->languages, i, elt) {
  243. if (strcmp (elt->elt->name, lelt->name) == 0) {
  244. found = TRUE;
  245. elt->prob += ((gdouble)freq) / ((gdouble)total);
  246. break;
  247. }
  248. }
  249. if (!found) {
  250. elt = rspamd_mempool_alloc (cfg->cfg_pool, sizeof (*elt));
  251. elt->elt = lelt;
  252. elt->prob = ((gdouble)freq) / ((gdouble)total);
  253. g_ptr_array_add (chain->languages, elt);
  254. }
  255. }
  256. }
  257. static inline enum rspamd_language_category
  258. rspamd_language_detector_get_category (guint uflags)
  259. {
  260. enum rspamd_language_category cat = RSPAMD_LANGUAGE_LATIN;
  261. if (uflags & RSPAMD_UNICODE_CYRILLIC) {
  262. cat = RSPAMD_LANGUAGE_CYRILLIC;
  263. }
  264. else if (uflags & RSPAMD_UNICODE_DEVANAGARI) {
  265. cat = RSPAMD_LANGUAGE_DEVANAGARI;
  266. }
  267. else if (uflags & RSPAMD_UNICODE_ARABIC) {
  268. cat = RSPAMD_LANGUAGE_ARAB;
  269. }
  270. return cat;
  271. }
  272. static const gchar *
  273. rspamd_language_detector_print_flags (struct rspamd_language_elt *elt)
  274. {
  275. static gchar flags_buf[256];
  276. goffset r = 0;
  277. if (elt->flags & RS_LANGUAGE_TIER1) {
  278. r += rspamd_snprintf (flags_buf + r, sizeof (flags_buf) - r, "tier1,");
  279. }
  280. if (elt->flags & RS_LANGUAGE_TIER0) {
  281. r += rspamd_snprintf (flags_buf + r, sizeof (flags_buf) - r, "tier0,");
  282. }
  283. if (elt->flags & RS_LANGUAGE_LATIN) {
  284. r += rspamd_snprintf (flags_buf + r, sizeof (flags_buf) - r, "latin,");
  285. }
  286. if (r > 0) {
  287. flags_buf[r - 1] = '\0';
  288. }
  289. else {
  290. flags_buf[r] = '\0';
  291. }
  292. return flags_buf;
  293. }
  294. static gint
  295. rspamd_language_detector_cmp_ngramm (gconstpointer a, gconstpointer b)
  296. {
  297. struct rspamd_language_ucs_elt *e1 = *(struct rspamd_language_ucs_elt **)a;
  298. struct rspamd_language_ucs_elt *e2 = *(struct rspamd_language_ucs_elt **)b;
  299. return (gint)e2->freq - (gint)e1->freq;
  300. }
  301. static void
  302. rspamd_language_detector_read_file (struct rspamd_config *cfg,
  303. struct rspamd_lang_detector *d,
  304. const gchar *path,
  305. const ucl_object_t *stop_words)
  306. {
  307. struct ucl_parser *parser;
  308. ucl_object_t *top;
  309. const ucl_object_t *freqs, *n_words, *cur, *type;
  310. ucl_object_iter_t it = NULL;
  311. UErrorCode uc_err = U_ZERO_ERROR;
  312. struct rspamd_language_elt *nelt;
  313. struct rspamd_language_ucs_elt *ucs_elt;
  314. khash_t (rspamd_trigram_hash) *htb = NULL;
  315. gchar *pos;
  316. guint total = 0, total_latin = 0, total_ngramms = 0, i, skipped,
  317. loaded, nstop = 0;
  318. gdouble mean = 0, std = 0, delta = 0, delta2 = 0, m2 = 0;
  319. enum rspamd_language_category cat = RSPAMD_LANGUAGE_MAX;
  320. parser = ucl_parser_new (UCL_PARSER_NO_FILEVARS);
  321. if (!ucl_parser_add_file (parser, path)) {
  322. msg_warn_config ("cannot parse file %s: %s", path,
  323. ucl_parser_get_error (parser));
  324. ucl_parser_free (parser);
  325. return;
  326. }
  327. top = ucl_parser_get_object (parser);
  328. ucl_parser_free (parser);
  329. freqs = ucl_object_lookup (top, "freq");
  330. if (freqs == NULL) {
  331. msg_warn_config ("file %s has no 'freq' key", path);
  332. ucl_object_unref (top);
  333. return;
  334. }
  335. pos = strrchr (path, '/');
  336. g_assert (pos != NULL);
  337. nelt = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (*nelt));
  338. nelt->name = rspamd_mempool_strdup (cfg->cfg_pool, pos + 1);
  339. /* Remove extension */
  340. pos = strchr (nelt->name, '.');
  341. g_assert (pos != NULL);
  342. *pos = '\0';
  343. n_words = ucl_object_lookup (top, "n_words");
  344. if (n_words == NULL || ucl_object_type (n_words) != UCL_ARRAY ||
  345. n_words->len != 3) {
  346. msg_warn_config ("cannot find n_words in language %s", nelt->name);
  347. ucl_object_unref (top);
  348. return;
  349. }
  350. else {
  351. nelt->trigramms_words = ucl_object_toint (ucl_array_find_index (n_words,
  352. 2));
  353. }
  354. type = ucl_object_lookup (top, "type");
  355. if (type == NULL || ucl_object_type (type) != UCL_STRING) {
  356. msg_debug_config ("cannot find type in language %s", nelt->name);
  357. ucl_object_unref (top);
  358. return;
  359. }
  360. else {
  361. const gchar *stype = ucl_object_tostring (type);
  362. if (strcmp (stype, "latin") == 0) {
  363. cat = RSPAMD_LANGUAGE_LATIN;
  364. }
  365. else if (strcmp (stype, "cyrillic") == 0) {
  366. cat = RSPAMD_LANGUAGE_CYRILLIC;
  367. }
  368. else if (strcmp (stype, "arab") == 0) {
  369. cat = RSPAMD_LANGUAGE_ARAB;
  370. }
  371. else if (strcmp (stype, "devanagari") == 0) {
  372. cat = RSPAMD_LANGUAGE_DEVANAGARI;
  373. }
  374. else {
  375. msg_debug_config ("unknown type %s of language %s", stype, nelt->name);
  376. ucl_object_unref (top);
  377. return;
  378. }
  379. }
  380. if (stop_words) {
  381. const ucl_object_t *specific_stop_words;
  382. specific_stop_words = ucl_object_lookup (stop_words, nelt->name);
  383. if (specific_stop_words) {
  384. struct sb_stemmer *stem = NULL;
  385. it = NULL;
  386. const ucl_object_t *w;
  387. guint start, stop;
  388. stem = sb_stemmer_new (nelt->name, "UTF_8");
  389. start = rspamd_multipattern_get_npatterns (d->stop_words[cat].mp);
  390. while ((w = ucl_object_iterate (specific_stop_words, &it, true)) != NULL) {
  391. gsize wlen;
  392. const char *word = ucl_object_tolstring (w, &wlen);
  393. const char *saved;
  394. rspamd_multipattern_add_pattern_len (d->stop_words[cat].mp,
  395. word, wlen,
  396. RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
  397. nelt->stop_words ++;
  398. nstop ++;
  399. /* Also lemmatise and store normalised */
  400. if (stem) {
  401. const char *nw = sb_stemmer_stem (stem, word, wlen);
  402. if (nw) {
  403. saved = nw;
  404. wlen = strlen (nw);
  405. }
  406. else {
  407. saved = word;
  408. }
  409. }
  410. else {
  411. saved = word;
  412. }
  413. if (saved) {
  414. gint rc;
  415. rspamd_ftok_t *tok;
  416. gchar *dst;
  417. tok = g_malloc (sizeof (*tok) + wlen + 1);
  418. dst = ((gchar *)tok) + sizeof (*tok);
  419. rspamd_strlcpy (dst, saved, wlen + 1);
  420. tok->begin = dst;
  421. tok->len = wlen;
  422. kh_put (rspamd_stopwords_hash, d->stop_words_norm,
  423. tok, &rc);
  424. }
  425. }
  426. if (stem) {
  427. sb_stemmer_delete (stem);
  428. }
  429. stop = rspamd_multipattern_get_npatterns (d->stop_words[cat].mp);
  430. struct rspamd_stop_word_range r;
  431. r.start = start;
  432. r.stop = stop;
  433. r.elt = nelt;
  434. g_array_append_val (d->stop_words[cat].ranges, r);
  435. it = NULL;
  436. }
  437. }
  438. nelt->category = cat;
  439. htb = d->trigramms[cat];
  440. GPtrArray *ngramms;
  441. guint nsym;
  442. if (rspamd_language_search_str (nelt->name, tier1_langs,
  443. G_N_ELEMENTS (tier1_langs))) {
  444. nelt->flags |= RS_LANGUAGE_TIER1;
  445. }
  446. if (rspamd_language_search_str (nelt->name, tier0_langs,
  447. G_N_ELEMENTS (tier0_langs))) {
  448. nelt->flags |= RS_LANGUAGE_TIER0;
  449. }
  450. it = NULL;
  451. ngramms = g_ptr_array_sized_new (freqs->len);
  452. i = 0;
  453. skipped = 0;
  454. loaded = 0;
  455. while ((cur = ucl_object_iterate (freqs, &it, true)) != NULL) {
  456. const gchar *key;
  457. gsize keylen;
  458. guint freq;
  459. key = ucl_object_keyl (cur, &keylen);
  460. freq = ucl_object_toint (cur);
  461. i ++;
  462. delta = freq - mean;
  463. mean += delta / i;
  464. delta2 = freq - mean;
  465. m2 += delta * delta2;
  466. if (key != NULL) {
  467. UChar32 *cur_ucs;
  468. const char *end = key + keylen, *cur_utf = key;
  469. ucs_elt = rspamd_mempool_alloc (cfg->cfg_pool,
  470. sizeof (*ucs_elt) + (keylen + 1) * sizeof (UChar32));
  471. cur_ucs = ucs_elt->s;
  472. nsym = 0;
  473. uc_err = U_ZERO_ERROR;
  474. while (cur_utf < end) {
  475. *cur_ucs++ = ucnv_getNextUChar (d->uchar_converter, &cur_utf,
  476. end, &uc_err);
  477. if (!U_SUCCESS (uc_err)) {
  478. break;
  479. }
  480. nsym ++;
  481. }
  482. if (!U_SUCCESS (uc_err)) {
  483. msg_warn_config ("cannot convert key %*s to unicode: %s",
  484. (gint)keylen, key, u_errorName (uc_err));
  485. continue;
  486. }
  487. ucs_elt->utf = key;
  488. rspamd_language_detector_ucs_lowercase (ucs_elt->s, nsym);
  489. if (nsym == 3) {
  490. g_ptr_array_add (ngramms, ucs_elt);
  491. }
  492. else {
  493. continue;
  494. }
  495. if (rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) {
  496. total_latin++;
  497. }
  498. ucs_elt->freq = freq;
  499. total_ngramms++;
  500. }
  501. }
  502. std = sqrt (m2 / (i - 1));
  503. if (total_latin >= total_ngramms / 3) {
  504. nelt->flags |= RS_LANGUAGE_LATIN;
  505. }
  506. nsym = 3;
  507. total = 0;
  508. PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) {
  509. if (!(nelt->flags & RS_LANGUAGE_LATIN) &&
  510. rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) {
  511. ucs_elt->freq = 0;
  512. /* Skip latin ngramm for non-latin language to avoid garbadge */
  513. skipped ++;
  514. continue;
  515. }
  516. /* Now, discriminate low frequency ngramms */
  517. total += ucs_elt->freq;
  518. loaded ++;
  519. }
  520. g_ptr_array_sort (ngramms, rspamd_language_detector_cmp_ngramm);
  521. PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) {
  522. if (ucs_elt->freq > 0) {
  523. rspamd_language_detector_init_ngramm (cfg, d,
  524. nelt, ucs_elt, nsym,
  525. ucs_elt->freq, total, htb);
  526. }
  527. }
  528. #ifdef EXTRA_LANGDET_DEBUG
  529. /* Useful for debug */
  530. for (i = 0; i < 10; i ++) {
  531. ucs_elt = g_ptr_array_index (ngramms, i);
  532. msg_debug_lang_det_cfg ("%s -> %s: %d", nelt->name,
  533. ucs_elt->utf, ucs_elt->freq);
  534. }
  535. #endif
  536. g_ptr_array_free (ngramms, TRUE);
  537. nelt->mean = mean;
  538. nelt->std = std;
  539. msg_debug_lang_det_cfg ("loaded %s language, %d trigramms, "
  540. "%d ngramms loaded; "
  541. "std=%.2f, mean=%.2f, skipped=%d, loaded=%d, stop_words=%d; "
  542. "(%s)",
  543. nelt->name,
  544. (gint)nelt->trigramms_words,
  545. total,
  546. std, mean,
  547. skipped, loaded, nelt->stop_words,
  548. rspamd_language_detector_print_flags (nelt));
  549. g_ptr_array_add (d->languages, nelt);
  550. ucl_object_unref (top);
  551. }
  552. static gboolean
  553. rspamd_ucl_array_find_str (const gchar *str, const ucl_object_t *ar)
  554. {
  555. ucl_object_iter_t it = NULL;
  556. const ucl_object_t *cur;
  557. if (ar == NULL || ar->len == 0) {
  558. return FALSE;
  559. }
  560. while ((cur = ucl_object_iterate (ar, &it, true)) != NULL) {
  561. if (ucl_object_type (cur) == UCL_STRING && rspamd_strcase_equal (
  562. ucl_object_tostring (cur), str)) {
  563. return TRUE;
  564. }
  565. }
  566. return FALSE;
  567. }
  568. static void
  569. rspamd_language_detector_process_chain (struct rspamd_config *cfg,
  570. struct rspamd_ngramm_chain *chain)
  571. {
  572. struct rspamd_ngramm_elt *elt;
  573. guint i;
  574. gdouble delta, mean = 0, delta2, m2 = 0, std;
  575. if (chain->languages->len > 3) {
  576. PTR_ARRAY_FOREACH (chain->languages, i, elt) {
  577. delta = elt->prob - mean;
  578. mean += delta / (i + 1);
  579. delta2 = elt->prob - mean;
  580. m2 += delta * delta2;
  581. }
  582. std = sqrt (m2 / (i - 1));
  583. chain->mean = mean;
  584. chain->std = std;
  585. /* Now, filter elements that are lower than mean */
  586. PTR_ARRAY_FOREACH (chain->languages, i, elt) {
  587. if (elt->prob < mean) {
  588. g_ptr_array_remove_index_fast (chain->languages, i);
  589. #ifdef EXTRA_LANGDET_DEBUG
  590. msg_debug_lang_det_cfg ("remove %s from %s; prob: %.4f; mean: %.4f, std: %.4f",
  591. elt->elt->name, chain->utf, elt->prob, mean, std);
  592. #endif
  593. }
  594. }
  595. }
  596. else {
  597. /* We have a unique ngramm, increase its weight */
  598. PTR_ARRAY_FOREACH (chain->languages, i, elt) {
  599. elt->prob *= 4.0;
  600. #ifdef EXTRA_LANGDET_DEBUG
  601. msg_debug_lang_det_cfg ("increase weight of %s in %s; prob: %.4f",
  602. elt->elt->name, chain->utf, elt->prob);
  603. #endif
  604. }
  605. }
  606. }
  607. static void
  608. rspamd_language_detector_dtor (struct rspamd_lang_detector *d)
  609. {
  610. if (d) {
  611. rspamd_ftok_t *tok;
  612. for (guint i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) {
  613. kh_destroy (rspamd_trigram_hash, d->trigramms[i]);
  614. rspamd_multipattern_destroy (d->stop_words[i].mp);
  615. g_array_free (d->stop_words[i].ranges, TRUE);
  616. }
  617. if (d->languages) {
  618. g_ptr_array_free (d->languages, TRUE);
  619. }
  620. kh_foreach_key (d->stop_words_norm, tok, {
  621. g_free (tok); /* String is embedded and freed automatically */
  622. });
  623. }
  624. }
  625. struct rspamd_lang_detector*
  626. rspamd_language_detector_init (struct rspamd_config *cfg)
  627. {
  628. const ucl_object_t *section, *elt, *languages_enable = NULL,
  629. *languages_disable = NULL;
  630. const gchar *languages_path = default_languages_path;
  631. glob_t gl;
  632. size_t i, short_text_limit = default_short_text_limit, total = 0;
  633. UErrorCode uc_err = U_ZERO_ERROR;
  634. GString *languages_pattern;
  635. struct rspamd_ngramm_chain *chain, schain;
  636. gchar *fname;
  637. struct rspamd_lang_detector *ret = NULL;
  638. struct ucl_parser *parser;
  639. ucl_object_t *stop_words;
  640. section = ucl_object_lookup (cfg->rcl_obj, "lang_detection");
  641. if (section != NULL) {
  642. elt = ucl_object_lookup (section, "languages");
  643. if (elt) {
  644. languages_path = ucl_object_tostring (elt);
  645. }
  646. elt = ucl_object_lookup (section, "short_text_limit");
  647. if (elt) {
  648. short_text_limit = ucl_object_toint (elt);
  649. }
  650. languages_enable = ucl_object_lookup (section, "languages_enable");
  651. languages_disable = ucl_object_lookup (section, "languages_disable");
  652. }
  653. languages_pattern = g_string_sized_new (PATH_MAX);
  654. rspamd_printf_gstring (languages_pattern, "%s/stop_words", languages_path);
  655. parser = ucl_parser_new (UCL_PARSER_DEFAULT);
  656. if (ucl_parser_add_file (parser, languages_pattern->str)) {
  657. stop_words = ucl_parser_get_object (parser);
  658. }
  659. else {
  660. msg_err_config ("cannot read stop words from %s: %s",
  661. languages_pattern->str,
  662. ucl_parser_get_error (parser));
  663. stop_words = NULL;
  664. }
  665. ucl_parser_free (parser);
  666. languages_pattern->len = 0;
  667. rspamd_printf_gstring (languages_pattern, "%s/*.json", languages_path);
  668. memset (&gl, 0, sizeof (gl));
  669. if (glob (languages_pattern->str, 0, NULL, &gl) != 0) {
  670. msg_err_config ("cannot read any files matching %v", languages_pattern);
  671. goto end;
  672. }
  673. ret = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (*ret));
  674. ret->languages = g_ptr_array_sized_new (gl.gl_pathc);
  675. ret->uchar_converter = rspamd_get_utf8_converter ();
  676. ret->short_text_limit = short_text_limit;
  677. ret->stop_words_norm = kh_init (rspamd_stopwords_hash);
  678. /* Map from ngramm in ucs32 to GPtrArray of rspamd_language_elt */
  679. for (i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) {
  680. ret->trigramms[i] = kh_init (rspamd_trigram_hash);
  681. ret->stop_words[i].mp = rspamd_multipattern_create (
  682. RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
  683. ret->stop_words[i].ranges = g_array_new (FALSE, FALSE,
  684. sizeof (struct rspamd_stop_word_range));
  685. }
  686. g_assert (uc_err == U_ZERO_ERROR);
  687. for (i = 0; i < gl.gl_pathc; i ++) {
  688. fname = g_path_get_basename (gl.gl_pathv[i]);
  689. if (!rspamd_ucl_array_find_str (fname, languages_disable) ||
  690. (languages_enable == NULL ||
  691. rspamd_ucl_array_find_str (fname, languages_enable))) {
  692. rspamd_language_detector_read_file (cfg, ret, gl.gl_pathv[i],
  693. stop_words);
  694. }
  695. else {
  696. msg_info_config ("skip language file %s: disabled", fname);
  697. }
  698. g_free (fname);
  699. }
  700. for (i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) {
  701. GError *err = NULL;
  702. kh_foreach_value (ret->trigramms[i], schain, {
  703. chain = &schain;
  704. rspamd_language_detector_process_chain (cfg, chain);
  705. });
  706. if (!rspamd_multipattern_compile (ret->stop_words[i].mp, &err)) {
  707. msg_err_config ("cannot compile stop words for %z language group: %e",
  708. i, err);
  709. g_error_free (err);
  710. }
  711. total += kh_size (ret->trigramms[i]);
  712. }
  713. msg_info_config ("loaded %d languages, "
  714. "%d trigramms",
  715. (gint)ret->languages->len,
  716. (gint)total);
  717. if (stop_words) {
  718. ucl_object_unref (stop_words);
  719. }
  720. REF_INIT_RETAIN (ret, rspamd_language_detector_dtor);
  721. rspamd_mempool_add_destructor (cfg->cfg_pool,
  722. (rspamd_mempool_destruct_t)rspamd_language_detector_unref,
  723. ret);
  724. end:
  725. if (gl.gl_pathc > 0) {
  726. globfree (&gl);
  727. }
  728. g_string_free (languages_pattern, TRUE);
  729. return ret;
  730. }
  731. static void
  732. rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
  733. goffset *offsets_out)
  734. {
  735. guint step_len, remainder, i, out_idx;
  736. guint64 coin, sel;
  737. rspamd_stat_token_t *tok;
  738. g_assert (nwords != 0);
  739. g_assert (offsets_out != NULL);
  740. g_assert (ucs_tokens->len >= nwords);
  741. /*
  742. * We split input array into `nwords` parts. For each part we randomly select
  743. * an element from this particular split. Here is an example:
  744. *
  745. * nwords=2, input_len=5
  746. *
  747. * w1 w2 w3 w4 w5
  748. * ^ ^
  749. * part1 part2
  750. * vv vv
  751. * w2 w5
  752. *
  753. * So we have 2 output words from 5 input words selected randomly within
  754. * their splits. It is not uniform distribution but it seems to be better
  755. * to include words from different text parts
  756. */
  757. step_len = ucs_tokens->len / nwords;
  758. remainder = ucs_tokens->len % nwords;
  759. out_idx = 0;
  760. coin = rspamd_random_uint64_fast ();
  761. sel = coin % (step_len + remainder);
  762. offsets_out[out_idx] = sel;
  763. for (i = step_len + remainder; i < ucs_tokens->len;
  764. i += step_len, out_idx ++) {
  765. guint ntries = 0;
  766. coin = rspamd_random_uint64_fast ();
  767. sel = (coin % step_len) + i;
  768. for (;;) {
  769. tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, sel);
  770. /* Filter bad tokens */
  771. if (tok->unicode.len >= 2 &&
  772. u_isalpha (tok->unicode.begin[0]) &&
  773. u_isalpha (tok->unicode.begin[tok->unicode.len - 1])) {
  774. offsets_out[out_idx] = sel;
  775. break;
  776. }
  777. else {
  778. ntries ++;
  779. coin = rspamd_random_uint64_fast ();
  780. if (ntries < step_len) {
  781. sel = (coin % step_len) + i;
  782. }
  783. else if (ntries < ucs_tokens->len) {
  784. sel = coin % ucs_tokens->len;
  785. }
  786. else {
  787. offsets_out[out_idx] = sel;
  788. break;
  789. }
  790. }
  791. }
  792. }
  793. /*
  794. * Fisher-Yates algorithm:
  795. * for i from 0 to n−2 do
  796. * j ← random integer such that i ≤ j < n
  797. * exchange a[i] and a[j]
  798. */
  799. #if 0
  800. if (out_idx > 2) {
  801. for (i = 0; i < out_idx - 2; i++) {
  802. coin = rspamd_random_uint64_fast ();
  803. sel = (coin % (out_idx - i)) + i;
  804. /* swap */
  805. tmp = offsets_out[i];
  806. offsets_out[i] = offsets_out[sel];
  807. offsets_out[sel] = tmp;
  808. }
  809. }
  810. #endif
  811. }
  812. static goffset
  813. rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar32 *window,
  814. guint wlen, goffset cur_off)
  815. {
  816. guint i;
  817. if (wlen > 1) {
  818. /* Deal with spaces at the beginning and ending */
  819. if (cur_off == 0) {
  820. window[0] = (UChar32)' ';
  821. for (i = 0; i < wlen - 1; i ++) {
  822. window[i + 1] = tok->unicode.begin[i];
  823. }
  824. }
  825. else if (cur_off + wlen == tok->unicode.len + 1) {
  826. /* Add trailing space */
  827. for (i = 0; i < wlen - 1; i ++) {
  828. window[i] = tok->unicode.begin[cur_off + i];
  829. }
  830. window[wlen - 1] = (UChar32)' ';
  831. }
  832. else if (cur_off + wlen > tok->unicode.len + 1) {
  833. /* No more fun */
  834. return -1;
  835. }
  836. else {
  837. /* Normal case */
  838. for (i = 0; i < wlen; i++) {
  839. window[i] = tok->unicode.begin[cur_off + i];
  840. }
  841. }
  842. }
  843. else {
  844. if (tok->normalized.len <= cur_off) {
  845. return -1;
  846. }
  847. window[0] = tok->unicode.begin[cur_off];
  848. }
  849. return cur_off + 1;
  850. }
  851. /*
  852. * Do full guess for a specific ngramm, checking all languages defined
  853. */
  854. static void
  855. rspamd_language_detector_process_ngramm_full (struct rspamd_task *task,
  856. struct rspamd_lang_detector *d,
  857. UChar32 *window,
  858. khash_t(rspamd_candidates_hash) *candidates,
  859. khash_t(rspamd_trigram_hash) *trigramms)
  860. {
  861. guint i;
  862. gint ret;
  863. struct rspamd_ngramm_chain *chain = NULL;
  864. struct rspamd_ngramm_elt *elt;
  865. struct rspamd_lang_detector_res *cand;
  866. khiter_t k;
  867. gdouble prob;
  868. k = kh_get (rspamd_trigram_hash, trigramms, window);
  869. if (k != kh_end (trigramms)) {
  870. chain = &kh_value (trigramms, k);
  871. }
  872. if (chain) {
  873. PTR_ARRAY_FOREACH (chain->languages, i, elt) {
  874. prob = elt->prob;
  875. if (prob < chain->mean) {
  876. continue;
  877. }
  878. k = kh_get (rspamd_candidates_hash, candidates, elt->elt->name);
  879. if (k != kh_end (candidates)) {
  880. cand = kh_value (candidates, k);
  881. }
  882. else {
  883. cand = NULL;
  884. }
  885. #ifdef NGRAMMS_DEBUG
  886. msg_err ("gramm: %s, lang: %s, prob: %.3f", chain->utf,
  887. elt->elt->name, log2 (elt->prob));
  888. #endif
  889. if (cand == NULL) {
  890. cand = rspamd_mempool_alloc (task->task_pool, sizeof (*cand));
  891. cand->elt = elt->elt;
  892. cand->lang = elt->elt->name;
  893. cand->prob = prob;
  894. k = kh_put (rspamd_candidates_hash, candidates, elt->elt->name,
  895. &ret);
  896. kh_value (candidates, k) = cand;
  897. } else {
  898. /* Update guess */
  899. cand->prob += prob;
  900. }
  901. }
  902. }
  903. }
  904. static void
  905. rspamd_language_detector_detect_word (struct rspamd_task *task,
  906. struct rspamd_lang_detector *d,
  907. rspamd_stat_token_t *tok,
  908. khash_t(rspamd_candidates_hash) *candidates,
  909. khash_t(rspamd_trigram_hash) *trigramms)
  910. {
  911. const guint wlen = 3;
  912. UChar32 window[3];
  913. goffset cur = 0;
  914. /* Split words */
  915. while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur))
  916. != -1) {
  917. rspamd_language_detector_process_ngramm_full (task,
  918. d, window, candidates, trigramms);
  919. }
  920. }
  921. static const gdouble cutoff_limit = -8.0;
  922. /*
  923. * Converts frequencies to log probabilities, filter those candidates who
  924. * has the lowest probabilities
  925. */
  926. static inline void
  927. rspamd_language_detector_filter_step1 (struct rspamd_task *task,
  928. struct rspamd_lang_detector_res *cand,
  929. gdouble *max_prob, guint *filtered)
  930. {
  931. if (!isnan (cand->prob)) {
  932. if (cand->prob == 0) {
  933. cand->prob = NAN;
  934. msg_debug_lang_det (
  935. "exclude language %s",
  936. cand->lang);
  937. (*filtered)++;
  938. }
  939. else {
  940. cand->prob = log2 (cand->prob);
  941. if (cand->prob < cutoff_limit) {
  942. msg_debug_lang_det (
  943. "exclude language %s: %.3f, cutoff limit: %.3f",
  944. cand->lang, cand->prob, cutoff_limit);
  945. cand->prob = NAN;
  946. (*filtered)++;
  947. }
  948. else if (cand->prob > *max_prob) {
  949. *max_prob = cand->prob;
  950. }
  951. }
  952. }
  953. }
  954. static inline void
  955. rspamd_language_detector_filter_step2 (struct rspamd_task *task,
  956. struct rspamd_lang_detector_res *cand,
  957. gdouble max_prob, guint *filtered)
  958. {
  959. /*
  960. * Probabilities are logarithmic, so if prob1 - prob2 > 4, it means that
  961. * prob2 is 2^4 less than prob1
  962. */
  963. if (!isnan (cand->prob) && max_prob - cand->prob > 1) {
  964. msg_debug_lang_det ("exclude language %s: %.3f (%.3f max)",
  965. cand->lang, cand->prob, max_prob);
  966. cand->prob = NAN;
  967. (*filtered) ++;
  968. }
  969. }
  970. static void
  971. rspamd_language_detector_filter_negligible (struct rspamd_task *task,
  972. khash_t(rspamd_candidates_hash) *candidates)
  973. {
  974. struct rspamd_lang_detector_res *cand;
  975. guint filtered = 0;
  976. gdouble max_prob = -(G_MAXDOUBLE);
  977. kh_foreach_value (candidates, cand,
  978. rspamd_language_detector_filter_step1 (task, cand, &max_prob, &filtered));
  979. kh_foreach_value (candidates, cand,
  980. rspamd_language_detector_filter_step2 (task, cand, max_prob, &filtered));
  981. msg_debug_lang_det ("removed %d languages", filtered);
  982. }
  983. static void
  984. rspamd_language_detector_detect_type (struct rspamd_task *task,
  985. guint nwords,
  986. struct rspamd_lang_detector *d,
  987. GArray *words,
  988. enum rspamd_language_category cat,
  989. khash_t(rspamd_candidates_hash) *candidates)
  990. {
  991. guint nparts = MIN (words->len, nwords);
  992. goffset *selected_words;
  993. rspamd_stat_token_t *tok;
  994. guint i;
  995. selected_words = g_new0 (goffset, nparts);
  996. rspamd_language_detector_random_select (words, nparts, selected_words);
  997. msg_debug_lang_det ("randomly selected %d words", nparts);
  998. for (i = 0; i < nparts; i++) {
  999. tok = &g_array_index (words, rspamd_stat_token_t,
  1000. selected_words[i]);
  1001. if (tok->unicode.len >= 3) {
  1002. rspamd_language_detector_detect_word (task, d, tok, candidates,
  1003. d->trigramms[cat]);
  1004. }
  1005. }
  1006. /* Filter negligible candidates */
  1007. rspamd_language_detector_filter_negligible (task, candidates);
  1008. g_free (selected_words);
  1009. }
  1010. static gint
  1011. rspamd_language_detector_cmp (gconstpointer a, gconstpointer b)
  1012. {
  1013. const struct rspamd_lang_detector_res
  1014. *canda = *(const struct rspamd_lang_detector_res **)a,
  1015. *candb = *(const struct rspamd_lang_detector_res **)b;
  1016. if (canda->prob > candb->prob) {
  1017. return -1;
  1018. }
  1019. else if (candb->prob > canda->prob) {
  1020. return 1;
  1021. }
  1022. return 0;
  1023. }
  1024. enum rspamd_language_detected_type {
  1025. rs_detect_none = 0,
  1026. rs_detect_single,
  1027. rs_detect_multiple,
  1028. };
  1029. static enum rspamd_language_detected_type
  1030. rspamd_language_detector_try_ngramm (struct rspamd_task *task,
  1031. guint nwords,
  1032. struct rspamd_lang_detector *d,
  1033. GArray *ucs_tokens,
  1034. enum rspamd_language_category cat,
  1035. khash_t(rspamd_candidates_hash) *candidates)
  1036. {
  1037. guint cand_len = 0;
  1038. struct rspamd_lang_detector_res *cand;
  1039. rspamd_language_detector_detect_type (task,
  1040. nwords,
  1041. d,
  1042. ucs_tokens,
  1043. cat,
  1044. candidates);
  1045. kh_foreach_value (candidates, cand, {
  1046. if (!isnan (cand->prob)) {
  1047. cand_len ++;
  1048. }
  1049. });
  1050. if (cand_len == 0) {
  1051. return rs_detect_none;
  1052. }
  1053. else if (cand_len == 1) {
  1054. return rs_detect_single;
  1055. }
  1056. return rs_detect_multiple;
  1057. }
  1058. enum rspamd_language_sort_flags {
  1059. RSPAMD_LANG_FLAG_DEFAULT = 0,
  1060. RSPAMD_LANG_FLAG_SHORT = 1 << 0,
  1061. };
  1062. struct rspamd_frequency_sort_cbdata {
  1063. struct rspamd_lang_detector *d;
  1064. enum rspamd_language_sort_flags flags;
  1065. gdouble std;
  1066. gdouble mean;
  1067. };
  1068. static const gdouble tier0_adjustment = 1.2;
  1069. static const gdouble tier1_adjustment = 0.8;
  1070. static const gdouble frequency_adjustment = 0.8;
  1071. static gint
  1072. rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b,
  1073. gpointer ud)
  1074. {
  1075. struct rspamd_frequency_sort_cbdata *cbd = ud;
  1076. const struct rspamd_lang_detector_res
  1077. *canda = *(const struct rspamd_lang_detector_res **)a,
  1078. *candb = *(const struct rspamd_lang_detector_res **)b;
  1079. gdouble adj;
  1080. gdouble proba_adjusted, probb_adjusted, freqa, freqb;
  1081. freqa = ((gdouble)canda->elt->occurencies) /
  1082. (gdouble)cbd->d->total_occurencies;
  1083. freqb = ((gdouble)candb->elt->occurencies) /
  1084. (gdouble)cbd->d->total_occurencies;
  1085. proba_adjusted = canda->prob;
  1086. probb_adjusted = candb->prob;
  1087. if (isnormal (freqa) && isnormal (freqb)) {
  1088. proba_adjusted += cbd->std * (frequency_adjustment * freqa);
  1089. probb_adjusted += cbd->std * (frequency_adjustment * freqb);
  1090. }
  1091. if (cbd->flags & RSPAMD_LANG_FLAG_SHORT) {
  1092. adj = tier1_adjustment * 2.0;
  1093. }
  1094. else {
  1095. adj = tier1_adjustment;
  1096. }
  1097. if (canda->elt->flags & RS_LANGUAGE_TIER1) {
  1098. proba_adjusted += cbd->std * adj;
  1099. }
  1100. if (candb->elt->flags & RS_LANGUAGE_TIER1) {
  1101. probb_adjusted += cbd->std * adj;
  1102. }
  1103. if (cbd->flags & RSPAMD_LANG_FLAG_SHORT) {
  1104. adj = tier0_adjustment * 16.0;
  1105. }
  1106. else {
  1107. adj = tier0_adjustment;
  1108. }
  1109. if (canda->elt->flags & RS_LANGUAGE_TIER0) {
  1110. proba_adjusted += cbd->std * adj;
  1111. }
  1112. if (candb->elt->flags & RS_LANGUAGE_TIER0) {
  1113. probb_adjusted += cbd->std * adj;
  1114. }
  1115. if (proba_adjusted > probb_adjusted) {
  1116. return -1;
  1117. }
  1118. else if (probb_adjusted > proba_adjusted) {
  1119. return 1;
  1120. }
  1121. return 0;
  1122. }
  1123. static void
  1124. rspamd_language_detector_unicode_scripts (struct rspamd_task *task,
  1125. struct rspamd_mime_text_part *part)
  1126. {
  1127. const gchar *p = part->utf_stripped_content->data, *end;
  1128. guint i = 0;
  1129. end = p + part->utf_stripped_content->len;
  1130. gint32 uc, sc;
  1131. guint nlatin = 0, nchinese = 0, nspecial = 0;
  1132. while (p + i < end) {
  1133. U8_NEXT (p, i, part->utf_stripped_content->len, uc);
  1134. if (((gint32) uc) < 0) {
  1135. break;
  1136. }
  1137. if (u_isalpha (uc)) {
  1138. sc = ublock_getCode (uc);
  1139. switch (sc) {
  1140. case UBLOCK_BASIC_LATIN:
  1141. case UBLOCK_LATIN_1_SUPPLEMENT:
  1142. part->unicode_scripts |= RSPAMD_UNICODE_LATIN;
  1143. nlatin ++;
  1144. break;
  1145. case UBLOCK_HEBREW:
  1146. part->unicode_scripts |= RSPAMD_UNICODE_HEBREW;
  1147. nspecial ++;
  1148. break;
  1149. case UBLOCK_GREEK:
  1150. part->unicode_scripts |= RSPAMD_UNICODE_GREEK;
  1151. nspecial ++;
  1152. break;
  1153. case UBLOCK_CYRILLIC:
  1154. part->unicode_scripts |= RSPAMD_UNICODE_CYRILLIC;
  1155. nspecial ++;
  1156. break;
  1157. case UBLOCK_CJK_UNIFIED_IDEOGRAPHS:
  1158. case UBLOCK_CJK_COMPATIBILITY:
  1159. case UBLOCK_CJK_RADICALS_SUPPLEMENT:
  1160. case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A:
  1161. case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B:
  1162. part->unicode_scripts |= RSPAMD_UNICODE_CJK;
  1163. nchinese ++;
  1164. break;
  1165. case UBLOCK_HIRAGANA:
  1166. case UBLOCK_KATAKANA:
  1167. part->unicode_scripts |= RSPAMD_UNICODE_JP;
  1168. nspecial ++;
  1169. break;
  1170. case UBLOCK_HANGUL_JAMO:
  1171. case UBLOCK_HANGUL_COMPATIBILITY_JAMO:
  1172. part->unicode_scripts |= RSPAMD_UNICODE_HANGUL;
  1173. nspecial ++;
  1174. break;
  1175. case UBLOCK_ARABIC:
  1176. part->unicode_scripts |= RSPAMD_UNICODE_ARABIC;
  1177. nspecial ++;
  1178. break;
  1179. case UBLOCK_DEVANAGARI:
  1180. part->unicode_scripts |= RSPAMD_UNICODE_DEVANAGARI;
  1181. nspecial ++;
  1182. break;
  1183. case UBLOCK_ARMENIAN:
  1184. part->unicode_scripts |= RSPAMD_UNICODE_ARMENIAN;
  1185. nspecial ++;
  1186. break;
  1187. case UBLOCK_GEORGIAN:
  1188. part->unicode_scripts |= RSPAMD_UNICODE_GEORGIAN;
  1189. nspecial ++;
  1190. break;
  1191. case UBLOCK_GUJARATI:
  1192. part->unicode_scripts |= RSPAMD_UNICODE_GUJARATI;
  1193. nspecial ++;
  1194. break;
  1195. case UBLOCK_TELUGU:
  1196. part->unicode_scripts |= RSPAMD_UNICODE_TELUGU;
  1197. nspecial ++;
  1198. break;
  1199. case UBLOCK_TAMIL:
  1200. part->unicode_scripts |= RSPAMD_UNICODE_TAMIL;
  1201. nspecial ++;
  1202. break;
  1203. case UBLOCK_THAI:
  1204. part->unicode_scripts |= RSPAMD_UNICODE_THAI;
  1205. nspecial ++;
  1206. break;
  1207. case RSPAMD_UNICODE_MALAYALAM:
  1208. part->unicode_scripts |= RSPAMD_UNICODE_MALAYALAM;
  1209. nspecial ++;
  1210. break;
  1211. case RSPAMD_UNICODE_SINHALA:
  1212. part->unicode_scripts |= RSPAMD_UNICODE_SINHALA;
  1213. nspecial ++;
  1214. break;
  1215. }
  1216. }
  1217. if (nspecial > 6 && nspecial > nlatin) {
  1218. break;
  1219. }
  1220. else if (nchinese > 6 && nchinese > nlatin) {
  1221. if (nspecial > 0) {
  1222. /* Likely japanese */
  1223. break;
  1224. }
  1225. }
  1226. }
  1227. msg_debug_lang_det ("stop after checking %d characters, "
  1228. "%d latin, %d special, %d chinese",
  1229. i, nlatin, nspecial, nchinese);
  1230. }
  1231. static inline void
  1232. rspamd_language_detector_set_language (struct rspamd_task *task,
  1233. struct rspamd_mime_text_part *part,
  1234. const gchar *code)
  1235. {
  1236. struct rspamd_lang_detector_res *r;
  1237. r = rspamd_mempool_alloc0 (task->task_pool, sizeof (*r));
  1238. r->prob = 1.0;
  1239. r->lang = code;
  1240. part->languages = g_ptr_array_sized_new (1);
  1241. g_ptr_array_add (part->languages, r);
  1242. part->language = code;
  1243. }
  1244. static gboolean
  1245. rspamd_language_detector_try_uniscript (struct rspamd_task *task,
  1246. struct rspamd_mime_text_part *part)
  1247. {
  1248. guint i;
  1249. for (i = 0; i < G_N_ELEMENTS (unicode_langs); i ++) {
  1250. if (unicode_langs[i].unicode_code & part->unicode_scripts) {
  1251. msg_debug_lang_det ("set language based on unicode script %s",
  1252. unicode_langs[i].lang);
  1253. rspamd_language_detector_set_language (task, part,
  1254. unicode_langs[i].lang);
  1255. return TRUE;
  1256. }
  1257. }
  1258. if (part->unicode_scripts & RSPAMD_UNICODE_CJK) {
  1259. rspamd_language_detector_set_language (task, part,
  1260. "zh-CN");
  1261. return TRUE;
  1262. }
  1263. return FALSE;
  1264. }
  1265. static guint
  1266. rspamd_langelt_hash_func (gconstpointer key)
  1267. {
  1268. const struct rspamd_language_elt *elt = (const struct rspamd_language_elt *)key;
  1269. return rspamd_cryptobox_fast_hash (elt->name, strlen (elt->name),
  1270. rspamd_hash_seed ());
  1271. }
  1272. static gboolean
  1273. rspamd_langelt_equal_func (gconstpointer v, gconstpointer v2)
  1274. {
  1275. const struct rspamd_language_elt *elt1 = (const struct rspamd_language_elt *)v,
  1276. *elt2 = (const struct rspamd_language_elt *)v2;
  1277. return strcmp (elt1->name, elt2->name) == 0;
  1278. }
  1279. KHASH_INIT (rspamd_sw_hash, struct rspamd_language_elt *, int, 1,
  1280. rspamd_langelt_hash_func, rspamd_langelt_equal_func);
  1281. struct rspamd_sw_cbdata {
  1282. khash_t (rspamd_sw_hash) *res;
  1283. GArray *ranges;
  1284. };
  1285. static gint
  1286. rspamd_ranges_cmp (const void *k, const void *memb)
  1287. {
  1288. gint pos = GPOINTER_TO_INT (k);
  1289. const struct rspamd_stop_word_range *r = (struct rspamd_stop_word_range *)memb;
  1290. if (pos >= r->start && pos < r->stop) {
  1291. return 0;
  1292. }
  1293. else if (pos < r->start) {
  1294. return -1;
  1295. }
  1296. return 1;
  1297. }
  1298. static gint
  1299. rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp,
  1300. guint strnum,
  1301. gint match_start,
  1302. gint match_pos,
  1303. const gchar *text,
  1304. gsize len,
  1305. void *context)
  1306. {
  1307. /* Check if boundary */
  1308. const gchar *prev = text, *next = text + len;
  1309. struct rspamd_stop_word_range *r;
  1310. struct rspamd_sw_cbdata *cbdata = (struct rspamd_sw_cbdata *)context;
  1311. khiter_t k;
  1312. if (match_start > 0) {
  1313. prev = text + match_start - 1;
  1314. if (!(g_ascii_isspace (*prev) || g_ascii_ispunct (*prev))) {
  1315. return 0;
  1316. }
  1317. }
  1318. if (match_pos < len) {
  1319. next = text + match_pos;
  1320. if (!(g_ascii_isspace (*next) || g_ascii_ispunct (*next))) {
  1321. return 0;
  1322. }
  1323. }
  1324. /* We have a word on the boundary, check range */
  1325. r = bsearch (GINT_TO_POINTER (strnum), cbdata->ranges->data,
  1326. cbdata->ranges->len, sizeof (*r), rspamd_ranges_cmp);
  1327. g_assert (r != NULL);
  1328. k = kh_get (rspamd_sw_hash, cbdata->res, r->elt);
  1329. if (k != kh_end (cbdata->res)) {
  1330. kh_value (cbdata->res, k) ++;
  1331. }
  1332. else {
  1333. gint tt;
  1334. k = kh_put (rspamd_sw_hash, cbdata->res, r->elt, &tt);
  1335. kh_value (cbdata->res, k) = 1;
  1336. }
  1337. return 0;
  1338. }
  1339. static gboolean
  1340. rspamd_language_detector_try_stop_words (struct rspamd_task *task,
  1341. struct rspamd_lang_detector *d,
  1342. struct rspamd_mime_text_part *part,
  1343. enum rspamd_language_category cat)
  1344. {
  1345. struct rspamd_stop_word_elt *elt;
  1346. struct rspamd_sw_cbdata cbdata;
  1347. gboolean ret = FALSE;
  1348. elt = &d->stop_words[cat];
  1349. cbdata.res = kh_init (rspamd_sw_hash);
  1350. cbdata.ranges = elt->ranges;
  1351. rspamd_multipattern_lookup (elt->mp, part->utf_stripped_content->data,
  1352. part->utf_stripped_content->len, rspamd_language_detector_sw_cb,
  1353. &cbdata, NULL);
  1354. if (kh_size (cbdata.res) > 0) {
  1355. gint cur_matches;
  1356. double max_rate = G_MINDOUBLE;
  1357. const gchar *sel = NULL;
  1358. struct rspamd_language_elt *cur_lang;
  1359. kh_foreach (cbdata.res, cur_lang, cur_matches, {
  1360. double rate = (double)cur_matches / (double)cur_lang->stop_words;
  1361. if (rate > max_rate) {
  1362. max_rate = rate;
  1363. sel = cur_lang->name;
  1364. }
  1365. msg_debug_lang_det ("found %d stop words from %s: %3f rate",
  1366. cur_matches, cur_lang->name, rate);
  1367. });
  1368. if (max_rate > 0 && sel) {
  1369. msg_debug_lang_det ("set language based on stop words script %s, %.3f found",
  1370. sel, max_rate);
  1371. rspamd_language_detector_set_language (task, part,
  1372. sel);
  1373. ret = TRUE;
  1374. }
  1375. }
  1376. kh_destroy (rspamd_sw_hash, cbdata.res);
  1377. return ret;
  1378. }
  1379. gboolean
  1380. rspamd_language_detector_detect (struct rspamd_task *task,
  1381. struct rspamd_lang_detector *d,
  1382. struct rspamd_mime_text_part *part)
  1383. {
  1384. khash_t(rspamd_candidates_hash) *candidates;
  1385. GPtrArray *result;
  1386. gdouble mean, std, start_ticks, end_ticks;
  1387. guint cand_len;
  1388. enum rspamd_language_category cat;
  1389. struct rspamd_lang_detector_res *cand;
  1390. enum rspamd_language_detected_type r;
  1391. struct rspamd_frequency_sort_cbdata cbd;
  1392. /* Check if we have sorted candidates based on frequency */
  1393. gboolean frequency_heuristic_applied = FALSE, ret = FALSE;
  1394. if (!part->utf_stripped_content) {
  1395. return FALSE;
  1396. }
  1397. start_ticks = rspamd_get_ticks (TRUE);
  1398. rspamd_language_detector_unicode_scripts (task, part);
  1399. /* Apply unicode scripts heuristic */
  1400. if (rspamd_language_detector_try_uniscript (task, part)) {
  1401. ret = TRUE;
  1402. }
  1403. cat = rspamd_language_detector_get_category (part->unicode_scripts);
  1404. if (!ret && rspamd_language_detector_try_stop_words (task, d, part, cat)) {
  1405. ret = TRUE;
  1406. }
  1407. if (!ret) {
  1408. if (part->nwords < default_short_text_limit) {
  1409. r = rs_detect_none;
  1410. msg_debug_lang_det ("text is too short for trigramms detection: "
  1411. "%d words; at least %d words required",
  1412. (int)part->nwords,
  1413. (int)default_short_text_limit);
  1414. rspamd_language_detector_set_language (task, part, "en");
  1415. candidates = kh_init (rspamd_candidates_hash);
  1416. }
  1417. else {
  1418. candidates = kh_init (rspamd_candidates_hash);
  1419. kh_resize (rspamd_candidates_hash, candidates, 32);
  1420. r = rspamd_language_detector_try_ngramm (task,
  1421. default_words,
  1422. d,
  1423. part->utf_words,
  1424. cat,
  1425. candidates);
  1426. if (r == rs_detect_none) {
  1427. msg_debug_lang_det ("no trigramms found, fallback to english");
  1428. rspamd_language_detector_set_language (task, part, "en");
  1429. } else if (r == rs_detect_multiple) {
  1430. /* Check our guess */
  1431. mean = 0.0;
  1432. std = 0.0;
  1433. cand_len = 0;
  1434. /* Check distirbution */
  1435. kh_foreach_value (candidates, cand, {
  1436. if (!isnan (cand->prob)) {
  1437. mean += cand->prob;
  1438. cand_len++;
  1439. }
  1440. });
  1441. if (cand_len > 0) {
  1442. mean /= cand_len;
  1443. kh_foreach_value (candidates, cand, {
  1444. gdouble err;
  1445. if (!isnan (cand->prob)) {
  1446. err = cand->prob - mean;
  1447. std += fabs (err);
  1448. }
  1449. });
  1450. std /= cand_len;
  1451. }
  1452. msg_debug_lang_det ("trigramms checked, %d candidates, %.3f mean, %.4f stddev",
  1453. cand_len, mean, std);
  1454. if (cand_len > 0 && std / fabs (mean) < 0.25) {
  1455. msg_debug_lang_det ("apply frequency heuristic sorting");
  1456. frequency_heuristic_applied = TRUE;
  1457. cbd.d = d;
  1458. cbd.mean = mean;
  1459. cbd.std = std;
  1460. cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
  1461. if (part->nwords < default_words / 2) {
  1462. cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
  1463. }
  1464. }
  1465. }
  1466. }
  1467. /* Now, convert hash to array and sort it */
  1468. if (r != rs_detect_none && kh_size (candidates) > 0) {
  1469. result = g_ptr_array_sized_new (kh_size (candidates));
  1470. kh_foreach_value (candidates, cand, {
  1471. if (!isnan (cand->prob)) {
  1472. msg_debug_lang_det ("final probability %s -> %.2f", cand->lang,
  1473. cand->prob);
  1474. g_ptr_array_add (result, cand);
  1475. }
  1476. });
  1477. if (frequency_heuristic_applied) {
  1478. g_ptr_array_sort_with_data (result,
  1479. rspamd_language_detector_cmp_heuristic, (gpointer) &cbd);
  1480. } else {
  1481. g_ptr_array_sort (result, rspamd_language_detector_cmp);
  1482. }
  1483. if (result->len > 0 && !frequency_heuristic_applied) {
  1484. cand = g_ptr_array_index (result, 0);
  1485. cand->elt->occurencies++;
  1486. d->total_occurencies++;
  1487. }
  1488. part->languages = result;
  1489. ret = TRUE;
  1490. }
  1491. else if (part->languages == NULL) {
  1492. rspamd_language_detector_set_language (task, part, "en");
  1493. }
  1494. kh_destroy (rspamd_candidates_hash, candidates);
  1495. }
  1496. end_ticks = rspamd_get_ticks (TRUE);
  1497. msg_debug_lang_det ("detected languages in %.0f ticks",
  1498. (end_ticks - start_ticks));
  1499. return ret;
  1500. }
  1501. struct rspamd_lang_detector*
  1502. rspamd_language_detector_ref (struct rspamd_lang_detector* d)
  1503. {
  1504. REF_RETAIN (d);
  1505. return d;
  1506. }
  1507. void
  1508. rspamd_language_detector_unref (struct rspamd_lang_detector* d)
  1509. {
  1510. REF_RELEASE (d);
  1511. }
  1512. gboolean
  1513. rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d,
  1514. const gchar *word, gsize wlen)
  1515. {
  1516. khiter_t k;
  1517. rspamd_ftok_t search;
  1518. search.begin = word;
  1519. search.len = wlen;
  1520. k = kh_get (rspamd_stopwords_hash, d->stop_words_norm, &search);
  1521. if (k != kh_end (d->stop_words_norm)) {
  1522. return TRUE;
  1523. }
  1524. return FALSE;
  1525. }