You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lang_detection.c 47KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972
  1. /*-
  2. * Copyright 2017 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "lang_detection.h"
  17. #include "libserver/logger.h"
  18. #include "libcryptobox/cryptobox.h"
  19. #include "libutil/multipattern.h"
  20. #include "ucl.h"
  21. #include "khash.h"
  22. #include "libstemmer.h"
  23. #include <glob.h>
  24. #include <unicode/utf8.h>
  25. #include <unicode/utf16.h>
  26. #include <unicode/ucnv.h>
  27. #include <unicode/uchar.h>
  28. #include <unicode/ustring.h>
  29. #include <math.h>
  30. static const gsize default_short_text_limit = 10;
  31. static const gsize default_words = 80;
  32. static const gdouble update_prob = 0.6;
  33. static const gchar *default_languages_path = RSPAMD_SHAREDIR "/languages";
  34. #undef EXTRA_LANGDET_DEBUG
  35. struct rspamd_language_unicode_match {
  36. const gchar *lang;
  37. gint unicode_code;
  38. };
  39. /*
  40. * List of languages detected by unicode scripts
  41. */
  42. static const struct rspamd_language_unicode_match unicode_langs[] = {
  43. {"el", RSPAMD_UNICODE_GREEK},
  44. {"ml", RSPAMD_UNICODE_MALAYALAM},
  45. {"te", RSPAMD_UNICODE_TELUGU},
  46. {"ta", RSPAMD_UNICODE_TAMIL},
  47. {"gu", RSPAMD_UNICODE_GUJARATI},
  48. {"th", RSPAMD_UNICODE_THAI},
  49. {"ka", RSPAMD_UNICODE_GEORGIAN},
  50. {"si", RSPAMD_UNICODE_SINHALA},
  51. {"hy", RSPAMD_UNICODE_ARMENIAN},
  52. {"ja", RSPAMD_UNICODE_JP},
  53. {"ko", RSPAMD_UNICODE_HANGUL},
  54. };
  55. /*
  56. * Top languages
  57. */
  58. static const gchar *tier0_langs[] = {
  59. "en",
  60. };
  61. static const gchar *tier1_langs[] = {
  62. "fr", "it", "de", "es", "nl",
  63. "pt", "ru", "pl", "tk", "th", "ar"
  64. };
  65. enum rspamd_language_category {
  66. RSPAMD_LANGUAGE_LATIN = 0,
  67. RSPAMD_LANGUAGE_CYRILLIC,
  68. RSPAMD_LANGUAGE_DEVANAGARI,
  69. RSPAMD_LANGUAGE_ARAB,
  70. RSPAMD_LANGUAGE_MAX,
  71. };
  72. struct rspamd_language_elt {
  73. const gchar *name; /* e.g. "en" or "ru" */
  74. gint flags; /* enum rspamd_language_elt_flags */
  75. enum rspamd_language_category category;
  76. guint trigrams_words;
  77. guint stop_words;
  78. gdouble mean;
  79. gdouble std;
  80. guint occurrences; /* total number of parts with this language */
  81. };
  82. struct rspamd_ngramm_elt {
  83. struct rspamd_language_elt *elt;
  84. gdouble prob;
  85. };
  86. struct rspamd_ngramm_chain {
  87. GPtrArray *languages;
  88. gdouble mean;
  89. gdouble std;
  90. gchar *utf;
  91. };
  92. struct rspamd_stop_word_range {
  93. guint start;
  94. guint stop;
  95. struct rspamd_language_elt *elt;
  96. };
  97. struct rspamd_stop_word_elt {
  98. struct rspamd_multipattern *mp;
  99. GArray *ranges; /* of rspamd_stop_word_range */
  100. };
  101. #define msg_debug_lang_det(...) rspamd_conditional_debug_fast (NULL, NULL, \
  102. rspamd_langdet_log_id, "langdet", task->task_pool->tag.uid, \
  103. G_STRFUNC, \
  104. __VA_ARGS__)
  105. #define msg_debug_lang_det_cfg(...) rspamd_conditional_debug_fast (NULL, NULL, \
  106. rspamd_langdet_log_id, "langdet", cfg->cfg_pool->tag.uid, \
  107. G_STRFUNC, \
  108. __VA_ARGS__)
  109. INIT_LOG_MODULE(langdet)
  110. static const struct rspamd_language_unicode_match *
  111. rspamd_language_search_unicode_match (const gchar *key,
  112. const struct rspamd_language_unicode_match *elts, size_t nelts)
  113. {
  114. size_t i;
  115. for (i = 0; i < nelts; i++) {
  116. if (strcmp (elts[i].lang, key) == 0) {
  117. return &elts[i];
  118. }
  119. }
  120. return NULL;
  121. }
  122. static gboolean
  123. rspamd_language_search_str (const gchar *key, const gchar *elts[], size_t nelts)
  124. {
  125. size_t i;
  126. for (i = 0; i < nelts; i++) {
  127. if (strcmp (elts[i], key) == 0) {
  128. return TRUE;
  129. }
  130. }
  131. return FALSE;
  132. }
  133. static guint
  134. rspamd_trigram_hash_func (gconstpointer key)
  135. {
  136. return rspamd_cryptobox_fast_hash (key, 3 * sizeof (UChar32),
  137. rspamd_hash_seed ());
  138. }
  139. static gboolean
  140. rspamd_trigram_equal_func (gconstpointer v, gconstpointer v2)
  141. {
  142. return memcmp (v, v2, 3 * sizeof (UChar32)) == 0;
  143. }
  144. KHASH_INIT (rspamd_trigram_hash, const UChar32 *, struct rspamd_ngramm_chain, true,
  145. rspamd_trigram_hash_func, rspamd_trigram_equal_func);
  146. KHASH_INIT (rspamd_candidates_hash, const gchar *,
  147. struct rspamd_lang_detector_res *, true,
  148. rspamd_str_hash, rspamd_str_equal);
  149. KHASH_INIT (rspamd_stopwords_hash, rspamd_ftok_t *,
  150. char, false,
  151. rspamd_ftok_hash, rspamd_ftok_equal);
  152. struct rspamd_lang_detector {
  153. GPtrArray *languages;
  154. khash_t(rspamd_trigram_hash) *trigrams[RSPAMD_LANGUAGE_MAX]; /* trigrams frequencies */
  155. struct rspamd_stop_word_elt stop_words[RSPAMD_LANGUAGE_MAX];
  156. khash_t(rspamd_stopwords_hash) *stop_words_norm;
  157. UConverter *uchar_converter;
  158. gsize short_text_limit;
  159. gsize total_occurrences; /* number of all languages found */
  160. ref_entry_t ref;
  161. };
  162. static void
  163. rspamd_language_detector_ucs_lowercase (UChar32 *s, gsize len)
  164. {
  165. gsize i;
  166. for (i = 0; i < len; i ++) {
  167. s[i] = u_tolower (s[i]);
  168. }
  169. }
  170. static gboolean
  171. rspamd_language_detector_ucs_is_latin (const UChar32 *s, gsize len)
  172. {
  173. gsize i;
  174. gboolean ret = TRUE;
  175. for (i = 0; i < len; i ++) {
  176. if (s[i] >= 128 || !(g_ascii_isalnum (s[i]) || s[i] == ' ')) {
  177. ret = FALSE;
  178. break;
  179. }
  180. }
  181. return ret;
  182. }
  183. struct rspamd_language_ucs_elt {
  184. guint freq;
  185. const gchar *utf;
  186. UChar32 s[0];
  187. };
  188. static void
  189. rspamd_language_detector_init_ngramm (struct rspamd_config *cfg,
  190. struct rspamd_lang_detector *d,
  191. struct rspamd_language_elt *lelt,
  192. struct rspamd_language_ucs_elt *ucs,
  193. guint len,
  194. guint freq,
  195. guint total,
  196. khash_t (rspamd_trigram_hash) *htb)
  197. {
  198. struct rspamd_ngramm_chain *chain = NULL, st_chain;
  199. struct rspamd_ngramm_elt *elt;
  200. khiter_t k;
  201. guint i;
  202. gboolean found;
  203. switch (len) {
  204. case 1:
  205. case 2:
  206. g_assert_not_reached ();
  207. break;
  208. case 3:
  209. k = kh_get (rspamd_trigram_hash, htb, ucs->s);
  210. if (k != kh_end (htb)) {
  211. chain = &kh_value (htb, k);
  212. }
  213. break;
  214. default:
  215. g_assert_not_reached ();
  216. break;
  217. }
  218. if (chain == NULL) {
  219. /* New element */
  220. chain = &st_chain;
  221. memset (chain, 0, sizeof (st_chain));
  222. chain->languages = g_ptr_array_sized_new (32);
  223. rspamd_mempool_add_destructor (cfg->cfg_pool, rspamd_ptr_array_free_hard,
  224. chain->languages);
  225. chain->utf = rspamd_mempool_strdup (cfg->cfg_pool, ucs->utf);
  226. elt = rspamd_mempool_alloc (cfg->cfg_pool, sizeof (*elt));
  227. elt->elt = lelt;
  228. elt->prob = ((gdouble)freq) / ((gdouble)total);
  229. g_ptr_array_add (chain->languages, elt);
  230. k = kh_put (rspamd_trigram_hash, htb, ucs->s, &i);
  231. kh_value (htb, k) = *chain;
  232. }
  233. else {
  234. /* Check sanity */
  235. found = FALSE;
  236. PTR_ARRAY_FOREACH (chain->languages, i, elt) {
  237. if (strcmp (elt->elt->name, lelt->name) == 0) {
  238. found = TRUE;
  239. elt->prob += ((gdouble)freq) / ((gdouble)total);
  240. break;
  241. }
  242. }
  243. if (!found) {
  244. elt = rspamd_mempool_alloc (cfg->cfg_pool, sizeof (*elt));
  245. elt->elt = lelt;
  246. elt->prob = ((gdouble)freq) / ((gdouble)total);
  247. g_ptr_array_add (chain->languages, elt);
  248. }
  249. }
  250. }
  251. static inline enum rspamd_language_category
  252. rspamd_language_detector_get_category (guint uflags)
  253. {
  254. enum rspamd_language_category cat = RSPAMD_LANGUAGE_LATIN;
  255. if (uflags & RSPAMD_UNICODE_CYRILLIC) {
  256. cat = RSPAMD_LANGUAGE_CYRILLIC;
  257. }
  258. else if (uflags & RSPAMD_UNICODE_DEVANAGARI) {
  259. cat = RSPAMD_LANGUAGE_DEVANAGARI;
  260. }
  261. else if (uflags & RSPAMD_UNICODE_ARABIC) {
  262. cat = RSPAMD_LANGUAGE_ARAB;
  263. }
  264. return cat;
  265. }
  266. static const gchar *
  267. rspamd_language_detector_print_flags (struct rspamd_language_elt *elt)
  268. {
  269. static gchar flags_buf[256];
  270. goffset r = 0;
  271. if (elt->flags & RS_LANGUAGE_TIER1) {
  272. r += rspamd_snprintf (flags_buf + r, sizeof (flags_buf) - r, "tier1,");
  273. }
  274. if (elt->flags & RS_LANGUAGE_TIER0) {
  275. r += rspamd_snprintf (flags_buf + r, sizeof (flags_buf) - r, "tier0,");
  276. }
  277. if (elt->flags & RS_LANGUAGE_LATIN) {
  278. r += rspamd_snprintf (flags_buf + r, sizeof (flags_buf) - r, "latin,");
  279. }
  280. if (r > 0) {
  281. flags_buf[r - 1] = '\0';
  282. }
  283. else {
  284. flags_buf[r] = '\0';
  285. }
  286. return flags_buf;
  287. }
  288. static gint
  289. rspamd_language_detector_cmp_ngramm (gconstpointer a, gconstpointer b)
  290. {
  291. struct rspamd_language_ucs_elt *e1 = *(struct rspamd_language_ucs_elt **)a;
  292. struct rspamd_language_ucs_elt *e2 = *(struct rspamd_language_ucs_elt **)b;
  293. return (gint)e2->freq - (gint)e1->freq;
  294. }
  295. static void
  296. rspamd_language_detector_read_file (struct rspamd_config *cfg,
  297. struct rspamd_lang_detector *d,
  298. const gchar *path,
  299. const ucl_object_t *stop_words)
  300. {
  301. struct ucl_parser *parser;
  302. ucl_object_t *top;
  303. const ucl_object_t *freqs, *n_words, *cur, *type, *flags;
  304. ucl_object_iter_t it = NULL;
  305. UErrorCode uc_err = U_ZERO_ERROR;
  306. struct rspamd_language_elt *nelt;
  307. struct rspamd_language_ucs_elt *ucs_elt;
  308. khash_t (rspamd_trigram_hash) *htb = NULL;
  309. gchar *pos;
  310. guint total = 0, total_latin = 0, total_ngramms = 0, i, skipped,
  311. loaded, nstop = 0;
  312. gdouble mean = 0, std = 0, delta = 0, delta2 = 0, m2 = 0;
  313. enum rspamd_language_category cat = RSPAMD_LANGUAGE_MAX;
  314. parser = ucl_parser_new (UCL_PARSER_NO_FILEVARS);
  315. if (!ucl_parser_add_file (parser, path)) {
  316. msg_warn_config ("cannot parse file %s: %s", path,
  317. ucl_parser_get_error (parser));
  318. ucl_parser_free (parser);
  319. return;
  320. }
  321. top = ucl_parser_get_object (parser);
  322. ucl_parser_free (parser);
  323. freqs = ucl_object_lookup (top, "freq");
  324. if (freqs == NULL) {
  325. msg_warn_config ("file %s has no 'freq' key", path);
  326. ucl_object_unref (top);
  327. return;
  328. }
  329. pos = strrchr (path, '/');
  330. g_assert (pos != NULL);
  331. nelt = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (*nelt));
  332. nelt->name = rspamd_mempool_strdup (cfg->cfg_pool, pos + 1);
  333. /* Remove extension */
  334. pos = strchr (nelt->name, '.');
  335. g_assert (pos != NULL);
  336. *pos = '\0';
  337. n_words = ucl_object_lookup (top, "n_words");
  338. if (n_words == NULL || ucl_object_type (n_words) != UCL_ARRAY ||
  339. n_words->len != 3) {
  340. msg_warn_config ("cannot find n_words in language %s", nelt->name);
  341. ucl_object_unref (top);
  342. return;
  343. }
  344. else {
  345. nelt->trigrams_words = ucl_object_toint (ucl_array_find_index (n_words,
  346. 2));
  347. }
  348. type = ucl_object_lookup (top, "type");
  349. if (type == NULL || ucl_object_type (type) != UCL_STRING) {
  350. msg_debug_config ("cannot find type in language %s", nelt->name);
  351. ucl_object_unref (top);
  352. return;
  353. }
  354. else {
  355. const gchar *stype = ucl_object_tostring (type);
  356. if (strcmp (stype, "latin") == 0) {
  357. cat = RSPAMD_LANGUAGE_LATIN;
  358. }
  359. else if (strcmp (stype, "cyrillic") == 0) {
  360. cat = RSPAMD_LANGUAGE_CYRILLIC;
  361. }
  362. else if (strcmp (stype, "arab") == 0) {
  363. cat = RSPAMD_LANGUAGE_ARAB;
  364. }
  365. else if (strcmp (stype, "devanagari") == 0) {
  366. cat = RSPAMD_LANGUAGE_DEVANAGARI;
  367. }
  368. else {
  369. msg_debug_config ("unknown type %s of language %s", stype, nelt->name);
  370. ucl_object_unref (top);
  371. return;
  372. }
  373. }
  374. flags = ucl_object_lookup (top, "flags");
  375. if (flags != NULL && ucl_object_type (flags) == UCL_ARRAY) {
  376. ucl_object_iter_t it = NULL;
  377. const ucl_object_t *cur;
  378. while ((cur = ucl_object_iterate (flags, &it, true)) != NULL) {
  379. const gchar *fl = ucl_object_tostring (cur);
  380. if (cur) {
  381. if (strcmp (fl, "diacritics") == 0) {
  382. nelt->flags |= RS_LANGUAGE_DIACRITICS;
  383. }
  384. else if (strcmp (fl, "ascii") == 0) {
  385. nelt->flags |= RS_LANGUAGE_ASCII;
  386. }
  387. else {
  388. msg_debug_config ("unknown flag %s of language %s", fl, nelt->name);
  389. }
  390. }
  391. else {
  392. msg_debug_config ("unknown flags type of language %s", nelt->name);
  393. }
  394. }
  395. }
  396. if (stop_words) {
  397. const ucl_object_t *specific_stop_words;
  398. specific_stop_words = ucl_object_lookup (stop_words, nelt->name);
  399. if (specific_stop_words) {
  400. struct sb_stemmer *stem = NULL;
  401. it = NULL;
  402. const ucl_object_t *w;
  403. guint start, stop;
  404. stem = sb_stemmer_new (nelt->name, "UTF_8");
  405. start = rspamd_multipattern_get_npatterns (d->stop_words[cat].mp);
  406. while ((w = ucl_object_iterate (specific_stop_words, &it, true)) != NULL) {
  407. gsize wlen;
  408. const char *word = ucl_object_tolstring (w, &wlen);
  409. const char *saved;
  410. guint mp_flags = RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8;
  411. if (rspamd_multipattern_has_hyperscan ()) {
  412. mp_flags |= RSPAMD_MULTIPATTERN_RE;
  413. }
  414. rspamd_multipattern_add_pattern_len (d->stop_words[cat].mp,
  415. word, wlen,
  416. mp_flags);
  417. nelt->stop_words ++;
  418. nstop ++;
  419. /* Also lemmatise and store normalised */
  420. if (stem) {
  421. const char *nw = sb_stemmer_stem (stem, word, wlen);
  422. if (nw) {
  423. saved = nw;
  424. wlen = strlen (nw);
  425. }
  426. else {
  427. saved = word;
  428. }
  429. }
  430. else {
  431. saved = word;
  432. }
  433. if (saved) {
  434. gint rc;
  435. rspamd_ftok_t *tok;
  436. gchar *dst;
  437. tok = rspamd_mempool_alloc (cfg->cfg_pool,
  438. sizeof (*tok) + wlen + 1);
  439. dst = ((gchar *)tok) + sizeof (*tok);
  440. rspamd_strlcpy (dst, saved, wlen + 1);
  441. tok->begin = dst;
  442. tok->len = wlen;
  443. kh_put (rspamd_stopwords_hash, d->stop_words_norm,
  444. tok, &rc);
  445. }
  446. }
  447. if (stem) {
  448. sb_stemmer_delete (stem);
  449. }
  450. stop = rspamd_multipattern_get_npatterns (d->stop_words[cat].mp);
  451. struct rspamd_stop_word_range r;
  452. r.start = start;
  453. r.stop = stop;
  454. r.elt = nelt;
  455. g_array_append_val (d->stop_words[cat].ranges, r);
  456. it = NULL;
  457. }
  458. }
  459. nelt->category = cat;
  460. htb = d->trigrams[cat];
  461. GPtrArray *ngramms;
  462. guint nsym;
  463. if (rspamd_language_search_str (nelt->name, tier1_langs,
  464. G_N_ELEMENTS (tier1_langs))) {
  465. nelt->flags |= RS_LANGUAGE_TIER1;
  466. }
  467. if (rspamd_language_search_str (nelt->name, tier0_langs,
  468. G_N_ELEMENTS (tier0_langs))) {
  469. nelt->flags |= RS_LANGUAGE_TIER0;
  470. }
  471. it = NULL;
  472. ngramms = g_ptr_array_sized_new (freqs->len);
  473. i = 0;
  474. skipped = 0;
  475. loaded = 0;
  476. while ((cur = ucl_object_iterate (freqs, &it, true)) != NULL) {
  477. const gchar *key;
  478. gsize keylen;
  479. guint freq;
  480. key = ucl_object_keyl (cur, &keylen);
  481. freq = ucl_object_toint (cur);
  482. i ++;
  483. delta = freq - mean;
  484. mean += delta / i;
  485. delta2 = freq - mean;
  486. m2 += delta * delta2;
  487. if (key != NULL) {
  488. UChar32 *cur_ucs;
  489. const char *end = key + keylen, *cur_utf = key;
  490. ucs_elt = rspamd_mempool_alloc (cfg->cfg_pool,
  491. sizeof (*ucs_elt) + (keylen + 1) * sizeof (UChar32));
  492. cur_ucs = ucs_elt->s;
  493. nsym = 0;
  494. uc_err = U_ZERO_ERROR;
  495. while (cur_utf < end) {
  496. *cur_ucs++ = ucnv_getNextUChar (d->uchar_converter, &cur_utf,
  497. end, &uc_err);
  498. if (!U_SUCCESS (uc_err)) {
  499. break;
  500. }
  501. nsym ++;
  502. }
  503. if (!U_SUCCESS (uc_err)) {
  504. msg_warn_config ("cannot convert key %*s to unicode: %s",
  505. (gint)keylen, key, u_errorName (uc_err));
  506. continue;
  507. }
  508. ucs_elt->utf = key;
  509. rspamd_language_detector_ucs_lowercase (ucs_elt->s, nsym);
  510. if (nsym == 3) {
  511. g_ptr_array_add (ngramms, ucs_elt);
  512. }
  513. else {
  514. continue;
  515. }
  516. if (rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) {
  517. total_latin++;
  518. }
  519. ucs_elt->freq = freq;
  520. total_ngramms++;
  521. }
  522. }
  523. std = sqrt (m2 / (i - 1));
  524. if (total_latin >= total_ngramms / 3) {
  525. nelt->flags |= RS_LANGUAGE_LATIN;
  526. }
  527. nsym = 3;
  528. total = 0;
  529. PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) {
  530. if (!(nelt->flags & RS_LANGUAGE_LATIN) &&
  531. rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) {
  532. ucs_elt->freq = 0;
  533. /* Skip latin ngramm for non-latin language to avoid garbage */
  534. skipped ++;
  535. continue;
  536. }
  537. /* Now, discriminate low frequency ngramms */
  538. total += ucs_elt->freq;
  539. loaded ++;
  540. }
  541. g_ptr_array_sort (ngramms, rspamd_language_detector_cmp_ngramm);
  542. PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) {
  543. if (ucs_elt->freq > 0) {
  544. rspamd_language_detector_init_ngramm (cfg, d,
  545. nelt, ucs_elt, nsym,
  546. ucs_elt->freq, total, htb);
  547. }
  548. }
  549. #ifdef EXTRA_LANGDET_DEBUG
  550. /* Useful for debug */
  551. for (i = 0; i < 10; i ++) {
  552. ucs_elt = g_ptr_array_index (ngramms, i);
  553. msg_debug_lang_det_cfg ("%s -> %s: %d", nelt->name,
  554. ucs_elt->utf, ucs_elt->freq);
  555. }
  556. #endif
  557. g_ptr_array_free (ngramms, TRUE);
  558. nelt->mean = mean;
  559. nelt->std = std;
  560. msg_debug_lang_det_cfg ("loaded %s language, %d trigrams, "
  561. "%d ngramms loaded; "
  562. "std=%.2f, mean=%.2f, skipped=%d, loaded=%d, stop_words=%d; "
  563. "(%s)",
  564. nelt->name,
  565. (gint)nelt->trigrams_words,
  566. total,
  567. std, mean,
  568. skipped, loaded, nelt->stop_words,
  569. rspamd_language_detector_print_flags (nelt));
  570. g_ptr_array_add (d->languages, nelt);
  571. ucl_object_unref (top);
  572. }
  573. static gboolean
  574. rspamd_ucl_array_find_str (const gchar *str, const ucl_object_t *ar)
  575. {
  576. ucl_object_iter_t it = NULL;
  577. const ucl_object_t *cur;
  578. if (ar == NULL || ar->len == 0) {
  579. return FALSE;
  580. }
  581. while ((cur = ucl_object_iterate (ar, &it, true)) != NULL) {
  582. if (ucl_object_type (cur) == UCL_STRING && rspamd_strcase_equal (
  583. ucl_object_tostring (cur), str)) {
  584. return TRUE;
  585. }
  586. }
  587. return FALSE;
  588. }
  589. static void
  590. rspamd_language_detector_process_chain (struct rspamd_config *cfg,
  591. struct rspamd_ngramm_chain *chain)
  592. {
  593. struct rspamd_ngramm_elt *elt;
  594. guint i;
  595. gdouble delta, mean = 0, delta2, m2 = 0, std;
  596. if (chain->languages->len > 3) {
  597. PTR_ARRAY_FOREACH (chain->languages, i, elt) {
  598. delta = elt->prob - mean;
  599. mean += delta / (i + 1);
  600. delta2 = elt->prob - mean;
  601. m2 += delta * delta2;
  602. }
  603. std = sqrt (m2 / (i - 1));
  604. chain->mean = mean;
  605. chain->std = std;
  606. /* Now, filter elements that are lower than mean */
  607. PTR_ARRAY_FOREACH (chain->languages, i, elt) {
  608. if (elt->prob < mean) {
  609. g_ptr_array_remove_index_fast (chain->languages, i);
  610. #ifdef EXTRA_LANGDET_DEBUG
  611. msg_debug_lang_det_cfg ("remove %s from %s; prob: %.4f; mean: %.4f, std: %.4f",
  612. elt->elt->name, chain->utf, elt->prob, mean, std);
  613. #endif
  614. }
  615. }
  616. }
  617. else {
  618. /* We have a unique ngramm, increase its weight */
  619. PTR_ARRAY_FOREACH (chain->languages, i, elt) {
  620. elt->prob *= 4.0;
  621. #ifdef EXTRA_LANGDET_DEBUG
  622. msg_debug_lang_det_cfg ("increase weight of %s in %s; prob: %.4f",
  623. elt->elt->name, chain->utf, elt->prob);
  624. #endif
  625. }
  626. }
  627. }
  628. static void
  629. rspamd_language_detector_dtor (struct rspamd_lang_detector *d)
  630. {
  631. if (d) {
  632. for (guint i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) {
  633. kh_destroy (rspamd_trigram_hash, d->trigrams[i]);
  634. rspamd_multipattern_destroy (d->stop_words[i].mp);
  635. g_array_free (d->stop_words[i].ranges, TRUE);
  636. }
  637. if (d->languages) {
  638. g_ptr_array_free (d->languages, TRUE);
  639. }
  640. kh_destroy (rspamd_stopwords_hash, d->stop_words_norm);
  641. }
  642. }
  643. struct rspamd_lang_detector*
  644. rspamd_language_detector_init (struct rspamd_config *cfg)
  645. {
  646. const ucl_object_t *section, *elt, *languages_enable = NULL,
  647. *languages_disable = NULL;
  648. const gchar *languages_path = default_languages_path;
  649. glob_t gl;
  650. size_t i, short_text_limit = default_short_text_limit, total = 0;
  651. UErrorCode uc_err = U_ZERO_ERROR;
  652. GString *languages_pattern;
  653. struct rspamd_ngramm_chain *chain, schain;
  654. gchar *fname;
  655. struct rspamd_lang_detector *ret = NULL;
  656. struct ucl_parser *parser;
  657. ucl_object_t *stop_words;
  658. section = ucl_object_lookup (cfg->rcl_obj, "lang_detection");
  659. if (section != NULL) {
  660. elt = ucl_object_lookup (section, "languages");
  661. if (elt) {
  662. languages_path = ucl_object_tostring (elt);
  663. }
  664. elt = ucl_object_lookup (section, "short_text_limit");
  665. if (elt) {
  666. short_text_limit = ucl_object_toint (elt);
  667. }
  668. languages_enable = ucl_object_lookup (section, "languages_enable");
  669. languages_disable = ucl_object_lookup (section, "languages_disable");
  670. }
  671. languages_pattern = g_string_sized_new (PATH_MAX);
  672. rspamd_printf_gstring (languages_pattern, "%s/stop_words", languages_path);
  673. parser = ucl_parser_new (UCL_PARSER_DEFAULT);
  674. if (ucl_parser_add_file (parser, languages_pattern->str)) {
  675. stop_words = ucl_parser_get_object (parser);
  676. }
  677. else {
  678. msg_err_config ("cannot read stop words from %s: %s",
  679. languages_pattern->str,
  680. ucl_parser_get_error (parser));
  681. stop_words = NULL;
  682. }
  683. ucl_parser_free (parser);
  684. languages_pattern->len = 0;
  685. rspamd_printf_gstring (languages_pattern, "%s/*.json", languages_path);
  686. memset (&gl, 0, sizeof (gl));
  687. if (glob (languages_pattern->str, 0, NULL, &gl) != 0) {
  688. msg_err_config ("cannot read any files matching %v", languages_pattern);
  689. goto end;
  690. }
  691. ret = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (*ret));
  692. ret->languages = g_ptr_array_sized_new (gl.gl_pathc);
  693. ret->uchar_converter = rspamd_get_utf8_converter ();
  694. ret->short_text_limit = short_text_limit;
  695. ret->stop_words_norm = kh_init (rspamd_stopwords_hash);
  696. /* Map from ngramm in ucs32 to GPtrArray of rspamd_language_elt */
  697. for (i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) {
  698. ret->trigrams[i] = kh_init (rspamd_trigram_hash);
  699. #ifdef WITH_HYPERSCAN
  700. ret->stop_words[i].mp = rspamd_multipattern_create (
  701. RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8|
  702. RSPAMD_MULTIPATTERN_RE);
  703. #else
  704. ret->stop_words[i].mp = rspamd_multipattern_create (
  705. RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8);
  706. #endif
  707. ret->stop_words[i].ranges = g_array_new (FALSE, FALSE,
  708. sizeof (struct rspamd_stop_word_range));
  709. }
  710. g_assert (uc_err == U_ZERO_ERROR);
  711. for (i = 0; i < gl.gl_pathc; i ++) {
  712. fname = g_path_get_basename (gl.gl_pathv[i]);
  713. if (!rspamd_ucl_array_find_str (fname, languages_disable) ||
  714. (languages_enable == NULL ||
  715. rspamd_ucl_array_find_str (fname, languages_enable))) {
  716. rspamd_language_detector_read_file (cfg, ret, gl.gl_pathv[i],
  717. stop_words);
  718. }
  719. else {
  720. msg_info_config ("skip language file %s: disabled", fname);
  721. }
  722. g_free (fname);
  723. }
  724. for (i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) {
  725. GError *err = NULL;
  726. kh_foreach_value (ret->trigrams[i], schain, {
  727. chain = &schain;
  728. rspamd_language_detector_process_chain (cfg, chain);
  729. });
  730. if (!rspamd_multipattern_compile (ret->stop_words[i].mp, &err)) {
  731. msg_err_config ("cannot compile stop words for %z language group: %e",
  732. i, err);
  733. g_error_free (err);
  734. }
  735. total += kh_size (ret->trigrams[i]);
  736. }
  737. msg_info_config ("loaded %d languages, "
  738. "%d trigrams",
  739. (gint)ret->languages->len,
  740. (gint)total);
  741. if (stop_words) {
  742. ucl_object_unref (stop_words);
  743. }
  744. REF_INIT_RETAIN (ret, rspamd_language_detector_dtor);
  745. rspamd_mempool_add_destructor (cfg->cfg_pool,
  746. (rspamd_mempool_destruct_t)rspamd_language_detector_unref,
  747. ret);
  748. end:
  749. if (gl.gl_pathc > 0) {
  750. globfree (&gl);
  751. }
  752. g_string_free (languages_pattern, TRUE);
  753. return ret;
  754. }
  755. static void
  756. rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
  757. goffset *offsets_out)
  758. {
  759. guint step_len, remainder, i, out_idx;
  760. guint64 coin, sel;
  761. rspamd_stat_token_t *tok;
  762. g_assert (nwords != 0);
  763. g_assert (offsets_out != NULL);
  764. g_assert (ucs_tokens->len >= nwords);
  765. /*
  766. * We split input array into `nwords` parts. For each part we randomly select
  767. * an element from this particular split. Here is an example:
  768. *
  769. * nwords=2, input_len=5
  770. *
  771. * w1 w2 w3 w4 w5
  772. * ^ ^
  773. * part1 part2
  774. * vv vv
  775. * w2 w5
  776. *
  777. * So we have 2 output words from 5 input words selected randomly within
  778. * their splits. It is not uniform distribution but it seems to be better
  779. * to include words from different text parts
  780. */
  781. step_len = ucs_tokens->len / nwords;
  782. remainder = ucs_tokens->len % nwords;
  783. out_idx = 0;
  784. coin = rspamd_random_uint64_fast ();
  785. sel = coin % (step_len + remainder);
  786. offsets_out[out_idx] = sel;
  787. for (i = step_len + remainder; i < ucs_tokens->len;
  788. i += step_len, out_idx ++) {
  789. guint ntries = 0;
  790. coin = rspamd_random_uint64_fast ();
  791. sel = (coin % step_len) + i;
  792. for (;;) {
  793. tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, sel);
  794. /* Filter bad tokens */
  795. if (tok->unicode.len >= 2 &&
  796. !(tok->flags & RSPAMD_STAT_TOKEN_FLAG_EXCEPTION) &&
  797. u_isalpha (tok->unicode.begin[0]) &&
  798. u_isalpha (tok->unicode.begin[tok->unicode.len - 1])) {
  799. offsets_out[out_idx] = sel;
  800. break;
  801. }
  802. else {
  803. ntries ++;
  804. coin = rspamd_random_uint64_fast ();
  805. if (ntries < step_len) {
  806. sel = (coin % step_len) + i;
  807. }
  808. else if (ntries < ucs_tokens->len) {
  809. sel = coin % ucs_tokens->len;
  810. }
  811. else {
  812. offsets_out[out_idx] = sel;
  813. break;
  814. }
  815. }
  816. }
  817. }
  818. /*
  819. * Fisher-Yates algorithm:
  820. * for i from 0 to n−2 do
  821. * j ← random integer such that i ≤ j < n
  822. * exchange a[i] and a[j]
  823. */
  824. #if 0
  825. if (out_idx > 2) {
  826. for (i = 0; i < out_idx - 2; i++) {
  827. coin = rspamd_random_uint64_fast ();
  828. sel = (coin % (out_idx - i)) + i;
  829. /* swap */
  830. tmp = offsets_out[i];
  831. offsets_out[i] = offsets_out[sel];
  832. offsets_out[sel] = tmp;
  833. }
  834. }
  835. #endif
  836. }
  837. static goffset
  838. rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar32 *window,
  839. guint wlen, goffset cur_off)
  840. {
  841. guint i;
  842. if (wlen > 1) {
  843. /* Deal with spaces at the beginning and ending */
  844. if (cur_off == 0) {
  845. window[0] = (UChar32)' ';
  846. for (i = 0; i < wlen - 1; i ++) {
  847. window[i + 1] = tok->unicode.begin[i];
  848. }
  849. }
  850. else if (cur_off + wlen == tok->unicode.len + 1) {
  851. /* Add trailing space */
  852. for (i = 0; i < wlen - 1; i ++) {
  853. window[i] = tok->unicode.begin[cur_off + i];
  854. }
  855. window[wlen - 1] = (UChar32)' ';
  856. }
  857. else if (cur_off + wlen > tok->unicode.len + 1) {
  858. /* No more fun */
  859. return -1;
  860. }
  861. else {
  862. /* Normal case */
  863. for (i = 0; i < wlen; i++) {
  864. window[i] = tok->unicode.begin[cur_off + i];
  865. }
  866. }
  867. }
  868. else {
  869. if (tok->normalized.len <= cur_off) {
  870. return -1;
  871. }
  872. window[0] = tok->unicode.begin[cur_off];
  873. }
  874. return cur_off + 1;
  875. }
  876. /*
  877. * Do full guess for a specific ngramm, checking all languages defined
  878. */
  879. static void
  880. rspamd_language_detector_process_ngramm_full (struct rspamd_task *task,
  881. struct rspamd_lang_detector *d,
  882. UChar32 *window,
  883. khash_t(rspamd_candidates_hash) *candidates,
  884. khash_t(rspamd_trigram_hash) *trigrams)
  885. {
  886. guint i;
  887. gint ret;
  888. struct rspamd_ngramm_chain *chain = NULL;
  889. struct rspamd_ngramm_elt *elt;
  890. struct rspamd_lang_detector_res *cand;
  891. khiter_t k;
  892. gdouble prob;
  893. k = kh_get (rspamd_trigram_hash, trigrams, window);
  894. if (k != kh_end (trigrams)) {
  895. chain = &kh_value (trigrams, k);
  896. }
  897. if (chain) {
  898. PTR_ARRAY_FOREACH (chain->languages, i, elt) {
  899. prob = elt->prob;
  900. if (prob < chain->mean) {
  901. continue;
  902. }
  903. k = kh_get (rspamd_candidates_hash, candidates, elt->elt->name);
  904. if (k != kh_end (candidates)) {
  905. cand = kh_value (candidates, k);
  906. }
  907. else {
  908. cand = NULL;
  909. }
  910. #ifdef NGRAMMS_DEBUG
  911. msg_err ("gramm: %s, lang: %s, prob: %.3f", chain->utf,
  912. elt->elt->name, log2 (elt->prob));
  913. #endif
  914. if (cand == NULL) {
  915. cand = rspamd_mempool_alloc (task->task_pool, sizeof (*cand));
  916. cand->elt = elt->elt;
  917. cand->lang = elt->elt->name;
  918. cand->prob = prob;
  919. k = kh_put (rspamd_candidates_hash, candidates, elt->elt->name,
  920. &ret);
  921. kh_value (candidates, k) = cand;
  922. } else {
  923. /* Update guess */
  924. cand->prob += prob;
  925. }
  926. }
  927. }
  928. }
  929. static void
  930. rspamd_language_detector_detect_word (struct rspamd_task *task,
  931. struct rspamd_lang_detector *d,
  932. rspamd_stat_token_t *tok,
  933. khash_t(rspamd_candidates_hash) *candidates,
  934. khash_t(rspamd_trigram_hash) *trigrams)
  935. {
  936. const guint wlen = 3;
  937. UChar32 window[3];
  938. goffset cur = 0;
  939. /* Split words */
  940. while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur))
  941. != -1) {
  942. rspamd_language_detector_process_ngramm_full (task,
  943. d, window, candidates, trigrams);
  944. }
  945. }
  946. static const gdouble cutoff_limit = -8.0;
  947. /*
  948. * Converts frequencies to log probabilities, filter those candidates who
  949. * has the lowest probabilities
  950. */
  951. static inline void
  952. rspamd_language_detector_filter_step1 (struct rspamd_task *task,
  953. struct rspamd_lang_detector_res *cand,
  954. gdouble *max_prob, guint *filtered)
  955. {
  956. if (!isnan (cand->prob)) {
  957. if (cand->prob == 0) {
  958. cand->prob = NAN;
  959. msg_debug_lang_det (
  960. "exclude language %s",
  961. cand->lang);
  962. (*filtered)++;
  963. }
  964. else {
  965. cand->prob = log2 (cand->prob);
  966. if (cand->prob < cutoff_limit) {
  967. msg_debug_lang_det (
  968. "exclude language %s: %.3f, cutoff limit: %.3f",
  969. cand->lang, cand->prob, cutoff_limit);
  970. cand->prob = NAN;
  971. (*filtered)++;
  972. }
  973. else if (cand->prob > *max_prob) {
  974. *max_prob = cand->prob;
  975. }
  976. }
  977. }
  978. }
  979. static inline void
  980. rspamd_language_detector_filter_step2 (struct rspamd_task *task,
  981. struct rspamd_lang_detector_res *cand,
  982. gdouble max_prob, guint *filtered)
  983. {
  984. /*
  985. * Probabilities are logarithmic, so if prob1 - prob2 > 4, it means that
  986. * prob2 is 2^4 less than prob1
  987. */
  988. if (!isnan (cand->prob) && max_prob - cand->prob > 1) {
  989. msg_debug_lang_det ("exclude language %s: %.3f (%.3f max)",
  990. cand->lang, cand->prob, max_prob);
  991. cand->prob = NAN;
  992. (*filtered) ++;
  993. }
  994. }
  995. static void
  996. rspamd_language_detector_filter_negligible (struct rspamd_task *task,
  997. khash_t(rspamd_candidates_hash) *candidates)
  998. {
  999. struct rspamd_lang_detector_res *cand;
  1000. guint filtered = 0;
  1001. gdouble max_prob = -(G_MAXDOUBLE);
  1002. kh_foreach_value (candidates, cand,
  1003. rspamd_language_detector_filter_step1 (task, cand, &max_prob, &filtered));
  1004. kh_foreach_value (candidates, cand,
  1005. rspamd_language_detector_filter_step2 (task, cand, max_prob, &filtered));
  1006. msg_debug_lang_det ("removed %d languages", filtered);
  1007. }
  1008. static void
  1009. rspamd_language_detector_detect_type (struct rspamd_task *task,
  1010. guint nwords,
  1011. struct rspamd_lang_detector *d,
  1012. GArray *words,
  1013. enum rspamd_language_category cat,
  1014. khash_t(rspamd_candidates_hash) *candidates)
  1015. {
  1016. guint nparts = MIN (words->len, nwords);
  1017. goffset *selected_words;
  1018. rspamd_stat_token_t *tok;
  1019. guint i;
  1020. selected_words = g_new0 (goffset, nparts);
  1021. rspamd_language_detector_random_select (words, nparts, selected_words);
  1022. msg_debug_lang_det ("randomly selected %d words", nparts);
  1023. for (i = 0; i < nparts; i++) {
  1024. tok = &g_array_index (words, rspamd_stat_token_t,
  1025. selected_words[i]);
  1026. if (tok->unicode.len >= 3) {
  1027. rspamd_language_detector_detect_word (task, d, tok, candidates,
  1028. d->trigrams[cat]);
  1029. }
  1030. }
  1031. /* Filter negligible candidates */
  1032. rspamd_language_detector_filter_negligible (task, candidates);
  1033. g_free (selected_words);
  1034. }
  1035. static gint
  1036. rspamd_language_detector_cmp (gconstpointer a, gconstpointer b)
  1037. {
  1038. const struct rspamd_lang_detector_res
  1039. *canda = *(const struct rspamd_lang_detector_res **)a,
  1040. *candb = *(const struct rspamd_lang_detector_res **)b;
  1041. if (canda->prob > candb->prob) {
  1042. return -1;
  1043. }
  1044. else if (candb->prob > canda->prob) {
  1045. return 1;
  1046. }
  1047. return 0;
  1048. }
  1049. enum rspamd_language_detected_type {
  1050. rs_detect_none = 0,
  1051. rs_detect_single,
  1052. rs_detect_multiple,
  1053. };
  1054. static enum rspamd_language_detected_type
  1055. rspamd_language_detector_try_ngramm (struct rspamd_task *task,
  1056. guint nwords,
  1057. struct rspamd_lang_detector *d,
  1058. GArray *ucs_tokens,
  1059. enum rspamd_language_category cat,
  1060. khash_t(rspamd_candidates_hash) *candidates)
  1061. {
  1062. guint cand_len = 0;
  1063. struct rspamd_lang_detector_res *cand;
  1064. rspamd_language_detector_detect_type (task,
  1065. nwords,
  1066. d,
  1067. ucs_tokens,
  1068. cat,
  1069. candidates);
  1070. kh_foreach_value (candidates, cand, {
  1071. if (!isnan (cand->prob)) {
  1072. cand_len ++;
  1073. }
  1074. });
  1075. if (cand_len == 0) {
  1076. return rs_detect_none;
  1077. }
  1078. else if (cand_len == 1) {
  1079. return rs_detect_single;
  1080. }
  1081. return rs_detect_multiple;
  1082. }
  1083. enum rspamd_language_sort_flags {
  1084. RSPAMD_LANG_FLAG_DEFAULT = 0,
  1085. RSPAMD_LANG_FLAG_SHORT = 1 << 0,
  1086. };
  1087. struct rspamd_frequency_sort_cbdata {
  1088. struct rspamd_lang_detector *d;
  1089. enum rspamd_language_sort_flags flags;
  1090. gdouble std;
  1091. gdouble mean;
  1092. };
  1093. static const gdouble tier0_adjustment = 1.2;
  1094. static const gdouble tier1_adjustment = 0.8;
  1095. static const gdouble frequency_adjustment = 0.8;
  1096. static gint
  1097. rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b,
  1098. gpointer ud)
  1099. {
  1100. struct rspamd_frequency_sort_cbdata *cbd = ud;
  1101. const struct rspamd_lang_detector_res
  1102. *canda = *(const struct rspamd_lang_detector_res **)a,
  1103. *candb = *(const struct rspamd_lang_detector_res **)b;
  1104. gdouble adj;
  1105. gdouble proba_adjusted, probb_adjusted, freqa, freqb;
  1106. if (cbd->d->total_occurrences == 0) {
  1107. return 0;
  1108. }
  1109. freqa = ((gdouble)canda->elt->occurrences) /
  1110. (gdouble)cbd->d->total_occurrences;
  1111. freqb = ((gdouble)candb->elt->occurrences) /
  1112. (gdouble)cbd->d->total_occurrences;
  1113. proba_adjusted = canda->prob;
  1114. probb_adjusted = candb->prob;
  1115. if (isnormal (freqa) && isnormal (freqb)) {
  1116. proba_adjusted += cbd->std * (frequency_adjustment * freqa);
  1117. probb_adjusted += cbd->std * (frequency_adjustment * freqb);
  1118. }
  1119. if (cbd->flags & RSPAMD_LANG_FLAG_SHORT) {
  1120. adj = tier1_adjustment * 2.0;
  1121. }
  1122. else {
  1123. adj = tier1_adjustment;
  1124. }
  1125. if (canda->elt->flags & RS_LANGUAGE_TIER1) {
  1126. proba_adjusted += cbd->std * adj;
  1127. }
  1128. if (candb->elt->flags & RS_LANGUAGE_TIER1) {
  1129. probb_adjusted += cbd->std * adj;
  1130. }
  1131. if (cbd->flags & RSPAMD_LANG_FLAG_SHORT) {
  1132. adj = tier0_adjustment * 16.0;
  1133. }
  1134. else {
  1135. adj = tier0_adjustment;
  1136. }
  1137. if (canda->elt->flags & RS_LANGUAGE_TIER0) {
  1138. proba_adjusted += cbd->std * adj;
  1139. }
  1140. if (candb->elt->flags & RS_LANGUAGE_TIER0) {
  1141. probb_adjusted += cbd->std * adj;
  1142. }
  1143. if (proba_adjusted > probb_adjusted) {
  1144. return -1;
  1145. }
  1146. else if (probb_adjusted > proba_adjusted) {
  1147. return 1;
  1148. }
  1149. return 0;
  1150. }
  1151. static void
  1152. rspamd_language_detector_unicode_scripts (struct rspamd_task *task,
  1153. struct rspamd_mime_text_part *part,
  1154. guint *pchinese,
  1155. guint *pspecial)
  1156. {
  1157. const gchar *p = part->utf_stripped_content->data, *end;
  1158. guint i = 0, cnt = 0;
  1159. end = p + part->utf_stripped_content->len;
  1160. gint32 uc, sc;
  1161. guint nlatin = 0, nchinese = 0, nspecial = 0;
  1162. const guint cutoff_limit = 32;
  1163. while (p + i < end) {
  1164. U8_NEXT (p, i, part->utf_stripped_content->len, uc);
  1165. if (((gint32) uc) < 0) {
  1166. break;
  1167. }
  1168. if (u_isalpha (uc)) {
  1169. sc = ublock_getCode (uc);
  1170. cnt ++;
  1171. switch (sc) {
  1172. case UBLOCK_BASIC_LATIN:
  1173. case UBLOCK_LATIN_1_SUPPLEMENT:
  1174. part->unicode_scripts |= RSPAMD_UNICODE_LATIN;
  1175. nlatin ++;
  1176. break;
  1177. case UBLOCK_HEBREW:
  1178. part->unicode_scripts |= RSPAMD_UNICODE_HEBREW;
  1179. nspecial ++;
  1180. break;
  1181. case UBLOCK_GREEK:
  1182. part->unicode_scripts |= RSPAMD_UNICODE_GREEK;
  1183. nspecial ++;
  1184. break;
  1185. case UBLOCK_CYRILLIC:
  1186. part->unicode_scripts |= RSPAMD_UNICODE_CYRILLIC;
  1187. nspecial ++;
  1188. break;
  1189. case UBLOCK_CJK_UNIFIED_IDEOGRAPHS:
  1190. case UBLOCK_CJK_COMPATIBILITY:
  1191. case UBLOCK_CJK_RADICALS_SUPPLEMENT:
  1192. case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A:
  1193. case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B:
  1194. part->unicode_scripts |= RSPAMD_UNICODE_CJK;
  1195. nchinese ++;
  1196. break;
  1197. case UBLOCK_HIRAGANA:
  1198. case UBLOCK_KATAKANA:
  1199. part->unicode_scripts |= RSPAMD_UNICODE_JP;
  1200. nspecial ++;
  1201. break;
  1202. case UBLOCK_HANGUL_JAMO:
  1203. case UBLOCK_HANGUL_COMPATIBILITY_JAMO:
  1204. part->unicode_scripts |= RSPAMD_UNICODE_HANGUL;
  1205. nspecial ++;
  1206. break;
  1207. case UBLOCK_ARABIC:
  1208. part->unicode_scripts |= RSPAMD_UNICODE_ARABIC;
  1209. nspecial ++;
  1210. break;
  1211. case UBLOCK_DEVANAGARI:
  1212. part->unicode_scripts |= RSPAMD_UNICODE_DEVANAGARI;
  1213. nspecial ++;
  1214. break;
  1215. case UBLOCK_ARMENIAN:
  1216. part->unicode_scripts |= RSPAMD_UNICODE_ARMENIAN;
  1217. nspecial ++;
  1218. break;
  1219. case UBLOCK_GEORGIAN:
  1220. part->unicode_scripts |= RSPAMD_UNICODE_GEORGIAN;
  1221. nspecial ++;
  1222. break;
  1223. case UBLOCK_GUJARATI:
  1224. part->unicode_scripts |= RSPAMD_UNICODE_GUJARATI;
  1225. nspecial ++;
  1226. break;
  1227. case UBLOCK_TELUGU:
  1228. part->unicode_scripts |= RSPAMD_UNICODE_TELUGU;
  1229. nspecial ++;
  1230. break;
  1231. case UBLOCK_TAMIL:
  1232. part->unicode_scripts |= RSPAMD_UNICODE_TAMIL;
  1233. nspecial ++;
  1234. break;
  1235. case UBLOCK_THAI:
  1236. part->unicode_scripts |= RSPAMD_UNICODE_THAI;
  1237. nspecial ++;
  1238. break;
  1239. case RSPAMD_UNICODE_MALAYALAM:
  1240. part->unicode_scripts |= RSPAMD_UNICODE_MALAYALAM;
  1241. nspecial ++;
  1242. break;
  1243. case RSPAMD_UNICODE_SINHALA:
  1244. part->unicode_scripts |= RSPAMD_UNICODE_SINHALA;
  1245. nspecial ++;
  1246. break;
  1247. }
  1248. }
  1249. if (nspecial > cutoff_limit && nspecial > nlatin) {
  1250. break;
  1251. }
  1252. else if (nchinese > cutoff_limit && nchinese > nlatin) {
  1253. if (nspecial > 0) {
  1254. /* Likely japanese */
  1255. break;
  1256. }
  1257. }
  1258. }
  1259. msg_debug_lang_det ("stop after checking %d characters, "
  1260. "%d latin, %d special, %d chinese",
  1261. cnt, nlatin, nspecial, nchinese);
  1262. *pchinese = nchinese;
  1263. *pspecial = nspecial;
  1264. }
  1265. static inline void
  1266. rspamd_language_detector_set_language (struct rspamd_task *task,
  1267. struct rspamd_mime_text_part *part,
  1268. const gchar *code,
  1269. struct rspamd_language_elt *elt)
  1270. {
  1271. struct rspamd_lang_detector_res *r;
  1272. r = rspamd_mempool_alloc0 (task->task_pool, sizeof (*r));
  1273. r->prob = 1.0;
  1274. r->lang = code;
  1275. r->elt = elt;
  1276. if (part->languages == NULL) {
  1277. part->languages = g_ptr_array_sized_new (1);
  1278. }
  1279. g_ptr_array_add (part->languages, r);
  1280. part->language = code;
  1281. }
  1282. static gboolean
  1283. rspamd_language_detector_try_uniscript (struct rspamd_task *task,
  1284. struct rspamd_mime_text_part *part,
  1285. guint nchinese,
  1286. guint nspecial)
  1287. {
  1288. guint i;
  1289. for (i = 0; i < G_N_ELEMENTS (unicode_langs); i ++) {
  1290. if (unicode_langs[i].unicode_code & part->unicode_scripts) {
  1291. if (unicode_langs[i].unicode_code != RSPAMD_UNICODE_JP) {
  1292. msg_debug_lang_det ("set language based on unicode script %s",
  1293. unicode_langs[i].lang);
  1294. rspamd_language_detector_set_language (task, part,
  1295. unicode_langs[i].lang, NULL);
  1296. return TRUE;
  1297. }
  1298. else {
  1299. /* Japanese <-> Chinese guess */
  1300. /*
  1301. * Typically there might be around 0-70% of kanji glyphs
  1302. * and the rest are Haragana/Katakana
  1303. *
  1304. * If we discover that Kanji is more than 80% then we consider
  1305. * it Chinese
  1306. */
  1307. if (nchinese <= 5 || nchinese < nspecial * 5) {
  1308. msg_debug_lang_det ("set language based on unicode script %s",
  1309. unicode_langs[i].lang);
  1310. rspamd_language_detector_set_language (task, part,
  1311. unicode_langs[i].lang, NULL);
  1312. return TRUE;
  1313. }
  1314. }
  1315. }
  1316. }
  1317. if (part->unicode_scripts & RSPAMD_UNICODE_CJK) {
  1318. msg_debug_lang_det ("guess chinese based on CJK characters: %d chinese, %d special",
  1319. nchinese, nspecial);
  1320. rspamd_language_detector_set_language (task, part,
  1321. "zh-CN", NULL);
  1322. return TRUE;
  1323. }
  1324. return FALSE;
  1325. }
  1326. static guint
  1327. rspamd_langelt_hash_func (gconstpointer key)
  1328. {
  1329. const struct rspamd_language_elt *elt = (const struct rspamd_language_elt *)key;
  1330. return rspamd_cryptobox_fast_hash (elt->name, strlen (elt->name),
  1331. rspamd_hash_seed ());
  1332. }
  1333. static gboolean
  1334. rspamd_langelt_equal_func (gconstpointer v, gconstpointer v2)
  1335. {
  1336. const struct rspamd_language_elt *elt1 = (const struct rspamd_language_elt *)v,
  1337. *elt2 = (const struct rspamd_language_elt *)v2;
  1338. return strcmp (elt1->name, elt2->name) == 0;
  1339. }
  1340. KHASH_INIT (rspamd_sw_hash, struct rspamd_language_elt *, int, 1,
  1341. rspamd_langelt_hash_func, rspamd_langelt_equal_func);
  1342. struct rspamd_sw_cbdata {
  1343. struct rspamd_task *task;
  1344. khash_t (rspamd_sw_hash) *res;
  1345. GArray *ranges;
  1346. };
  1347. static gint
  1348. rspamd_ranges_cmp (const void *k, const void *memb)
  1349. {
  1350. gint pos = GPOINTER_TO_INT (k);
  1351. const struct rspamd_stop_word_range *r = (struct rspamd_stop_word_range *)memb;
  1352. if (pos >= r->start && pos < r->stop) {
  1353. return 0;
  1354. }
  1355. else if (pos < r->start) {
  1356. return -1;
  1357. }
  1358. return 1;
  1359. }
  1360. static gint
  1361. rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp,
  1362. guint strnum,
  1363. gint match_start,
  1364. gint match_pos,
  1365. const gchar *text,
  1366. gsize len,
  1367. void *context)
  1368. {
  1369. /* Check if boundary */
  1370. const gchar *prev = text, *next = text + len;
  1371. struct rspamd_stop_word_range *r;
  1372. struct rspamd_sw_cbdata *cbdata = (struct rspamd_sw_cbdata *)context;
  1373. khiter_t k;
  1374. static const gsize max_stop_words = 80;
  1375. struct rspamd_task *task;
  1376. if (match_start > 0) {
  1377. prev = text + match_start - 1;
  1378. if (!(g_ascii_isspace (*prev) || g_ascii_ispunct (*prev))) {
  1379. return 0;
  1380. }
  1381. }
  1382. if (match_pos < len) {
  1383. next = text + match_pos;
  1384. if (!(g_ascii_isspace (*next) || g_ascii_ispunct (*next))) {
  1385. return 0;
  1386. }
  1387. }
  1388. /* We have a word on the boundary, check range */
  1389. task = cbdata->task;
  1390. r = bsearch (GINT_TO_POINTER (strnum), cbdata->ranges->data,
  1391. cbdata->ranges->len, sizeof (*r), rspamd_ranges_cmp);
  1392. g_assert (r != NULL);
  1393. k = kh_get (rspamd_sw_hash, cbdata->res, r->elt);
  1394. gint nwords = 1;
  1395. if (k != kh_end (cbdata->res)) {
  1396. nwords = ++ kh_value (cbdata->res, k);
  1397. if (kh_value (cbdata->res, k) > max_stop_words) {
  1398. return 1;
  1399. }
  1400. }
  1401. else {
  1402. gint tt;
  1403. k = kh_put (rspamd_sw_hash, cbdata->res, r->elt, &tt);
  1404. kh_value (cbdata->res, k) = 1;
  1405. }
  1406. msg_debug_lang_det ("found word %*s from %s language (%d stop words found so far)",
  1407. (int)(next - prev - 1), prev + 1, r->elt->name, nwords);
  1408. return 0;
  1409. }
  1410. static gboolean
  1411. rspamd_language_detector_try_stop_words (struct rspamd_task *task,
  1412. struct rspamd_lang_detector *d,
  1413. struct rspamd_mime_text_part *part,
  1414. enum rspamd_language_category cat)
  1415. {
  1416. struct rspamd_stop_word_elt *elt;
  1417. struct rspamd_sw_cbdata cbdata;
  1418. gboolean ret = FALSE;
  1419. static const int stop_words_threshold = 4, /* minimum stop words count */
  1420. strong_confidence_threshold = 10 /* we are sure that this is enough */;
  1421. elt = &d->stop_words[cat];
  1422. cbdata.res = kh_init (rspamd_sw_hash);
  1423. cbdata.ranges = elt->ranges;
  1424. cbdata.task = task;
  1425. rspamd_multipattern_lookup (elt->mp, part->utf_stripped_content->data,
  1426. part->utf_stripped_content->len, rspamd_language_detector_sw_cb,
  1427. &cbdata, NULL);
  1428. if (kh_size (cbdata.res) > 0) {
  1429. gint cur_matches;
  1430. double max_rate = G_MINDOUBLE;
  1431. struct rspamd_language_elt *cur_lang, *sel = NULL;
  1432. gboolean ignore_ascii = FALSE, ignore_latin = FALSE;
  1433. again:
  1434. kh_foreach (cbdata.res, cur_lang, cur_matches, {
  1435. if (!ignore_ascii && (cur_lang->flags & RS_LANGUAGE_DIACRITICS)) {
  1436. /* Restart matches */
  1437. ignore_ascii = TRUE;
  1438. sel = NULL;
  1439. max_rate = G_MINDOUBLE;
  1440. msg_debug_lang_det ("ignore ascii after finding %d stop words from %s",
  1441. cur_matches, cur_lang->name);
  1442. goto again;
  1443. }
  1444. if (!ignore_latin && cur_lang->category != RSPAMD_LANGUAGE_LATIN) {
  1445. /* Restart matches */
  1446. ignore_latin = TRUE;
  1447. sel = NULL;
  1448. max_rate = G_MINDOUBLE;
  1449. msg_debug_lang_det ("ignore latin after finding stop %d words from %s",
  1450. cur_matches, cur_lang->name);
  1451. goto again;
  1452. }
  1453. if (cur_matches < stop_words_threshold) {
  1454. continue;
  1455. }
  1456. if (cur_matches < strong_confidence_threshold) {
  1457. /* Ignore mixed languages when not enough confidence */
  1458. if (ignore_ascii && (cur_lang->flags & RS_LANGUAGE_ASCII)) {
  1459. continue;
  1460. }
  1461. if (ignore_latin && cur_lang->category == RSPAMD_LANGUAGE_LATIN) {
  1462. continue;
  1463. }
  1464. }
  1465. double rate = (double)cur_matches / (double)cur_lang->stop_words;
  1466. if (rate > max_rate) {
  1467. max_rate = rate;
  1468. sel = cur_lang;
  1469. }
  1470. msg_debug_lang_det ("found %d stop words from %s: %3f rate",
  1471. cur_matches, cur_lang->name, rate);
  1472. });
  1473. if (max_rate > 0 && sel) {
  1474. msg_debug_lang_det ("set language based on stop words script %s, %.3f found",
  1475. sel->name, max_rate);
  1476. rspamd_language_detector_set_language (task, part,
  1477. sel->name, sel);
  1478. ret = TRUE;
  1479. }
  1480. }
  1481. else {
  1482. msg_debug_lang_det ("found no stop words in a text");
  1483. }
  1484. kh_destroy (rspamd_sw_hash, cbdata.res);
  1485. return ret;
  1486. }
  1487. gboolean
  1488. rspamd_language_detector_detect (struct rspamd_task *task,
  1489. struct rspamd_lang_detector *d,
  1490. struct rspamd_mime_text_part *part)
  1491. {
  1492. khash_t(rspamd_candidates_hash) *candidates;
  1493. GPtrArray *result;
  1494. gdouble mean, std, start_ticks, end_ticks;
  1495. guint cand_len;
  1496. enum rspamd_language_category cat;
  1497. struct rspamd_lang_detector_res *cand;
  1498. enum rspamd_language_detected_type r;
  1499. struct rspamd_frequency_sort_cbdata cbd;
  1500. /* Check if we have sorted candidates based on frequency */
  1501. gboolean frequency_heuristic_applied = FALSE, ret = FALSE;
  1502. if (!part->utf_stripped_content) {
  1503. return FALSE;
  1504. }
  1505. start_ticks = rspamd_get_ticks (TRUE);
  1506. guint nchinese = 0, nspecial = 0;
  1507. rspamd_language_detector_unicode_scripts (task, part, &nchinese, &nspecial);
  1508. /* Apply unicode scripts heuristic */
  1509. if (rspamd_language_detector_try_uniscript (task, part, nchinese, nspecial)) {
  1510. ret = TRUE;
  1511. }
  1512. cat = rspamd_language_detector_get_category (part->unicode_scripts);
  1513. if (!ret && rspamd_language_detector_try_stop_words (task, d, part, cat)) {
  1514. ret = TRUE;
  1515. }
  1516. if (!ret) {
  1517. if (part->utf_words->len < default_short_text_limit) {
  1518. r = rs_detect_none;
  1519. msg_debug_lang_det ("text is too short for trigrams detection: "
  1520. "%d words; at least %d words required",
  1521. (int)part->utf_words->len,
  1522. (int)default_short_text_limit);
  1523. switch (cat) {
  1524. case RSPAMD_LANGUAGE_CYRILLIC:
  1525. rspamd_language_detector_set_language (task, part, "ru", NULL);
  1526. break;
  1527. case RSPAMD_LANGUAGE_DEVANAGARI:
  1528. rspamd_language_detector_set_language (task, part, "hi", NULL);
  1529. break;
  1530. case RSPAMD_LANGUAGE_ARAB:
  1531. rspamd_language_detector_set_language (task, part, "ar", NULL);
  1532. break;
  1533. default:
  1534. case RSPAMD_LANGUAGE_LATIN:
  1535. rspamd_language_detector_set_language (task, part, "en", NULL);
  1536. break;
  1537. }
  1538. msg_debug_lang_det ("set %s language based on symbols category",
  1539. part->language);
  1540. candidates = kh_init (rspamd_candidates_hash);
  1541. }
  1542. else {
  1543. candidates = kh_init (rspamd_candidates_hash);
  1544. kh_resize (rspamd_candidates_hash, candidates, 32);
  1545. r = rspamd_language_detector_try_ngramm (task,
  1546. default_words,
  1547. d,
  1548. part->utf_words,
  1549. cat,
  1550. candidates);
  1551. if (r == rs_detect_none) {
  1552. msg_debug_lang_det ("no trigrams found, fallback to english");
  1553. rspamd_language_detector_set_language (task, part, "en", NULL);
  1554. } else if (r == rs_detect_multiple) {
  1555. /* Check our guess */
  1556. mean = 0.0;
  1557. std = 0.0;
  1558. cand_len = 0;
  1559. /* Check distribution */
  1560. kh_foreach_value (candidates, cand, {
  1561. if (!isnan (cand->prob)) {
  1562. mean += cand->prob;
  1563. cand_len++;
  1564. }
  1565. });
  1566. if (cand_len > 0) {
  1567. mean /= cand_len;
  1568. kh_foreach_value (candidates, cand, {
  1569. gdouble err;
  1570. if (!isnan (cand->prob)) {
  1571. err = cand->prob - mean;
  1572. std += fabs (err);
  1573. }
  1574. });
  1575. std /= cand_len;
  1576. }
  1577. msg_debug_lang_det ("trigrams checked, %d candidates, %.3f mean, %.4f stddev",
  1578. cand_len, mean, std);
  1579. if (cand_len > 0 && std / fabs (mean) < 0.25) {
  1580. msg_debug_lang_det ("apply frequency heuristic sorting");
  1581. frequency_heuristic_applied = TRUE;
  1582. cbd.d = d;
  1583. cbd.mean = mean;
  1584. cbd.std = std;
  1585. cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
  1586. if (part->nwords < default_words / 2) {
  1587. cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
  1588. }
  1589. }
  1590. }
  1591. }
  1592. /* Now, convert hash to array and sort it */
  1593. if (r != rs_detect_none && kh_size (candidates) > 0) {
  1594. result = g_ptr_array_sized_new (kh_size (candidates));
  1595. kh_foreach_value (candidates, cand, {
  1596. if (!isnan (cand->prob)) {
  1597. msg_debug_lang_det ("final probability %s -> %.2f", cand->lang,
  1598. cand->prob);
  1599. g_ptr_array_add (result, cand);
  1600. }
  1601. });
  1602. if (frequency_heuristic_applied) {
  1603. g_ptr_array_sort_with_data (result,
  1604. rspamd_language_detector_cmp_heuristic, (gpointer) &cbd);
  1605. } else {
  1606. g_ptr_array_sort (result, rspamd_language_detector_cmp);
  1607. }
  1608. if (result->len > 0 && !frequency_heuristic_applied) {
  1609. cand = g_ptr_array_index (result, 0);
  1610. cand->elt->occurrences++;
  1611. d->total_occurrences++;
  1612. }
  1613. if (part->languages != NULL) {
  1614. g_ptr_array_unref (part->languages);
  1615. }
  1616. part->languages = result;
  1617. ret = TRUE;
  1618. }
  1619. else if (part->languages == NULL) {
  1620. rspamd_language_detector_set_language (task, part, "en", NULL);
  1621. }
  1622. kh_destroy (rspamd_candidates_hash, candidates);
  1623. }
  1624. end_ticks = rspamd_get_ticks (TRUE);
  1625. msg_debug_lang_det ("detected languages in %.0f ticks",
  1626. (end_ticks - start_ticks));
  1627. return ret;
  1628. }
  1629. struct rspamd_lang_detector*
  1630. rspamd_language_detector_ref (struct rspamd_lang_detector* d)
  1631. {
  1632. REF_RETAIN (d);
  1633. return d;
  1634. }
  1635. void
  1636. rspamd_language_detector_unref (struct rspamd_lang_detector* d)
  1637. {
  1638. REF_RELEASE (d);
  1639. }
  1640. gboolean
  1641. rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d,
  1642. const gchar *word, gsize wlen)
  1643. {
  1644. khiter_t k;
  1645. rspamd_ftok_t search;
  1646. search.begin = word;
  1647. search.len = wlen;
  1648. k = kh_get (rspamd_stopwords_hash, d->stop_words_norm, &search);
  1649. if (k != kh_end (d->stop_words_norm)) {
  1650. return TRUE;
  1651. }
  1652. return FALSE;
  1653. }
  1654. gint
  1655. rspamd_language_detector_elt_flags (const struct rspamd_language_elt *elt)
  1656. {
  1657. if (elt) {
  1658. return elt->flags;
  1659. }
  1660. return 0;
  1661. }