You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

lang_detection.c 53KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103
  1. /*
  2. * Copyright 2024 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "lang_detection.h"
  17. #include "lang_detection_fasttext.h"
  18. #include "libserver/logger.h"
  19. #include "libcryptobox/cryptobox.h"
  20. #include "libutil/multipattern.h"
  21. #include "ucl.h"
  22. #include "khash.h"
  23. #include "libstemmer.h"
  24. #include <glob.h>
  25. #include <unicode/utf8.h>
  26. #include <unicode/utf16.h>
  27. #include <unicode/ucnv.h>
  28. #include <unicode/uchar.h>
  29. #include <unicode/ustring.h>
  30. #include <math.h>
  31. static const gsize default_short_text_limit = 10;
  32. static const gsize default_words = 80;
  33. static const double update_prob = 0.6;
  34. static const char *default_languages_path = RSPAMD_SHAREDIR "/languages";
  35. #undef EXTRA_LANGDET_DEBUG
  36. struct rspamd_language_unicode_match {
  37. const char *lang;
  38. int unicode_code;
  39. };
  40. /*
  41. * List of languages detected by unicode scripts
  42. */
  43. static const struct rspamd_language_unicode_match unicode_langs[] = {
  44. {"el", RSPAMD_UNICODE_GREEK},
  45. {"ml", RSPAMD_UNICODE_MALAYALAM},
  46. {"te", RSPAMD_UNICODE_TELUGU},
  47. {"ta", RSPAMD_UNICODE_TAMIL},
  48. {"gu", RSPAMD_UNICODE_GUJARATI},
  49. {"th", RSPAMD_UNICODE_THAI},
  50. {"ka", RSPAMD_UNICODE_GEORGIAN},
  51. {"si", RSPAMD_UNICODE_SINHALA},
  52. {"hy", RSPAMD_UNICODE_ARMENIAN},
  53. {"ja", RSPAMD_UNICODE_JP},
  54. {"ko", RSPAMD_UNICODE_HANGUL},
  55. };
  56. /*
  57. * Top languages
  58. */
  59. static const char *tier0_langs[] = {
  60. "en",
  61. };
  62. static const char *tier1_langs[] = {
  63. "fr", "it", "de", "es", "nl",
  64. "pt", "ru", "pl", "tk", "th", "ar"};
  65. enum rspamd_language_category {
  66. RSPAMD_LANGUAGE_LATIN = 0,
  67. RSPAMD_LANGUAGE_CYRILLIC,
  68. RSPAMD_LANGUAGE_DEVANAGARI,
  69. RSPAMD_LANGUAGE_ARAB,
  70. RSPAMD_LANGUAGE_MAX,
  71. };
  72. struct rspamd_language_elt {
  73. const char *name; /* e.g. "en" or "ru" */
  74. int flags; /* enum rspamd_language_elt_flags */
  75. enum rspamd_language_category category;
  76. unsigned int trigrams_words;
  77. unsigned int stop_words;
  78. double mean;
  79. double std;
  80. unsigned int occurrences; /* total number of parts with this language */
  81. };
  82. struct rspamd_ngramm_elt {
  83. struct rspamd_language_elt *elt;
  84. double prob;
  85. };
  86. struct rspamd_ngramm_chain {
  87. GPtrArray *languages;
  88. double mean;
  89. double std;
  90. char *utf;
  91. };
  92. struct rspamd_stop_word_range {
  93. unsigned int start;
  94. unsigned int stop;
  95. struct rspamd_language_elt *elt;
  96. };
  97. struct rspamd_stop_word_elt {
  98. struct rspamd_multipattern *mp;
  99. GArray *ranges; /* of rspamd_stop_word_range */
  100. };
  101. #define msg_debug_lang_det(...) rspamd_conditional_debug_fast(NULL, NULL, \
  102. rspamd_langdet_log_id, "langdet", task->task_pool->tag.uid, \
  103. G_STRFUNC, \
  104. __VA_ARGS__)
  105. #define msg_debug_lang_det_cfg(...) rspamd_conditional_debug_fast(NULL, NULL, \
  106. rspamd_langdet_log_id, "langdet", cfg->cfg_pool->tag.uid, \
  107. G_STRFUNC, \
  108. __VA_ARGS__)
  109. INIT_LOG_MODULE_PUBLIC(langdet)
  110. static const struct rspamd_language_unicode_match *
  111. rspamd_language_search_unicode_match(const char *key,
  112. const struct rspamd_language_unicode_match *elts, size_t nelts)
  113. {
  114. size_t i;
  115. for (i = 0; i < nelts; i++) {
  116. if (strcmp(elts[i].lang, key) == 0) {
  117. return &elts[i];
  118. }
  119. }
  120. return NULL;
  121. }
  122. static gboolean
  123. rspamd_language_search_str(const char *key, const char *elts[], size_t nelts)
  124. {
  125. size_t i;
  126. for (i = 0; i < nelts; i++) {
  127. if (strcmp(elts[i], key) == 0) {
  128. return TRUE;
  129. }
  130. }
  131. return FALSE;
  132. }
  133. static unsigned int
  134. rspamd_trigram_hash_func(gconstpointer key)
  135. {
  136. return rspamd_cryptobox_fast_hash(key, 3 * sizeof(UChar32),
  137. rspamd_hash_seed());
  138. }
  139. static gboolean
  140. rspamd_trigram_equal_func(gconstpointer v, gconstpointer v2)
  141. {
  142. return memcmp(v, v2, 3 * sizeof(UChar32)) == 0;
  143. }
  144. KHASH_INIT(rspamd_trigram_hash, const UChar32 *, struct rspamd_ngramm_chain, true,
  145. rspamd_trigram_hash_func, rspamd_trigram_equal_func);
  146. KHASH_INIT(rspamd_candidates_hash, const char *,
  147. struct rspamd_lang_detector_res *, true,
  148. rspamd_str_hash, rspamd_str_equal);
  149. KHASH_INIT(rspamd_stopwords_hash, rspamd_ftok_t *,
  150. char, false,
  151. rspamd_ftok_hash, rspamd_ftok_equal);
  152. KHASH_INIT(rspamd_languages_hash, const char *, struct rspamd_language_elt *, true,
  153. rspamd_str_hash, rspamd_str_equal);
  154. struct rspamd_lang_detector {
  155. khash_t(rspamd_languages_hash) * languages;
  156. khash_t(rspamd_trigram_hash) * trigrams[RSPAMD_LANGUAGE_MAX]; /* trigrams frequencies */
  157. struct rspamd_stop_word_elt stop_words[RSPAMD_LANGUAGE_MAX];
  158. khash_t(rspamd_stopwords_hash) * stop_words_norm;
  159. UConverter *uchar_converter;
  160. gsize short_text_limit;
  161. bool prefer_fasttext;
  162. gsize total_occurrences; /* number of all languages found */
  163. gpointer fasttext_detector;
  164. ref_entry_t ref;
  165. };
  166. static void
  167. rspamd_language_detector_ucs_lowercase(UChar32 *s, gsize len)
  168. {
  169. gsize i;
  170. for (i = 0; i < len; i++) {
  171. s[i] = u_tolower(s[i]);
  172. }
  173. }
  174. static gboolean
  175. rspamd_language_detector_ucs_is_latin(const UChar32 *s, gsize len)
  176. {
  177. gsize i;
  178. gboolean ret = TRUE;
  179. for (i = 0; i < len; i++) {
  180. if (s[i] >= 128 || !(g_ascii_isalnum(s[i]) || s[i] == ' ')) {
  181. ret = FALSE;
  182. break;
  183. }
  184. }
  185. return ret;
  186. }
  187. struct rspamd_language_ucs_elt {
  188. unsigned int freq;
  189. const char *utf;
  190. UChar32 s[0];
  191. };
  192. static void
  193. rspamd_language_detector_init_ngramm(struct rspamd_config *cfg,
  194. struct rspamd_lang_detector *d,
  195. struct rspamd_language_elt *lelt,
  196. struct rspamd_language_ucs_elt *ucs,
  197. unsigned int len,
  198. unsigned int freq,
  199. unsigned int total,
  200. khash_t(rspamd_trigram_hash) * htb)
  201. {
  202. struct rspamd_ngramm_chain *chain = NULL, st_chain;
  203. struct rspamd_ngramm_elt *elt;
  204. khiter_t k;
  205. unsigned int i;
  206. gboolean found;
  207. switch (len) {
  208. case 1:
  209. case 2:
  210. g_assert_not_reached();
  211. break;
  212. case 3:
  213. k = kh_get(rspamd_trigram_hash, htb, ucs->s);
  214. if (k != kh_end(htb)) {
  215. chain = &kh_value(htb, k);
  216. }
  217. break;
  218. default:
  219. g_assert_not_reached();
  220. break;
  221. }
  222. if (chain == NULL) {
  223. /* New element */
  224. chain = &st_chain;
  225. memset(chain, 0, sizeof(st_chain));
  226. chain->languages = g_ptr_array_sized_new(32);
  227. rspamd_mempool_add_destructor(cfg->cfg_pool, rspamd_ptr_array_free_hard,
  228. chain->languages);
  229. chain->utf = rspamd_mempool_strdup(cfg->cfg_pool, ucs->utf);
  230. elt = rspamd_mempool_alloc(cfg->cfg_pool, sizeof(*elt));
  231. elt->elt = lelt;
  232. elt->prob = ((double) freq) / ((double) total);
  233. g_ptr_array_add(chain->languages, elt);
  234. k = kh_put(rspamd_trigram_hash, htb, ucs->s, &i);
  235. kh_value(htb, k) = *chain;
  236. }
  237. else {
  238. /* Check sanity */
  239. found = FALSE;
  240. PTR_ARRAY_FOREACH(chain->languages, i, elt)
  241. {
  242. if (strcmp(elt->elt->name, lelt->name) == 0) {
  243. found = TRUE;
  244. elt->prob += ((double) freq) / ((double) total);
  245. break;
  246. }
  247. }
  248. if (!found) {
  249. elt = rspamd_mempool_alloc(cfg->cfg_pool, sizeof(*elt));
  250. elt->elt = lelt;
  251. elt->prob = ((double) freq) / ((double) total);
  252. g_ptr_array_add(chain->languages, elt);
  253. }
  254. }
  255. }
  256. static inline enum rspamd_language_category
  257. rspamd_language_detector_get_category(unsigned int uflags)
  258. {
  259. enum rspamd_language_category cat = RSPAMD_LANGUAGE_LATIN;
  260. if (uflags & RSPAMD_UNICODE_CYRILLIC) {
  261. cat = RSPAMD_LANGUAGE_CYRILLIC;
  262. }
  263. else if (uflags & RSPAMD_UNICODE_DEVANAGARI) {
  264. cat = RSPAMD_LANGUAGE_DEVANAGARI;
  265. }
  266. else if (uflags & RSPAMD_UNICODE_ARABIC) {
  267. cat = RSPAMD_LANGUAGE_ARAB;
  268. }
  269. return cat;
  270. }
  271. static const char *
  272. rspamd_language_detector_print_flags(struct rspamd_language_elt *elt)
  273. {
  274. static char flags_buf[256];
  275. goffset r = 0;
  276. if (elt->flags & RS_LANGUAGE_TIER1) {
  277. r += rspamd_snprintf(flags_buf + r, sizeof(flags_buf) - r, "tier1,");
  278. }
  279. if (elt->flags & RS_LANGUAGE_TIER0) {
  280. r += rspamd_snprintf(flags_buf + r, sizeof(flags_buf) - r, "tier0,");
  281. }
  282. if (elt->flags & RS_LANGUAGE_LATIN) {
  283. r += rspamd_snprintf(flags_buf + r, sizeof(flags_buf) - r, "latin,");
  284. }
  285. if (r > 0) {
  286. flags_buf[r - 1] = '\0';
  287. }
  288. else {
  289. flags_buf[r] = '\0';
  290. }
  291. return flags_buf;
  292. }
  293. static int
  294. rspamd_language_detector_cmp_ngramm(gconstpointer a, gconstpointer b)
  295. {
  296. struct rspamd_language_ucs_elt *e1 = *(struct rspamd_language_ucs_elt **) a;
  297. struct rspamd_language_ucs_elt *e2 = *(struct rspamd_language_ucs_elt **) b;
  298. return (int) e2->freq - (int) e1->freq;
  299. }
  300. static void
  301. rspamd_language_detector_read_file(struct rspamd_config *cfg,
  302. struct rspamd_lang_detector *d,
  303. const char *path,
  304. const ucl_object_t *stop_words)
  305. {
  306. struct ucl_parser *parser;
  307. ucl_object_t *top;
  308. const ucl_object_t *freqs, *n_words, *cur, *type, *flags;
  309. ucl_object_iter_t it = NULL;
  310. UErrorCode uc_err = U_ZERO_ERROR;
  311. struct rspamd_language_elt *nelt;
  312. struct rspamd_language_ucs_elt *ucs_elt;
  313. khash_t(rspamd_trigram_hash) *htb = NULL;
  314. char *pos;
  315. unsigned int total = 0, total_latin = 0, total_ngramms = 0, i, skipped,
  316. loaded;
  317. double mean = 0, std = 0, delta = 0, delta2 = 0, m2 = 0;
  318. enum rspamd_language_category cat = RSPAMD_LANGUAGE_MAX;
  319. parser = ucl_parser_new(UCL_PARSER_NO_FILEVARS);
  320. if (!ucl_parser_add_file(parser, path)) {
  321. msg_warn_config("cannot parse file %s: %s", path,
  322. ucl_parser_get_error(parser));
  323. ucl_parser_free(parser);
  324. return;
  325. }
  326. top = ucl_parser_get_object(parser);
  327. ucl_parser_free(parser);
  328. freqs = ucl_object_lookup(top, "freq");
  329. if (freqs == NULL) {
  330. msg_warn_config("file %s has no 'freq' key", path);
  331. ucl_object_unref(top);
  332. return;
  333. }
  334. pos = strrchr(path, '/');
  335. g_assert(pos != NULL);
  336. nelt = rspamd_mempool_alloc0(cfg->cfg_pool, sizeof(*nelt));
  337. nelt->name = rspamd_mempool_strdup(cfg->cfg_pool, pos + 1);
  338. /* Remove extension */
  339. pos = strchr(nelt->name, '.');
  340. g_assert(pos != NULL);
  341. *pos = '\0';
  342. n_words = ucl_object_lookup(top, "n_words");
  343. if (n_words == NULL || ucl_object_type(n_words) != UCL_ARRAY ||
  344. n_words->len != 3) {
  345. msg_warn_config("cannot find n_words in language %s", nelt->name);
  346. ucl_object_unref(top);
  347. return;
  348. }
  349. else {
  350. nelt->trigrams_words = ucl_object_toint(ucl_array_find_index(n_words,
  351. 2));
  352. }
  353. type = ucl_object_lookup(top, "type");
  354. if (type == NULL || ucl_object_type(type) != UCL_STRING) {
  355. msg_debug_config("cannot find type in language %s", nelt->name);
  356. ucl_object_unref(top);
  357. return;
  358. }
  359. else {
  360. const char *stype = ucl_object_tostring(type);
  361. if (strcmp(stype, "latin") == 0) {
  362. cat = RSPAMD_LANGUAGE_LATIN;
  363. }
  364. else if (strcmp(stype, "cyrillic") == 0) {
  365. cat = RSPAMD_LANGUAGE_CYRILLIC;
  366. }
  367. else if (strcmp(stype, "arab") == 0) {
  368. cat = RSPAMD_LANGUAGE_ARAB;
  369. }
  370. else if (strcmp(stype, "devanagari") == 0) {
  371. cat = RSPAMD_LANGUAGE_DEVANAGARI;
  372. }
  373. else {
  374. msg_debug_config("unknown type %s of language %s", stype, nelt->name);
  375. ucl_object_unref(top);
  376. return;
  377. }
  378. }
  379. flags = ucl_object_lookup(top, "flags");
  380. if (flags != NULL && ucl_object_type(flags) == UCL_ARRAY) {
  381. ucl_object_iter_t it = NULL;
  382. const ucl_object_t *cur;
  383. while ((cur = ucl_object_iterate(flags, &it, true)) != NULL) {
  384. const char *fl = ucl_object_tostring(cur);
  385. if (cur) {
  386. if (strcmp(fl, "diacritics") == 0) {
  387. nelt->flags |= RS_LANGUAGE_DIACRITICS;
  388. }
  389. else if (strcmp(fl, "ascii") == 0) {
  390. nelt->flags |= RS_LANGUAGE_ASCII;
  391. }
  392. else {
  393. msg_debug_config("unknown flag %s of language %s", fl, nelt->name);
  394. }
  395. }
  396. else {
  397. msg_debug_config("unknown flags type of language %s", nelt->name);
  398. }
  399. }
  400. }
  401. if (stop_words) {
  402. const ucl_object_t *specific_stop_words;
  403. specific_stop_words = ucl_object_lookup(stop_words, nelt->name);
  404. if (specific_stop_words) {
  405. struct sb_stemmer *stem = NULL;
  406. it = NULL;
  407. const ucl_object_t *w;
  408. unsigned int start, stop;
  409. stem = sb_stemmer_new(nelt->name, "UTF_8");
  410. start = rspamd_multipattern_get_npatterns(d->stop_words[cat].mp);
  411. while ((w = ucl_object_iterate(specific_stop_words, &it, true)) != NULL) {
  412. gsize wlen;
  413. const char *word = ucl_object_tolstring(w, &wlen);
  414. const char *saved;
  415. unsigned int mp_flags = RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8;
  416. if (rspamd_multipattern_has_hyperscan()) {
  417. mp_flags |= RSPAMD_MULTIPATTERN_RE;
  418. }
  419. rspamd_multipattern_add_pattern_len(d->stop_words[cat].mp,
  420. word, wlen,
  421. mp_flags);
  422. nelt->stop_words++;
  423. /* Also lemmatise and store normalised */
  424. if (stem) {
  425. const char *nw = sb_stemmer_stem(stem, word, wlen);
  426. if (nw) {
  427. saved = nw;
  428. wlen = strlen(nw);
  429. }
  430. else {
  431. saved = word;
  432. }
  433. }
  434. else {
  435. saved = word;
  436. }
  437. if (saved) {
  438. int rc;
  439. rspamd_ftok_t *tok;
  440. char *dst;
  441. tok = rspamd_mempool_alloc(cfg->cfg_pool,
  442. sizeof(*tok) + wlen + 1);
  443. dst = ((char *) tok) + sizeof(*tok);
  444. rspamd_strlcpy(dst, saved, wlen + 1);
  445. tok->begin = dst;
  446. tok->len = wlen;
  447. kh_put(rspamd_stopwords_hash, d->stop_words_norm,
  448. tok, &rc);
  449. }
  450. }
  451. if (stem) {
  452. sb_stemmer_delete(stem);
  453. }
  454. stop = rspamd_multipattern_get_npatterns(d->stop_words[cat].mp);
  455. struct rspamd_stop_word_range r;
  456. r.start = start;
  457. r.stop = stop;
  458. r.elt = nelt;
  459. g_array_append_val(d->stop_words[cat].ranges, r);
  460. it = NULL;
  461. }
  462. }
  463. nelt->category = cat;
  464. htb = d->trigrams[cat];
  465. GPtrArray *ngramms;
  466. unsigned int nsym;
  467. if (rspamd_language_search_str(nelt->name, tier1_langs,
  468. G_N_ELEMENTS(tier1_langs))) {
  469. nelt->flags |= RS_LANGUAGE_TIER1;
  470. }
  471. if (rspamd_language_search_str(nelt->name, tier0_langs,
  472. G_N_ELEMENTS(tier0_langs))) {
  473. nelt->flags |= RS_LANGUAGE_TIER0;
  474. }
  475. it = NULL;
  476. ngramms = g_ptr_array_sized_new(freqs->len);
  477. i = 0;
  478. skipped = 0;
  479. loaded = 0;
  480. while ((cur = ucl_object_iterate(freqs, &it, true)) != NULL) {
  481. const char *key;
  482. gsize keylen;
  483. unsigned int freq;
  484. key = ucl_object_keyl(cur, &keylen);
  485. freq = ucl_object_toint(cur);
  486. i++;
  487. delta = freq - mean;
  488. mean += delta / i;
  489. delta2 = freq - mean;
  490. m2 += delta * delta2;
  491. if (key != NULL) {
  492. UChar32 *cur_ucs;
  493. const char *end = key + keylen, *cur_utf = key;
  494. ucs_elt = rspamd_mempool_alloc(cfg->cfg_pool,
  495. sizeof(*ucs_elt) + (keylen + 1) * sizeof(UChar32));
  496. cur_ucs = ucs_elt->s;
  497. nsym = 0;
  498. uc_err = U_ZERO_ERROR;
  499. while (cur_utf < end) {
  500. *cur_ucs++ = ucnv_getNextUChar(d->uchar_converter, &cur_utf,
  501. end, &uc_err);
  502. if (!U_SUCCESS(uc_err)) {
  503. break;
  504. }
  505. nsym++;
  506. }
  507. if (!U_SUCCESS(uc_err)) {
  508. msg_warn_config("cannot convert key %*s to unicode: %s",
  509. (int) keylen, key, u_errorName(uc_err));
  510. continue;
  511. }
  512. ucs_elt->utf = key;
  513. rspamd_language_detector_ucs_lowercase(ucs_elt->s, nsym);
  514. if (nsym == 3) {
  515. g_ptr_array_add(ngramms, ucs_elt);
  516. }
  517. else {
  518. continue;
  519. }
  520. if (rspamd_language_detector_ucs_is_latin(ucs_elt->s, nsym)) {
  521. total_latin++;
  522. }
  523. ucs_elt->freq = freq;
  524. total_ngramms++;
  525. }
  526. }
  527. std = sqrt(m2 / (i - 1));
  528. if (total_latin >= total_ngramms / 3) {
  529. nelt->flags |= RS_LANGUAGE_LATIN;
  530. }
  531. nsym = 3;
  532. total = 0;
  533. PTR_ARRAY_FOREACH(ngramms, i, ucs_elt)
  534. {
  535. if (!(nelt->flags & RS_LANGUAGE_LATIN) &&
  536. rspamd_language_detector_ucs_is_latin(ucs_elt->s, nsym)) {
  537. ucs_elt->freq = 0;
  538. /* Skip latin ngramm for non-latin language to avoid garbage */
  539. skipped++;
  540. continue;
  541. }
  542. /* Now, discriminate low frequency ngramms */
  543. total += ucs_elt->freq;
  544. loaded++;
  545. }
  546. g_ptr_array_sort(ngramms, rspamd_language_detector_cmp_ngramm);
  547. PTR_ARRAY_FOREACH(ngramms, i, ucs_elt)
  548. {
  549. if (ucs_elt->freq > 0) {
  550. rspamd_language_detector_init_ngramm(cfg, d,
  551. nelt, ucs_elt, nsym,
  552. ucs_elt->freq, total, htb);
  553. }
  554. }
  555. #ifdef EXTRA_LANGDET_DEBUG
  556. /* Useful for debug */
  557. for (i = 0; i < 10; i++) {
  558. ucs_elt = g_ptr_array_index(ngramms, i);
  559. msg_debug_lang_det_cfg("%s -> %s: %d", nelt->name,
  560. ucs_elt->utf, ucs_elt->freq);
  561. }
  562. #endif
  563. g_ptr_array_free(ngramms, TRUE);
  564. nelt->mean = mean;
  565. nelt->std = std;
  566. msg_debug_lang_det_cfg("loaded %s language, %d trigrams, "
  567. "%d ngramms loaded; "
  568. "std=%.2f, mean=%.2f, skipped=%d, loaded=%d, stop_words=%d; "
  569. "(%s)",
  570. nelt->name,
  571. (int) nelt->trigrams_words,
  572. total,
  573. std, mean,
  574. skipped, loaded, nelt->stop_words,
  575. rspamd_language_detector_print_flags(nelt));
  576. int ret;
  577. khiter_t k = kh_put(rspamd_languages_hash, d->languages, nelt->name, &ret);
  578. g_assert(ret > 0); /* must be unique */
  579. kh_value(d->languages, k) = nelt;
  580. ucl_object_unref(top);
  581. }
  582. static gboolean
  583. rspamd_ucl_array_find_str(const char *str, const ucl_object_t *ar)
  584. {
  585. ucl_object_iter_t it = NULL;
  586. const ucl_object_t *cur;
  587. if (ar == NULL || ar->len == 0) {
  588. return FALSE;
  589. }
  590. while ((cur = ucl_object_iterate(ar, &it, true)) != NULL) {
  591. if (ucl_object_type(cur) == UCL_STRING && rspamd_strcase_equal(
  592. ucl_object_tostring(cur), str)) {
  593. return TRUE;
  594. }
  595. }
  596. return FALSE;
  597. }
  598. static void
  599. rspamd_language_detector_process_chain(struct rspamd_config *cfg,
  600. struct rspamd_ngramm_chain *chain)
  601. {
  602. struct rspamd_ngramm_elt *elt;
  603. unsigned int i;
  604. double delta, mean = 0, delta2, m2 = 0, std;
  605. if (chain->languages->len > 3) {
  606. PTR_ARRAY_FOREACH(chain->languages, i, elt)
  607. {
  608. delta = elt->prob - mean;
  609. mean += delta / (i + 1);
  610. delta2 = elt->prob - mean;
  611. m2 += delta * delta2;
  612. }
  613. std = sqrt(m2 / (i - 1));
  614. chain->mean = mean;
  615. chain->std = std;
  616. /* Now, filter elements that are lower than mean */
  617. PTR_ARRAY_FOREACH(chain->languages, i, elt)
  618. {
  619. if (elt->prob < mean) {
  620. g_ptr_array_remove_index_fast(chain->languages, i);
  621. #ifdef EXTRA_LANGDET_DEBUG
  622. msg_debug_lang_det_cfg("remove %s from %s; prob: %.4f; mean: %.4f, std: %.4f",
  623. elt->elt->name, chain->utf, elt->prob, mean, std);
  624. #endif
  625. }
  626. }
  627. }
  628. else {
  629. /* We have a unique ngramm, increase its weight */
  630. PTR_ARRAY_FOREACH(chain->languages, i, elt)
  631. {
  632. elt->prob *= 4.0;
  633. #ifdef EXTRA_LANGDET_DEBUG
  634. msg_debug_lang_det_cfg("increase weight of %s in %s; prob: %.4f",
  635. elt->elt->name, chain->utf, elt->prob);
  636. #endif
  637. }
  638. }
  639. }
  640. static void
  641. rspamd_language_detector_dtor(struct rspamd_lang_detector *d)
  642. {
  643. if (d) {
  644. for (unsigned int i = 0; i < RSPAMD_LANGUAGE_MAX; i++) {
  645. kh_destroy(rspamd_trigram_hash, d->trigrams[i]);
  646. rspamd_multipattern_destroy(d->stop_words[i].mp);
  647. g_array_free(d->stop_words[i].ranges, TRUE);
  648. }
  649. if (d->languages) {
  650. kh_destroy(rspamd_languages_hash, d->languages);
  651. }
  652. kh_destroy(rspamd_stopwords_hash, d->stop_words_norm);
  653. rspamd_lang_detection_fasttext_destroy(d->fasttext_detector);
  654. }
  655. }
  656. struct rspamd_lang_detector *
  657. rspamd_language_detector_init(struct rspamd_config *cfg)
  658. {
  659. const ucl_object_t *section, *elt, *languages_enable = NULL,
  660. *languages_disable = NULL;
  661. const char *languages_path = default_languages_path;
  662. glob_t gl;
  663. size_t i, short_text_limit = default_short_text_limit, total = 0;
  664. UErrorCode uc_err = U_ZERO_ERROR;
  665. GString *languages_pattern;
  666. struct rspamd_ngramm_chain *chain, schain;
  667. char *fname;
  668. struct rspamd_lang_detector *ret = NULL;
  669. struct ucl_parser *parser;
  670. ucl_object_t *stop_words;
  671. bool prefer_fasttext = true;
  672. section = ucl_object_lookup(cfg->cfg_ucl_obj, "lang_detection");
  673. if (section != NULL) {
  674. elt = ucl_object_lookup(section, "languages");
  675. if (elt) {
  676. languages_path = ucl_object_tostring(elt);
  677. }
  678. elt = ucl_object_lookup(section, "short_text_limit");
  679. if (elt) {
  680. short_text_limit = ucl_object_toint(elt);
  681. }
  682. languages_enable = ucl_object_lookup(section, "languages_enable");
  683. languages_disable = ucl_object_lookup(section, "languages_disable");
  684. elt = ucl_object_lookup(section, "prefer_fasttext");
  685. if (elt) {
  686. prefer_fasttext = ucl_object_toboolean(elt);
  687. }
  688. }
  689. languages_pattern = g_string_sized_new(PATH_MAX);
  690. rspamd_printf_gstring(languages_pattern, "%s/stop_words", languages_path);
  691. parser = ucl_parser_new(UCL_PARSER_DEFAULT);
  692. if (ucl_parser_add_file(parser, languages_pattern->str)) {
  693. stop_words = ucl_parser_get_object(parser);
  694. }
  695. else {
  696. msg_err_config("cannot read stop words from %s: %s",
  697. languages_pattern->str,
  698. ucl_parser_get_error(parser));
  699. stop_words = NULL;
  700. }
  701. ucl_parser_free(parser);
  702. languages_pattern->len = 0;
  703. rspamd_printf_gstring(languages_pattern, "%s/*.json", languages_path);
  704. memset(&gl, 0, sizeof(gl));
  705. if (glob(languages_pattern->str, 0, NULL, &gl) != 0) {
  706. msg_err_config("cannot read any files matching %v", languages_pattern);
  707. goto end;
  708. }
  709. ret = rspamd_mempool_alloc0(cfg->cfg_pool, sizeof(*ret));
  710. ret->languages = kh_init(rspamd_languages_hash);
  711. kh_resize(rspamd_languages_hash, ret->languages, gl.gl_pathc);
  712. ret->uchar_converter = rspamd_get_utf8_converter();
  713. ret->short_text_limit = short_text_limit;
  714. ret->stop_words_norm = kh_init(rspamd_stopwords_hash);
  715. ret->prefer_fasttext = prefer_fasttext;
  716. /* Map from ngramm in ucs32 to GPtrArray of rspamd_language_elt */
  717. for (i = 0; i < RSPAMD_LANGUAGE_MAX; i++) {
  718. ret->trigrams[i] = kh_init(rspamd_trigram_hash);
  719. #ifdef WITH_HYPERSCAN
  720. ret->stop_words[i].mp = rspamd_multipattern_create(
  721. RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8 |
  722. RSPAMD_MULTIPATTERN_RE);
  723. #else
  724. ret->stop_words[i].mp = rspamd_multipattern_create(
  725. RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8);
  726. #endif
  727. ret->stop_words[i].ranges = g_array_new(FALSE, FALSE,
  728. sizeof(struct rspamd_stop_word_range));
  729. }
  730. g_assert(uc_err == U_ZERO_ERROR);
  731. for (i = 0; i < gl.gl_pathc; i++) {
  732. fname = g_path_get_basename(gl.gl_pathv[i]);
  733. if (!rspamd_ucl_array_find_str(fname, languages_disable) ||
  734. (languages_enable == NULL ||
  735. rspamd_ucl_array_find_str(fname, languages_enable))) {
  736. rspamd_language_detector_read_file(cfg, ret, gl.gl_pathv[i],
  737. stop_words);
  738. }
  739. else {
  740. msg_info_config("skip language file %s: disabled", fname);
  741. }
  742. g_free(fname);
  743. }
  744. for (i = 0; i < RSPAMD_LANGUAGE_MAX; i++) {
  745. GError *err = NULL;
  746. kh_foreach_value(ret->trigrams[i], schain, {
  747. chain = &schain;
  748. rspamd_language_detector_process_chain(cfg, chain);
  749. });
  750. if (!rspamd_multipattern_compile(ret->stop_words[i].mp, 0, &err)) {
  751. msg_err_config("cannot compile stop words for %z language group: %e",
  752. i, err);
  753. g_error_free(err);
  754. }
  755. total += kh_size(ret->trigrams[i]);
  756. }
  757. ret->fasttext_detector = rspamd_lang_detection_fasttext_init(cfg);
  758. char *fasttext_status = rspamd_lang_detection_fasttext_show_info(ret->fasttext_detector);
  759. msg_info_config("loaded %d languages, "
  760. "%d trigrams; %s",
  761. (int) kh_size(ret->languages),
  762. (int) total, fasttext_status);
  763. g_free(fasttext_status);
  764. if (stop_words) {
  765. ucl_object_unref(stop_words);
  766. }
  767. REF_INIT_RETAIN(ret, rspamd_language_detector_dtor);
  768. rspamd_mempool_add_destructor(cfg->cfg_pool,
  769. (rspamd_mempool_destruct_t) rspamd_language_detector_unref,
  770. ret);
  771. end:
  772. if (gl.gl_pathc > 0) {
  773. globfree(&gl);
  774. }
  775. g_string_free(languages_pattern, TRUE);
  776. return ret;
  777. }
  778. static void
  779. rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords,
  780. goffset *offsets_out,
  781. uint64_t *seed)
  782. {
  783. unsigned int step_len, remainder, i, out_idx;
  784. uint64_t coin, sel;
  785. rspamd_stat_token_t *tok;
  786. g_assert(nwords != 0);
  787. g_assert(offsets_out != NULL);
  788. g_assert(ucs_tokens->len >= nwords);
  789. /*
  790. * We split input array into `nwords` parts. For each part we randomly select
  791. * an element from this particular split. Here is an example:
  792. *
  793. * nwords=2, input_len=5
  794. *
  795. * w1 w2 w3 w4 w5
  796. * ^ ^
  797. * part1 part2
  798. * vv vv
  799. * w2 w5
  800. *
  801. * So we have 2 output words from 5 input words selected randomly within
  802. * their splits. It is not uniform distribution but it seems to be better
  803. * to include words from different text parts
  804. */
  805. step_len = ucs_tokens->len / nwords;
  806. remainder = ucs_tokens->len % nwords;
  807. out_idx = 0;
  808. coin = rspamd_random_uint64_fast_seed(seed);
  809. sel = coin % (step_len + remainder);
  810. offsets_out[out_idx] = sel;
  811. for (i = step_len + remainder; i < ucs_tokens->len;
  812. i += step_len, out_idx++) {
  813. unsigned int ntries = 0;
  814. coin = rspamd_random_uint64_fast_seed(seed);
  815. sel = (coin % step_len) + i;
  816. for (;;) {
  817. tok = &g_array_index(ucs_tokens, rspamd_stat_token_t, sel);
  818. /* Filter bad tokens */
  819. if (tok->unicode.len >= 2 &&
  820. !(tok->flags & RSPAMD_STAT_TOKEN_FLAG_EXCEPTION) &&
  821. u_isalpha(tok->unicode.begin[0]) &&
  822. u_isalpha(tok->unicode.begin[tok->unicode.len - 1])) {
  823. offsets_out[out_idx] = sel;
  824. break;
  825. }
  826. else {
  827. ntries++;
  828. coin = rspamd_random_uint64_fast_seed(seed);
  829. if (ntries < step_len) {
  830. sel = (coin % step_len) + i;
  831. }
  832. else if (ntries < ucs_tokens->len) {
  833. sel = coin % ucs_tokens->len;
  834. }
  835. else {
  836. offsets_out[out_idx] = sel;
  837. break;
  838. }
  839. }
  840. }
  841. }
  842. /*
  843. * Fisher-Yates algorithm:
  844. * for i from 0 to n−2 do
  845. * j ← random integer such that i ≤ j < n
  846. * exchange a[i] and a[j]
  847. */
  848. #if 0
  849. if (out_idx > 2) {
  850. for (i = 0; i < out_idx - 2; i++) {
  851. coin = rspamd_random_uint64_fast ();
  852. sel = (coin % (out_idx - i)) + i;
  853. /* swap */
  854. tmp = offsets_out[i];
  855. offsets_out[i] = offsets_out[sel];
  856. offsets_out[sel] = tmp;
  857. }
  858. }
  859. #endif
  860. }
  861. static goffset
  862. rspamd_language_detector_next_ngramm(rspamd_stat_token_t *tok, UChar32 *window,
  863. unsigned int wlen, goffset cur_off)
  864. {
  865. unsigned int i;
  866. if (wlen > 1) {
  867. /* Deal with spaces at the beginning and ending */
  868. if (cur_off == 0) {
  869. window[0] = (UChar32) ' ';
  870. for (i = 0; i < wlen - 1; i++) {
  871. window[i + 1] = tok->unicode.begin[i];
  872. }
  873. }
  874. else if (cur_off + wlen == tok->unicode.len + 1) {
  875. /* Add trailing space */
  876. for (i = 0; i < wlen - 1; i++) {
  877. window[i] = tok->unicode.begin[cur_off + i];
  878. }
  879. window[wlen - 1] = (UChar32) ' ';
  880. }
  881. else if (cur_off + wlen > tok->unicode.len + 1) {
  882. /* No more fun */
  883. return -1;
  884. }
  885. else {
  886. /* Normal case */
  887. for (i = 0; i < wlen; i++) {
  888. window[i] = tok->unicode.begin[cur_off + i];
  889. }
  890. }
  891. }
  892. else {
  893. if (tok->normalized.len <= cur_off) {
  894. return -1;
  895. }
  896. window[0] = tok->unicode.begin[cur_off];
  897. }
  898. return cur_off + 1;
  899. }
  900. /*
  901. * Do full guess for a specific ngramm, checking all languages defined
  902. */
  903. static void
  904. rspamd_language_detector_process_ngramm_full(struct rspamd_task *task,
  905. struct rspamd_lang_detector *d,
  906. UChar32 *window,
  907. khash_t(rspamd_candidates_hash) * candidates,
  908. khash_t(rspamd_trigram_hash) * trigrams)
  909. {
  910. unsigned int i;
  911. int ret;
  912. struct rspamd_ngramm_chain *chain = NULL;
  913. struct rspamd_ngramm_elt *elt;
  914. struct rspamd_lang_detector_res *cand;
  915. khiter_t k;
  916. double prob;
  917. k = kh_get(rspamd_trigram_hash, trigrams, window);
  918. if (k != kh_end(trigrams)) {
  919. chain = &kh_value(trigrams, k);
  920. }
  921. if (chain) {
  922. PTR_ARRAY_FOREACH(chain->languages, i, elt)
  923. {
  924. prob = elt->prob;
  925. if (prob < chain->mean) {
  926. continue;
  927. }
  928. k = kh_get(rspamd_candidates_hash, candidates, elt->elt->name);
  929. if (k != kh_end(candidates)) {
  930. cand = kh_value(candidates, k);
  931. }
  932. else {
  933. cand = NULL;
  934. }
  935. #ifdef NGRAMMS_DEBUG
  936. msg_err("gramm: %s, lang: %s, prob: %.3f", chain->utf,
  937. elt->elt->name, log2(elt->prob));
  938. #endif
  939. if (cand == NULL) {
  940. cand = rspamd_mempool_alloc(task->task_pool, sizeof(*cand));
  941. cand->elt = elt->elt;
  942. cand->lang = elt->elt->name;
  943. cand->prob = prob;
  944. k = kh_put(rspamd_candidates_hash, candidates, elt->elt->name,
  945. &ret);
  946. kh_value(candidates, k) = cand;
  947. }
  948. else {
  949. /* Update guess */
  950. cand->prob += prob;
  951. }
  952. }
  953. }
  954. }
  955. static void
  956. rspamd_language_detector_detect_word(struct rspamd_task *task,
  957. struct rspamd_lang_detector *d,
  958. rspamd_stat_token_t *tok,
  959. khash_t(rspamd_candidates_hash) * candidates,
  960. khash_t(rspamd_trigram_hash) * trigrams)
  961. {
  962. const unsigned int wlen = 3;
  963. UChar32 window[3];
  964. goffset cur = 0;
  965. /* Split words */
  966. while ((cur = rspamd_language_detector_next_ngramm(tok, window, wlen, cur)) != -1) {
  967. rspamd_language_detector_process_ngramm_full(task,
  968. d, window, candidates, trigrams);
  969. }
  970. }
  971. static const double cutoff_limit = -8.0;
  972. /*
  973. * Converts frequencies to log probabilities, filter those candidates who
  974. * has the lowest probabilities
  975. */
  976. static inline void
  977. rspamd_language_detector_filter_step1(struct rspamd_task *task,
  978. struct rspamd_lang_detector_res *cand,
  979. double *max_prob, unsigned int *filtered)
  980. {
  981. if (!isnan(cand->prob)) {
  982. if (cand->prob == 0) {
  983. cand->prob = NAN;
  984. msg_debug_lang_det(
  985. "exclude language %s",
  986. cand->lang);
  987. (*filtered)++;
  988. }
  989. else {
  990. cand->prob = log2(cand->prob);
  991. if (cand->prob < cutoff_limit) {
  992. msg_debug_lang_det(
  993. "exclude language %s: %.3f, cutoff limit: %.3f",
  994. cand->lang, cand->prob, cutoff_limit);
  995. cand->prob = NAN;
  996. (*filtered)++;
  997. }
  998. else if (cand->prob > *max_prob) {
  999. *max_prob = cand->prob;
  1000. }
  1001. }
  1002. }
  1003. }
  1004. static inline void
  1005. rspamd_language_detector_filter_step2(struct rspamd_task *task,
  1006. struct rspamd_lang_detector_res *cand,
  1007. double max_prob, unsigned int *filtered)
  1008. {
  1009. /*
  1010. * Probabilities are logarithmic, so if prob1 - prob2 > 4, it means that
  1011. * prob2 is 2^4 less than prob1
  1012. */
  1013. if (!isnan(cand->prob) && max_prob - cand->prob > 1) {
  1014. msg_debug_lang_det("exclude language %s: %.3f (%.3f max)",
  1015. cand->lang, cand->prob, max_prob);
  1016. cand->prob = NAN;
  1017. (*filtered)++;
  1018. }
  1019. }
  1020. static void
  1021. rspamd_language_detector_filter_negligible(struct rspamd_task *task,
  1022. khash_t(rspamd_candidates_hash) * candidates)
  1023. {
  1024. struct rspamd_lang_detector_res *cand;
  1025. unsigned int filtered = 0;
  1026. double max_prob = -(G_MAXDOUBLE);
  1027. kh_foreach_value(candidates, cand,
  1028. rspamd_language_detector_filter_step1(task, cand, &max_prob, &filtered));
  1029. kh_foreach_value(candidates, cand,
  1030. rspamd_language_detector_filter_step2(task, cand, max_prob, &filtered));
  1031. msg_debug_lang_det("removed %d languages", filtered);
  1032. }
  1033. static void
  1034. rspamd_language_detector_detect_type(struct rspamd_task *task,
  1035. unsigned int nwords,
  1036. struct rspamd_lang_detector *d,
  1037. GArray *words,
  1038. enum rspamd_language_category cat,
  1039. khash_t(rspamd_candidates_hash) * candidates,
  1040. struct rspamd_mime_text_part *part)
  1041. {
  1042. unsigned int nparts = MIN(words->len, nwords);
  1043. goffset *selected_words;
  1044. rspamd_stat_token_t *tok;
  1045. unsigned int i;
  1046. uint64_t seed;
  1047. /* Seed PRNG with part digest to provide some sort of determinism */
  1048. memcpy(&seed, part->mime_part->digest, sizeof(seed));
  1049. selected_words = g_new0(goffset, nparts);
  1050. rspamd_language_detector_random_select(words, nparts, selected_words, &seed);
  1051. msg_debug_lang_det("randomly selected %d words", nparts);
  1052. for (i = 0; i < nparts; i++) {
  1053. tok = &g_array_index(words, rspamd_stat_token_t,
  1054. selected_words[i]);
  1055. if (tok->unicode.len >= 3) {
  1056. rspamd_language_detector_detect_word(task, d, tok, candidates,
  1057. d->trigrams[cat]);
  1058. }
  1059. }
  1060. /* Filter negligible candidates */
  1061. rspamd_language_detector_filter_negligible(task, candidates);
  1062. g_free(selected_words);
  1063. }
  1064. static int
  1065. rspamd_language_detector_cmp(gconstpointer a, gconstpointer b)
  1066. {
  1067. const struct rspamd_lang_detector_res
  1068. *canda = *(const struct rspamd_lang_detector_res **) a,
  1069. *candb = *(const struct rspamd_lang_detector_res **) b;
  1070. if (canda->prob > candb->prob) {
  1071. return -1;
  1072. }
  1073. else if (candb->prob > canda->prob) {
  1074. return 1;
  1075. }
  1076. return 0;
  1077. }
  1078. enum rspamd_language_detected_type {
  1079. rs_detect_none = 0,
  1080. rs_detect_single,
  1081. rs_detect_multiple,
  1082. };
  1083. static enum rspamd_language_detected_type
  1084. rspamd_language_detector_try_ngramm(struct rspamd_task *task,
  1085. unsigned int nwords,
  1086. struct rspamd_lang_detector *d,
  1087. GArray *ucs_tokens,
  1088. enum rspamd_language_category cat,
  1089. khash_t(rspamd_candidates_hash) * candidates,
  1090. struct rspamd_mime_text_part *part)
  1091. {
  1092. unsigned int cand_len = 0;
  1093. struct rspamd_lang_detector_res *cand;
  1094. rspamd_language_detector_detect_type(task,
  1095. nwords,
  1096. d,
  1097. ucs_tokens,
  1098. cat,
  1099. candidates,
  1100. part);
  1101. kh_foreach_value(candidates, cand, {
  1102. if (!isnan(cand->prob)) {
  1103. cand_len++;
  1104. }
  1105. });
  1106. if (cand_len == 0) {
  1107. return rs_detect_none;
  1108. }
  1109. else if (cand_len == 1) {
  1110. return rs_detect_single;
  1111. }
  1112. return rs_detect_multiple;
  1113. }
  1114. enum rspamd_language_sort_flags {
  1115. RSPAMD_LANG_FLAG_DEFAULT = 0,
  1116. RSPAMD_LANG_FLAG_SHORT = 1 << 0,
  1117. };
  1118. struct rspamd_frequency_sort_cbdata {
  1119. struct rspamd_lang_detector *d;
  1120. enum rspamd_language_sort_flags flags;
  1121. double std;
  1122. double mean;
  1123. };
  1124. static const double tier0_adjustment = 1.2;
  1125. static const double tier1_adjustment = 0.8;
  1126. static const double frequency_adjustment = 0.8;
  1127. static int
  1128. rspamd_language_detector_cmp_heuristic(gconstpointer a, gconstpointer b,
  1129. gpointer ud)
  1130. {
  1131. struct rspamd_frequency_sort_cbdata *cbd = ud;
  1132. struct rspamd_lang_detector_res
  1133. *canda = *(struct rspamd_lang_detector_res **) a,
  1134. *candb = *(struct rspamd_lang_detector_res **) b;
  1135. double adj;
  1136. double proba_adjusted, probb_adjusted, freqa, freqb;
  1137. if (cbd->d->total_occurrences == 0) {
  1138. /* Not enough data, compare directly */
  1139. return rspamd_language_detector_cmp(a, b);
  1140. }
  1141. freqa = ((double) canda->elt->occurrences) /
  1142. (double) cbd->d->total_occurrences;
  1143. freqb = ((double) candb->elt->occurrences) /
  1144. (double) cbd->d->total_occurrences;
  1145. proba_adjusted = canda->prob;
  1146. probb_adjusted = candb->prob;
  1147. if (isnormal(freqa) && isnormal(freqb)) {
  1148. proba_adjusted += cbd->std * (frequency_adjustment * freqa);
  1149. probb_adjusted += cbd->std * (frequency_adjustment * freqb);
  1150. }
  1151. if (cbd->flags & RSPAMD_LANG_FLAG_SHORT) {
  1152. adj = tier1_adjustment * 2.0;
  1153. }
  1154. else {
  1155. adj = tier1_adjustment;
  1156. }
  1157. if (canda->elt->flags & RS_LANGUAGE_TIER1) {
  1158. proba_adjusted += cbd->std * adj;
  1159. }
  1160. if (candb->elt->flags & RS_LANGUAGE_TIER1) {
  1161. probb_adjusted += cbd->std * adj;
  1162. }
  1163. if (cbd->flags & RSPAMD_LANG_FLAG_SHORT) {
  1164. adj = tier0_adjustment * 16.0;
  1165. }
  1166. else {
  1167. adj = tier0_adjustment;
  1168. }
  1169. if (canda->elt->flags & RS_LANGUAGE_TIER0) {
  1170. proba_adjusted += cbd->std * adj;
  1171. }
  1172. if (candb->elt->flags & RS_LANGUAGE_TIER0) {
  1173. probb_adjusted += cbd->std * adj;
  1174. }
  1175. /* Hack: adjust probability directly */
  1176. canda->prob = proba_adjusted;
  1177. candb->prob = probb_adjusted;
  1178. if (proba_adjusted > probb_adjusted) {
  1179. return -1;
  1180. }
  1181. else if (probb_adjusted > proba_adjusted) {
  1182. return 1;
  1183. }
  1184. return 0;
  1185. }
  1186. static void
  1187. rspamd_language_detector_unicode_scripts(struct rspamd_task *task,
  1188. struct rspamd_mime_text_part *part,
  1189. unsigned int *pchinese,
  1190. unsigned int *pspecial)
  1191. {
  1192. const char *p = part->utf_stripped_content->data, *end;
  1193. unsigned int i = 0, cnt = 0;
  1194. end = p + part->utf_stripped_content->len;
  1195. int32_t uc, sc;
  1196. unsigned int nlatin = 0, nchinese = 0, nspecial = 0;
  1197. const unsigned int cutoff_limit = 32;
  1198. while (p + i < end) {
  1199. U8_NEXT(p, i, part->utf_stripped_content->len, uc);
  1200. if (((int32_t) uc) < 0) {
  1201. break;
  1202. }
  1203. if (u_isalpha(uc)) {
  1204. sc = ublock_getCode(uc);
  1205. cnt++;
  1206. switch (sc) {
  1207. case UBLOCK_BASIC_LATIN:
  1208. case UBLOCK_LATIN_1_SUPPLEMENT:
  1209. part->unicode_scripts |= RSPAMD_UNICODE_LATIN;
  1210. nlatin++;
  1211. break;
  1212. case UBLOCK_HEBREW:
  1213. part->unicode_scripts |= RSPAMD_UNICODE_HEBREW;
  1214. nspecial++;
  1215. break;
  1216. case UBLOCK_GREEK:
  1217. part->unicode_scripts |= RSPAMD_UNICODE_GREEK;
  1218. nspecial++;
  1219. break;
  1220. case UBLOCK_CYRILLIC:
  1221. part->unicode_scripts |= RSPAMD_UNICODE_CYRILLIC;
  1222. nspecial++;
  1223. break;
  1224. case UBLOCK_CJK_UNIFIED_IDEOGRAPHS:
  1225. case UBLOCK_CJK_COMPATIBILITY:
  1226. case UBLOCK_CJK_RADICALS_SUPPLEMENT:
  1227. case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A:
  1228. case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B:
  1229. part->unicode_scripts |= RSPAMD_UNICODE_CJK;
  1230. nchinese++;
  1231. break;
  1232. case UBLOCK_HIRAGANA:
  1233. case UBLOCK_KATAKANA:
  1234. part->unicode_scripts |= RSPAMD_UNICODE_JP;
  1235. nspecial++;
  1236. break;
  1237. case UBLOCK_HANGUL_JAMO:
  1238. case UBLOCK_HANGUL_COMPATIBILITY_JAMO:
  1239. part->unicode_scripts |= RSPAMD_UNICODE_HANGUL;
  1240. nspecial++;
  1241. break;
  1242. case UBLOCK_ARABIC:
  1243. part->unicode_scripts |= RSPAMD_UNICODE_ARABIC;
  1244. nspecial++;
  1245. break;
  1246. case UBLOCK_DEVANAGARI:
  1247. part->unicode_scripts |= RSPAMD_UNICODE_DEVANAGARI;
  1248. nspecial++;
  1249. break;
  1250. case UBLOCK_ARMENIAN:
  1251. part->unicode_scripts |= RSPAMD_UNICODE_ARMENIAN;
  1252. nspecial++;
  1253. break;
  1254. case UBLOCK_GEORGIAN:
  1255. part->unicode_scripts |= RSPAMD_UNICODE_GEORGIAN;
  1256. nspecial++;
  1257. break;
  1258. case UBLOCK_GUJARATI:
  1259. part->unicode_scripts |= RSPAMD_UNICODE_GUJARATI;
  1260. nspecial++;
  1261. break;
  1262. case UBLOCK_TELUGU:
  1263. part->unicode_scripts |= RSPAMD_UNICODE_TELUGU;
  1264. nspecial++;
  1265. break;
  1266. case UBLOCK_TAMIL:
  1267. part->unicode_scripts |= RSPAMD_UNICODE_TAMIL;
  1268. nspecial++;
  1269. break;
  1270. case UBLOCK_THAI:
  1271. part->unicode_scripts |= RSPAMD_UNICODE_THAI;
  1272. nspecial++;
  1273. break;
  1274. case RSPAMD_UNICODE_MALAYALAM:
  1275. part->unicode_scripts |= RSPAMD_UNICODE_MALAYALAM;
  1276. nspecial++;
  1277. break;
  1278. case RSPAMD_UNICODE_SINHALA:
  1279. part->unicode_scripts |= RSPAMD_UNICODE_SINHALA;
  1280. nspecial++;
  1281. break;
  1282. }
  1283. }
  1284. if (nspecial > cutoff_limit && nspecial > nlatin) {
  1285. break;
  1286. }
  1287. else if (nchinese > cutoff_limit && nchinese > nlatin) {
  1288. if (nspecial > 0) {
  1289. /* Likely japanese */
  1290. break;
  1291. }
  1292. }
  1293. }
  1294. msg_debug_lang_det("stop after checking %d characters, "
  1295. "%d latin, %d special, %d chinese",
  1296. cnt, nlatin, nspecial, nchinese);
  1297. *pchinese = nchinese;
  1298. *pspecial = nspecial;
  1299. }
  1300. static inline void
  1301. rspamd_language_detector_set_language(struct rspamd_task *task,
  1302. struct rspamd_mime_text_part *part,
  1303. const char *code,
  1304. struct rspamd_language_elt *elt)
  1305. {
  1306. struct rspamd_lang_detector_res *r;
  1307. r = rspamd_mempool_alloc0(task->task_pool, sizeof(*r));
  1308. r->prob = 1.0;
  1309. r->lang = code;
  1310. r->elt = elt;
  1311. if (part->languages == NULL) {
  1312. part->languages = g_ptr_array_sized_new(1);
  1313. }
  1314. g_ptr_array_add(part->languages, r);
  1315. part->language = code;
  1316. }
  1317. static gboolean
  1318. rspamd_language_detector_try_uniscript(struct rspamd_task *task,
  1319. struct rspamd_mime_text_part *part,
  1320. unsigned int nchinese,
  1321. unsigned int nspecial)
  1322. {
  1323. unsigned int i;
  1324. for (i = 0; i < G_N_ELEMENTS(unicode_langs); i++) {
  1325. if (unicode_langs[i].unicode_code & part->unicode_scripts) {
  1326. if (unicode_langs[i].unicode_code != RSPAMD_UNICODE_JP) {
  1327. msg_debug_lang_det("set language based on unicode script %s",
  1328. unicode_langs[i].lang);
  1329. rspamd_language_detector_set_language(task, part,
  1330. unicode_langs[i].lang, NULL);
  1331. return TRUE;
  1332. }
  1333. else {
  1334. /* Japanese <-> Chinese guess */
  1335. /*
  1336. * Typically there might be around 0-70% of kanji glyphs
  1337. * and the rest are Haragana/Katakana
  1338. *
  1339. * If we discover that Kanji is more than 80% then we consider
  1340. * it Chinese
  1341. */
  1342. if (nchinese <= 5 || nchinese < nspecial * 5) {
  1343. msg_debug_lang_det("set language based on unicode script %s",
  1344. unicode_langs[i].lang);
  1345. rspamd_language_detector_set_language(task, part,
  1346. unicode_langs[i].lang, NULL);
  1347. return TRUE;
  1348. }
  1349. }
  1350. }
  1351. }
  1352. if (part->unicode_scripts & RSPAMD_UNICODE_CJK) {
  1353. msg_debug_lang_det("guess chinese based on CJK characters: %d chinese, %d special",
  1354. nchinese, nspecial);
  1355. rspamd_language_detector_set_language(task, part,
  1356. "zh-CN", NULL);
  1357. return TRUE;
  1358. }
  1359. return FALSE;
  1360. }
  1361. static unsigned int
  1362. rspamd_langelt_hash_func(gconstpointer key)
  1363. {
  1364. const struct rspamd_language_elt *elt = (const struct rspamd_language_elt *) key;
  1365. return rspamd_cryptobox_fast_hash(elt->name, strlen(elt->name),
  1366. rspamd_hash_seed());
  1367. }
  1368. static gboolean
  1369. rspamd_langelt_equal_func(gconstpointer v, gconstpointer v2)
  1370. {
  1371. const struct rspamd_language_elt *elt1 = (const struct rspamd_language_elt *) v,
  1372. *elt2 = (const struct rspamd_language_elt *) v2;
  1373. return strcmp(elt1->name, elt2->name) == 0;
  1374. }
  1375. /* This hash set stores a word index in the language to avoid duplicate stop words */
  1376. KHASH_INIT(rspamd_sw_res_set, int, char, 0, kh_int_hash_func, kh_int_hash_equal);
  1377. KHASH_INIT(rspamd_sw_hash, struct rspamd_language_elt *, khash_t(rspamd_sw_res_set) *, 1,
  1378. rspamd_langelt_hash_func, rspamd_langelt_equal_func);
  1379. struct rspamd_sw_cbdata {
  1380. struct rspamd_task *task;
  1381. khash_t(rspamd_sw_hash) * res;
  1382. GArray *ranges;
  1383. };
  1384. static int
  1385. rspamd_ranges_cmp(const void *k, const void *memb)
  1386. {
  1387. int pos = GPOINTER_TO_INT(k);
  1388. const struct rspamd_stop_word_range *r = (struct rspamd_stop_word_range *) memb;
  1389. if (pos >= r->start && pos < r->stop) {
  1390. return 0;
  1391. }
  1392. else if (pos < r->start) {
  1393. return -1;
  1394. }
  1395. return 1;
  1396. }
  1397. static int
  1398. rspamd_language_detector_sw_cb(struct rspamd_multipattern *mp,
  1399. unsigned int strnum,
  1400. int match_start,
  1401. int match_pos,
  1402. const char *text,
  1403. gsize len,
  1404. void *context)
  1405. {
  1406. /* Check if boundary */
  1407. const char *prev = text, *next = text + len;
  1408. struct rspamd_stop_word_range *r;
  1409. struct rspamd_sw_cbdata *cbdata = (struct rspamd_sw_cbdata *) context;
  1410. khiter_t k;
  1411. static const gsize max_stop_words = 80;
  1412. struct rspamd_task *task;
  1413. if (match_start > 0) {
  1414. prev = text + match_start - 1;
  1415. if (!(g_ascii_isspace(*prev) || g_ascii_ispunct(*prev))) {
  1416. return 0;
  1417. }
  1418. }
  1419. if (match_pos < len) {
  1420. next = text + match_pos;
  1421. if (!(g_ascii_isspace(*next) || g_ascii_ispunct(*next))) {
  1422. return 0;
  1423. }
  1424. }
  1425. /* We have a word on the boundary, check range */
  1426. task = cbdata->task;
  1427. r = bsearch(GINT_TO_POINTER(strnum), cbdata->ranges->data,
  1428. cbdata->ranges->len, sizeof(*r), rspamd_ranges_cmp);
  1429. g_assert(r != NULL);
  1430. k = kh_get(rspamd_sw_hash, cbdata->res, r->elt);
  1431. int nwords = 1;
  1432. if (k != kh_end(cbdata->res)) {
  1433. khiter_t set_k;
  1434. int tt;
  1435. set_k = kh_get(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum);
  1436. nwords = kh_size(kh_value(cbdata->res, k));
  1437. if (set_k == kh_end(kh_value(cbdata->res, k))) {
  1438. /* New word */
  1439. set_k = kh_put(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum, &tt);
  1440. msg_debug_lang_det("found new word %*s from %s language (%d stop words found so far)",
  1441. (int) (next - prev - 1), prev + 1, r->elt->name, nwords);
  1442. }
  1443. if (nwords > max_stop_words) {
  1444. return 1;
  1445. }
  1446. }
  1447. else {
  1448. int tt;
  1449. k = kh_put(rspamd_sw_hash, cbdata->res, r->elt, &tt);
  1450. kh_value(cbdata->res, k) = kh_init(rspamd_sw_res_set);
  1451. kh_put(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum, &tt);
  1452. msg_debug_lang_det("found new word %*s from %s language (%d stop words found so far)",
  1453. (int) (next - prev - 1), prev + 1, r->elt->name, nwords);
  1454. }
  1455. return 0;
  1456. }
  1457. static gboolean
  1458. rspamd_language_detector_try_stop_words(struct rspamd_task *task,
  1459. struct rspamd_lang_detector *d,
  1460. struct rspamd_mime_text_part *part,
  1461. enum rspamd_language_category cat)
  1462. {
  1463. struct rspamd_stop_word_elt *elt;
  1464. struct rspamd_sw_cbdata cbdata;
  1465. gboolean ret = FALSE;
  1466. static const int stop_words_threshold = 4, /* minimum stop words count */
  1467. strong_confidence_threshold = 10 /* we are sure that this is enough */;
  1468. elt = &d->stop_words[cat];
  1469. cbdata.res = kh_init(rspamd_sw_hash);
  1470. cbdata.ranges = elt->ranges;
  1471. cbdata.task = task;
  1472. rspamd_multipattern_lookup(elt->mp, part->utf_stripped_content->data,
  1473. part->utf_stripped_content->len, rspamd_language_detector_sw_cb,
  1474. &cbdata, NULL);
  1475. if (kh_size(cbdata.res) > 0) {
  1476. khash_t(rspamd_sw_res_set) * cur_res;
  1477. double max_rate = G_MINDOUBLE;
  1478. struct rspamd_language_elt *cur_lang, *sel = NULL;
  1479. gboolean ignore_ascii = FALSE, ignore_latin = FALSE;
  1480. again:
  1481. kh_foreach(cbdata.res, cur_lang, cur_res, {
  1482. int cur_matches = kh_size(cur_res);
  1483. if (!ignore_ascii && (cur_lang->flags & RS_LANGUAGE_DIACRITICS)) {
  1484. /* Restart matches */
  1485. ignore_ascii = TRUE;
  1486. sel = NULL;
  1487. max_rate = G_MINDOUBLE;
  1488. msg_debug_lang_det("ignore ascii after finding %d stop words from %s",
  1489. cur_matches, cur_lang->name);
  1490. goto again;
  1491. }
  1492. if (!ignore_latin && cur_lang->category != RSPAMD_LANGUAGE_LATIN) {
  1493. /* Restart matches */
  1494. ignore_latin = TRUE;
  1495. sel = NULL;
  1496. max_rate = G_MINDOUBLE;
  1497. msg_debug_lang_det("ignore latin after finding stop %d words from %s",
  1498. cur_matches, cur_lang->name);
  1499. goto again;
  1500. }
  1501. if (cur_matches < stop_words_threshold) {
  1502. continue;
  1503. }
  1504. if (cur_matches < strong_confidence_threshold) {
  1505. /* Ignore mixed languages when not enough confidence */
  1506. if (ignore_ascii && (cur_lang->flags & RS_LANGUAGE_ASCII)) {
  1507. continue;
  1508. }
  1509. if (ignore_latin && cur_lang->category == RSPAMD_LANGUAGE_LATIN) {
  1510. continue;
  1511. }
  1512. }
  1513. double rate = (double) cur_matches / (double) cur_lang->stop_words;
  1514. if (rate > max_rate) {
  1515. max_rate = rate;
  1516. sel = cur_lang;
  1517. }
  1518. msg_debug_lang_det("found %d stop words from %s: %3f rate",
  1519. cur_matches, cur_lang->name, rate);
  1520. });
  1521. /* Cleanup */
  1522. kh_foreach(cbdata.res, cur_lang, cur_res, {
  1523. kh_destroy(rspamd_sw_res_set, cur_res);
  1524. });
  1525. if (max_rate > 0 && sel) {
  1526. msg_debug_lang_det("set language based on stop words script %s, %.3f found",
  1527. sel->name, max_rate);
  1528. rspamd_language_detector_set_language(task, part,
  1529. sel->name, sel);
  1530. ret = TRUE;
  1531. }
  1532. }
  1533. else {
  1534. msg_debug_lang_det("found no stop words in a text");
  1535. }
  1536. kh_destroy(rspamd_sw_hash, cbdata.res);
  1537. return ret;
  1538. }
  1539. gboolean
  1540. rspamd_language_detector_detect(struct rspamd_task *task,
  1541. struct rspamd_lang_detector *d,
  1542. struct rspamd_mime_text_part *part)
  1543. {
  1544. khash_t(rspamd_candidates_hash) * candidates;
  1545. GPtrArray *result;
  1546. double mean, std, start_ticks, end_ticks;
  1547. unsigned int cand_len;
  1548. enum rspamd_language_category cat;
  1549. struct rspamd_lang_detector_res *cand;
  1550. enum rspamd_language_detected_type r;
  1551. struct rspamd_frequency_sort_cbdata cbd;
  1552. /* Check if we have sorted candidates based on frequency */
  1553. gboolean frequency_heuristic_applied = FALSE, ret = FALSE;
  1554. if (!part->utf_stripped_content) {
  1555. return FALSE;
  1556. }
  1557. start_ticks = rspamd_get_ticks(TRUE);
  1558. unsigned int nchinese = 0, nspecial = 0;
  1559. rspamd_language_detector_unicode_scripts(task, part, &nchinese, &nspecial);
  1560. /* Disable internal language detection heuristics if we have fasttext */
  1561. if (!rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector) || !d->prefer_fasttext) {
  1562. /* Apply unicode scripts heuristic */
  1563. if (rspamd_language_detector_try_uniscript(task, part, nchinese, nspecial)) {
  1564. ret = TRUE;
  1565. }
  1566. cat = rspamd_language_detector_get_category(part->unicode_scripts);
  1567. if (!ret && rspamd_language_detector_try_stop_words(task, d, part, cat)) {
  1568. ret = TRUE;
  1569. }
  1570. }
  1571. if (!ret) {
  1572. unsigned ndetected = 0;
  1573. if (rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector)) {
  1574. rspamd_fasttext_predict_result_t fasttext_predict_result =
  1575. rspamd_lang_detection_fasttext_detect(d->fasttext_detector, task,
  1576. part->utf_words, 4);
  1577. ndetected = rspamd_lang_detection_fasttext_get_nlangs(fasttext_predict_result);
  1578. if (ndetected > 0) {
  1579. candidates = kh_init(rspamd_candidates_hash);
  1580. kh_resize(rspamd_candidates_hash, candidates, ndetected);
  1581. /* Now fill all results where probability is above threshold */
  1582. float max_prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, 0);
  1583. for (unsigned int i = 0; i < ndetected; i++) {
  1584. float prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, i);
  1585. if (prob > max_prob * 0.75) {
  1586. char *lang = rspamd_mempool_strdup(task->task_pool,
  1587. rspamd_lang_detection_fasttext_get_lang(fasttext_predict_result, i));
  1588. int tmp;
  1589. khiter_t k = kh_put(rspamd_candidates_hash, candidates, lang, &tmp);
  1590. kh_value(candidates, k) = rspamd_mempool_alloc0(task->task_pool, sizeof(*cand));
  1591. cand = kh_value(candidates, k);
  1592. cand->lang = lang;
  1593. cand->prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, i);
  1594. /* Find the corresponding language elt */
  1595. k = kh_get(rspamd_languages_hash, d->languages, lang);
  1596. if (k != kh_end(d->languages)) {
  1597. cand->elt = kh_value(d->languages, k);
  1598. }
  1599. }
  1600. }
  1601. if (kh_size(candidates) == 1) {
  1602. r = rs_detect_single;
  1603. }
  1604. else if (kh_size(candidates) > 1) {
  1605. r = rs_detect_multiple;
  1606. }
  1607. else {
  1608. r = rs_detect_none;
  1609. }
  1610. }
  1611. rspamd_fasttext_predict_result_destroy(fasttext_predict_result);
  1612. }
  1613. if (ndetected == 0) {
  1614. if (part->utf_words->len < default_short_text_limit) {
  1615. r = rs_detect_none;
  1616. msg_debug_lang_det("text is too short for trigrams detection: "
  1617. "%d words; at least %d words required",
  1618. (int) part->utf_words->len,
  1619. (int) default_short_text_limit);
  1620. switch (cat) {
  1621. case RSPAMD_LANGUAGE_CYRILLIC:
  1622. rspamd_language_detector_set_language(task, part, "ru", NULL);
  1623. break;
  1624. case RSPAMD_LANGUAGE_DEVANAGARI:
  1625. rspamd_language_detector_set_language(task, part, "hi", NULL);
  1626. break;
  1627. case RSPAMD_LANGUAGE_ARAB:
  1628. rspamd_language_detector_set_language(task, part, "ar", NULL);
  1629. break;
  1630. default:
  1631. case RSPAMD_LANGUAGE_LATIN:
  1632. rspamd_language_detector_set_language(task, part, "en", NULL);
  1633. break;
  1634. }
  1635. msg_debug_lang_det("set %s language based on symbols category",
  1636. part->language);
  1637. candidates = kh_init(rspamd_candidates_hash);
  1638. }
  1639. else {
  1640. candidates = kh_init(rspamd_candidates_hash);
  1641. kh_resize(rspamd_candidates_hash, candidates, 32);
  1642. r = rspamd_language_detector_try_ngramm(task,
  1643. default_words,
  1644. d,
  1645. part->utf_words,
  1646. cat,
  1647. candidates,
  1648. part);
  1649. if (r == rs_detect_none) {
  1650. msg_debug_lang_det("no trigrams found, fallback to english");
  1651. rspamd_language_detector_set_language(task, part, "en", NULL);
  1652. }
  1653. else if (r == rs_detect_multiple) {
  1654. /* Check our guess */
  1655. mean = 0.0;
  1656. std = 0.0;
  1657. cand_len = 0;
  1658. /* Check distribution */
  1659. kh_foreach_value(candidates, cand, {
  1660. if (!isnan(cand->prob)) {
  1661. mean += cand->prob;
  1662. cand_len++;
  1663. }
  1664. });
  1665. if (cand_len > 0) {
  1666. mean /= cand_len;
  1667. kh_foreach_value(candidates, cand, {
  1668. double err;
  1669. if (!isnan(cand->prob)) {
  1670. err = cand->prob - mean;
  1671. std += fabs(err);
  1672. }
  1673. });
  1674. std /= cand_len;
  1675. }
  1676. msg_debug_lang_det("trigrams checked, %d candidates, %.3f mean, %.4f stddev",
  1677. cand_len, mean, std);
  1678. if (cand_len > 0 && std / fabs(mean) < 0.25) {
  1679. msg_debug_lang_det("apply frequency heuristic sorting");
  1680. frequency_heuristic_applied = TRUE;
  1681. cbd.d = d;
  1682. cbd.mean = mean;
  1683. cbd.std = std;
  1684. cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
  1685. if (part->nwords < default_words / 2) {
  1686. cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
  1687. }
  1688. }
  1689. }
  1690. }
  1691. }
  1692. /* Now, convert hash to array and sort it */
  1693. if (r != rs_detect_none && kh_size(candidates) > 0) {
  1694. result = g_ptr_array_sized_new(kh_size(candidates));
  1695. kh_foreach_value(candidates, cand, {
  1696. if (!isnan(cand->prob)) {
  1697. msg_debug_lang_det("pre-sorting probability %s -> %.2f", cand->lang,
  1698. cand->prob);
  1699. g_ptr_array_add(result, cand);
  1700. }
  1701. });
  1702. if (frequency_heuristic_applied) {
  1703. g_ptr_array_sort_with_data(result,
  1704. rspamd_language_detector_cmp_heuristic,
  1705. (gpointer) &cbd);
  1706. }
  1707. else {
  1708. g_ptr_array_sort(result, rspamd_language_detector_cmp);
  1709. }
  1710. int i;
  1711. PTR_ARRAY_FOREACH(result, i, cand)
  1712. {
  1713. msg_debug_lang_det("final probability %s -> %.2f", cand->lang,
  1714. cand->prob);
  1715. }
  1716. if (part->languages != NULL) {
  1717. g_ptr_array_unref(part->languages);
  1718. }
  1719. part->languages = result;
  1720. part->language = ((struct rspamd_lang_detector_res *) g_ptr_array_index(result, 0))->lang;
  1721. ret = TRUE;
  1722. }
  1723. else if (part->languages == NULL) {
  1724. rspamd_language_detector_set_language(task, part, "en", NULL);
  1725. }
  1726. kh_destroy(rspamd_candidates_hash, candidates);
  1727. }
  1728. /* Update internal stat */
  1729. if (part->languages != NULL && part->languages->len > 0 && !frequency_heuristic_applied) {
  1730. cand = g_ptr_array_index(part->languages, 0);
  1731. if (cand->elt) {
  1732. cand->elt->occurrences++;
  1733. d->total_occurrences++;
  1734. msg_debug_lang_det("updated stat for %s: %d occurrences, %z total detected",
  1735. cand->elt->name, cand->elt->occurrences,
  1736. d->total_occurrences);
  1737. }
  1738. }
  1739. end_ticks = rspamd_get_ticks(TRUE);
  1740. msg_debug_lang_det("detected languages in %.0f ticks",
  1741. (end_ticks - start_ticks));
  1742. return ret;
  1743. }
  1744. struct rspamd_lang_detector *
  1745. rspamd_language_detector_ref(struct rspamd_lang_detector *d)
  1746. {
  1747. REF_RETAIN(d);
  1748. return d;
  1749. }
  1750. void rspamd_language_detector_unref(struct rspamd_lang_detector *d)
  1751. {
  1752. REF_RELEASE(d);
  1753. }
  1754. gboolean
  1755. rspamd_language_detector_is_stop_word(struct rspamd_lang_detector *d,
  1756. const char *word, gsize wlen)
  1757. {
  1758. khiter_t k;
  1759. rspamd_ftok_t search;
  1760. search.begin = word;
  1761. search.len = wlen;
  1762. k = kh_get(rspamd_stopwords_hash, d->stop_words_norm, &search);
  1763. if (k != kh_end(d->stop_words_norm)) {
  1764. return TRUE;
  1765. }
  1766. return FALSE;
  1767. }
  1768. int rspamd_language_detector_elt_flags(const struct rspamd_language_elt *elt)
  1769. {
  1770. if (elt) {
  1771. return elt->flags;
  1772. }
  1773. return 0;
  1774. }