123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103 |
- /*
- * Copyright 2024 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- #include "lang_detection.h"
- #include "lang_detection_fasttext.h"
- #include "libserver/logger.h"
- #include "libcryptobox/cryptobox.h"
- #include "libutil/multipattern.h"
- #include "ucl.h"
- #include "khash.h"
- #include "libstemmer.h"
-
- #include <glob.h>
- #include <unicode/utf8.h>
- #include <unicode/utf16.h>
- #include <unicode/ucnv.h>
- #include <unicode/uchar.h>
- #include <unicode/ustring.h>
- #include <math.h>
-
- static const gsize default_short_text_limit = 10;
- static const gsize default_words = 80;
- static const double update_prob = 0.6;
- static const char *default_languages_path = RSPAMD_SHAREDIR "/languages";
-
- #undef EXTRA_LANGDET_DEBUG
-
- struct rspamd_language_unicode_match {
- const char *lang;
- int unicode_code;
- };
-
- /*
- * List of languages detected by unicode scripts
- */
- static const struct rspamd_language_unicode_match unicode_langs[] = {
- {"el", RSPAMD_UNICODE_GREEK},
- {"ml", RSPAMD_UNICODE_MALAYALAM},
- {"te", RSPAMD_UNICODE_TELUGU},
- {"ta", RSPAMD_UNICODE_TAMIL},
- {"gu", RSPAMD_UNICODE_GUJARATI},
- {"th", RSPAMD_UNICODE_THAI},
- {"ka", RSPAMD_UNICODE_GEORGIAN},
- {"si", RSPAMD_UNICODE_SINHALA},
- {"hy", RSPAMD_UNICODE_ARMENIAN},
- {"ja", RSPAMD_UNICODE_JP},
- {"ko", RSPAMD_UNICODE_HANGUL},
- };
-
- /*
- * Top languages
- */
- static const char *tier0_langs[] = {
- "en",
- };
- static const char *tier1_langs[] = {
- "fr", "it", "de", "es", "nl",
- "pt", "ru", "pl", "tk", "th", "ar"};
-
- enum rspamd_language_category {
- RSPAMD_LANGUAGE_LATIN = 0,
- RSPAMD_LANGUAGE_CYRILLIC,
- RSPAMD_LANGUAGE_DEVANAGARI,
- RSPAMD_LANGUAGE_ARAB,
- RSPAMD_LANGUAGE_MAX,
- };
-
- struct rspamd_language_elt {
- const char *name; /* e.g. "en" or "ru" */
- int flags; /* enum rspamd_language_elt_flags */
- enum rspamd_language_category category;
- unsigned int trigrams_words;
- unsigned int stop_words;
- double mean;
- double std;
- unsigned int occurrences; /* total number of parts with this language */
- };
-
- struct rspamd_ngramm_elt {
- struct rspamd_language_elt *elt;
- double prob;
- };
-
- struct rspamd_ngramm_chain {
- GPtrArray *languages;
- double mean;
- double std;
- char *utf;
- };
-
- struct rspamd_stop_word_range {
- unsigned int start;
- unsigned int stop;
- struct rspamd_language_elt *elt;
- };
-
- struct rspamd_stop_word_elt {
- struct rspamd_multipattern *mp;
- GArray *ranges; /* of rspamd_stop_word_range */
- };
-
- #define msg_debug_lang_det(...) rspamd_conditional_debug_fast(NULL, NULL, \
- rspamd_langdet_log_id, "langdet", task->task_pool->tag.uid, \
- G_STRFUNC, \
- __VA_ARGS__)
- #define msg_debug_lang_det_cfg(...) rspamd_conditional_debug_fast(NULL, NULL, \
- rspamd_langdet_log_id, "langdet", cfg->cfg_pool->tag.uid, \
- G_STRFUNC, \
- __VA_ARGS__)
-
- INIT_LOG_MODULE_PUBLIC(langdet)
-
- static const struct rspamd_language_unicode_match *
- rspamd_language_search_unicode_match(const char *key,
- const struct rspamd_language_unicode_match *elts, size_t nelts)
- {
- size_t i;
-
- for (i = 0; i < nelts; i++) {
- if (strcmp(elts[i].lang, key) == 0) {
- return &elts[i];
- }
- }
-
- return NULL;
- }
-
- static gboolean
- rspamd_language_search_str(const char *key, const char *elts[], size_t nelts)
- {
- size_t i;
-
- for (i = 0; i < nelts; i++) {
- if (strcmp(elts[i], key) == 0) {
- return TRUE;
- }
- }
- return FALSE;
- }
-
- static unsigned int
- rspamd_trigram_hash_func(gconstpointer key)
- {
- return rspamd_cryptobox_fast_hash(key, 3 * sizeof(UChar32),
- rspamd_hash_seed());
- }
-
- static gboolean
- rspamd_trigram_equal_func(gconstpointer v, gconstpointer v2)
- {
- return memcmp(v, v2, 3 * sizeof(UChar32)) == 0;
- }
-
- KHASH_INIT(rspamd_trigram_hash, const UChar32 *, struct rspamd_ngramm_chain, true,
- rspamd_trigram_hash_func, rspamd_trigram_equal_func);
- KHASH_INIT(rspamd_candidates_hash, const char *,
- struct rspamd_lang_detector_res *, true,
- rspamd_str_hash, rspamd_str_equal);
- KHASH_INIT(rspamd_stopwords_hash, rspamd_ftok_t *,
- char, false,
- rspamd_ftok_hash, rspamd_ftok_equal);
-
- KHASH_INIT(rspamd_languages_hash, const char *, struct rspamd_language_elt *, true,
- rspamd_str_hash, rspamd_str_equal);
- struct rspamd_lang_detector {
- khash_t(rspamd_languages_hash) * languages;
- khash_t(rspamd_trigram_hash) * trigrams[RSPAMD_LANGUAGE_MAX]; /* trigrams frequencies */
- struct rspamd_stop_word_elt stop_words[RSPAMD_LANGUAGE_MAX];
- khash_t(rspamd_stopwords_hash) * stop_words_norm;
- UConverter *uchar_converter;
- gsize short_text_limit;
- bool prefer_fasttext;
- gsize total_occurrences; /* number of all languages found */
- gpointer fasttext_detector;
- ref_entry_t ref;
- };
-
- static void
- rspamd_language_detector_ucs_lowercase(UChar32 *s, gsize len)
- {
- gsize i;
-
- for (i = 0; i < len; i++) {
- s[i] = u_tolower(s[i]);
- }
- }
-
- static gboolean
- rspamd_language_detector_ucs_is_latin(const UChar32 *s, gsize len)
- {
- gsize i;
- gboolean ret = TRUE;
-
- for (i = 0; i < len; i++) {
- if (s[i] >= 128 || !(g_ascii_isalnum(s[i]) || s[i] == ' ')) {
- ret = FALSE;
- break;
- }
- }
-
- return ret;
- }
-
- struct rspamd_language_ucs_elt {
- unsigned int freq;
- const char *utf;
- UChar32 s[0];
- };
-
- static void
- rspamd_language_detector_init_ngramm(struct rspamd_config *cfg,
- struct rspamd_lang_detector *d,
- struct rspamd_language_elt *lelt,
- struct rspamd_language_ucs_elt *ucs,
- unsigned int len,
- unsigned int freq,
- unsigned int total,
- khash_t(rspamd_trigram_hash) * htb)
- {
- struct rspamd_ngramm_chain *chain = NULL, st_chain;
- struct rspamd_ngramm_elt *elt;
- khiter_t k;
- unsigned int i;
- gboolean found;
-
- switch (len) {
- case 1:
- case 2:
- g_assert_not_reached();
- break;
- case 3:
- k = kh_get(rspamd_trigram_hash, htb, ucs->s);
- if (k != kh_end(htb)) {
- chain = &kh_value(htb, k);
- }
- break;
- default:
- g_assert_not_reached();
- break;
- }
-
- if (chain == NULL) {
- /* New element */
- chain = &st_chain;
- memset(chain, 0, sizeof(st_chain));
- chain->languages = g_ptr_array_sized_new(32);
- rspamd_mempool_add_destructor(cfg->cfg_pool, rspamd_ptr_array_free_hard,
- chain->languages);
- chain->utf = rspamd_mempool_strdup(cfg->cfg_pool, ucs->utf);
- elt = rspamd_mempool_alloc(cfg->cfg_pool, sizeof(*elt));
- elt->elt = lelt;
- elt->prob = ((double) freq) / ((double) total);
- g_ptr_array_add(chain->languages, elt);
-
- k = kh_put(rspamd_trigram_hash, htb, ucs->s, &i);
- kh_value(htb, k) = *chain;
- }
- else {
- /* Check sanity */
- found = FALSE;
-
- PTR_ARRAY_FOREACH(chain->languages, i, elt)
- {
- if (strcmp(elt->elt->name, lelt->name) == 0) {
- found = TRUE;
- elt->prob += ((double) freq) / ((double) total);
- break;
- }
- }
-
- if (!found) {
- elt = rspamd_mempool_alloc(cfg->cfg_pool, sizeof(*elt));
- elt->elt = lelt;
- elt->prob = ((double) freq) / ((double) total);
- g_ptr_array_add(chain->languages, elt);
- }
- }
- }
-
- static inline enum rspamd_language_category
- rspamd_language_detector_get_category(unsigned int uflags)
- {
- enum rspamd_language_category cat = RSPAMD_LANGUAGE_LATIN;
-
- if (uflags & RSPAMD_UNICODE_CYRILLIC) {
- cat = RSPAMD_LANGUAGE_CYRILLIC;
- }
- else if (uflags & RSPAMD_UNICODE_DEVANAGARI) {
- cat = RSPAMD_LANGUAGE_DEVANAGARI;
- }
- else if (uflags & RSPAMD_UNICODE_ARABIC) {
- cat = RSPAMD_LANGUAGE_ARAB;
- }
-
- return cat;
- }
-
- static const char *
- rspamd_language_detector_print_flags(struct rspamd_language_elt *elt)
- {
- static char flags_buf[256];
- goffset r = 0;
-
- if (elt->flags & RS_LANGUAGE_TIER1) {
- r += rspamd_snprintf(flags_buf + r, sizeof(flags_buf) - r, "tier1,");
- }
- if (elt->flags & RS_LANGUAGE_TIER0) {
- r += rspamd_snprintf(flags_buf + r, sizeof(flags_buf) - r, "tier0,");
- }
- if (elt->flags & RS_LANGUAGE_LATIN) {
- r += rspamd_snprintf(flags_buf + r, sizeof(flags_buf) - r, "latin,");
- }
-
- if (r > 0) {
- flags_buf[r - 1] = '\0';
- }
- else {
- flags_buf[r] = '\0';
- }
-
- return flags_buf;
- }
-
- static int
- rspamd_language_detector_cmp_ngramm(gconstpointer a, gconstpointer b)
- {
- struct rspamd_language_ucs_elt *e1 = *(struct rspamd_language_ucs_elt **) a;
- struct rspamd_language_ucs_elt *e2 = *(struct rspamd_language_ucs_elt **) b;
-
- return (int) e2->freq - (int) e1->freq;
- }
-
- static void
- rspamd_language_detector_read_file(struct rspamd_config *cfg,
- struct rspamd_lang_detector *d,
- const char *path,
- const ucl_object_t *stop_words)
- {
- struct ucl_parser *parser;
- ucl_object_t *top;
- const ucl_object_t *freqs, *n_words, *cur, *type, *flags;
- ucl_object_iter_t it = NULL;
- UErrorCode uc_err = U_ZERO_ERROR;
- struct rspamd_language_elt *nelt;
- struct rspamd_language_ucs_elt *ucs_elt;
- khash_t(rspamd_trigram_hash) *htb = NULL;
- char *pos;
- unsigned int total = 0, total_latin = 0, total_ngramms = 0, i, skipped,
- loaded;
- double mean = 0, std = 0, delta = 0, delta2 = 0, m2 = 0;
- enum rspamd_language_category cat = RSPAMD_LANGUAGE_MAX;
-
- parser = ucl_parser_new(UCL_PARSER_NO_FILEVARS);
- if (!ucl_parser_add_file(parser, path)) {
- msg_warn_config("cannot parse file %s: %s", path,
- ucl_parser_get_error(parser));
- ucl_parser_free(parser);
-
- return;
- }
-
- top = ucl_parser_get_object(parser);
- ucl_parser_free(parser);
-
- freqs = ucl_object_lookup(top, "freq");
-
- if (freqs == NULL) {
- msg_warn_config("file %s has no 'freq' key", path);
- ucl_object_unref(top);
-
- return;
- }
-
- pos = strrchr(path, '/');
- g_assert(pos != NULL);
- nelt = rspamd_mempool_alloc0(cfg->cfg_pool, sizeof(*nelt));
- nelt->name = rspamd_mempool_strdup(cfg->cfg_pool, pos + 1);
- /* Remove extension */
- pos = strchr(nelt->name, '.');
- g_assert(pos != NULL);
- *pos = '\0';
-
- n_words = ucl_object_lookup(top, "n_words");
-
- if (n_words == NULL || ucl_object_type(n_words) != UCL_ARRAY ||
- n_words->len != 3) {
- msg_warn_config("cannot find n_words in language %s", nelt->name);
- ucl_object_unref(top);
-
- return;
- }
- else {
- nelt->trigrams_words = ucl_object_toint(ucl_array_find_index(n_words,
- 2));
- }
-
- type = ucl_object_lookup(top, "type");
-
- if (type == NULL || ucl_object_type(type) != UCL_STRING) {
- msg_debug_config("cannot find type in language %s", nelt->name);
- ucl_object_unref(top);
-
- return;
- }
- else {
- const char *stype = ucl_object_tostring(type);
-
- if (strcmp(stype, "latin") == 0) {
- cat = RSPAMD_LANGUAGE_LATIN;
- }
- else if (strcmp(stype, "cyrillic") == 0) {
- cat = RSPAMD_LANGUAGE_CYRILLIC;
- }
- else if (strcmp(stype, "arab") == 0) {
- cat = RSPAMD_LANGUAGE_ARAB;
- }
- else if (strcmp(stype, "devanagari") == 0) {
- cat = RSPAMD_LANGUAGE_DEVANAGARI;
- }
- else {
- msg_debug_config("unknown type %s of language %s", stype, nelt->name);
- ucl_object_unref(top);
-
- return;
- }
- }
-
- flags = ucl_object_lookup(top, "flags");
-
- if (flags != NULL && ucl_object_type(flags) == UCL_ARRAY) {
- ucl_object_iter_t it = NULL;
- const ucl_object_t *cur;
-
- while ((cur = ucl_object_iterate(flags, &it, true)) != NULL) {
- const char *fl = ucl_object_tostring(cur);
-
- if (cur) {
- if (strcmp(fl, "diacritics") == 0) {
- nelt->flags |= RS_LANGUAGE_DIACRITICS;
- }
- else if (strcmp(fl, "ascii") == 0) {
- nelt->flags |= RS_LANGUAGE_ASCII;
- }
- else {
- msg_debug_config("unknown flag %s of language %s", fl, nelt->name);
- }
- }
- else {
- msg_debug_config("unknown flags type of language %s", nelt->name);
- }
- }
- }
-
- if (stop_words) {
- const ucl_object_t *specific_stop_words;
-
- specific_stop_words = ucl_object_lookup(stop_words, nelt->name);
-
- if (specific_stop_words) {
- struct sb_stemmer *stem = NULL;
- it = NULL;
- const ucl_object_t *w;
- unsigned int start, stop;
-
- stem = sb_stemmer_new(nelt->name, "UTF_8");
- start = rspamd_multipattern_get_npatterns(d->stop_words[cat].mp);
-
- while ((w = ucl_object_iterate(specific_stop_words, &it, true)) != NULL) {
- gsize wlen;
- const char *word = ucl_object_tolstring(w, &wlen);
- const char *saved;
- unsigned int mp_flags = RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8;
-
- if (rspamd_multipattern_has_hyperscan()) {
- mp_flags |= RSPAMD_MULTIPATTERN_RE;
- }
-
- rspamd_multipattern_add_pattern_len(d->stop_words[cat].mp,
- word, wlen,
- mp_flags);
- nelt->stop_words++;
-
- /* Also lemmatise and store normalised */
- if (stem) {
- const char *nw = sb_stemmer_stem(stem, word, wlen);
-
-
- if (nw) {
- saved = nw;
- wlen = strlen(nw);
- }
- else {
- saved = word;
- }
- }
- else {
- saved = word;
- }
-
- if (saved) {
- int rc;
- rspamd_ftok_t *tok;
- char *dst;
-
- tok = rspamd_mempool_alloc(cfg->cfg_pool,
- sizeof(*tok) + wlen + 1);
- dst = ((char *) tok) + sizeof(*tok);
- rspamd_strlcpy(dst, saved, wlen + 1);
- tok->begin = dst;
- tok->len = wlen;
-
- kh_put(rspamd_stopwords_hash, d->stop_words_norm,
- tok, &rc);
- }
- }
-
- if (stem) {
- sb_stemmer_delete(stem);
- }
-
- stop = rspamd_multipattern_get_npatterns(d->stop_words[cat].mp);
-
- struct rspamd_stop_word_range r;
-
- r.start = start;
- r.stop = stop;
- r.elt = nelt;
-
- g_array_append_val(d->stop_words[cat].ranges, r);
- it = NULL;
- }
- }
-
- nelt->category = cat;
- htb = d->trigrams[cat];
-
- GPtrArray *ngramms;
- unsigned int nsym;
-
- if (rspamd_language_search_str(nelt->name, tier1_langs,
- G_N_ELEMENTS(tier1_langs))) {
- nelt->flags |= RS_LANGUAGE_TIER1;
- }
-
- if (rspamd_language_search_str(nelt->name, tier0_langs,
- G_N_ELEMENTS(tier0_langs))) {
- nelt->flags |= RS_LANGUAGE_TIER0;
- }
-
- it = NULL;
- ngramms = g_ptr_array_sized_new(freqs->len);
- i = 0;
- skipped = 0;
- loaded = 0;
-
- while ((cur = ucl_object_iterate(freqs, &it, true)) != NULL) {
- const char *key;
- gsize keylen;
- unsigned int freq;
-
- key = ucl_object_keyl(cur, &keylen);
- freq = ucl_object_toint(cur);
-
- i++;
- delta = freq - mean;
- mean += delta / i;
- delta2 = freq - mean;
- m2 += delta * delta2;
-
- if (key != NULL) {
- UChar32 *cur_ucs;
- const char *end = key + keylen, *cur_utf = key;
-
- ucs_elt = rspamd_mempool_alloc(cfg->cfg_pool,
- sizeof(*ucs_elt) + (keylen + 1) * sizeof(UChar32));
-
- cur_ucs = ucs_elt->s;
- nsym = 0;
- uc_err = U_ZERO_ERROR;
-
- while (cur_utf < end) {
- *cur_ucs++ = ucnv_getNextUChar(d->uchar_converter, &cur_utf,
- end, &uc_err);
- if (!U_SUCCESS(uc_err)) {
- break;
- }
-
- nsym++;
- }
-
- if (!U_SUCCESS(uc_err)) {
- msg_warn_config("cannot convert key %*s to unicode: %s",
- (int) keylen, key, u_errorName(uc_err));
-
- continue;
- }
-
- ucs_elt->utf = key;
- rspamd_language_detector_ucs_lowercase(ucs_elt->s, nsym);
-
- if (nsym == 3) {
- g_ptr_array_add(ngramms, ucs_elt);
- }
- else {
- continue;
- }
-
- if (rspamd_language_detector_ucs_is_latin(ucs_elt->s, nsym)) {
- total_latin++;
- }
-
- ucs_elt->freq = freq;
-
- total_ngramms++;
- }
- }
-
- std = sqrt(m2 / (i - 1));
-
- if (total_latin >= total_ngramms / 3) {
- nelt->flags |= RS_LANGUAGE_LATIN;
- }
-
- nsym = 3;
-
- total = 0;
- PTR_ARRAY_FOREACH(ngramms, i, ucs_elt)
- {
-
- if (!(nelt->flags & RS_LANGUAGE_LATIN) &&
- rspamd_language_detector_ucs_is_latin(ucs_elt->s, nsym)) {
- ucs_elt->freq = 0;
- /* Skip latin ngramm for non-latin language to avoid garbage */
- skipped++;
- continue;
- }
-
- /* Now, discriminate low frequency ngramms */
-
- total += ucs_elt->freq;
- loaded++;
- }
-
- g_ptr_array_sort(ngramms, rspamd_language_detector_cmp_ngramm);
-
- PTR_ARRAY_FOREACH(ngramms, i, ucs_elt)
- {
- if (ucs_elt->freq > 0) {
- rspamd_language_detector_init_ngramm(cfg, d,
- nelt, ucs_elt, nsym,
- ucs_elt->freq, total, htb);
- }
- }
-
- #ifdef EXTRA_LANGDET_DEBUG
- /* Useful for debug */
- for (i = 0; i < 10; i++) {
- ucs_elt = g_ptr_array_index(ngramms, i);
-
- msg_debug_lang_det_cfg("%s -> %s: %d", nelt->name,
- ucs_elt->utf, ucs_elt->freq);
- }
- #endif
-
- g_ptr_array_free(ngramms, TRUE);
- nelt->mean = mean;
- nelt->std = std;
-
- msg_debug_lang_det_cfg("loaded %s language, %d trigrams, "
- "%d ngramms loaded; "
- "std=%.2f, mean=%.2f, skipped=%d, loaded=%d, stop_words=%d; "
- "(%s)",
- nelt->name,
- (int) nelt->trigrams_words,
- total,
- std, mean,
- skipped, loaded, nelt->stop_words,
- rspamd_language_detector_print_flags(nelt));
-
- int ret;
- khiter_t k = kh_put(rspamd_languages_hash, d->languages, nelt->name, &ret);
- g_assert(ret > 0); /* must be unique */
- kh_value(d->languages, k) = nelt;
- ucl_object_unref(top);
- }
-
- static gboolean
- rspamd_ucl_array_find_str(const char *str, const ucl_object_t *ar)
- {
- ucl_object_iter_t it = NULL;
- const ucl_object_t *cur;
-
- if (ar == NULL || ar->len == 0) {
- return FALSE;
- }
-
- while ((cur = ucl_object_iterate(ar, &it, true)) != NULL) {
- if (ucl_object_type(cur) == UCL_STRING && rspamd_strcase_equal(
- ucl_object_tostring(cur), str)) {
- return TRUE;
- }
- }
-
- return FALSE;
- }
-
- static void
- rspamd_language_detector_process_chain(struct rspamd_config *cfg,
- struct rspamd_ngramm_chain *chain)
- {
- struct rspamd_ngramm_elt *elt;
- unsigned int i;
- double delta, mean = 0, delta2, m2 = 0, std;
-
- if (chain->languages->len > 3) {
- PTR_ARRAY_FOREACH(chain->languages, i, elt)
- {
- delta = elt->prob - mean;
- mean += delta / (i + 1);
- delta2 = elt->prob - mean;
- m2 += delta * delta2;
- }
-
- std = sqrt(m2 / (i - 1));
- chain->mean = mean;
- chain->std = std;
-
- /* Now, filter elements that are lower than mean */
- PTR_ARRAY_FOREACH(chain->languages, i, elt)
- {
- if (elt->prob < mean) {
- g_ptr_array_remove_index_fast(chain->languages, i);
- #ifdef EXTRA_LANGDET_DEBUG
- msg_debug_lang_det_cfg("remove %s from %s; prob: %.4f; mean: %.4f, std: %.4f",
- elt->elt->name, chain->utf, elt->prob, mean, std);
- #endif
- }
- }
- }
- else {
- /* We have a unique ngramm, increase its weight */
- PTR_ARRAY_FOREACH(chain->languages, i, elt)
- {
- elt->prob *= 4.0;
- #ifdef EXTRA_LANGDET_DEBUG
- msg_debug_lang_det_cfg("increase weight of %s in %s; prob: %.4f",
- elt->elt->name, chain->utf, elt->prob);
- #endif
- }
- }
- }
-
- static void
- rspamd_language_detector_dtor(struct rspamd_lang_detector *d)
- {
- if (d) {
- for (unsigned int i = 0; i < RSPAMD_LANGUAGE_MAX; i++) {
- kh_destroy(rspamd_trigram_hash, d->trigrams[i]);
- rspamd_multipattern_destroy(d->stop_words[i].mp);
- g_array_free(d->stop_words[i].ranges, TRUE);
- }
-
- if (d->languages) {
- kh_destroy(rspamd_languages_hash, d->languages);
- }
-
- kh_destroy(rspamd_stopwords_hash, d->stop_words_norm);
- rspamd_lang_detection_fasttext_destroy(d->fasttext_detector);
- }
- }
-
- struct rspamd_lang_detector *
- rspamd_language_detector_init(struct rspamd_config *cfg)
- {
- const ucl_object_t *section, *elt, *languages_enable = NULL,
- *languages_disable = NULL;
- const char *languages_path = default_languages_path;
- glob_t gl;
- size_t i, short_text_limit = default_short_text_limit, total = 0;
- UErrorCode uc_err = U_ZERO_ERROR;
- GString *languages_pattern;
- struct rspamd_ngramm_chain *chain, schain;
- char *fname;
- struct rspamd_lang_detector *ret = NULL;
- struct ucl_parser *parser;
- ucl_object_t *stop_words;
- bool prefer_fasttext = true;
-
- section = ucl_object_lookup(cfg->cfg_ucl_obj, "lang_detection");
-
- if (section != NULL) {
- elt = ucl_object_lookup(section, "languages");
-
- if (elt) {
- languages_path = ucl_object_tostring(elt);
- }
-
- elt = ucl_object_lookup(section, "short_text_limit");
-
- if (elt) {
- short_text_limit = ucl_object_toint(elt);
- }
-
- languages_enable = ucl_object_lookup(section, "languages_enable");
- languages_disable = ucl_object_lookup(section, "languages_disable");
-
- elt = ucl_object_lookup(section, "prefer_fasttext");
- if (elt) {
- prefer_fasttext = ucl_object_toboolean(elt);
- }
- }
-
- languages_pattern = g_string_sized_new(PATH_MAX);
- rspamd_printf_gstring(languages_pattern, "%s/stop_words", languages_path);
- parser = ucl_parser_new(UCL_PARSER_DEFAULT);
-
- if (ucl_parser_add_file(parser, languages_pattern->str)) {
- stop_words = ucl_parser_get_object(parser);
- }
- else {
- msg_err_config("cannot read stop words from %s: %s",
- languages_pattern->str,
- ucl_parser_get_error(parser));
- stop_words = NULL;
- }
-
- ucl_parser_free(parser);
- languages_pattern->len = 0;
-
- rspamd_printf_gstring(languages_pattern, "%s/*.json", languages_path);
- memset(&gl, 0, sizeof(gl));
-
- if (glob(languages_pattern->str, 0, NULL, &gl) != 0) {
- msg_err_config("cannot read any files matching %v", languages_pattern);
- goto end;
- }
-
- ret = rspamd_mempool_alloc0(cfg->cfg_pool, sizeof(*ret));
- ret->languages = kh_init(rspamd_languages_hash);
- kh_resize(rspamd_languages_hash, ret->languages, gl.gl_pathc);
- ret->uchar_converter = rspamd_get_utf8_converter();
- ret->short_text_limit = short_text_limit;
- ret->stop_words_norm = kh_init(rspamd_stopwords_hash);
- ret->prefer_fasttext = prefer_fasttext;
-
- /* Map from ngramm in ucs32 to GPtrArray of rspamd_language_elt */
- for (i = 0; i < RSPAMD_LANGUAGE_MAX; i++) {
- ret->trigrams[i] = kh_init(rspamd_trigram_hash);
- #ifdef WITH_HYPERSCAN
- ret->stop_words[i].mp = rspamd_multipattern_create(
- RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8 |
- RSPAMD_MULTIPATTERN_RE);
- #else
- ret->stop_words[i].mp = rspamd_multipattern_create(
- RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8);
- #endif
-
- ret->stop_words[i].ranges = g_array_new(FALSE, FALSE,
- sizeof(struct rspamd_stop_word_range));
- }
-
- g_assert(uc_err == U_ZERO_ERROR);
-
- for (i = 0; i < gl.gl_pathc; i++) {
- fname = g_path_get_basename(gl.gl_pathv[i]);
-
- if (!rspamd_ucl_array_find_str(fname, languages_disable) ||
- (languages_enable == NULL ||
- rspamd_ucl_array_find_str(fname, languages_enable))) {
- rspamd_language_detector_read_file(cfg, ret, gl.gl_pathv[i],
- stop_words);
- }
- else {
- msg_info_config("skip language file %s: disabled", fname);
- }
-
- g_free(fname);
- }
-
- for (i = 0; i < RSPAMD_LANGUAGE_MAX; i++) {
- GError *err = NULL;
-
- kh_foreach_value(ret->trigrams[i], schain, {
- chain = &schain;
- rspamd_language_detector_process_chain(cfg, chain);
- });
-
- if (!rspamd_multipattern_compile(ret->stop_words[i].mp, 0, &err)) {
- msg_err_config("cannot compile stop words for %z language group: %e",
- i, err);
- g_error_free(err);
- }
-
- total += kh_size(ret->trigrams[i]);
- }
-
- ret->fasttext_detector = rspamd_lang_detection_fasttext_init(cfg);
- char *fasttext_status = rspamd_lang_detection_fasttext_show_info(ret->fasttext_detector);
-
- msg_info_config("loaded %d languages, "
- "%d trigrams; %s",
- (int) kh_size(ret->languages),
- (int) total, fasttext_status);
- g_free(fasttext_status);
-
- if (stop_words) {
- ucl_object_unref(stop_words);
- }
-
- REF_INIT_RETAIN(ret, rspamd_language_detector_dtor);
- rspamd_mempool_add_destructor(cfg->cfg_pool,
- (rspamd_mempool_destruct_t) rspamd_language_detector_unref,
- ret);
-
- end:
- if (gl.gl_pathc > 0) {
- globfree(&gl);
- }
-
- g_string_free(languages_pattern, TRUE);
-
- return ret;
- }
-
- static void
- rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords,
- goffset *offsets_out,
- uint64_t *seed)
- {
- unsigned int step_len, remainder, i, out_idx;
- uint64_t coin, sel;
- rspamd_stat_token_t *tok;
-
- g_assert(nwords != 0);
- g_assert(offsets_out != NULL);
- g_assert(ucs_tokens->len >= nwords);
- /*
- * We split input array into `nwords` parts. For each part we randomly select
- * an element from this particular split. Here is an example:
- *
- * nwords=2, input_len=5
- *
- * w1 w2 w3 w4 w5
- * ^ ^
- * part1 part2
- * vv vv
- * w2 w5
- *
- * So we have 2 output words from 5 input words selected randomly within
- * their splits. It is not uniform distribution but it seems to be better
- * to include words from different text parts
- */
- step_len = ucs_tokens->len / nwords;
- remainder = ucs_tokens->len % nwords;
-
- out_idx = 0;
- coin = rspamd_random_uint64_fast_seed(seed);
- sel = coin % (step_len + remainder);
- offsets_out[out_idx] = sel;
-
- for (i = step_len + remainder; i < ucs_tokens->len;
- i += step_len, out_idx++) {
- unsigned int ntries = 0;
- coin = rspamd_random_uint64_fast_seed(seed);
- sel = (coin % step_len) + i;
-
- for (;;) {
- tok = &g_array_index(ucs_tokens, rspamd_stat_token_t, sel);
- /* Filter bad tokens */
-
- if (tok->unicode.len >= 2 &&
- !(tok->flags & RSPAMD_STAT_TOKEN_FLAG_EXCEPTION) &&
- u_isalpha(tok->unicode.begin[0]) &&
- u_isalpha(tok->unicode.begin[tok->unicode.len - 1])) {
- offsets_out[out_idx] = sel;
- break;
- }
- else {
- ntries++;
- coin = rspamd_random_uint64_fast_seed(seed);
-
- if (ntries < step_len) {
- sel = (coin % step_len) + i;
- }
- else if (ntries < ucs_tokens->len) {
- sel = coin % ucs_tokens->len;
- }
- else {
- offsets_out[out_idx] = sel;
- break;
- }
- }
- }
- }
-
- /*
- * Fisher-Yates algorithm:
- * for i from 0 to n−2 do
- * j ← random integer such that i ≤ j < n
- * exchange a[i] and a[j]
- */
- #if 0
- if (out_idx > 2) {
- for (i = 0; i < out_idx - 2; i++) {
- coin = rspamd_random_uint64_fast ();
- sel = (coin % (out_idx - i)) + i;
- /* swap */
- tmp = offsets_out[i];
- offsets_out[i] = offsets_out[sel];
- offsets_out[sel] = tmp;
- }
- }
- #endif
- }
-
- static goffset
- rspamd_language_detector_next_ngramm(rspamd_stat_token_t *tok, UChar32 *window,
- unsigned int wlen, goffset cur_off)
- {
- unsigned int i;
-
- if (wlen > 1) {
- /* Deal with spaces at the beginning and ending */
-
- if (cur_off == 0) {
- window[0] = (UChar32) ' ';
-
- for (i = 0; i < wlen - 1; i++) {
- window[i + 1] = tok->unicode.begin[i];
- }
- }
- else if (cur_off + wlen == tok->unicode.len + 1) {
- /* Add trailing space */
- for (i = 0; i < wlen - 1; i++) {
- window[i] = tok->unicode.begin[cur_off + i];
- }
- window[wlen - 1] = (UChar32) ' ';
- }
- else if (cur_off + wlen > tok->unicode.len + 1) {
- /* No more fun */
- return -1;
- }
- else {
- /* Normal case */
- for (i = 0; i < wlen; i++) {
- window[i] = tok->unicode.begin[cur_off + i];
- }
- }
- }
- else {
- if (tok->normalized.len <= cur_off) {
- return -1;
- }
-
- window[0] = tok->unicode.begin[cur_off];
- }
-
- return cur_off + 1;
- }
-
- /*
- * Do full guess for a specific ngramm, checking all languages defined
- */
- static void
- rspamd_language_detector_process_ngramm_full(struct rspamd_task *task,
- struct rspamd_lang_detector *d,
- UChar32 *window,
- khash_t(rspamd_candidates_hash) * candidates,
- khash_t(rspamd_trigram_hash) * trigrams)
- {
- unsigned int i;
- int ret;
- struct rspamd_ngramm_chain *chain = NULL;
- struct rspamd_ngramm_elt *elt;
- struct rspamd_lang_detector_res *cand;
- khiter_t k;
- double prob;
-
- k = kh_get(rspamd_trigram_hash, trigrams, window);
- if (k != kh_end(trigrams)) {
- chain = &kh_value(trigrams, k);
- }
-
- if (chain) {
- PTR_ARRAY_FOREACH(chain->languages, i, elt)
- {
- prob = elt->prob;
-
- if (prob < chain->mean) {
- continue;
- }
-
- k = kh_get(rspamd_candidates_hash, candidates, elt->elt->name);
- if (k != kh_end(candidates)) {
- cand = kh_value(candidates, k);
- }
- else {
- cand = NULL;
- }
-
- #ifdef NGRAMMS_DEBUG
- msg_err("gramm: %s, lang: %s, prob: %.3f", chain->utf,
- elt->elt->name, log2(elt->prob));
- #endif
- if (cand == NULL) {
- cand = rspamd_mempool_alloc(task->task_pool, sizeof(*cand));
- cand->elt = elt->elt;
- cand->lang = elt->elt->name;
- cand->prob = prob;
-
- k = kh_put(rspamd_candidates_hash, candidates, elt->elt->name,
- &ret);
- kh_value(candidates, k) = cand;
- }
- else {
- /* Update guess */
- cand->prob += prob;
- }
- }
- }
- }
-
- static void
- rspamd_language_detector_detect_word(struct rspamd_task *task,
- struct rspamd_lang_detector *d,
- rspamd_stat_token_t *tok,
- khash_t(rspamd_candidates_hash) * candidates,
- khash_t(rspamd_trigram_hash) * trigrams)
- {
- const unsigned int wlen = 3;
- UChar32 window[3];
- goffset cur = 0;
-
- /* Split words */
- while ((cur = rspamd_language_detector_next_ngramm(tok, window, wlen, cur)) != -1) {
- rspamd_language_detector_process_ngramm_full(task,
- d, window, candidates, trigrams);
- }
- }
-
- static const double cutoff_limit = -8.0;
- /*
- * Converts frequencies to log probabilities, filter those candidates who
- * has the lowest probabilities
- */
-
- static inline void
- rspamd_language_detector_filter_step1(struct rspamd_task *task,
- struct rspamd_lang_detector_res *cand,
- double *max_prob, unsigned int *filtered)
- {
- if (!isnan(cand->prob)) {
- if (cand->prob == 0) {
- cand->prob = NAN;
- msg_debug_lang_det(
- "exclude language %s",
- cand->lang);
- (*filtered)++;
- }
- else {
- cand->prob = log2(cand->prob);
- if (cand->prob < cutoff_limit) {
- msg_debug_lang_det(
- "exclude language %s: %.3f, cutoff limit: %.3f",
- cand->lang, cand->prob, cutoff_limit);
- cand->prob = NAN;
- (*filtered)++;
- }
- else if (cand->prob > *max_prob) {
- *max_prob = cand->prob;
- }
- }
- }
- }
-
- static inline void
- rspamd_language_detector_filter_step2(struct rspamd_task *task,
- struct rspamd_lang_detector_res *cand,
- double max_prob, unsigned int *filtered)
- {
- /*
- * Probabilities are logarithmic, so if prob1 - prob2 > 4, it means that
- * prob2 is 2^4 less than prob1
- */
- if (!isnan(cand->prob) && max_prob - cand->prob > 1) {
- msg_debug_lang_det("exclude language %s: %.3f (%.3f max)",
- cand->lang, cand->prob, max_prob);
- cand->prob = NAN;
- (*filtered)++;
- }
- }
-
- static void
- rspamd_language_detector_filter_negligible(struct rspamd_task *task,
- khash_t(rspamd_candidates_hash) * candidates)
- {
- struct rspamd_lang_detector_res *cand;
- unsigned int filtered = 0;
- double max_prob = -(G_MAXDOUBLE);
-
- kh_foreach_value(candidates, cand,
- rspamd_language_detector_filter_step1(task, cand, &max_prob, &filtered));
- kh_foreach_value(candidates, cand,
- rspamd_language_detector_filter_step2(task, cand, max_prob, &filtered));
-
- msg_debug_lang_det("removed %d languages", filtered);
- }
-
- static void
- rspamd_language_detector_detect_type(struct rspamd_task *task,
- unsigned int nwords,
- struct rspamd_lang_detector *d,
- GArray *words,
- enum rspamd_language_category cat,
- khash_t(rspamd_candidates_hash) * candidates,
- struct rspamd_mime_text_part *part)
- {
- unsigned int nparts = MIN(words->len, nwords);
- goffset *selected_words;
- rspamd_stat_token_t *tok;
- unsigned int i;
- uint64_t seed;
-
- /* Seed PRNG with part digest to provide some sort of determinism */
- memcpy(&seed, part->mime_part->digest, sizeof(seed));
- selected_words = g_new0(goffset, nparts);
- rspamd_language_detector_random_select(words, nparts, selected_words, &seed);
- msg_debug_lang_det("randomly selected %d words", nparts);
-
- for (i = 0; i < nparts; i++) {
- tok = &g_array_index(words, rspamd_stat_token_t,
- selected_words[i]);
-
- if (tok->unicode.len >= 3) {
- rspamd_language_detector_detect_word(task, d, tok, candidates,
- d->trigrams[cat]);
- }
- }
-
- /* Filter negligible candidates */
- rspamd_language_detector_filter_negligible(task, candidates);
- g_free(selected_words);
- }
-
- static int
- rspamd_language_detector_cmp(gconstpointer a, gconstpointer b)
- {
- const struct rspamd_lang_detector_res
- *canda = *(const struct rspamd_lang_detector_res **) a,
- *candb = *(const struct rspamd_lang_detector_res **) b;
-
- if (canda->prob > candb->prob) {
- return -1;
- }
- else if (candb->prob > canda->prob) {
- return 1;
- }
-
- return 0;
- }
-
- enum rspamd_language_detected_type {
- rs_detect_none = 0,
- rs_detect_single,
- rs_detect_multiple,
- };
-
- static enum rspamd_language_detected_type
- rspamd_language_detector_try_ngramm(struct rspamd_task *task,
- unsigned int nwords,
- struct rspamd_lang_detector *d,
- GArray *ucs_tokens,
- enum rspamd_language_category cat,
- khash_t(rspamd_candidates_hash) * candidates,
- struct rspamd_mime_text_part *part)
- {
- unsigned int cand_len = 0;
- struct rspamd_lang_detector_res *cand;
-
- rspamd_language_detector_detect_type(task,
- nwords,
- d,
- ucs_tokens,
- cat,
- candidates,
- part);
-
- kh_foreach_value(candidates, cand, {
- if (!isnan(cand->prob)) {
- cand_len++;
- }
- });
-
- if (cand_len == 0) {
- return rs_detect_none;
- }
- else if (cand_len == 1) {
- return rs_detect_single;
- }
-
- return rs_detect_multiple;
- }
-
- enum rspamd_language_sort_flags {
- RSPAMD_LANG_FLAG_DEFAULT = 0,
- RSPAMD_LANG_FLAG_SHORT = 1 << 0,
- };
-
- struct rspamd_frequency_sort_cbdata {
- struct rspamd_lang_detector *d;
- enum rspamd_language_sort_flags flags;
- double std;
- double mean;
- };
-
- static const double tier0_adjustment = 1.2;
- static const double tier1_adjustment = 0.8;
- static const double frequency_adjustment = 0.8;
-
- static int
- rspamd_language_detector_cmp_heuristic(gconstpointer a, gconstpointer b,
- gpointer ud)
- {
- struct rspamd_frequency_sort_cbdata *cbd = ud;
- struct rspamd_lang_detector_res
- *canda = *(struct rspamd_lang_detector_res **) a,
- *candb = *(struct rspamd_lang_detector_res **) b;
- double adj;
- double proba_adjusted, probb_adjusted, freqa, freqb;
-
- if (cbd->d->total_occurrences == 0) {
- /* Not enough data, compare directly */
- return rspamd_language_detector_cmp(a, b);
- }
-
- freqa = ((double) canda->elt->occurrences) /
- (double) cbd->d->total_occurrences;
- freqb = ((double) candb->elt->occurrences) /
- (double) cbd->d->total_occurrences;
-
- proba_adjusted = canda->prob;
- probb_adjusted = candb->prob;
-
- if (isnormal(freqa) && isnormal(freqb)) {
- proba_adjusted += cbd->std * (frequency_adjustment * freqa);
- probb_adjusted += cbd->std * (frequency_adjustment * freqb);
- }
-
- if (cbd->flags & RSPAMD_LANG_FLAG_SHORT) {
- adj = tier1_adjustment * 2.0;
- }
- else {
- adj = tier1_adjustment;
- }
- if (canda->elt->flags & RS_LANGUAGE_TIER1) {
- proba_adjusted += cbd->std * adj;
- }
-
- if (candb->elt->flags & RS_LANGUAGE_TIER1) {
- probb_adjusted += cbd->std * adj;
- }
-
- if (cbd->flags & RSPAMD_LANG_FLAG_SHORT) {
- adj = tier0_adjustment * 16.0;
- }
- else {
- adj = tier0_adjustment;
- }
-
- if (canda->elt->flags & RS_LANGUAGE_TIER0) {
- proba_adjusted += cbd->std * adj;
- }
-
- if (candb->elt->flags & RS_LANGUAGE_TIER0) {
- probb_adjusted += cbd->std * adj;
- }
-
- /* Hack: adjust probability directly */
- canda->prob = proba_adjusted;
- candb->prob = probb_adjusted;
-
- if (proba_adjusted > probb_adjusted) {
- return -1;
- }
- else if (probb_adjusted > proba_adjusted) {
- return 1;
- }
-
- return 0;
- }
-
- static void
- rspamd_language_detector_unicode_scripts(struct rspamd_task *task,
- struct rspamd_mime_text_part *part,
- unsigned int *pchinese,
- unsigned int *pspecial)
- {
- const char *p = part->utf_stripped_content->data, *end;
- unsigned int i = 0, cnt = 0;
- end = p + part->utf_stripped_content->len;
- int32_t uc, sc;
- unsigned int nlatin = 0, nchinese = 0, nspecial = 0;
- const unsigned int cutoff_limit = 32;
-
- while (p + i < end) {
- U8_NEXT(p, i, part->utf_stripped_content->len, uc);
-
- if (((int32_t) uc) < 0) {
- break;
- }
-
- if (u_isalpha(uc)) {
- sc = ublock_getCode(uc);
- cnt++;
-
- switch (sc) {
- case UBLOCK_BASIC_LATIN:
- case UBLOCK_LATIN_1_SUPPLEMENT:
- part->unicode_scripts |= RSPAMD_UNICODE_LATIN;
- nlatin++;
- break;
- case UBLOCK_HEBREW:
- part->unicode_scripts |= RSPAMD_UNICODE_HEBREW;
- nspecial++;
- break;
- case UBLOCK_GREEK:
- part->unicode_scripts |= RSPAMD_UNICODE_GREEK;
- nspecial++;
- break;
- case UBLOCK_CYRILLIC:
- part->unicode_scripts |= RSPAMD_UNICODE_CYRILLIC;
- nspecial++;
- break;
- case UBLOCK_CJK_UNIFIED_IDEOGRAPHS:
- case UBLOCK_CJK_COMPATIBILITY:
- case UBLOCK_CJK_RADICALS_SUPPLEMENT:
- case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A:
- case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B:
- part->unicode_scripts |= RSPAMD_UNICODE_CJK;
- nchinese++;
- break;
- case UBLOCK_HIRAGANA:
- case UBLOCK_KATAKANA:
- part->unicode_scripts |= RSPAMD_UNICODE_JP;
- nspecial++;
- break;
- case UBLOCK_HANGUL_JAMO:
- case UBLOCK_HANGUL_COMPATIBILITY_JAMO:
- part->unicode_scripts |= RSPAMD_UNICODE_HANGUL;
- nspecial++;
- break;
- case UBLOCK_ARABIC:
- part->unicode_scripts |= RSPAMD_UNICODE_ARABIC;
- nspecial++;
- break;
- case UBLOCK_DEVANAGARI:
- part->unicode_scripts |= RSPAMD_UNICODE_DEVANAGARI;
- nspecial++;
- break;
- case UBLOCK_ARMENIAN:
- part->unicode_scripts |= RSPAMD_UNICODE_ARMENIAN;
- nspecial++;
- break;
- case UBLOCK_GEORGIAN:
- part->unicode_scripts |= RSPAMD_UNICODE_GEORGIAN;
- nspecial++;
- break;
- case UBLOCK_GUJARATI:
- part->unicode_scripts |= RSPAMD_UNICODE_GUJARATI;
- nspecial++;
- break;
- case UBLOCK_TELUGU:
- part->unicode_scripts |= RSPAMD_UNICODE_TELUGU;
- nspecial++;
- break;
- case UBLOCK_TAMIL:
- part->unicode_scripts |= RSPAMD_UNICODE_TAMIL;
- nspecial++;
- break;
- case UBLOCK_THAI:
- part->unicode_scripts |= RSPAMD_UNICODE_THAI;
- nspecial++;
- break;
- case RSPAMD_UNICODE_MALAYALAM:
- part->unicode_scripts |= RSPAMD_UNICODE_MALAYALAM;
- nspecial++;
- break;
- case RSPAMD_UNICODE_SINHALA:
- part->unicode_scripts |= RSPAMD_UNICODE_SINHALA;
- nspecial++;
- break;
- }
- }
-
- if (nspecial > cutoff_limit && nspecial > nlatin) {
- break;
- }
- else if (nchinese > cutoff_limit && nchinese > nlatin) {
- if (nspecial > 0) {
- /* Likely japanese */
- break;
- }
- }
- }
-
- msg_debug_lang_det("stop after checking %d characters, "
- "%d latin, %d special, %d chinese",
- cnt, nlatin, nspecial, nchinese);
-
- *pchinese = nchinese;
- *pspecial = nspecial;
- }
-
- static inline void
- rspamd_language_detector_set_language(struct rspamd_task *task,
- struct rspamd_mime_text_part *part,
- const char *code,
- struct rspamd_language_elt *elt)
- {
- struct rspamd_lang_detector_res *r;
-
- r = rspamd_mempool_alloc0(task->task_pool, sizeof(*r));
- r->prob = 1.0;
- r->lang = code;
- r->elt = elt;
-
- if (part->languages == NULL) {
- part->languages = g_ptr_array_sized_new(1);
- }
-
- g_ptr_array_add(part->languages, r);
- part->language = code;
- }
-
- static gboolean
- rspamd_language_detector_try_uniscript(struct rspamd_task *task,
- struct rspamd_mime_text_part *part,
- unsigned int nchinese,
- unsigned int nspecial)
- {
- unsigned int i;
-
- for (i = 0; i < G_N_ELEMENTS(unicode_langs); i++) {
- if (unicode_langs[i].unicode_code & part->unicode_scripts) {
-
- if (unicode_langs[i].unicode_code != RSPAMD_UNICODE_JP) {
- msg_debug_lang_det("set language based on unicode script %s",
- unicode_langs[i].lang);
- rspamd_language_detector_set_language(task, part,
- unicode_langs[i].lang, NULL);
-
- return TRUE;
- }
- else {
- /* Japanese <-> Chinese guess */
-
- /*
- * Typically there might be around 0-70% of kanji glyphs
- * and the rest are Haragana/Katakana
- *
- * If we discover that Kanji is more than 80% then we consider
- * it Chinese
- */
- if (nchinese <= 5 || nchinese < nspecial * 5) {
- msg_debug_lang_det("set language based on unicode script %s",
- unicode_langs[i].lang);
- rspamd_language_detector_set_language(task, part,
- unicode_langs[i].lang, NULL);
-
- return TRUE;
- }
- }
- }
- }
-
- if (part->unicode_scripts & RSPAMD_UNICODE_CJK) {
- msg_debug_lang_det("guess chinese based on CJK characters: %d chinese, %d special",
- nchinese, nspecial);
- rspamd_language_detector_set_language(task, part,
- "zh-CN", NULL);
-
- return TRUE;
- }
-
- return FALSE;
- }
-
- static unsigned int
- rspamd_langelt_hash_func(gconstpointer key)
- {
- const struct rspamd_language_elt *elt = (const struct rspamd_language_elt *) key;
- return rspamd_cryptobox_fast_hash(elt->name, strlen(elt->name),
- rspamd_hash_seed());
- }
-
- static gboolean
- rspamd_langelt_equal_func(gconstpointer v, gconstpointer v2)
- {
- const struct rspamd_language_elt *elt1 = (const struct rspamd_language_elt *) v,
- *elt2 = (const struct rspamd_language_elt *) v2;
- return strcmp(elt1->name, elt2->name) == 0;
- }
-
- /* This hash set stores a word index in the language to avoid duplicate stop words */
- KHASH_INIT(rspamd_sw_res_set, int, char, 0, kh_int_hash_func, kh_int_hash_equal);
-
- KHASH_INIT(rspamd_sw_hash, struct rspamd_language_elt *, khash_t(rspamd_sw_res_set) *, 1,
- rspamd_langelt_hash_func, rspamd_langelt_equal_func);
-
- struct rspamd_sw_cbdata {
- struct rspamd_task *task;
- khash_t(rspamd_sw_hash) * res;
- GArray *ranges;
- };
-
- static int
- rspamd_ranges_cmp(const void *k, const void *memb)
- {
- int pos = GPOINTER_TO_INT(k);
- const struct rspamd_stop_word_range *r = (struct rspamd_stop_word_range *) memb;
-
- if (pos >= r->start && pos < r->stop) {
- return 0;
- }
- else if (pos < r->start) {
- return -1;
- }
-
- return 1;
- }
-
- static int
- rspamd_language_detector_sw_cb(struct rspamd_multipattern *mp,
- unsigned int strnum,
- int match_start,
- int match_pos,
- const char *text,
- gsize len,
- void *context)
- {
- /* Check if boundary */
- const char *prev = text, *next = text + len;
- struct rspamd_stop_word_range *r;
- struct rspamd_sw_cbdata *cbdata = (struct rspamd_sw_cbdata *) context;
- khiter_t k;
- static const gsize max_stop_words = 80;
- struct rspamd_task *task;
-
- if (match_start > 0) {
- prev = text + match_start - 1;
-
- if (!(g_ascii_isspace(*prev) || g_ascii_ispunct(*prev))) {
- return 0;
- }
- }
-
- if (match_pos < len) {
- next = text + match_pos;
-
- if (!(g_ascii_isspace(*next) || g_ascii_ispunct(*next))) {
- return 0;
- }
- }
-
- /* We have a word on the boundary, check range */
- task = cbdata->task;
- r = bsearch(GINT_TO_POINTER(strnum), cbdata->ranges->data,
- cbdata->ranges->len, sizeof(*r), rspamd_ranges_cmp);
-
- g_assert(r != NULL);
-
- k = kh_get(rspamd_sw_hash, cbdata->res, r->elt);
- int nwords = 1;
-
- if (k != kh_end(cbdata->res)) {
- khiter_t set_k;
- int tt;
-
- set_k = kh_get(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum);
- nwords = kh_size(kh_value(cbdata->res, k));
-
- if (set_k == kh_end(kh_value(cbdata->res, k))) {
- /* New word */
- set_k = kh_put(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum, &tt);
- msg_debug_lang_det("found new word %*s from %s language (%d stop words found so far)",
- (int) (next - prev - 1), prev + 1, r->elt->name, nwords);
- }
-
- if (nwords > max_stop_words) {
- return 1;
- }
- }
- else {
- int tt;
-
- k = kh_put(rspamd_sw_hash, cbdata->res, r->elt, &tt);
- kh_value(cbdata->res, k) = kh_init(rspamd_sw_res_set);
- kh_put(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum, &tt);
-
- msg_debug_lang_det("found new word %*s from %s language (%d stop words found so far)",
- (int) (next - prev - 1), prev + 1, r->elt->name, nwords);
- }
-
- return 0;
- }
-
- static gboolean
- rspamd_language_detector_try_stop_words(struct rspamd_task *task,
- struct rspamd_lang_detector *d,
- struct rspamd_mime_text_part *part,
- enum rspamd_language_category cat)
- {
- struct rspamd_stop_word_elt *elt;
- struct rspamd_sw_cbdata cbdata;
- gboolean ret = FALSE;
- static const int stop_words_threshold = 4, /* minimum stop words count */
- strong_confidence_threshold = 10 /* we are sure that this is enough */;
-
- elt = &d->stop_words[cat];
- cbdata.res = kh_init(rspamd_sw_hash);
- cbdata.ranges = elt->ranges;
- cbdata.task = task;
-
- rspamd_multipattern_lookup(elt->mp, part->utf_stripped_content->data,
- part->utf_stripped_content->len, rspamd_language_detector_sw_cb,
- &cbdata, NULL);
-
- if (kh_size(cbdata.res) > 0) {
- khash_t(rspamd_sw_res_set) * cur_res;
- double max_rate = G_MINDOUBLE;
- struct rspamd_language_elt *cur_lang, *sel = NULL;
- gboolean ignore_ascii = FALSE, ignore_latin = FALSE;
-
- again:
- kh_foreach(cbdata.res, cur_lang, cur_res, {
- int cur_matches = kh_size(cur_res);
-
- if (!ignore_ascii && (cur_lang->flags & RS_LANGUAGE_DIACRITICS)) {
- /* Restart matches */
- ignore_ascii = TRUE;
- sel = NULL;
- max_rate = G_MINDOUBLE;
- msg_debug_lang_det("ignore ascii after finding %d stop words from %s",
- cur_matches, cur_lang->name);
- goto again;
- }
-
- if (!ignore_latin && cur_lang->category != RSPAMD_LANGUAGE_LATIN) {
- /* Restart matches */
- ignore_latin = TRUE;
- sel = NULL;
- max_rate = G_MINDOUBLE;
- msg_debug_lang_det("ignore latin after finding stop %d words from %s",
- cur_matches, cur_lang->name);
- goto again;
- }
-
- if (cur_matches < stop_words_threshold) {
- continue;
- }
-
- if (cur_matches < strong_confidence_threshold) {
- /* Ignore mixed languages when not enough confidence */
- if (ignore_ascii && (cur_lang->flags & RS_LANGUAGE_ASCII)) {
- continue;
- }
-
- if (ignore_latin && cur_lang->category == RSPAMD_LANGUAGE_LATIN) {
- continue;
- }
- }
-
- double rate = (double) cur_matches / (double) cur_lang->stop_words;
-
- if (rate > max_rate) {
- max_rate = rate;
- sel = cur_lang;
- }
-
- msg_debug_lang_det("found %d stop words from %s: %3f rate",
- cur_matches, cur_lang->name, rate);
- });
-
- /* Cleanup */
- kh_foreach(cbdata.res, cur_lang, cur_res, {
- kh_destroy(rspamd_sw_res_set, cur_res);
- });
-
- if (max_rate > 0 && sel) {
- msg_debug_lang_det("set language based on stop words script %s, %.3f found",
- sel->name, max_rate);
- rspamd_language_detector_set_language(task, part,
- sel->name, sel);
-
- ret = TRUE;
- }
- }
- else {
- msg_debug_lang_det("found no stop words in a text");
- }
-
- kh_destroy(rspamd_sw_hash, cbdata.res);
-
- return ret;
- }
-
- gboolean
- rspamd_language_detector_detect(struct rspamd_task *task,
- struct rspamd_lang_detector *d,
- struct rspamd_mime_text_part *part)
- {
- khash_t(rspamd_candidates_hash) * candidates;
- GPtrArray *result;
- double mean, std, start_ticks, end_ticks;
- unsigned int cand_len;
- enum rspamd_language_category cat;
- struct rspamd_lang_detector_res *cand;
- enum rspamd_language_detected_type r;
- struct rspamd_frequency_sort_cbdata cbd;
- /* Check if we have sorted candidates based on frequency */
- gboolean frequency_heuristic_applied = FALSE, ret = FALSE;
-
- if (!part->utf_stripped_content) {
- return FALSE;
- }
-
- start_ticks = rspamd_get_ticks(TRUE);
-
- unsigned int nchinese = 0, nspecial = 0;
- rspamd_language_detector_unicode_scripts(task, part, &nchinese, &nspecial);
-
- /* Disable internal language detection heuristics if we have fasttext */
- if (!rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector) || !d->prefer_fasttext) {
- /* Apply unicode scripts heuristic */
- if (rspamd_language_detector_try_uniscript(task, part, nchinese, nspecial)) {
- ret = TRUE;
- }
-
- cat = rspamd_language_detector_get_category(part->unicode_scripts);
-
- if (!ret && rspamd_language_detector_try_stop_words(task, d, part, cat)) {
- ret = TRUE;
- }
- }
-
- if (!ret) {
- unsigned ndetected = 0;
- if (rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector)) {
- rspamd_fasttext_predict_result_t fasttext_predict_result =
- rspamd_lang_detection_fasttext_detect(d->fasttext_detector, task,
- part->utf_words, 4);
-
- ndetected = rspamd_lang_detection_fasttext_get_nlangs(fasttext_predict_result);
-
- if (ndetected > 0) {
- candidates = kh_init(rspamd_candidates_hash);
- kh_resize(rspamd_candidates_hash, candidates, ndetected);
-
- /* Now fill all results where probability is above threshold */
- float max_prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, 0);
-
- for (unsigned int i = 0; i < ndetected; i++) {
- float prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, i);
- if (prob > max_prob * 0.75) {
- char *lang = rspamd_mempool_strdup(task->task_pool,
- rspamd_lang_detection_fasttext_get_lang(fasttext_predict_result, i));
- int tmp;
- khiter_t k = kh_put(rspamd_candidates_hash, candidates, lang, &tmp);
-
- kh_value(candidates, k) = rspamd_mempool_alloc0(task->task_pool, sizeof(*cand));
- cand = kh_value(candidates, k);
- cand->lang = lang;
- cand->prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, i);
-
- /* Find the corresponding language elt */
- k = kh_get(rspamd_languages_hash, d->languages, lang);
- if (k != kh_end(d->languages)) {
- cand->elt = kh_value(d->languages, k);
- }
- }
- }
-
- if (kh_size(candidates) == 1) {
- r = rs_detect_single;
- }
- else if (kh_size(candidates) > 1) {
- r = rs_detect_multiple;
- }
- else {
- r = rs_detect_none;
- }
- }
-
- rspamd_fasttext_predict_result_destroy(fasttext_predict_result);
- }
- if (ndetected == 0) {
- if (part->utf_words->len < default_short_text_limit) {
- r = rs_detect_none;
- msg_debug_lang_det("text is too short for trigrams detection: "
- "%d words; at least %d words required",
- (int) part->utf_words->len,
- (int) default_short_text_limit);
- switch (cat) {
- case RSPAMD_LANGUAGE_CYRILLIC:
- rspamd_language_detector_set_language(task, part, "ru", NULL);
- break;
- case RSPAMD_LANGUAGE_DEVANAGARI:
- rspamd_language_detector_set_language(task, part, "hi", NULL);
- break;
- case RSPAMD_LANGUAGE_ARAB:
- rspamd_language_detector_set_language(task, part, "ar", NULL);
- break;
- default:
- case RSPAMD_LANGUAGE_LATIN:
- rspamd_language_detector_set_language(task, part, "en", NULL);
- break;
- }
- msg_debug_lang_det("set %s language based on symbols category",
- part->language);
-
- candidates = kh_init(rspamd_candidates_hash);
- }
- else {
- candidates = kh_init(rspamd_candidates_hash);
- kh_resize(rspamd_candidates_hash, candidates, 32);
-
- r = rspamd_language_detector_try_ngramm(task,
- default_words,
- d,
- part->utf_words,
- cat,
- candidates,
- part);
-
- if (r == rs_detect_none) {
- msg_debug_lang_det("no trigrams found, fallback to english");
- rspamd_language_detector_set_language(task, part, "en", NULL);
- }
- else if (r == rs_detect_multiple) {
- /* Check our guess */
-
- mean = 0.0;
- std = 0.0;
- cand_len = 0;
-
- /* Check distribution */
- kh_foreach_value(candidates, cand, {
- if (!isnan(cand->prob)) {
- mean += cand->prob;
- cand_len++;
- }
- });
-
- if (cand_len > 0) {
- mean /= cand_len;
-
- kh_foreach_value(candidates, cand, {
- double err;
- if (!isnan(cand->prob)) {
- err = cand->prob - mean;
- std += fabs(err);
- }
- });
-
- std /= cand_len;
- }
-
- msg_debug_lang_det("trigrams checked, %d candidates, %.3f mean, %.4f stddev",
- cand_len, mean, std);
-
- if (cand_len > 0 && std / fabs(mean) < 0.25) {
- msg_debug_lang_det("apply frequency heuristic sorting");
- frequency_heuristic_applied = TRUE;
- cbd.d = d;
- cbd.mean = mean;
- cbd.std = std;
- cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
-
- if (part->nwords < default_words / 2) {
- cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
- }
- }
- }
- }
- }
-
- /* Now, convert hash to array and sort it */
- if (r != rs_detect_none && kh_size(candidates) > 0) {
- result = g_ptr_array_sized_new(kh_size(candidates));
-
- kh_foreach_value(candidates, cand, {
- if (!isnan(cand->prob)) {
- msg_debug_lang_det("pre-sorting probability %s -> %.2f", cand->lang,
- cand->prob);
- g_ptr_array_add(result, cand);
- }
- });
-
- if (frequency_heuristic_applied) {
- g_ptr_array_sort_with_data(result,
- rspamd_language_detector_cmp_heuristic,
- (gpointer) &cbd);
- }
- else {
- g_ptr_array_sort(result, rspamd_language_detector_cmp);
- }
-
- int i;
- PTR_ARRAY_FOREACH(result, i, cand)
- {
- msg_debug_lang_det("final probability %s -> %.2f", cand->lang,
- cand->prob);
- }
-
- if (part->languages != NULL) {
- g_ptr_array_unref(part->languages);
- }
-
- part->languages = result;
- part->language = ((struct rspamd_lang_detector_res *) g_ptr_array_index(result, 0))->lang;
- ret = TRUE;
- }
- else if (part->languages == NULL) {
- rspamd_language_detector_set_language(task, part, "en", NULL);
- }
-
- kh_destroy(rspamd_candidates_hash, candidates);
- }
-
- /* Update internal stat */
- if (part->languages != NULL && part->languages->len > 0 && !frequency_heuristic_applied) {
- cand = g_ptr_array_index(part->languages, 0);
- if (cand->elt) {
- cand->elt->occurrences++;
- d->total_occurrences++;
-
- msg_debug_lang_det("updated stat for %s: %d occurrences, %z total detected",
- cand->elt->name, cand->elt->occurrences,
- d->total_occurrences);
- }
- }
-
- end_ticks = rspamd_get_ticks(TRUE);
- msg_debug_lang_det("detected languages in %.0f ticks",
- (end_ticks - start_ticks));
-
- return ret;
- }
-
-
- struct rspamd_lang_detector *
- rspamd_language_detector_ref(struct rspamd_lang_detector *d)
- {
- REF_RETAIN(d);
-
- return d;
- }
-
- void rspamd_language_detector_unref(struct rspamd_lang_detector *d)
- {
- REF_RELEASE(d);
- }
-
- gboolean
- rspamd_language_detector_is_stop_word(struct rspamd_lang_detector *d,
- const char *word, gsize wlen)
- {
- khiter_t k;
- rspamd_ftok_t search;
-
- search.begin = word;
- search.len = wlen;
-
- k = kh_get(rspamd_stopwords_hash, d->stop_words_norm, &search);
-
- if (k != kh_end(d->stop_words_norm)) {
- return TRUE;
- }
-
- return FALSE;
- }
-
- int rspamd_language_detector_elt_flags(const struct rspamd_language_elt *elt)
- {
- if (elt) {
- return elt->flags;
- }
-
- return 0;
- }
|