diff options
author | Vsevolod Stakhov <vsevolod@rspamd.com> | 2023-07-26 10:49:23 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rspamd.com> | 2023-07-26 10:49:23 +0100 |
commit | 537a7180a0d5132c11636c4fd8b1450cd99d352c (patch) | |
tree | fb9f8c84955a411bdffbd6371ea32f2716fb3687 /src/libmime/lang_detection.c | |
parent | 5fd7a90fdaa33f52c59bdb0ca84451e5c1e22365 (diff) | |
download | rspamd-537a7180a0d5132c11636c4fd8b1450cd99d352c.tar.gz rspamd-537a7180a0d5132c11636c4fd8b1450cd99d352c.zip |
[Rework] Use clang-format to unify formatting in all sources
No meaningful changes.
Diffstat (limited to 'src/libmime/lang_detection.c')
-rw-r--r-- | src/libmime/lang_detection.c | 1218 |
1 files changed, 612 insertions, 606 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 4d9e1ae68..52221cd32 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -47,29 +47,28 @@ struct rspamd_language_unicode_match { * List of languages detected by unicode scripts */ static const struct rspamd_language_unicode_match unicode_langs[] = { - {"el", RSPAMD_UNICODE_GREEK}, - {"ml", RSPAMD_UNICODE_MALAYALAM}, - {"te", RSPAMD_UNICODE_TELUGU}, - {"ta", RSPAMD_UNICODE_TAMIL}, - {"gu", RSPAMD_UNICODE_GUJARATI}, - {"th", RSPAMD_UNICODE_THAI}, - {"ka", RSPAMD_UNICODE_GEORGIAN}, - {"si", RSPAMD_UNICODE_SINHALA}, - {"hy", RSPAMD_UNICODE_ARMENIAN}, - {"ja", RSPAMD_UNICODE_JP}, - {"ko", RSPAMD_UNICODE_HANGUL}, + {"el", RSPAMD_UNICODE_GREEK}, + {"ml", RSPAMD_UNICODE_MALAYALAM}, + {"te", RSPAMD_UNICODE_TELUGU}, + {"ta", RSPAMD_UNICODE_TAMIL}, + {"gu", RSPAMD_UNICODE_GUJARATI}, + {"th", RSPAMD_UNICODE_THAI}, + {"ka", RSPAMD_UNICODE_GEORGIAN}, + {"si", RSPAMD_UNICODE_SINHALA}, + {"hy", RSPAMD_UNICODE_ARMENIAN}, + {"ja", RSPAMD_UNICODE_JP}, + {"ko", RSPAMD_UNICODE_HANGUL}, }; /* * Top languages */ static const gchar *tier0_langs[] = { - "en", + "en", }; static const gchar *tier1_langs[] = { - "fr", "it", "de", "es", "nl", - "pt", "ru", "pl", "tk", "th", "ar" -}; + "fr", "it", "de", "es", "nl", + "pt", "ru", "pl", "tk", "th", "ar"}; enum rspamd_language_category { RSPAMD_LANGUAGE_LATIN = 0, @@ -81,7 +80,7 @@ enum rspamd_language_category { struct rspamd_language_elt { const gchar *name; /* e.g. "en" or "ru" */ - gint flags; /* enum rspamd_language_elt_flags */ + gint flags; /* enum rspamd_language_elt_flags */ enum rspamd_language_category category; guint trigrams_words; guint stop_words; @@ -113,25 +112,25 @@ struct rspamd_stop_word_elt { GArray *ranges; /* of rspamd_stop_word_range */ }; -#define msg_debug_lang_det(...) rspamd_conditional_debug_fast (NULL, NULL, \ - rspamd_langdet_log_id, "langdet", task->task_pool->tag.uid, \ - G_STRFUNC, \ - __VA_ARGS__) -#define msg_debug_lang_det_cfg(...) rspamd_conditional_debug_fast (NULL, NULL, \ - rspamd_langdet_log_id, "langdet", cfg->cfg_pool->tag.uid, \ - G_STRFUNC, \ - __VA_ARGS__) +#define msg_debug_lang_det(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_langdet_log_id, "langdet", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_debug_lang_det_cfg(...) rspamd_conditional_debug_fast(NULL, NULL, \ + rspamd_langdet_log_id, "langdet", cfg->cfg_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) INIT_LOG_MODULE_PUBLIC(langdet) static const struct rspamd_language_unicode_match * -rspamd_language_search_unicode_match (const gchar *key, - const struct rspamd_language_unicode_match *elts, size_t nelts) +rspamd_language_search_unicode_match(const gchar *key, + const struct rspamd_language_unicode_match *elts, size_t nelts) { size_t i; for (i = 0; i < nelts; i++) { - if (strcmp (elts[i].lang, key) == 0) { + if (strcmp(elts[i].lang, key) == 0) { return &elts[i]; } } @@ -140,12 +139,12 @@ rspamd_language_search_unicode_match (const gchar *key, } static gboolean -rspamd_language_search_str (const gchar *key, const gchar *elts[], size_t nelts) +rspamd_language_search_str(const gchar *key, const gchar *elts[], size_t nelts) { size_t i; for (i = 0; i < nelts; i++) { - if (strcmp (elts[i], key) == 0) { + if (strcmp(elts[i], key) == 0) { return TRUE; } } @@ -153,34 +152,34 @@ rspamd_language_search_str (const gchar *key, const gchar *elts[], size_t nelts) } static guint -rspamd_trigram_hash_func (gconstpointer key) +rspamd_trigram_hash_func(gconstpointer key) { - return rspamd_cryptobox_fast_hash (key, 3 * sizeof (UChar32), - rspamd_hash_seed ()); + return rspamd_cryptobox_fast_hash(key, 3 * sizeof(UChar32), + rspamd_hash_seed()); } static gboolean -rspamd_trigram_equal_func (gconstpointer v, gconstpointer v2) +rspamd_trigram_equal_func(gconstpointer v, gconstpointer v2) { - return memcmp (v, v2, 3 * sizeof (UChar32)) == 0; + return memcmp(v, v2, 3 * sizeof(UChar32)) == 0; } -KHASH_INIT (rspamd_trigram_hash, const UChar32 *, struct rspamd_ngramm_chain, true, - rspamd_trigram_hash_func, rspamd_trigram_equal_func); -KHASH_INIT (rspamd_candidates_hash, const gchar *, - struct rspamd_lang_detector_res *, true, - rspamd_str_hash, rspamd_str_equal); -KHASH_INIT (rspamd_stopwords_hash, rspamd_ftok_t *, - char, false, - rspamd_ftok_hash, rspamd_ftok_equal); - -KHASH_INIT (rspamd_languages_hash, const gchar *, struct rspamd_language_elt *, true, - rspamd_str_hash, rspamd_str_equal); +KHASH_INIT(rspamd_trigram_hash, const UChar32 *, struct rspamd_ngramm_chain, true, + rspamd_trigram_hash_func, rspamd_trigram_equal_func); +KHASH_INIT(rspamd_candidates_hash, const gchar *, + struct rspamd_lang_detector_res *, true, + rspamd_str_hash, rspamd_str_equal); +KHASH_INIT(rspamd_stopwords_hash, rspamd_ftok_t *, + char, false, + rspamd_ftok_hash, rspamd_ftok_equal); + +KHASH_INIT(rspamd_languages_hash, const gchar *, struct rspamd_language_elt *, true, + rspamd_str_hash, rspamd_str_equal); struct rspamd_lang_detector { - khash_t(rspamd_languages_hash) *languages; - khash_t(rspamd_trigram_hash) *trigrams[RSPAMD_LANGUAGE_MAX]; /* trigrams frequencies */ + khash_t(rspamd_languages_hash) * languages; + khash_t(rspamd_trigram_hash) * trigrams[RSPAMD_LANGUAGE_MAX]; /* trigrams frequencies */ struct rspamd_stop_word_elt stop_words[RSPAMD_LANGUAGE_MAX]; - khash_t(rspamd_stopwords_hash) *stop_words_norm; + khash_t(rspamd_stopwords_hash) * stop_words_norm; UConverter *uchar_converter; gsize short_text_limit; bool prefer_fasttext; @@ -190,23 +189,23 @@ struct rspamd_lang_detector { }; static void -rspamd_language_detector_ucs_lowercase (UChar32 *s, gsize len) +rspamd_language_detector_ucs_lowercase(UChar32 *s, gsize len) { gsize i; - for (i = 0; i < len; i ++) { - s[i] = u_tolower (s[i]); + for (i = 0; i < len; i++) { + s[i] = u_tolower(s[i]); } } static gboolean -rspamd_language_detector_ucs_is_latin (const UChar32 *s, gsize len) +rspamd_language_detector_ucs_is_latin(const UChar32 *s, gsize len) { gsize i; gboolean ret = TRUE; - for (i = 0; i < len; i ++) { - if (s[i] >= 128 || !(g_ascii_isalnum (s[i]) || s[i] == ' ')) { + for (i = 0; i < len; i++) { + if (s[i] >= 128 || !(g_ascii_isalnum(s[i]) || s[i] == ' ')) { ret = FALSE; break; } @@ -222,14 +221,14 @@ struct rspamd_language_ucs_elt { }; static void -rspamd_language_detector_init_ngramm (struct rspamd_config *cfg, - struct rspamd_lang_detector *d, - struct rspamd_language_elt *lelt, - struct rspamd_language_ucs_elt *ucs, - guint len, - guint freq, - guint total, - khash_t (rspamd_trigram_hash) *htb) +rspamd_language_detector_init_ngramm(struct rspamd_config *cfg, + struct rspamd_lang_detector *d, + struct rspamd_language_elt *lelt, + struct rspamd_language_ucs_elt *ucs, + guint len, + guint freq, + guint total, + khash_t(rspamd_trigram_hash) * htb) { struct rspamd_ngramm_chain *chain = NULL, st_chain; struct rspamd_ngramm_elt *elt; @@ -240,58 +239,59 @@ rspamd_language_detector_init_ngramm (struct rspamd_config *cfg, switch (len) { case 1: case 2: - g_assert_not_reached (); + g_assert_not_reached(); break; case 3: - k = kh_get (rspamd_trigram_hash, htb, ucs->s); - if (k != kh_end (htb)) { - chain = &kh_value (htb, k); + k = kh_get(rspamd_trigram_hash, htb, ucs->s); + if (k != kh_end(htb)) { + chain = &kh_value(htb, k); } break; default: - g_assert_not_reached (); + g_assert_not_reached(); break; } if (chain == NULL) { /* New element */ chain = &st_chain; - memset (chain, 0, sizeof (st_chain)); - chain->languages = g_ptr_array_sized_new (32); - rspamd_mempool_add_destructor (cfg->cfg_pool, rspamd_ptr_array_free_hard, - chain->languages); - chain->utf = rspamd_mempool_strdup (cfg->cfg_pool, ucs->utf); - elt = rspamd_mempool_alloc (cfg->cfg_pool, sizeof (*elt)); + memset(chain, 0, sizeof(st_chain)); + chain->languages = g_ptr_array_sized_new(32); + rspamd_mempool_add_destructor(cfg->cfg_pool, rspamd_ptr_array_free_hard, + chain->languages); + chain->utf = rspamd_mempool_strdup(cfg->cfg_pool, ucs->utf); + elt = rspamd_mempool_alloc(cfg->cfg_pool, sizeof(*elt)); elt->elt = lelt; - elt->prob = ((gdouble)freq) / ((gdouble)total); - g_ptr_array_add (chain->languages, elt); + elt->prob = ((gdouble) freq) / ((gdouble) total); + g_ptr_array_add(chain->languages, elt); - k = kh_put (rspamd_trigram_hash, htb, ucs->s, &i); - kh_value (htb, k) = *chain; + k = kh_put(rspamd_trigram_hash, htb, ucs->s, &i); + kh_value(htb, k) = *chain; } else { /* Check sanity */ found = FALSE; - PTR_ARRAY_FOREACH (chain->languages, i, elt) { - if (strcmp (elt->elt->name, lelt->name) == 0) { + PTR_ARRAY_FOREACH(chain->languages, i, elt) + { + if (strcmp(elt->elt->name, lelt->name) == 0) { found = TRUE; - elt->prob += ((gdouble)freq) / ((gdouble)total); + elt->prob += ((gdouble) freq) / ((gdouble) total); break; } } if (!found) { - elt = rspamd_mempool_alloc (cfg->cfg_pool, sizeof (*elt)); + elt = rspamd_mempool_alloc(cfg->cfg_pool, sizeof(*elt)); elt->elt = lelt; - elt->prob = ((gdouble)freq) / ((gdouble)total); - g_ptr_array_add (chain->languages, elt); + elt->prob = ((gdouble) freq) / ((gdouble) total); + g_ptr_array_add(chain->languages, elt); } } } static inline enum rspamd_language_category -rspamd_language_detector_get_category (guint uflags) +rspamd_language_detector_get_category(guint uflags) { enum rspamd_language_category cat = RSPAMD_LANGUAGE_LATIN; @@ -309,19 +309,19 @@ rspamd_language_detector_get_category (guint uflags) } static const gchar * -rspamd_language_detector_print_flags (struct rspamd_language_elt *elt) +rspamd_language_detector_print_flags(struct rspamd_language_elt *elt) { static gchar flags_buf[256]; goffset r = 0; if (elt->flags & RS_LANGUAGE_TIER1) { - r += rspamd_snprintf (flags_buf + r, sizeof (flags_buf) - r, "tier1,"); + r += rspamd_snprintf(flags_buf + r, sizeof(flags_buf) - r, "tier1,"); } if (elt->flags & RS_LANGUAGE_TIER0) { - r += rspamd_snprintf (flags_buf + r, sizeof (flags_buf) - r, "tier0,"); + r += rspamd_snprintf(flags_buf + r, sizeof(flags_buf) - r, "tier0,"); } if (elt->flags & RS_LANGUAGE_LATIN) { - r += rspamd_snprintf (flags_buf + r, sizeof (flags_buf) - r, "latin,"); + r += rspamd_snprintf(flags_buf + r, sizeof(flags_buf) - r, "latin,"); } if (r > 0) { @@ -335,19 +335,19 @@ rspamd_language_detector_print_flags (struct rspamd_language_elt *elt) } static gint -rspamd_language_detector_cmp_ngramm (gconstpointer a, gconstpointer b) +rspamd_language_detector_cmp_ngramm(gconstpointer a, gconstpointer b) { - struct rspamd_language_ucs_elt *e1 = *(struct rspamd_language_ucs_elt **)a; - struct rspamd_language_ucs_elt *e2 = *(struct rspamd_language_ucs_elt **)b; + struct rspamd_language_ucs_elt *e1 = *(struct rspamd_language_ucs_elt **) a; + struct rspamd_language_ucs_elt *e2 = *(struct rspamd_language_ucs_elt **) b; - return (gint)e2->freq - (gint)e1->freq; + return (gint) e2->freq - (gint) e1->freq; } static void -rspamd_language_detector_read_file (struct rspamd_config *cfg, - struct rspamd_lang_detector *d, - const gchar *path, - const ucl_object_t *stop_words) +rspamd_language_detector_read_file(struct rspamd_config *cfg, + struct rspamd_lang_detector *d, + const gchar *path, + const ucl_object_t *stop_words) { struct ucl_parser *parser; ucl_object_t *top; @@ -356,110 +356,110 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, UErrorCode uc_err = U_ZERO_ERROR; struct rspamd_language_elt *nelt; struct rspamd_language_ucs_elt *ucs_elt; - khash_t (rspamd_trigram_hash) *htb = NULL; + khash_t(rspamd_trigram_hash) *htb = NULL; gchar *pos; guint total = 0, total_latin = 0, total_ngramms = 0, i, skipped, - loaded, nstop = 0; + loaded, nstop = 0; gdouble mean = 0, std = 0, delta = 0, delta2 = 0, m2 = 0; enum rspamd_language_category cat = RSPAMD_LANGUAGE_MAX; - parser = ucl_parser_new (UCL_PARSER_NO_FILEVARS); - if (!ucl_parser_add_file (parser, path)) { - msg_warn_config ("cannot parse file %s: %s", path, - ucl_parser_get_error (parser)); - ucl_parser_free (parser); + parser = ucl_parser_new(UCL_PARSER_NO_FILEVARS); + if (!ucl_parser_add_file(parser, path)) { + msg_warn_config("cannot parse file %s: %s", path, + ucl_parser_get_error(parser)); + ucl_parser_free(parser); return; } - top = ucl_parser_get_object (parser); - ucl_parser_free (parser); + top = ucl_parser_get_object(parser); + ucl_parser_free(parser); - freqs = ucl_object_lookup (top, "freq"); + freqs = ucl_object_lookup(top, "freq"); if (freqs == NULL) { - msg_warn_config ("file %s has no 'freq' key", path); - ucl_object_unref (top); + msg_warn_config("file %s has no 'freq' key", path); + ucl_object_unref(top); return; } - pos = strrchr (path, '/'); - g_assert (pos != NULL); - nelt = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (*nelt)); - nelt->name = rspamd_mempool_strdup (cfg->cfg_pool, pos + 1); + pos = strrchr(path, '/'); + g_assert(pos != NULL); + nelt = rspamd_mempool_alloc0(cfg->cfg_pool, sizeof(*nelt)); + nelt->name = rspamd_mempool_strdup(cfg->cfg_pool, pos + 1); /* Remove extension */ - pos = strchr (nelt->name, '.'); - g_assert (pos != NULL); + pos = strchr(nelt->name, '.'); + g_assert(pos != NULL); *pos = '\0'; - n_words = ucl_object_lookup (top, "n_words"); + n_words = ucl_object_lookup(top, "n_words"); - if (n_words == NULL || ucl_object_type (n_words) != UCL_ARRAY || - n_words->len != 3) { - msg_warn_config ("cannot find n_words in language %s", nelt->name); - ucl_object_unref (top); + if (n_words == NULL || ucl_object_type(n_words) != UCL_ARRAY || + n_words->len != 3) { + msg_warn_config("cannot find n_words in language %s", nelt->name); + ucl_object_unref(top); return; } else { - nelt->trigrams_words = ucl_object_toint (ucl_array_find_index (n_words, - 2)); + nelt->trigrams_words = ucl_object_toint(ucl_array_find_index(n_words, + 2)); } - type = ucl_object_lookup (top, "type"); + type = ucl_object_lookup(top, "type"); - if (type == NULL || ucl_object_type (type) != UCL_STRING) { - msg_debug_config ("cannot find type in language %s", nelt->name); - ucl_object_unref (top); + if (type == NULL || ucl_object_type(type) != UCL_STRING) { + msg_debug_config("cannot find type in language %s", nelt->name); + ucl_object_unref(top); return; } else { - const gchar *stype = ucl_object_tostring (type); + const gchar *stype = ucl_object_tostring(type); - if (strcmp (stype, "latin") == 0) { + if (strcmp(stype, "latin") == 0) { cat = RSPAMD_LANGUAGE_LATIN; } - else if (strcmp (stype, "cyrillic") == 0) { + else if (strcmp(stype, "cyrillic") == 0) { cat = RSPAMD_LANGUAGE_CYRILLIC; } - else if (strcmp (stype, "arab") == 0) { + else if (strcmp(stype, "arab") == 0) { cat = RSPAMD_LANGUAGE_ARAB; } - else if (strcmp (stype, "devanagari") == 0) { + else if (strcmp(stype, "devanagari") == 0) { cat = RSPAMD_LANGUAGE_DEVANAGARI; } else { - msg_debug_config ("unknown type %s of language %s", stype, nelt->name); - ucl_object_unref (top); + msg_debug_config("unknown type %s of language %s", stype, nelt->name); + ucl_object_unref(top); return; } } - flags = ucl_object_lookup (top, "flags"); + flags = ucl_object_lookup(top, "flags"); - if (flags != NULL && ucl_object_type (flags) == UCL_ARRAY) { + if (flags != NULL && ucl_object_type(flags) == UCL_ARRAY) { ucl_object_iter_t it = NULL; const ucl_object_t *cur; - while ((cur = ucl_object_iterate (flags, &it, true)) != NULL) { - const gchar *fl = ucl_object_tostring (cur); + while ((cur = ucl_object_iterate(flags, &it, true)) != NULL) { + const gchar *fl = ucl_object_tostring(cur); if (cur) { - if (strcmp (fl, "diacritics") == 0) { + if (strcmp(fl, "diacritics") == 0) { nelt->flags |= RS_LANGUAGE_DIACRITICS; } - else if (strcmp (fl, "ascii") == 0) { + else if (strcmp(fl, "ascii") == 0) { nelt->flags |= RS_LANGUAGE_ASCII; } else { - msg_debug_config ("unknown flag %s of language %s", fl, nelt->name); + msg_debug_config("unknown flag %s of language %s", fl, nelt->name); } } else { - msg_debug_config ("unknown flags type of language %s", nelt->name); + msg_debug_config("unknown flags type of language %s", nelt->name); } } } @@ -467,7 +467,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, if (stop_words) { const ucl_object_t *specific_stop_words; - specific_stop_words = ucl_object_lookup (stop_words, nelt->name); + specific_stop_words = ucl_object_lookup(stop_words, nelt->name); if (specific_stop_words) { struct sb_stemmer *stem = NULL; @@ -475,33 +475,33 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, const ucl_object_t *w; guint start, stop; - stem = sb_stemmer_new (nelt->name, "UTF_8"); - start = rspamd_multipattern_get_npatterns (d->stop_words[cat].mp); + stem = sb_stemmer_new(nelt->name, "UTF_8"); + start = rspamd_multipattern_get_npatterns(d->stop_words[cat].mp); - while ((w = ucl_object_iterate (specific_stop_words, &it, true)) != NULL) { + while ((w = ucl_object_iterate(specific_stop_words, &it, true)) != NULL) { gsize wlen; - const char *word = ucl_object_tolstring (w, &wlen); + const char *word = ucl_object_tolstring(w, &wlen); const char *saved; - guint mp_flags = RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8; + guint mp_flags = RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8; - if (rspamd_multipattern_has_hyperscan ()) { + if (rspamd_multipattern_has_hyperscan()) { mp_flags |= RSPAMD_MULTIPATTERN_RE; } - rspamd_multipattern_add_pattern_len (d->stop_words[cat].mp, - word, wlen, - mp_flags); - nelt->stop_words ++; - nstop ++; + rspamd_multipattern_add_pattern_len(d->stop_words[cat].mp, + word, wlen, + mp_flags); + nelt->stop_words++; + nstop++; /* Also lemmatise and store normalised */ if (stem) { - const char *nw = sb_stemmer_stem (stem, word, wlen); + const char *nw = sb_stemmer_stem(stem, word, wlen); if (nw) { saved = nw; - wlen = strlen (nw); + wlen = strlen(nw); } else { saved = word; @@ -516,23 +516,23 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, rspamd_ftok_t *tok; gchar *dst; - tok = rspamd_mempool_alloc (cfg->cfg_pool, - sizeof (*tok) + wlen + 1); - dst = ((gchar *)tok) + sizeof (*tok); - rspamd_strlcpy (dst, saved, wlen + 1); + tok = rspamd_mempool_alloc(cfg->cfg_pool, + sizeof(*tok) + wlen + 1); + dst = ((gchar *) tok) + sizeof(*tok); + rspamd_strlcpy(dst, saved, wlen + 1); tok->begin = dst; tok->len = wlen; - kh_put (rspamd_stopwords_hash, d->stop_words_norm, - tok, &rc); + kh_put(rspamd_stopwords_hash, d->stop_words_norm, + tok, &rc); } } if (stem) { - sb_stemmer_delete (stem); + sb_stemmer_delete(stem); } - stop = rspamd_multipattern_get_npatterns (d->stop_words[cat].mp); + stop = rspamd_multipattern_get_npatterns(d->stop_words[cat].mp); struct rspamd_stop_word_range r; @@ -540,7 +540,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, r.stop = stop; r.elt = nelt; - g_array_append_val (d->stop_words[cat].ranges, r); + g_array_append_val(d->stop_words[cat].ranges, r); it = NULL; } } @@ -551,31 +551,31 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, GPtrArray *ngramms; guint nsym; - if (rspamd_language_search_str (nelt->name, tier1_langs, - G_N_ELEMENTS (tier1_langs))) { + if (rspamd_language_search_str(nelt->name, tier1_langs, + G_N_ELEMENTS(tier1_langs))) { nelt->flags |= RS_LANGUAGE_TIER1; } - if (rspamd_language_search_str (nelt->name, tier0_langs, - G_N_ELEMENTS (tier0_langs))) { + if (rspamd_language_search_str(nelt->name, tier0_langs, + G_N_ELEMENTS(tier0_langs))) { nelt->flags |= RS_LANGUAGE_TIER0; } it = NULL; - ngramms = g_ptr_array_sized_new (freqs->len); + ngramms = g_ptr_array_sized_new(freqs->len); i = 0; skipped = 0; loaded = 0; - while ((cur = ucl_object_iterate (freqs, &it, true)) != NULL) { + while ((cur = ucl_object_iterate(freqs, &it, true)) != NULL) { const gchar *key; gsize keylen; guint freq; - key = ucl_object_keyl (cur, &keylen); - freq = ucl_object_toint (cur); + key = ucl_object_keyl(cur, &keylen); + freq = ucl_object_toint(cur); - i ++; + i++; delta = freq - mean; mean += delta / i; delta2 = freq - mean; @@ -585,41 +585,41 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, UChar32 *cur_ucs; const char *end = key + keylen, *cur_utf = key; - ucs_elt = rspamd_mempool_alloc (cfg->cfg_pool, - sizeof (*ucs_elt) + (keylen + 1) * sizeof (UChar32)); + ucs_elt = rspamd_mempool_alloc(cfg->cfg_pool, + sizeof(*ucs_elt) + (keylen + 1) * sizeof(UChar32)); cur_ucs = ucs_elt->s; nsym = 0; uc_err = U_ZERO_ERROR; while (cur_utf < end) { - *cur_ucs++ = ucnv_getNextUChar (d->uchar_converter, &cur_utf, - end, &uc_err); - if (!U_SUCCESS (uc_err)) { + *cur_ucs++ = ucnv_getNextUChar(d->uchar_converter, &cur_utf, + end, &uc_err); + if (!U_SUCCESS(uc_err)) { break; } - nsym ++; + nsym++; } - if (!U_SUCCESS (uc_err)) { - msg_warn_config ("cannot convert key %*s to unicode: %s", - (gint)keylen, key, u_errorName (uc_err)); + if (!U_SUCCESS(uc_err)) { + msg_warn_config("cannot convert key %*s to unicode: %s", + (gint) keylen, key, u_errorName(uc_err)); continue; } ucs_elt->utf = key; - rspamd_language_detector_ucs_lowercase (ucs_elt->s, nsym); + rspamd_language_detector_ucs_lowercase(ucs_elt->s, nsym); if (nsym == 3) { - g_ptr_array_add (ngramms, ucs_elt); + g_ptr_array_add(ngramms, ucs_elt); } else { continue; } - if (rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) { + if (rspamd_language_detector_ucs_is_latin(ucs_elt->s, nsym)) { total_latin++; } @@ -629,7 +629,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, } } - std = sqrt (m2 / (i - 1)); + std = sqrt(m2 / (i - 1)); if (total_latin >= total_ngramms / 3) { nelt->flags |= RS_LANGUAGE_LATIN; @@ -638,66 +638,68 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, nsym = 3; total = 0; - PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) { + PTR_ARRAY_FOREACH(ngramms, i, ucs_elt) + { if (!(nelt->flags & RS_LANGUAGE_LATIN) && - rspamd_language_detector_ucs_is_latin (ucs_elt->s, nsym)) { + rspamd_language_detector_ucs_is_latin(ucs_elt->s, nsym)) { ucs_elt->freq = 0; /* Skip latin ngramm for non-latin language to avoid garbage */ - skipped ++; + skipped++; continue; } /* Now, discriminate low frequency ngramms */ total += ucs_elt->freq; - loaded ++; + loaded++; } - g_ptr_array_sort (ngramms, rspamd_language_detector_cmp_ngramm); + g_ptr_array_sort(ngramms, rspamd_language_detector_cmp_ngramm); - PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) { + PTR_ARRAY_FOREACH(ngramms, i, ucs_elt) + { if (ucs_elt->freq > 0) { - rspamd_language_detector_init_ngramm (cfg, d, - nelt, ucs_elt, nsym, - ucs_elt->freq, total, htb); + rspamd_language_detector_init_ngramm(cfg, d, + nelt, ucs_elt, nsym, + ucs_elt->freq, total, htb); } } #ifdef EXTRA_LANGDET_DEBUG /* Useful for debug */ - for (i = 0; i < 10; i ++) { - ucs_elt = g_ptr_array_index (ngramms, i); + for (i = 0; i < 10; i++) { + ucs_elt = g_ptr_array_index(ngramms, i); - msg_debug_lang_det_cfg ("%s -> %s: %d", nelt->name, - ucs_elt->utf, ucs_elt->freq); - } + msg_debug_lang_det_cfg("%s -> %s: %d", nelt->name, + ucs_elt->utf, ucs_elt->freq); + } #endif - g_ptr_array_free (ngramms, TRUE); + g_ptr_array_free(ngramms, TRUE); nelt->mean = mean; nelt->std = std; - msg_debug_lang_det_cfg ("loaded %s language, %d trigrams, " - "%d ngramms loaded; " - "std=%.2f, mean=%.2f, skipped=%d, loaded=%d, stop_words=%d; " - "(%s)", - nelt->name, - (gint)nelt->trigrams_words, - total, - std, mean, - skipped, loaded, nelt->stop_words, - rspamd_language_detector_print_flags (nelt)); + msg_debug_lang_det_cfg("loaded %s language, %d trigrams, " + "%d ngramms loaded; " + "std=%.2f, mean=%.2f, skipped=%d, loaded=%d, stop_words=%d; " + "(%s)", + nelt->name, + (gint) nelt->trigrams_words, + total, + std, mean, + skipped, loaded, nelt->stop_words, + rspamd_language_detector_print_flags(nelt)); int ret; khiter_t k = kh_put(rspamd_languages_hash, d->languages, nelt->name, &ret); - g_assert (ret > 0); /* must be unique */ + g_assert(ret > 0); /* must be unique */ kh_value(d->languages, k) = nelt; - ucl_object_unref (top); + ucl_object_unref(top); } static gboolean -rspamd_ucl_array_find_str (const gchar *str, const ucl_object_t *ar) +rspamd_ucl_array_find_str(const gchar *str, const ucl_object_t *ar) { ucl_object_iter_t it = NULL; const ucl_object_t *cur; @@ -706,9 +708,9 @@ rspamd_ucl_array_find_str (const gchar *str, const ucl_object_t *ar) return FALSE; } - while ((cur = ucl_object_iterate (ar, &it, true)) != NULL) { - if (ucl_object_type (cur) == UCL_STRING && rspamd_strcase_equal ( - ucl_object_tostring (cur), str)) { + while ((cur = ucl_object_iterate(ar, &it, true)) != NULL) { + if (ucl_object_type(cur) == UCL_STRING && rspamd_strcase_equal( + ucl_object_tostring(cur), str)) { return TRUE; } } @@ -717,72 +719,75 @@ rspamd_ucl_array_find_str (const gchar *str, const ucl_object_t *ar) } static void -rspamd_language_detector_process_chain (struct rspamd_config *cfg, - struct rspamd_ngramm_chain *chain) +rspamd_language_detector_process_chain(struct rspamd_config *cfg, + struct rspamd_ngramm_chain *chain) { struct rspamd_ngramm_elt *elt; guint i; gdouble delta, mean = 0, delta2, m2 = 0, std; if (chain->languages->len > 3) { - PTR_ARRAY_FOREACH (chain->languages, i, elt) { + PTR_ARRAY_FOREACH(chain->languages, i, elt) + { delta = elt->prob - mean; mean += delta / (i + 1); delta2 = elt->prob - mean; m2 += delta * delta2; } - std = sqrt (m2 / (i - 1)); + std = sqrt(m2 / (i - 1)); chain->mean = mean; chain->std = std; /* Now, filter elements that are lower than mean */ - PTR_ARRAY_FOREACH (chain->languages, i, elt) { + PTR_ARRAY_FOREACH(chain->languages, i, elt) + { if (elt->prob < mean) { - g_ptr_array_remove_index_fast (chain->languages, i); + g_ptr_array_remove_index_fast(chain->languages, i); #ifdef EXTRA_LANGDET_DEBUG - msg_debug_lang_det_cfg ("remove %s from %s; prob: %.4f; mean: %.4f, std: %.4f", - elt->elt->name, chain->utf, elt->prob, mean, std); + msg_debug_lang_det_cfg("remove %s from %s; prob: %.4f; mean: %.4f, std: %.4f", + elt->elt->name, chain->utf, elt->prob, mean, std); #endif } } } else { /* We have a unique ngramm, increase its weight */ - PTR_ARRAY_FOREACH (chain->languages, i, elt) { + PTR_ARRAY_FOREACH(chain->languages, i, elt) + { elt->prob *= 4.0; #ifdef EXTRA_LANGDET_DEBUG - msg_debug_lang_det_cfg ("increase weight of %s in %s; prob: %.4f", - elt->elt->name, chain->utf, elt->prob); + msg_debug_lang_det_cfg("increase weight of %s in %s; prob: %.4f", + elt->elt->name, chain->utf, elt->prob); #endif } } } static void -rspamd_language_detector_dtor (struct rspamd_lang_detector *d) +rspamd_language_detector_dtor(struct rspamd_lang_detector *d) { if (d) { - for (guint i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) { - kh_destroy (rspamd_trigram_hash, d->trigrams[i]); - rspamd_multipattern_destroy (d->stop_words[i].mp); - g_array_free (d->stop_words[i].ranges, TRUE); + for (guint i = 0; i < RSPAMD_LANGUAGE_MAX; i++) { + kh_destroy(rspamd_trigram_hash, d->trigrams[i]); + rspamd_multipattern_destroy(d->stop_words[i].mp); + g_array_free(d->stop_words[i].ranges, TRUE); } if (d->languages) { - kh_destroy (rspamd_languages_hash, d->languages); + kh_destroy(rspamd_languages_hash, d->languages); } - kh_destroy (rspamd_stopwords_hash, d->stop_words_norm); + kh_destroy(rspamd_stopwords_hash, d->stop_words_norm); rspamd_lang_detection_fasttext_destroy(d->fasttext_detector); } } -struct rspamd_lang_detector* -rspamd_language_detector_init (struct rspamd_config *cfg) +struct rspamd_lang_detector * +rspamd_language_detector_init(struct rspamd_config *cfg) { const ucl_object_t *section, *elt, *languages_enable = NULL, - *languages_disable = NULL; + *languages_disable = NULL; const gchar *languages_path = default_languages_path; glob_t gl; size_t i, short_text_limit = default_short_text_limit, total = 0; @@ -795,153 +800,153 @@ rspamd_language_detector_init (struct rspamd_config *cfg) ucl_object_t *stop_words; bool prefer_fasttext = true; - section = ucl_object_lookup (cfg->rcl_obj, "lang_detection"); + section = ucl_object_lookup(cfg->rcl_obj, "lang_detection"); if (section != NULL) { - elt = ucl_object_lookup (section, "languages"); + elt = ucl_object_lookup(section, "languages"); if (elt) { - languages_path = ucl_object_tostring (elt); + languages_path = ucl_object_tostring(elt); } - elt = ucl_object_lookup (section, "short_text_limit"); + elt = ucl_object_lookup(section, "short_text_limit"); if (elt) { - short_text_limit = ucl_object_toint (elt); + short_text_limit = ucl_object_toint(elt); } - languages_enable = ucl_object_lookup (section, "languages_enable"); - languages_disable = ucl_object_lookup (section, "languages_disable"); + languages_enable = ucl_object_lookup(section, "languages_enable"); + languages_disable = ucl_object_lookup(section, "languages_disable"); elt = ucl_object_lookup(section, "prefer_fasttext"); if (elt) { - prefer_fasttext = ucl_object_toboolean (elt); + prefer_fasttext = ucl_object_toboolean(elt); } } - languages_pattern = g_string_sized_new (PATH_MAX); - rspamd_printf_gstring (languages_pattern, "%s/stop_words", languages_path); - parser = ucl_parser_new (UCL_PARSER_DEFAULT); + languages_pattern = g_string_sized_new(PATH_MAX); + rspamd_printf_gstring(languages_pattern, "%s/stop_words", languages_path); + parser = ucl_parser_new(UCL_PARSER_DEFAULT); - if (ucl_parser_add_file (parser, languages_pattern->str)) { - stop_words = ucl_parser_get_object (parser); + if (ucl_parser_add_file(parser, languages_pattern->str)) { + stop_words = ucl_parser_get_object(parser); } else { - msg_err_config ("cannot read stop words from %s: %s", - languages_pattern->str, - ucl_parser_get_error (parser)); + msg_err_config("cannot read stop words from %s: %s", + languages_pattern->str, + ucl_parser_get_error(parser)); stop_words = NULL; } - ucl_parser_free (parser); + ucl_parser_free(parser); languages_pattern->len = 0; - rspamd_printf_gstring (languages_pattern, "%s/*.json", languages_path); - memset (&gl, 0, sizeof (gl)); + rspamd_printf_gstring(languages_pattern, "%s/*.json", languages_path); + memset(&gl, 0, sizeof(gl)); - if (glob (languages_pattern->str, 0, NULL, &gl) != 0) { - msg_err_config ("cannot read any files matching %v", languages_pattern); + if (glob(languages_pattern->str, 0, NULL, &gl) != 0) { + msg_err_config("cannot read any files matching %v", languages_pattern); goto end; } - ret = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (*ret)); + ret = rspamd_mempool_alloc0(cfg->cfg_pool, sizeof(*ret)); ret->languages = kh_init(rspamd_languages_hash); kh_resize(rspamd_languages_hash, ret->languages, gl.gl_pathc); - ret->uchar_converter = rspamd_get_utf8_converter (); + ret->uchar_converter = rspamd_get_utf8_converter(); ret->short_text_limit = short_text_limit; - ret->stop_words_norm = kh_init (rspamd_stopwords_hash); + ret->stop_words_norm = kh_init(rspamd_stopwords_hash); ret->prefer_fasttext = prefer_fasttext; /* Map from ngramm in ucs32 to GPtrArray of rspamd_language_elt */ - for (i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) { - ret->trigrams[i] = kh_init (rspamd_trigram_hash); + for (i = 0; i < RSPAMD_LANGUAGE_MAX; i++) { + ret->trigrams[i] = kh_init(rspamd_trigram_hash); #ifdef WITH_HYPERSCAN - ret->stop_words[i].mp = rspamd_multipattern_create ( - RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8| - RSPAMD_MULTIPATTERN_RE); + ret->stop_words[i].mp = rspamd_multipattern_create( + RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8 | + RSPAMD_MULTIPATTERN_RE); #else - ret->stop_words[i].mp = rspamd_multipattern_create ( - RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8); + ret->stop_words[i].mp = rspamd_multipattern_create( + RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_UTF8); #endif - ret->stop_words[i].ranges = g_array_new (FALSE, FALSE, - sizeof (struct rspamd_stop_word_range)); + ret->stop_words[i].ranges = g_array_new(FALSE, FALSE, + sizeof(struct rspamd_stop_word_range)); } - g_assert (uc_err == U_ZERO_ERROR); + g_assert(uc_err == U_ZERO_ERROR); - for (i = 0; i < gl.gl_pathc; i ++) { - fname = g_path_get_basename (gl.gl_pathv[i]); + for (i = 0; i < gl.gl_pathc; i++) { + fname = g_path_get_basename(gl.gl_pathv[i]); - if (!rspamd_ucl_array_find_str (fname, languages_disable) || - (languages_enable == NULL || - rspamd_ucl_array_find_str (fname, languages_enable))) { - rspamd_language_detector_read_file (cfg, ret, gl.gl_pathv[i], - stop_words); + if (!rspamd_ucl_array_find_str(fname, languages_disable) || + (languages_enable == NULL || + rspamd_ucl_array_find_str(fname, languages_enable))) { + rspamd_language_detector_read_file(cfg, ret, gl.gl_pathv[i], + stop_words); } else { - msg_info_config ("skip language file %s: disabled", fname); + msg_info_config("skip language file %s: disabled", fname); } - g_free (fname); + g_free(fname); } - for (i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) { + for (i = 0; i < RSPAMD_LANGUAGE_MAX; i++) { GError *err = NULL; - kh_foreach_value (ret->trigrams[i], schain, { + kh_foreach_value(ret->trigrams[i], schain, { chain = &schain; - rspamd_language_detector_process_chain (cfg, chain); + rspamd_language_detector_process_chain(cfg, chain); }); - if (!rspamd_multipattern_compile (ret->stop_words[i].mp, &err)) { - msg_err_config ("cannot compile stop words for %z language group: %e", - i, err); - g_error_free (err); + if (!rspamd_multipattern_compile(ret->stop_words[i].mp, &err)) { + msg_err_config("cannot compile stop words for %z language group: %e", + i, err); + g_error_free(err); } - total += kh_size (ret->trigrams[i]); + total += kh_size(ret->trigrams[i]); } ret->fasttext_detector = rspamd_lang_detection_fasttext_init(cfg); char *fasttext_status = rspamd_lang_detection_fasttext_show_info(ret->fasttext_detector); - msg_info_config ("loaded %d languages, " - "%d trigrams; %s", - (gint)kh_size(ret->languages), - (gint)total, fasttext_status); - g_free (fasttext_status); + msg_info_config("loaded %d languages, " + "%d trigrams; %s", + (gint) kh_size(ret->languages), + (gint) total, fasttext_status); + g_free(fasttext_status); if (stop_words) { - ucl_object_unref (stop_words); + ucl_object_unref(stop_words); } - REF_INIT_RETAIN (ret, rspamd_language_detector_dtor); - rspamd_mempool_add_destructor (cfg->cfg_pool, - (rspamd_mempool_destruct_t)rspamd_language_detector_unref, - ret); + REF_INIT_RETAIN(ret, rspamd_language_detector_dtor); + rspamd_mempool_add_destructor(cfg->cfg_pool, + (rspamd_mempool_destruct_t) rspamd_language_detector_unref, + ret); end: if (gl.gl_pathc > 0) { - globfree (&gl); + globfree(&gl); } - g_string_free (languages_pattern, TRUE); + g_string_free(languages_pattern, TRUE); return ret; } static void -rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords, - goffset *offsets_out) +rspamd_language_detector_random_select(GArray *ucs_tokens, guint nwords, + goffset *offsets_out) { guint step_len, remainder, i, out_idx; guint64 coin, sel; rspamd_stat_token_t *tok; - g_assert (nwords != 0); - g_assert (offsets_out != NULL); - g_assert (ucs_tokens->len >= nwords); + g_assert(nwords != 0); + g_assert(offsets_out != NULL); + g_assert(ucs_tokens->len >= nwords); /* * We split input array into `nwords` parts. For each part we randomly select * an element from this particular split. Here is an example: @@ -962,30 +967,30 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords, remainder = ucs_tokens->len % nwords; out_idx = 0; - coin = rspamd_random_uint64_fast (); + coin = rspamd_random_uint64_fast(); sel = coin % (step_len + remainder); offsets_out[out_idx] = sel; for (i = step_len + remainder; i < ucs_tokens->len; - i += step_len, out_idx ++) { + i += step_len, out_idx++) { guint ntries = 0; - coin = rspamd_random_uint64_fast (); + coin = rspamd_random_uint64_fast(); sel = (coin % step_len) + i; for (;;) { - tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, sel); + tok = &g_array_index(ucs_tokens, rspamd_stat_token_t, sel); /* Filter bad tokens */ if (tok->unicode.len >= 2 && - !(tok->flags & RSPAMD_STAT_TOKEN_FLAG_EXCEPTION) && - u_isalpha (tok->unicode.begin[0]) && - u_isalpha (tok->unicode.begin[tok->unicode.len - 1])) { + !(tok->flags & RSPAMD_STAT_TOKEN_FLAG_EXCEPTION) && + u_isalpha(tok->unicode.begin[0]) && + u_isalpha(tok->unicode.begin[tok->unicode.len - 1])) { offsets_out[out_idx] = sel; break; } else { - ntries ++; - coin = rspamd_random_uint64_fast (); + ntries++; + coin = rspamd_random_uint64_fast(); if (ntries < step_len) { sel = (coin % step_len) + i; @@ -1022,8 +1027,8 @@ rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords, } static goffset -rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar32 *window, - guint wlen, goffset cur_off) +rspamd_language_detector_next_ngramm(rspamd_stat_token_t *tok, UChar32 *window, + guint wlen, goffset cur_off) { guint i; @@ -1031,18 +1036,18 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar32 *window, /* Deal with spaces at the beginning and ending */ if (cur_off == 0) { - window[0] = (UChar32)' '; + window[0] = (UChar32) ' '; - for (i = 0; i < wlen - 1; i ++) { + for (i = 0; i < wlen - 1; i++) { window[i + 1] = tok->unicode.begin[i]; } } else if (cur_off + wlen == tok->unicode.len + 1) { /* Add trailing space */ - for (i = 0; i < wlen - 1; i ++) { + for (i = 0; i < wlen - 1; i++) { window[i] = tok->unicode.begin[cur_off + i]; } - window[wlen - 1] = (UChar32)' '; + window[wlen - 1] = (UChar32) ' '; } else if (cur_off + wlen > tok->unicode.len + 1) { /* No more fun */ @@ -1070,11 +1075,11 @@ rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar32 *window, * Do full guess for a specific ngramm, checking all languages defined */ static void -rspamd_language_detector_process_ngramm_full (struct rspamd_task *task, - struct rspamd_lang_detector *d, - UChar32 *window, - khash_t(rspamd_candidates_hash) *candidates, - khash_t(rspamd_trigram_hash) *trigrams) +rspamd_language_detector_process_ngramm_full(struct rspamd_task *task, + struct rspamd_lang_detector *d, + UChar32 *window, + khash_t(rspamd_candidates_hash) * candidates, + khash_t(rspamd_trigram_hash) * trigrams) { guint i; gint ret; @@ -1084,41 +1089,43 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_task *task, khiter_t k; gdouble prob; - k = kh_get (rspamd_trigram_hash, trigrams, window); - if (k != kh_end (trigrams)) { - chain = &kh_value (trigrams, k); + k = kh_get(rspamd_trigram_hash, trigrams, window); + if (k != kh_end(trigrams)) { + chain = &kh_value(trigrams, k); } if (chain) { - PTR_ARRAY_FOREACH (chain->languages, i, elt) { + PTR_ARRAY_FOREACH(chain->languages, i, elt) + { prob = elt->prob; if (prob < chain->mean) { continue; } - k = kh_get (rspamd_candidates_hash, candidates, elt->elt->name); - if (k != kh_end (candidates)) { - cand = kh_value (candidates, k); + k = kh_get(rspamd_candidates_hash, candidates, elt->elt->name); + if (k != kh_end(candidates)) { + cand = kh_value(candidates, k); } else { cand = NULL; } #ifdef NGRAMMS_DEBUG - msg_err ("gramm: %s, lang: %s, prob: %.3f", chain->utf, - elt->elt->name, log2 (elt->prob)); + msg_err("gramm: %s, lang: %s, prob: %.3f", chain->utf, + elt->elt->name, log2(elt->prob)); #endif if (cand == NULL) { - cand = rspamd_mempool_alloc (task->task_pool, sizeof (*cand)); + cand = rspamd_mempool_alloc(task->task_pool, sizeof(*cand)); cand->elt = elt->elt; cand->lang = elt->elt->name; cand->prob = prob; - k = kh_put (rspamd_candidates_hash, candidates, elt->elt->name, - &ret); - kh_value (candidates, k) = cand; - } else { + k = kh_put(rspamd_candidates_hash, candidates, elt->elt->name, + &ret); + kh_value(candidates, k) = cand; + } + else { /* Update guess */ cand->prob += prob; } @@ -1127,21 +1134,20 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_task *task, } static void -rspamd_language_detector_detect_word (struct rspamd_task *task, - struct rspamd_lang_detector *d, - rspamd_stat_token_t *tok, - khash_t(rspamd_candidates_hash) *candidates, - khash_t(rspamd_trigram_hash) *trigrams) +rspamd_language_detector_detect_word(struct rspamd_task *task, + struct rspamd_lang_detector *d, + rspamd_stat_token_t *tok, + khash_t(rspamd_candidates_hash) * candidates, + khash_t(rspamd_trigram_hash) * trigrams) { const guint wlen = 3; UChar32 window[3]; goffset cur = 0; /* Split words */ - while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur)) - != -1) { - rspamd_language_detector_process_ngramm_full (task, - d, window, candidates, trigrams); + while ((cur = rspamd_language_detector_next_ngramm(tok, window, wlen, cur)) != -1) { + rspamd_language_detector_process_ngramm_full(task, + d, window, candidates, trigrams); } } @@ -1152,24 +1158,24 @@ static const gdouble cutoff_limit = -8.0; */ static inline void -rspamd_language_detector_filter_step1 (struct rspamd_task *task, - struct rspamd_lang_detector_res *cand, - gdouble *max_prob, guint *filtered) +rspamd_language_detector_filter_step1(struct rspamd_task *task, + struct rspamd_lang_detector_res *cand, + gdouble *max_prob, guint *filtered) { - if (!isnan (cand->prob)) { + if (!isnan(cand->prob)) { if (cand->prob == 0) { cand->prob = NAN; - msg_debug_lang_det ( - "exclude language %s", - cand->lang); + msg_debug_lang_det( + "exclude language %s", + cand->lang); (*filtered)++; } else { - cand->prob = log2 (cand->prob); + cand->prob = log2(cand->prob); if (cand->prob < cutoff_limit) { - msg_debug_lang_det ( - "exclude language %s: %.3f, cutoff limit: %.3f", - cand->lang, cand->prob, cutoff_limit); + msg_debug_lang_det( + "exclude language %s: %.3f, cutoff limit: %.3f", + cand->lang, cand->prob, cutoff_limit); cand->prob = NAN; (*filtered)++; } @@ -1181,76 +1187,76 @@ rspamd_language_detector_filter_step1 (struct rspamd_task *task, } static inline void -rspamd_language_detector_filter_step2 (struct rspamd_task *task, - struct rspamd_lang_detector_res *cand, - gdouble max_prob, guint *filtered) +rspamd_language_detector_filter_step2(struct rspamd_task *task, + struct rspamd_lang_detector_res *cand, + gdouble max_prob, guint *filtered) { /* * Probabilities are logarithmic, so if prob1 - prob2 > 4, it means that * prob2 is 2^4 less than prob1 */ - if (!isnan (cand->prob) && max_prob - cand->prob > 1) { - msg_debug_lang_det ("exclude language %s: %.3f (%.3f max)", - cand->lang, cand->prob, max_prob); + if (!isnan(cand->prob) && max_prob - cand->prob > 1) { + msg_debug_lang_det("exclude language %s: %.3f (%.3f max)", + cand->lang, cand->prob, max_prob); cand->prob = NAN; - (*filtered) ++; + (*filtered)++; } } static void -rspamd_language_detector_filter_negligible (struct rspamd_task *task, - khash_t(rspamd_candidates_hash) *candidates) +rspamd_language_detector_filter_negligible(struct rspamd_task *task, + khash_t(rspamd_candidates_hash) * candidates) { struct rspamd_lang_detector_res *cand; guint filtered = 0; gdouble max_prob = -(G_MAXDOUBLE); - kh_foreach_value (candidates, cand, - rspamd_language_detector_filter_step1 (task, cand, &max_prob, &filtered)); - kh_foreach_value (candidates, cand, - rspamd_language_detector_filter_step2 (task, cand, max_prob, &filtered)); + kh_foreach_value(candidates, cand, + rspamd_language_detector_filter_step1(task, cand, &max_prob, &filtered)); + kh_foreach_value(candidates, cand, + rspamd_language_detector_filter_step2(task, cand, max_prob, &filtered)); - msg_debug_lang_det ("removed %d languages", filtered); + msg_debug_lang_det("removed %d languages", filtered); } static void -rspamd_language_detector_detect_type (struct rspamd_task *task, - guint nwords, - struct rspamd_lang_detector *d, - GArray *words, - enum rspamd_language_category cat, - khash_t(rspamd_candidates_hash) *candidates) +rspamd_language_detector_detect_type(struct rspamd_task *task, + guint nwords, + struct rspamd_lang_detector *d, + GArray *words, + enum rspamd_language_category cat, + khash_t(rspamd_candidates_hash) * candidates) { - guint nparts = MIN (words->len, nwords); + guint nparts = MIN(words->len, nwords); goffset *selected_words; rspamd_stat_token_t *tok; guint i; - selected_words = g_new0 (goffset, nparts); - rspamd_language_detector_random_select (words, nparts, selected_words); - msg_debug_lang_det ("randomly selected %d words", nparts); + selected_words = g_new0(goffset, nparts); + rspamd_language_detector_random_select(words, nparts, selected_words); + msg_debug_lang_det("randomly selected %d words", nparts); for (i = 0; i < nparts; i++) { - tok = &g_array_index (words, rspamd_stat_token_t, - selected_words[i]); + tok = &g_array_index(words, rspamd_stat_token_t, + selected_words[i]); if (tok->unicode.len >= 3) { - rspamd_language_detector_detect_word (task, d, tok, candidates, - d->trigrams[cat]); + rspamd_language_detector_detect_word(task, d, tok, candidates, + d->trigrams[cat]); } } /* Filter negligible candidates */ - rspamd_language_detector_filter_negligible (task, candidates); - g_free (selected_words); + rspamd_language_detector_filter_negligible(task, candidates); + g_free(selected_words); } static gint -rspamd_language_detector_cmp (gconstpointer a, gconstpointer b) +rspamd_language_detector_cmp(gconstpointer a, gconstpointer b) { const struct rspamd_lang_detector_res - *canda = *(const struct rspamd_lang_detector_res **)a, - *candb = *(const struct rspamd_lang_detector_res **)b; + *canda = *(const struct rspamd_lang_detector_res **) a, + *candb = *(const struct rspamd_lang_detector_res **) b; if (canda->prob > candb->prob) { return -1; @@ -1269,26 +1275,26 @@ enum rspamd_language_detected_type { }; static enum rspamd_language_detected_type -rspamd_language_detector_try_ngramm (struct rspamd_task *task, - guint nwords, - struct rspamd_lang_detector *d, - GArray *ucs_tokens, - enum rspamd_language_category cat, - khash_t(rspamd_candidates_hash) *candidates) +rspamd_language_detector_try_ngramm(struct rspamd_task *task, + guint nwords, + struct rspamd_lang_detector *d, + GArray *ucs_tokens, + enum rspamd_language_category cat, + khash_t(rspamd_candidates_hash) * candidates) { guint cand_len = 0; struct rspamd_lang_detector_res *cand; - rspamd_language_detector_detect_type (task, - nwords, - d, - ucs_tokens, - cat, - candidates); + rspamd_language_detector_detect_type(task, + nwords, + d, + ucs_tokens, + cat, + candidates); - kh_foreach_value (candidates, cand, { - if (!isnan (cand->prob)) { - cand_len ++; + kh_foreach_value(candidates, cand, { + if (!isnan(cand->prob)) { + cand_len++; } }); @@ -1319,13 +1325,13 @@ static const gdouble tier1_adjustment = 0.8; static const gdouble frequency_adjustment = 0.8; static gint -rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b, - gpointer ud) +rspamd_language_detector_cmp_heuristic(gconstpointer a, gconstpointer b, + gpointer ud) { struct rspamd_frequency_sort_cbdata *cbd = ud; const struct rspamd_lang_detector_res - *canda = *(const struct rspamd_lang_detector_res **)a, - *candb = *(const struct rspamd_lang_detector_res **)b; + *canda = *(const struct rspamd_lang_detector_res **) a, + *candb = *(const struct rspamd_lang_detector_res **) b; gdouble adj; gdouble proba_adjusted, probb_adjusted, freqa, freqb; @@ -1333,15 +1339,15 @@ rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b, return 0; } - freqa = ((gdouble)canda->elt->occurrences) / - (gdouble)cbd->d->total_occurrences; - freqb = ((gdouble)candb->elt->occurrences) / - (gdouble)cbd->d->total_occurrences; + freqa = ((gdouble) canda->elt->occurrences) / + (gdouble) cbd->d->total_occurrences; + freqb = ((gdouble) candb->elt->occurrences) / + (gdouble) cbd->d->total_occurrences; proba_adjusted = canda->prob; probb_adjusted = candb->prob; - if (isnormal (freqa) && isnormal (freqb)) { + if (isnormal(freqa) && isnormal(freqb)) { proba_adjusted += cbd->std * (frequency_adjustment * freqa); probb_adjusted += cbd->std * (frequency_adjustment * freqb); } @@ -1386,10 +1392,10 @@ rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b, } static void -rspamd_language_detector_unicode_scripts (struct rspamd_task *task, - struct rspamd_mime_text_part *part, - guint *pchinese, - guint *pspecial) +rspamd_language_detector_unicode_scripts(struct rspamd_task *task, + struct rspamd_mime_text_part *part, + guint *pchinese, + guint *pspecial) { const gchar *p = part->utf_stripped_content->data, *end; guint i = 0, cnt = 0; @@ -1399,33 +1405,33 @@ rspamd_language_detector_unicode_scripts (struct rspamd_task *task, const guint cutoff_limit = 32; while (p + i < end) { - U8_NEXT (p, i, part->utf_stripped_content->len, uc); + U8_NEXT(p, i, part->utf_stripped_content->len, uc); if (((gint32) uc) < 0) { break; } - if (u_isalpha (uc)) { - sc = ublock_getCode (uc); - cnt ++; + if (u_isalpha(uc)) { + sc = ublock_getCode(uc); + cnt++; switch (sc) { case UBLOCK_BASIC_LATIN: case UBLOCK_LATIN_1_SUPPLEMENT: part->unicode_scripts |= RSPAMD_UNICODE_LATIN; - nlatin ++; + nlatin++; break; case UBLOCK_HEBREW: part->unicode_scripts |= RSPAMD_UNICODE_HEBREW; - nspecial ++; + nspecial++; break; case UBLOCK_GREEK: part->unicode_scripts |= RSPAMD_UNICODE_GREEK; - nspecial ++; + nspecial++; break; case UBLOCK_CYRILLIC: part->unicode_scripts |= RSPAMD_UNICODE_CYRILLIC; - nspecial ++; + nspecial++; break; case UBLOCK_CJK_UNIFIED_IDEOGRAPHS: case UBLOCK_CJK_COMPATIBILITY: @@ -1433,57 +1439,57 @@ rspamd_language_detector_unicode_scripts (struct rspamd_task *task, case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A: case UBLOCK_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B: part->unicode_scripts |= RSPAMD_UNICODE_CJK; - nchinese ++; + nchinese++; break; case UBLOCK_HIRAGANA: case UBLOCK_KATAKANA: part->unicode_scripts |= RSPAMD_UNICODE_JP; - nspecial ++; + nspecial++; break; case UBLOCK_HANGUL_JAMO: case UBLOCK_HANGUL_COMPATIBILITY_JAMO: part->unicode_scripts |= RSPAMD_UNICODE_HANGUL; - nspecial ++; + nspecial++; break; case UBLOCK_ARABIC: part->unicode_scripts |= RSPAMD_UNICODE_ARABIC; - nspecial ++; + nspecial++; break; case UBLOCK_DEVANAGARI: part->unicode_scripts |= RSPAMD_UNICODE_DEVANAGARI; - nspecial ++; + nspecial++; break; case UBLOCK_ARMENIAN: part->unicode_scripts |= RSPAMD_UNICODE_ARMENIAN; - nspecial ++; + nspecial++; break; case UBLOCK_GEORGIAN: part->unicode_scripts |= RSPAMD_UNICODE_GEORGIAN; - nspecial ++; + nspecial++; break; case UBLOCK_GUJARATI: part->unicode_scripts |= RSPAMD_UNICODE_GUJARATI; - nspecial ++; + nspecial++; break; case UBLOCK_TELUGU: part->unicode_scripts |= RSPAMD_UNICODE_TELUGU; - nspecial ++; + nspecial++; break; case UBLOCK_TAMIL: part->unicode_scripts |= RSPAMD_UNICODE_TAMIL; - nspecial ++; + nspecial++; break; case UBLOCK_THAI: part->unicode_scripts |= RSPAMD_UNICODE_THAI; - nspecial ++; + nspecial++; break; case RSPAMD_UNICODE_MALAYALAM: part->unicode_scripts |= RSPAMD_UNICODE_MALAYALAM; - nspecial ++; + nspecial++; break; case RSPAMD_UNICODE_SINHALA: part->unicode_scripts |= RSPAMD_UNICODE_SINHALA; - nspecial ++; + nspecial++; break; } } @@ -1499,51 +1505,51 @@ rspamd_language_detector_unicode_scripts (struct rspamd_task *task, } } - msg_debug_lang_det ("stop after checking %d characters, " - "%d latin, %d special, %d chinese", - cnt, nlatin, nspecial, nchinese); + msg_debug_lang_det("stop after checking %d characters, " + "%d latin, %d special, %d chinese", + cnt, nlatin, nspecial, nchinese); *pchinese = nchinese; *pspecial = nspecial; } static inline void -rspamd_language_detector_set_language (struct rspamd_task *task, - struct rspamd_mime_text_part *part, - const gchar *code, - struct rspamd_language_elt *elt) +rspamd_language_detector_set_language(struct rspamd_task *task, + struct rspamd_mime_text_part *part, + const gchar *code, + struct rspamd_language_elt *elt) { struct rspamd_lang_detector_res *r; - r = rspamd_mempool_alloc0 (task->task_pool, sizeof (*r)); + r = rspamd_mempool_alloc0(task->task_pool, sizeof(*r)); r->prob = 1.0; r->lang = code; r->elt = elt; if (part->languages == NULL) { - part->languages = g_ptr_array_sized_new (1); + part->languages = g_ptr_array_sized_new(1); } - g_ptr_array_add (part->languages, r); + g_ptr_array_add(part->languages, r); part->language = code; } static gboolean -rspamd_language_detector_try_uniscript (struct rspamd_task *task, - struct rspamd_mime_text_part *part, - guint nchinese, - guint nspecial) +rspamd_language_detector_try_uniscript(struct rspamd_task *task, + struct rspamd_mime_text_part *part, + guint nchinese, + guint nspecial) { guint i; - for (i = 0; i < G_N_ELEMENTS (unicode_langs); i ++) { + for (i = 0; i < G_N_ELEMENTS(unicode_langs); i++) { if (unicode_langs[i].unicode_code & part->unicode_scripts) { if (unicode_langs[i].unicode_code != RSPAMD_UNICODE_JP) { - msg_debug_lang_det ("set language based on unicode script %s", - unicode_langs[i].lang); - rspamd_language_detector_set_language (task, part, - unicode_langs[i].lang, NULL); + msg_debug_lang_det("set language based on unicode script %s", + unicode_langs[i].lang); + rspamd_language_detector_set_language(task, part, + unicode_langs[i].lang, NULL); return TRUE; } @@ -1558,10 +1564,10 @@ rspamd_language_detector_try_uniscript (struct rspamd_task *task, * it Chinese */ if (nchinese <= 5 || nchinese < nspecial * 5) { - msg_debug_lang_det ("set language based on unicode script %s", - unicode_langs[i].lang); - rspamd_language_detector_set_language (task, part, - unicode_langs[i].lang, NULL); + msg_debug_lang_det("set language based on unicode script %s", + unicode_langs[i].lang); + rspamd_language_detector_set_language(task, part, + unicode_langs[i].lang, NULL); return TRUE; } @@ -1570,10 +1576,10 @@ rspamd_language_detector_try_uniscript (struct rspamd_task *task, } if (part->unicode_scripts & RSPAMD_UNICODE_CJK) { - msg_debug_lang_det ("guess chinese based on CJK characters: %d chinese, %d special", - nchinese, nspecial); - rspamd_language_detector_set_language (task, part, - "zh-CN", NULL); + msg_debug_lang_det("guess chinese based on CJK characters: %d chinese, %d special", + nchinese, nspecial); + rspamd_language_detector_set_language(task, part, + "zh-CN", NULL); return TRUE; } @@ -1582,38 +1588,38 @@ rspamd_language_detector_try_uniscript (struct rspamd_task *task, } static guint -rspamd_langelt_hash_func (gconstpointer key) +rspamd_langelt_hash_func(gconstpointer key) { - const struct rspamd_language_elt *elt = (const struct rspamd_language_elt *)key; - return rspamd_cryptobox_fast_hash (elt->name, strlen (elt->name), - rspamd_hash_seed ()); + const struct rspamd_language_elt *elt = (const struct rspamd_language_elt *) key; + return rspamd_cryptobox_fast_hash(elt->name, strlen(elt->name), + rspamd_hash_seed()); } static gboolean -rspamd_langelt_equal_func (gconstpointer v, gconstpointer v2) +rspamd_langelt_equal_func(gconstpointer v, gconstpointer v2) { - const struct rspamd_language_elt *elt1 = (const struct rspamd_language_elt *)v, - *elt2 = (const struct rspamd_language_elt *)v2; - return strcmp (elt1->name, elt2->name) == 0; + const struct rspamd_language_elt *elt1 = (const struct rspamd_language_elt *) v, + *elt2 = (const struct rspamd_language_elt *) v2; + return strcmp(elt1->name, elt2->name) == 0; } /* This hash set stores a word index in the language to avoid duplicate stop words */ -KHASH_INIT (rspamd_sw_res_set, int, char, 0, kh_int_hash_func, kh_int_hash_equal); +KHASH_INIT(rspamd_sw_res_set, int, char, 0, kh_int_hash_func, kh_int_hash_equal); -KHASH_INIT (rspamd_sw_hash, struct rspamd_language_elt *, khash_t(rspamd_sw_res_set) *, 1, - rspamd_langelt_hash_func, rspamd_langelt_equal_func); +KHASH_INIT(rspamd_sw_hash, struct rspamd_language_elt *, khash_t(rspamd_sw_res_set) *, 1, + rspamd_langelt_hash_func, rspamd_langelt_equal_func); struct rspamd_sw_cbdata { struct rspamd_task *task; - khash_t (rspamd_sw_hash) *res; + khash_t(rspamd_sw_hash) * res; GArray *ranges; }; static gint -rspamd_ranges_cmp (const void *k, const void *memb) +rspamd_ranges_cmp(const void *k, const void *memb) { - gint pos = GPOINTER_TO_INT (k); - const struct rspamd_stop_word_range *r = (struct rspamd_stop_word_range *)memb; + gint pos = GPOINTER_TO_INT(k); + const struct rspamd_stop_word_range *r = (struct rspamd_stop_word_range *) memb; if (pos >= r->start && pos < r->stop) { return 0; @@ -1626,18 +1632,18 @@ rspamd_ranges_cmp (const void *k, const void *memb) } static gint -rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp, - guint strnum, - gint match_start, - gint match_pos, - const gchar *text, - gsize len, - void *context) +rspamd_language_detector_sw_cb(struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context) { /* Check if boundary */ const gchar *prev = text, *next = text + len; struct rspamd_stop_word_range *r; - struct rspamd_sw_cbdata *cbdata = (struct rspamd_sw_cbdata *)context; + struct rspamd_sw_cbdata *cbdata = (struct rspamd_sw_cbdata *) context; khiter_t k; static const gsize max_stop_words = 80; struct rspamd_task *task; @@ -1645,7 +1651,7 @@ rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp, if (match_start > 0) { prev = text + match_start - 1; - if (!(g_ascii_isspace (*prev) || g_ascii_ispunct (*prev))) { + if (!(g_ascii_isspace(*prev) || g_ascii_ispunct(*prev))) { return 0; } } @@ -1653,22 +1659,22 @@ rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp, if (match_pos < len) { next = text + match_pos; - if (!(g_ascii_isspace (*next) || g_ascii_ispunct (*next))) { + if (!(g_ascii_isspace(*next) || g_ascii_ispunct(*next))) { return 0; } } /* We have a word on the boundary, check range */ task = cbdata->task; - r = bsearch (GINT_TO_POINTER (strnum), cbdata->ranges->data, - cbdata->ranges->len, sizeof (*r), rspamd_ranges_cmp); + r = bsearch(GINT_TO_POINTER(strnum), cbdata->ranges->data, + cbdata->ranges->len, sizeof(*r), rspamd_ranges_cmp); - g_assert (r != NULL); + g_assert(r != NULL); - k = kh_get (rspamd_sw_hash, cbdata->res, r->elt); + k = kh_get(rspamd_sw_hash, cbdata->res, r->elt); gint nwords = 1; - if (k != kh_end (cbdata->res)) { + if (k != kh_end(cbdata->res)) { khiter_t set_k; int tt; @@ -1678,8 +1684,8 @@ rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp, if (set_k == kh_end(kh_value(cbdata->res, k))) { /* New word */ set_k = kh_put(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum, &tt); - msg_debug_lang_det ("found new word %*s from %s language (%d stop words found so far)", - (int)(next - prev - 1), prev + 1, r->elt->name, nwords); + msg_debug_lang_det("found new word %*s from %s language (%d stop words found so far)", + (int) (next - prev - 1), prev + 1, r->elt->name, nwords); } if (nwords > max_stop_words) { @@ -1689,46 +1695,46 @@ rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp, else { gint tt; - k = kh_put (rspamd_sw_hash, cbdata->res, r->elt, &tt); + k = kh_put(rspamd_sw_hash, cbdata->res, r->elt, &tt); kh_value(cbdata->res, k) = kh_init(rspamd_sw_res_set); kh_put(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum, &tt); - msg_debug_lang_det ("found new word %*s from %s language (%d stop words found so far)", - (int)(next - prev - 1), prev + 1, r->elt->name, nwords); + msg_debug_lang_det("found new word %*s from %s language (%d stop words found so far)", + (int) (next - prev - 1), prev + 1, r->elt->name, nwords); } return 0; } static gboolean -rspamd_language_detector_try_stop_words (struct rspamd_task *task, - struct rspamd_lang_detector *d, - struct rspamd_mime_text_part *part, - enum rspamd_language_category cat) +rspamd_language_detector_try_stop_words(struct rspamd_task *task, + struct rspamd_lang_detector *d, + struct rspamd_mime_text_part *part, + enum rspamd_language_category cat) { struct rspamd_stop_word_elt *elt; struct rspamd_sw_cbdata cbdata; gboolean ret = FALSE; static const int stop_words_threshold = 4, /* minimum stop words count */ - strong_confidence_threshold = 10 /* we are sure that this is enough */; + strong_confidence_threshold = 10 /* we are sure that this is enough */; elt = &d->stop_words[cat]; - cbdata.res = kh_init (rspamd_sw_hash); + cbdata.res = kh_init(rspamd_sw_hash); cbdata.ranges = elt->ranges; cbdata.task = task; - rspamd_multipattern_lookup (elt->mp, part->utf_stripped_content->data, - part->utf_stripped_content->len, rspamd_language_detector_sw_cb, - &cbdata, NULL); + rspamd_multipattern_lookup(elt->mp, part->utf_stripped_content->data, + part->utf_stripped_content->len, rspamd_language_detector_sw_cb, + &cbdata, NULL); - if (kh_size (cbdata.res) > 0) { - khash_t(rspamd_sw_res_set) *cur_res; + if (kh_size(cbdata.res) > 0) { + khash_t(rspamd_sw_res_set) * cur_res; double max_rate = G_MINDOUBLE; struct rspamd_language_elt *cur_lang, *sel = NULL; gboolean ignore_ascii = FALSE, ignore_latin = FALSE; - again: - kh_foreach (cbdata.res, cur_lang, cur_res, { + again: + kh_foreach(cbdata.res, cur_lang, cur_res, { int cur_matches = kh_size(cur_res); if (!ignore_ascii && (cur_lang->flags & RS_LANGUAGE_DIACRITICS)) { @@ -1736,8 +1742,8 @@ rspamd_language_detector_try_stop_words (struct rspamd_task *task, ignore_ascii = TRUE; sel = NULL; max_rate = G_MINDOUBLE; - msg_debug_lang_det ("ignore ascii after finding %d stop words from %s", - cur_matches, cur_lang->name); + msg_debug_lang_det("ignore ascii after finding %d stop words from %s", + cur_matches, cur_lang->name); goto again; } @@ -1746,8 +1752,8 @@ rspamd_language_detector_try_stop_words (struct rspamd_task *task, ignore_latin = TRUE; sel = NULL; max_rate = G_MINDOUBLE; - msg_debug_lang_det ("ignore latin after finding stop %d words from %s", - cur_matches, cur_lang->name); + msg_debug_lang_det("ignore latin after finding stop %d words from %s", + cur_matches, cur_lang->name); goto again; } @@ -1766,46 +1772,46 @@ rspamd_language_detector_try_stop_words (struct rspamd_task *task, } } - double rate = (double)cur_matches / (double)cur_lang->stop_words; + double rate = (double) cur_matches / (double) cur_lang->stop_words; if (rate > max_rate) { max_rate = rate; sel = cur_lang; } - msg_debug_lang_det ("found %d stop words from %s: %3f rate", - cur_matches, cur_lang->name, rate); + msg_debug_lang_det("found %d stop words from %s: %3f rate", + cur_matches, cur_lang->name, rate); }); /* Cleanup */ - kh_foreach (cbdata.res, cur_lang, cur_res, { - kh_destroy (rspamd_sw_res_set, cur_res); + kh_foreach(cbdata.res, cur_lang, cur_res, { + kh_destroy(rspamd_sw_res_set, cur_res); }); if (max_rate > 0 && sel) { - msg_debug_lang_det ("set language based on stop words script %s, %.3f found", - sel->name, max_rate); - rspamd_language_detector_set_language (task, part, - sel->name, sel); + msg_debug_lang_det("set language based on stop words script %s, %.3f found", + sel->name, max_rate); + rspamd_language_detector_set_language(task, part, + sel->name, sel); ret = TRUE; } } else { - msg_debug_lang_det ("found no stop words in a text"); + msg_debug_lang_det("found no stop words in a text"); } - kh_destroy (rspamd_sw_hash, cbdata.res); + kh_destroy(rspamd_sw_hash, cbdata.res); return ret; } gboolean -rspamd_language_detector_detect (struct rspamd_task *task, - struct rspamd_lang_detector *d, - struct rspamd_mime_text_part *part) +rspamd_language_detector_detect(struct rspamd_task *task, + struct rspamd_lang_detector *d, + struct rspamd_mime_text_part *part) { - khash_t(rspamd_candidates_hash) *candidates; + khash_t(rspamd_candidates_hash) * candidates; GPtrArray *result; gdouble mean, std, start_ticks, end_ticks; guint cand_len; @@ -1820,10 +1826,10 @@ rspamd_language_detector_detect (struct rspamd_task *task, return FALSE; } - start_ticks = rspamd_get_ticks (TRUE); + start_ticks = rspamd_get_ticks(TRUE); guint nchinese = 0, nspecial = 0; - rspamd_language_detector_unicode_scripts (task, part, &nchinese, &nspecial); + rspamd_language_detector_unicode_scripts(task, part, &nchinese, &nspecial); /* Disable internal language detection heuristics if we have fasttext */ if (!rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector) || !d->prefer_fasttext) { @@ -1844,24 +1850,24 @@ rspamd_language_detector_detect (struct rspamd_task *task, if (rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector)) { rspamd_fasttext_predict_result_t fasttext_predict_result = rspamd_lang_detection_fasttext_detect(d->fasttext_detector, task, - part->utf_words, 4); + part->utf_words, 4); ndetected = rspamd_lang_detection_fasttext_get_nlangs(fasttext_predict_result); if (ndetected > 0) { - candidates = kh_init (rspamd_candidates_hash); - kh_resize (rspamd_candidates_hash, candidates, ndetected); + candidates = kh_init(rspamd_candidates_hash); + kh_resize(rspamd_candidates_hash, candidates, ndetected); /* Now fill all results where probability is above threshold */ float max_prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, 0); - for (unsigned int i = 0; i < ndetected; i ++) { + for (unsigned int i = 0; i < ndetected; i++) { float prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, i); if (prob > max_prob * 0.75) { char *lang = rspamd_mempool_strdup(task->task_pool, - rspamd_lang_detection_fasttext_get_lang(fasttext_predict_result, i)); + rspamd_lang_detection_fasttext_get_lang(fasttext_predict_result, i)); int tmp; - khiter_t k = kh_put (rspamd_candidates_hash, candidates, lang, &tmp); + khiter_t k = kh_put(rspamd_candidates_hash, candidates, lang, &tmp); kh_value(candidates, k) = rspamd_mempool_alloc0(task->task_pool, sizeof(*cand)); cand = kh_value(candidates, k); @@ -1892,45 +1898,46 @@ rspamd_language_detector_detect (struct rspamd_task *task, if (ndetected == 0) { if (part->utf_words->len < default_short_text_limit) { r = rs_detect_none; - msg_debug_lang_det ("text is too short for trigrams detection: " - "%d words; at least %d words required", - (int)part->utf_words->len, - (int)default_short_text_limit); + msg_debug_lang_det("text is too short for trigrams detection: " + "%d words; at least %d words required", + (int) part->utf_words->len, + (int) default_short_text_limit); switch (cat) { case RSPAMD_LANGUAGE_CYRILLIC: - rspamd_language_detector_set_language (task, part, "ru", NULL); + rspamd_language_detector_set_language(task, part, "ru", NULL); break; case RSPAMD_LANGUAGE_DEVANAGARI: - rspamd_language_detector_set_language (task, part, "hi", NULL); + rspamd_language_detector_set_language(task, part, "hi", NULL); break; case RSPAMD_LANGUAGE_ARAB: - rspamd_language_detector_set_language (task, part, "ar", NULL); + rspamd_language_detector_set_language(task, part, "ar", NULL); break; default: case RSPAMD_LANGUAGE_LATIN: - rspamd_language_detector_set_language (task, part, "en", NULL); + rspamd_language_detector_set_language(task, part, "en", NULL); break; } - msg_debug_lang_det ("set %s language based on symbols category", - part->language); + msg_debug_lang_det("set %s language based on symbols category", + part->language); - candidates = kh_init (rspamd_candidates_hash); + candidates = kh_init(rspamd_candidates_hash); } else { - candidates = kh_init (rspamd_candidates_hash); - kh_resize (rspamd_candidates_hash, candidates, 32); + candidates = kh_init(rspamd_candidates_hash); + kh_resize(rspamd_candidates_hash, candidates, 32); - r = rspamd_language_detector_try_ngramm (task, - default_words, - d, - part->utf_words, - cat, - candidates); + r = rspamd_language_detector_try_ngramm(task, + default_words, + d, + part->utf_words, + cat, + candidates); if (r == rs_detect_none) { - msg_debug_lang_det ("no trigrams found, fallback to english"); - rspamd_language_detector_set_language (task, part, "en", NULL); - } else if (r == rs_detect_multiple) { + msg_debug_lang_det("no trigrams found, fallback to english"); + rspamd_language_detector_set_language(task, part, "en", NULL); + } + else if (r == rs_detect_multiple) { /* Check our guess */ mean = 0.0; @@ -1938,8 +1945,8 @@ rspamd_language_detector_detect (struct rspamd_task *task, cand_len = 0; /* Check distribution */ - kh_foreach_value (candidates, cand, { - if (!isnan (cand->prob)) { + kh_foreach_value(candidates, cand, { + if (!isnan(cand->prob)) { mean += cand->prob; cand_len++; } @@ -1948,22 +1955,22 @@ rspamd_language_detector_detect (struct rspamd_task *task, if (cand_len > 0) { mean /= cand_len; - kh_foreach_value (candidates, cand, { + kh_foreach_value(candidates, cand, { gdouble err; - if (!isnan (cand->prob)) { + if (!isnan(cand->prob)) { err = cand->prob - mean; - std += fabs (err); + std += fabs(err); } }); std /= cand_len; } - msg_debug_lang_det ("trigrams checked, %d candidates, %.3f mean, %.4f stddev", - cand_len, mean, std); + msg_debug_lang_det("trigrams checked, %d candidates, %.3f mean, %.4f stddev", + cand_len, mean, std); - if (cand_len > 0 && std / fabs (mean) < 0.25) { - msg_debug_lang_det ("apply frequency heuristic sorting"); + if (cand_len > 0 && std / fabs(mean) < 0.25) { + msg_debug_lang_det("apply frequency heuristic sorting"); frequency_heuristic_applied = TRUE; cbd.d = d; cbd.mean = mean; @@ -1979,26 +1986,27 @@ rspamd_language_detector_detect (struct rspamd_task *task, } /* Now, convert hash to array and sort it */ - if (r != rs_detect_none && kh_size (candidates) > 0) { - result = g_ptr_array_sized_new (kh_size (candidates)); - - kh_foreach_value (candidates, cand, { - if (!isnan (cand->prob)) { - msg_debug_lang_det ("final probability %s -> %.2f", cand->lang, - cand->prob); - g_ptr_array_add (result, cand); + if (r != rs_detect_none && kh_size(candidates) > 0) { + result = g_ptr_array_sized_new(kh_size(candidates)); + + kh_foreach_value(candidates, cand, { + if (!isnan(cand->prob)) { + msg_debug_lang_det("final probability %s -> %.2f", cand->lang, + cand->prob); + g_ptr_array_add(result, cand); } }); if (frequency_heuristic_applied) { - g_ptr_array_sort_with_data (result, - rspamd_language_detector_cmp_heuristic, (gpointer) &cbd); - } else { - g_ptr_array_sort (result, rspamd_language_detector_cmp); + g_ptr_array_sort_with_data(result, + rspamd_language_detector_cmp_heuristic, (gpointer) &cbd); + } + else { + g_ptr_array_sort(result, rspamd_language_detector_cmp); } if (result->len > 0 && !frequency_heuristic_applied) { - cand = g_ptr_array_index (result, 0); + cand = g_ptr_array_index(result, 0); if (cand->elt) { cand->elt->occurrences++; } @@ -2006,45 +2014,44 @@ rspamd_language_detector_detect (struct rspamd_task *task, } if (part->languages != NULL) { - g_ptr_array_unref (part->languages); + g_ptr_array_unref(part->languages); } part->languages = result; - part->language = ((struct rspamd_lang_detector_res *)g_ptr_array_index (result, 0))->lang; + part->language = ((struct rspamd_lang_detector_res *) g_ptr_array_index(result, 0))->lang; ret = TRUE; } else if (part->languages == NULL) { - rspamd_language_detector_set_language (task, part, "en", NULL); + rspamd_language_detector_set_language(task, part, "en", NULL); } - kh_destroy (rspamd_candidates_hash, candidates); + kh_destroy(rspamd_candidates_hash, candidates); } - end_ticks = rspamd_get_ticks (TRUE); - msg_debug_lang_det ("detected languages in %.0f ticks", - (end_ticks - start_ticks)); + end_ticks = rspamd_get_ticks(TRUE); + msg_debug_lang_det("detected languages in %.0f ticks", + (end_ticks - start_ticks)); return ret; } -struct rspamd_lang_detector* -rspamd_language_detector_ref (struct rspamd_lang_detector* d) +struct rspamd_lang_detector * +rspamd_language_detector_ref(struct rspamd_lang_detector *d) { - REF_RETAIN (d); + REF_RETAIN(d); return d; } -void -rspamd_language_detector_unref (struct rspamd_lang_detector* d) +void rspamd_language_detector_unref(struct rspamd_lang_detector *d) { - REF_RELEASE (d); + REF_RELEASE(d); } gboolean -rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d, - const gchar *word, gsize wlen) +rspamd_language_detector_is_stop_word(struct rspamd_lang_detector *d, + const gchar *word, gsize wlen) { khiter_t k; rspamd_ftok_t search; @@ -2052,17 +2059,16 @@ rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d, search.begin = word; search.len = wlen; - k = kh_get (rspamd_stopwords_hash, d->stop_words_norm, &search); + k = kh_get(rspamd_stopwords_hash, d->stop_words_norm, &search); - if (k != kh_end (d->stop_words_norm)) { + if (k != kh_end(d->stop_words_norm)) { return TRUE; } return FALSE; } -gint -rspamd_language_detector_elt_flags (const struct rspamd_language_elt *elt) +gint rspamd_language_detector_elt_flags(const struct rspamd_language_elt *elt) { if (elt) { return elt->flags; |