rspamd_langdet_log_id, "langdet", task->task_pool->tag.uid, \
G_STRFUNC, \
__VA_ARGS__)
+#define msg_debug_lang_det_cfg(...) rspamd_conditional_debug_fast (NULL, NULL, \
+ rspamd_langdet_log_id, "langdet", cfg->cfg_pool->tag.uid, \
+ G_STRFUNC, \
+ __VA_ARGS__)
INIT_LOG_MODULE(langdet)
gboolean ret = TRUE;
for (i = 0; i < len; i ++) {
- if (!((s[i] >= 'A' && s[i] <= 'Z') || (s[i] >= 'a' && s[i] <= 'z'))) {
+ if (!((s[i] >= 'A' && s[i] <= 'Z') || (s[i] >= 'a' && s[i] <= 'z')
+ || s[i] == ' ')) {
ret = FALSE;
break;
}
struct rspamd_language_ucs_elt {
guint freq;
+ const gchar *utf;
UChar s[0];
};
return flags_buf;
}
+static gint
+rspamd_language_detector_cmp_ngramm (gconstpointer a, gconstpointer b)
+{
+ struct rspamd_language_ucs_elt *e1 = *(struct rspamd_language_ucs_elt **)a;
+ struct rspamd_language_ucs_elt *e2 = *(struct rspamd_language_ucs_elt **)b;
+
+ return (gint)e2->freq - (gint)e1->freq;
+}
+
static void
rspamd_language_detector_read_file (struct rspamd_config *cfg,
struct rspamd_lang_detector *d,
ucs_elt->s, keylen + 1,
key,
keylen, &uc_err);
+ ucs_elt->utf = key;
if (uc_err != U_ZERO_ERROR) {
msg_warn_config ("cannot convert key to unicode: %s",
}
/* Now, discriminate low frequency ngramms */
- if (ucs_elt->freq < mean + std / 8.0) {
+ if (ucs_elt->freq < mean) {
ucs_elt->freq = 0;
skipped ++;
continue;
loaded ++;
}
+ g_ptr_array_sort (ngramms, rspamd_language_detector_cmp_ngramm);
+
PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) {
if (ucs_elt->freq > 0) {
rspamd_language_detector_init_ngramm (cfg, d, nelt, ucs_elt->s,
}
}
+ /* Useful for debug */
+ for (i = 0; i < 10; i ++) {
+ ucs_elt = g_ptr_array_index (ngramms, i);
+
+ msg_debug_lang_det_cfg ("%s -> %s: %d", nelt->name,
+ ucs_elt->utf, ucs_elt->freq);
+ }
+
g_ptr_array_free (ngramms, TRUE);
msg_info_config ("loaded %s language, %d unigramms, %d trigramms, "
"%d ngramms loaded; "
struct rspamd_ngramm_elt *elt;
struct rspamd_lang_detector_res *cand;
GHashTable *ngramms;
+ /* Ignore if ngramm is found in that amount of languages */
+ static const guint languages_cutoff = 10;
switch (type) {
case rs_unigramm:
ar = g_hash_table_lookup (ngramms, window);
- if (ar) {
+ if (ar && ar->len < languages_cutoff) {
PTR_ARRAY_FOREACH (ar, i, elt) {
cand = g_hash_table_lookup (candidates, elt->elt->name);