summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-02-05 20:20:24 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-02-05 20:20:24 +0000
commit27faff16112850f1d29de49c287ec00534c323b4 (patch)
tree4e6246452f88684c1b18c3608a665c7d2fbc6806
parent4e85642cbab29192b48c7f662d2ab2921912d9bc (diff)
downloadrspamd-27faff16112850f1d29de49c287ec00534c323b4.tar.gz
rspamd-27faff16112850f1d29de49c287ec00534c323b4.zip
[Minor] Use ngramm chains structure
-rw-r--r--src/libmime/lang_detection.c54
1 files changed, 29 insertions, 25 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 1c3a5bded..cb2427724 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -100,6 +100,11 @@ struct rspamd_ngramm_elt {
gdouble prob;
};
+struct rspamd_ngramm_chain {
+ GPtrArray *languages;
+ gchar *utf;
+};
+
struct rspamd_lang_detector {
GPtrArray *languages;
GHashTable *unigramms; /* unigramms frequencies */
@@ -200,14 +205,20 @@ rspamd_language_detector_ucs_is_latin (UChar *s, gsize len)
return ret;
}
+struct rspamd_language_ucs_elt {
+ guint freq;
+ const gchar *utf;
+ UChar s[0];
+};
+
static void
rspamd_language_detector_init_ngramm (struct rspamd_config *cfg,
struct rspamd_lang_detector *d,
struct rspamd_language_elt *lelt,
- UChar *s, guint len, guint freq, guint total)
+ struct rspamd_language_ucs_elt *ucs, guint len, guint freq, guint total)
{
GHashTable *target;
- GPtrArray *ar;
+ struct rspamd_ngramm_chain *chain;
struct rspamd_ngramm_elt *elt;
guint i;
gboolean found;
@@ -227,23 +238,25 @@ rspamd_language_detector_init_ngramm (struct rspamd_config *cfg,
break;
}
- ar = g_hash_table_lookup (target, s);
+ chain = g_hash_table_lookup (target, ucs->s);
- if (ar == NULL) {
+ if (chain == NULL) {
/* New element */
- ar = g_ptr_array_sized_new (32);
+ chain = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (*chain));
+ chain->languages = g_ptr_array_sized_new (32);
+ chain->utf = rspamd_mempool_strdup (cfg->cfg_pool, ucs->utf);
elt = rspamd_mempool_alloc (cfg->cfg_pool, sizeof (*elt));
elt->elt = lelt;
elt->prob = ((gdouble)freq) / ((gdouble)total);
- g_ptr_array_add (ar, elt);
+ g_ptr_array_add (chain->languages, elt);
- g_hash_table_insert (target, s, ar);
+ g_hash_table_insert (target, ucs->s, chain);
}
else {
/* Check sanity */
found = FALSE;
- PTR_ARRAY_FOREACH (ar, i, elt) {
+ PTR_ARRAY_FOREACH (chain->languages, i, elt) {
if (strcmp (elt->elt->name, lelt->name) == 0) {
found = TRUE;
elt->prob += ((gdouble)freq) / ((gdouble)total);
@@ -255,16 +268,11 @@ rspamd_language_detector_init_ngramm (struct rspamd_config *cfg,
elt = rspamd_mempool_alloc (cfg->cfg_pool, sizeof (*elt));
elt->elt = lelt;
elt->prob = ((gdouble)freq) / ((gdouble)total);
- g_ptr_array_add (ar, elt);
+ g_ptr_array_add (chain->languages, elt);
}
}
}
-struct rspamd_language_ucs_elt {
- guint freq;
- const gchar *utf;
- UChar s[0];
-};
static const gchar *
rspamd_language_detector_print_flags (struct rspamd_language_elt *elt)
@@ -489,7 +497,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
PTR_ARRAY_FOREACH (ngramms, i, ucs_elt) {
if (ucs_elt->freq > 0) {
rspamd_language_detector_init_ngramm (cfg, d,
- nelt, ucs_elt->s, nsym,
+ nelt, ucs_elt, nsym,
ucs_elt->freq, total);
}
}
@@ -803,7 +811,7 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_task *task,
GHashTable *candidates)
{
guint i;
- GPtrArray *ar;
+ struct rspamd_ngramm_chain *chain;
struct rspamd_ngramm_elt *elt;
struct rspamd_lang_detector_res *cand;
GHashTable *ngramms;
@@ -819,19 +827,15 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_task *task,
}
- ar = g_hash_table_lookup (ngramms, window);
+ chain = g_hash_table_lookup (ngramms, window);
- if (ar) {
- PTR_ARRAY_FOREACH (ar, i, elt) {
+ if (chain) {
+ PTR_ARRAY_FOREACH (chain->languages, i, elt) {
cand = g_hash_table_lookup (candidates, elt->elt->name);
prob = elt->prob;
#ifdef NGRAMMS_DEBUG
- UConverter *ucnv;
- UErrorCode uc_err = U_ZERO_ERROR;
- char buf[1024];
- ucnv = ucnv_open ("UTF-8", &uc_err);
- ucnv_fromUChars (ucnv, buf, sizeof (buf), window, 3, &uc_err);
- msg_err ("gramm: %s, lang: %s, prob: %.3f", buf, elt->elt->name, log2 (elt->prob));
+ msg_err ("gramm: %s, lang: %s, prob: %.3f", chain->utf,
+ elt->elt->name, log2 (elt->prob));
#endif
if (cand == NULL) {
cand = g_malloc (sizeof (*cand));