Browse Source

[Project] Implement logic of ngramms application

tags/1.7.0
Vsevolod Stakhov 6 years ago
parent
commit
88950e4e4f
2 changed files with 107 additions and 14 deletions
  1. 105
    11
      src/libmime/lang_detection.c
  2. 2
    3
      src/libmime/lang_detection.h

+ 105
- 11
src/libmime/lang_detection.c View File

@@ -380,7 +380,8 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_lang_detector *d,
UChar *window, enum rspamd_language_gramm_type type,
GHashTable *candidates)
{
guint i, freq;
guint i;
gdouble freq, class_freq;
struct rspamd_language_elt *elt;
struct rspamd_lang_detector_res *cand;
GHashTable *ngramms;
@@ -391,16 +392,20 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_lang_detector *d,
switch (type) {
case rs_unigramm:
ngramms = elt->unigramms;
class_freq = elt->unigramms_total;
break;
case rs_bigramm:
ngramms = elt->bigramms;
class_freq = elt->bigramms_total;
break;
case rs_trigramm:
ngramms = elt->trigramms;
class_freq = elt->trigramms_total;
break;
}

freq = GPOINTER_TO_UINT (g_hash_table_lookup (ngramms, window));
freq = ((gdouble)GPOINTER_TO_UINT (
g_hash_table_lookup (ngramms, window))) / class_freq;
cand = g_hash_table_lookup (candidates, elt->name);

if (cand == NULL) {
@@ -419,12 +424,12 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_lang_detector *d,
/*
* Check only candidates, if none found, switch to full version
*/
static void
static gboolean
rspamd_language_detector_process_ngramm_update (struct rspamd_lang_detector *d,
UChar *window, enum rspamd_language_gramm_type type,
GHashTable *candidates)
{
guint freq, total_freq = 0;
gdouble freq, total_freq = 0.0, class_freq;
struct rspamd_language_elt *elt;
struct rspamd_lang_detector_res *cand;
GHashTableIter it;
@@ -440,16 +445,20 @@ rspamd_language_detector_process_ngramm_update (struct rspamd_lang_detector *d,
switch (type) {
case rs_unigramm:
ngramms = elt->unigramms;
class_freq = elt->unigramms_total;
break;
case rs_bigramm:
ngramms = elt->bigramms;
class_freq = elt->bigramms_total;
break;
case rs_trigramm:
ngramms = elt->trigramms;
class_freq = elt->trigramms_total;
break;
}

freq = GPOINTER_TO_UINT (g_hash_table_lookup (ngramms, window));
freq = ((gdouble)GPOINTER_TO_UINT (
g_hash_table_lookup (ngramms, window))) / class_freq;

cand->prob += freq;
total_freq += freq;
@@ -458,7 +467,11 @@ rspamd_language_detector_process_ngramm_update (struct rspamd_lang_detector *d,
if (total_freq == 0) {
/* Nothing found , do full scan which will also update candidates */
rspamd_language_detector_process_ngramm_full (d, window, type, candidates);

return FALSE;
}

return TRUE;
}

static gboolean
@@ -469,6 +482,7 @@ rspamd_language_detector_update_guess (struct rspamd_lang_detector *d,
guint wlen;
UChar window[3];
goffset cur = 0;
gboolean ret = TRUE;

switch (type) {
case rs_unigramm:
@@ -485,8 +499,13 @@ rspamd_language_detector_update_guess (struct rspamd_lang_detector *d,
/* Split words */
while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur))
!= -1) {
rspamd_language_detector_process_ngramm_update (d, window, type, candidates);
if (!rspamd_language_detector_process_ngramm_update (d, window,
type, candidates)) {
ret = FALSE;
}
}

return ret;
}

static void
@@ -539,7 +558,7 @@ rspamd_language_detector_filter_negligible (GHashTable *candidates)
g_hash_table_iter_remove (&it);
}
else {
cand->prob = log2 (cand->prob / cand->total_words);
cand->prob = log2 (cand->prob);

if (cand->prob > max_prob) {
max_prob = cand->prob;
@@ -547,6 +566,7 @@ rspamd_language_detector_filter_negligible (GHashTable *candidates)
}
}

g_hash_table_iter_init (&it, candidates);
/* Filter step */
while (g_hash_table_iter_next (&it, &k, &v)) {
cand = (struct rspamd_lang_detector_res *) v;
@@ -565,7 +585,8 @@ static void
rspamd_language_detector_detect_type (struct rspamd_lang_detector *d,
GPtrArray *ucs_tokens,
GHashTable *candidates,
enum rspamd_language_gramm_type type)
enum rspamd_language_gramm_type type,
gboolean start_over)
{
guint nparts = MIN (ucs_tokens->len, default_words);
goffset *selected_words;
@@ -577,7 +598,13 @@ rspamd_language_detector_detect_type (struct rspamd_lang_detector *d,

/* Deal with the first word in a special case */
tok = g_ptr_array_index (ucs_tokens, selected_words[0]);
rspamd_language_detector_detect_word (d, tok, candidates, type);

if (start_over) {
rspamd_language_detector_detect_word (d, tok, candidates, type);
}
else {
rspamd_language_detector_update_guess (d, tok, candidates, type);
}

for (i = 1; i < nparts; i ++) {
tok = g_ptr_array_index (ucs_tokens, selected_words[i]);
@@ -588,20 +615,87 @@ rspamd_language_detector_detect_type (struct rspamd_lang_detector *d,
rspamd_language_detector_filter_negligible (candidates);
}

const gchar *
static gint
rspamd_language_detector_cmp (gconstpointer a, gconstpointer b)
{
const struct rspamd_lang_detector_res
*canda = *(const struct rspamd_lang_detector_res **)a,
*candb = *(const struct rspamd_lang_detector_res **)a;

if (canda->prob > candb->prob) {
return 1;
}
else if (candb->prob > canda->prob) {
return -1;
}

return 0;
}

GPtrArray *
rspamd_language_detector_detect (struct rspamd_lang_detector *d,
GPtrArray *ucs_tokens, gsize words_len)
{
GHashTable *candidates;
GPtrArray *result;
GHashTableIter it;
gpointer k, v;
struct rspamd_lang_detector_res *cand;
guint cand_len, prev_len;

candidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal,
NULL, g_free);
if (words_len < d->short_text_limit) {
/* For short text, start directly from trigramms */
rspamd_language_detector_detect_type (d, ucs_tokens, candidates,
rs_trigramm);
rs_trigramm, TRUE);
}
else {
/* Start with unigramms */
rspamd_language_detector_detect_type (d, ucs_tokens, candidates,
rs_unigramm, TRUE);
cand_len = g_hash_table_size (candidates);

if (cand_len > 1) {
/* Try bigramms */
rspamd_language_detector_detect_type (d, ucs_tokens, candidates,
rs_unigramm, FALSE);

cand_len = g_hash_table_size (candidates);
if (cand_len > 1) {
prev_len = cand_len;
/* Try trigramms */
GHashTable *ncandidates;
ncandidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal,
NULL, g_free);
rspamd_language_detector_detect_type (d, ucs_tokens, ncandidates,
rs_trigramm, TRUE);
cand_len = g_hash_table_size (ncandidates);

if (cand_len < prev_len) {
g_hash_table_unref (candidates);
candidates = ncandidates;
}
else {
/* Not a better guess */
g_hash_table_unref (ncandidates);
}
}
}
}

/* Now, convert hash to array and sort it */
result = g_ptr_array_new_full (g_hash_table_size (candidates), g_free);
g_hash_table_iter_init (&it, candidates);

while (g_hash_table_iter_next (&it, &k, &v)) {
cand = (struct rspamd_lang_detector_res *) v;
g_ptr_array_add (result, cand);
g_hash_table_iter_steal (&it);
}

g_ptr_array_sort (result, rspamd_language_detector_cmp);
g_hash_table_unref (candidates);

return result;
}

+ 2
- 3
src/libmime/lang_detection.h View File

@@ -26,7 +26,6 @@ struct rspamd_language_elt;

struct rspamd_lang_detector_res {
gdouble prob;
gdouble total_words;
const gchar *lang;
struct rspamd_language_elt *elt;
};
@@ -53,9 +52,9 @@ void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
* @param d
* @param ucs_tokens
* @param words_len
* @return language code or NULL if language has not been detected
* @return array of struct rspamd_lang_detector_res sorted by freq descending
*/
const gchar * rspamd_language_detector_detect (struct rspamd_lang_detector *d,
GPtrArray * rspamd_language_detector_detect (struct rspamd_lang_detector *d,
GPtrArray *ucs_tokens, gsize words_len);

#endif

Loading…
Cancel
Save