Browse Source

[Project] Add ngramms frequencies detector

tags/1.7.0
Vsevolod Stakhov 6 years ago
parent
commit
f581bcea91
2 changed files with 197 additions and 1 deletions
  1. 195
    1
      src/libmime/lang_detection.c
  2. 2
    0
      src/libmime/lang_detection.h

+ 195
- 1
src/libmime/lang_detection.c View File

@@ -303,13 +303,207 @@ rspamd_language_detector_random_select (GPtrArray *ucs_tokens, guint nwords,
}
}

enum rspamd_language_gramm_type {
rs_unigramm = 0,
rs_bigramm,
rs_trigramm
};

static goffset
rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
guint wlen, goffset cur_off)
{
guint i;

if (wlen > 1) {
/* Deal with spaces at the beginning and ending */

if (cur_off == 0) {
window[0] = (UChar)' ';

for (i = 0; i < wlen - 1; i ++) {
window[i + 1] = *(((UChar *)tok->begin) + i);
}
}
else if (cur_off + wlen == tok->len + 1) {
/* Add trailing space */
for (i = 0; i < wlen - 1; i ++) {
window[i] = *(((UChar *)tok->begin) + cur_off + i);
}
window[wlen - 1] = (UChar)' ';
}
else if (cur_off + wlen > tok->len + 1) {
/* No more fun */
return -1;
}

/* Normal case */
for (i = 0; i < wlen; i ++) {
window[i] = *(((UChar *)tok->begin) + cur_off + i);
}
}
else {
if (tok->len >= cur_off) {
return -1;
}

window[0] = *(((UChar *)tok->begin) + cur_off);
}

return cur_off + 1;
}

/*
* Do full guess for a specific ngramm, checking all languages defined
*/
static void
rspamd_language_detector_process_ngramm_full (struct rspamd_lang_detector *d,
UChar *window, enum rspamd_language_gramm_type type,
GHashTable *candidates)
{
guint i, freq;
struct rspamd_language_elt *elt;
struct rspamd_lang_detector_res *cand;
GHashTable *ngramms;

for (i = 0; i < d->languages->len; i ++) {
elt = g_ptr_array_index (d->languages, i);

switch (type) {
case rs_unigramm:
ngramms = elt->unigramms;
break;
case rs_bigramm:
ngramms = elt->bigramms;
break;
case rs_trigramm:
ngramms = elt->trigramms;
break;
}

freq = GPOINTER_TO_UINT (g_hash_table_lookup (ngramms, window));
cand = g_hash_table_lookup (candidates, elt->name);

if (cand == NULL) {
cand = g_malloc (sizeof (*cand));
cand->elt = elt;
cand->lang = elt->name;
cand->prob = freq;
}
else {
/* Update guess */
cand->prob += freq;
}
}
}

/*
* Check only candidates, if none found, switch to full version
*/
static void
rspamd_language_detector_process_ngramm_update (struct rspamd_lang_detector *d,
UChar *window, enum rspamd_language_gramm_type type,
GHashTable *candidates)
{
guint freq, total_freq = 0;
struct rspamd_language_elt *elt;
struct rspamd_lang_detector_res *cand;
GHashTableIter it;
gpointer k, v;
GHashTable *ngramms;

g_hash_table_iter_init (&it, candidates);

while (g_hash_table_iter_next (&it, &k, &v)) {
cand = (struct rspamd_lang_detector_res *)v;
elt = cand->elt;

switch (type) {
case rs_unigramm:
ngramms = elt->unigramms;
break;
case rs_bigramm:
ngramms = elt->bigramms;
break;
case rs_trigramm:
ngramms = elt->trigramms;
break;
}

freq = GPOINTER_TO_UINT (g_hash_table_lookup (ngramms, window));

cand->prob += freq;
total_freq += freq;
}

if (total_freq == 0) {
/* Nothing found , do full scan which will also update candidates */
rspamd_language_detector_process_ngramm_full (d, window, type, candidates);
}
}

static gboolean
rspamd_language_detector_update_guess (struct rspamd_lang_detector *d,
rspamd_stat_token_t *tok, GHashTable *candidates,
enum rspamd_language_gramm_type type)
{
guint wlen;
UChar window[3];
goffset cur = 0;

switch (type) {
case rs_unigramm:
wlen = 1;
break;
case rs_bigramm:
wlen = 2;
break;
case rs_trigramm:
wlen = 3;
break;
}

/* Split words */
while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur))
!= -1) {

}
}

static void
rspamd_language_detector_detect_word (struct rspamd_lang_detector *d,
rspamd_stat_token_t *tok, GHashTable *candidates,
enum rspamd_language_gramm_type type)
{
guint wlen;
UChar window[3];
goffset cur = 0;

switch (type) {
case rs_unigramm:
wlen = 1;
break;
case rs_bigramm:
wlen = 2;
break;
case rs_trigramm:
wlen = 3;
break;
}

/* Split words */
while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur))
!= -1) {

}
}

const gchar *
rspamd_language_detector_detect (struct rspamd_lang_detector *d,
GPtrArray *ucs_tokens, gsize words_len)
{
if (words_len < d->short_text_limit) {
/* For short text, start directly from trigramms */
return rspamd_language_detector_detect_trigramm ();
}

/* Start with unigramms */

+ 2
- 0
src/libmime/lang_detection.h View File

@@ -22,10 +22,12 @@
#include "libstat/stat_api.h"

struct rspamd_lang_detector;
struct rspamd_language_elt;

struct rspamd_lang_detector_res {
gdouble prob;
const gchar *lang;
struct rspamd_language_elt *elt;
};

/**

Loading…
Cancel
Save