UChar *window, enum rspamd_language_gramm_type type,
GHashTable *candidates)
{
- guint i, freq;
+ guint i;
+ gdouble freq, class_freq;
struct rspamd_language_elt *elt;
struct rspamd_lang_detector_res *cand;
GHashTable *ngramms;
switch (type) {
case rs_unigramm:
ngramms = elt->unigramms;
+ class_freq = elt->unigramms_total;
break;
case rs_bigramm:
ngramms = elt->bigramms;
+ class_freq = elt->bigramms_total;
break;
case rs_trigramm:
ngramms = elt->trigramms;
+ class_freq = elt->trigramms_total;
break;
}
- freq = GPOINTER_TO_UINT (g_hash_table_lookup (ngramms, window));
+ freq = ((gdouble)GPOINTER_TO_UINT (
+ g_hash_table_lookup (ngramms, window))) / class_freq;
cand = g_hash_table_lookup (candidates, elt->name);
if (cand == NULL) {
/*
* Check only candidates, if none found, switch to full version
*/
-static void
+static gboolean
rspamd_language_detector_process_ngramm_update (struct rspamd_lang_detector *d,
UChar *window, enum rspamd_language_gramm_type type,
GHashTable *candidates)
{
- guint freq, total_freq = 0;
+ gdouble freq, total_freq = 0.0, class_freq;
struct rspamd_language_elt *elt;
struct rspamd_lang_detector_res *cand;
GHashTableIter it;
switch (type) {
case rs_unigramm:
ngramms = elt->unigramms;
+ class_freq = elt->unigramms_total;
break;
case rs_bigramm:
ngramms = elt->bigramms;
+ class_freq = elt->bigramms_total;
break;
case rs_trigramm:
ngramms = elt->trigramms;
+ class_freq = elt->trigramms_total;
break;
}
- freq = GPOINTER_TO_UINT (g_hash_table_lookup (ngramms, window));
+ freq = ((gdouble)GPOINTER_TO_UINT (
+ g_hash_table_lookup (ngramms, window))) / class_freq;
cand->prob += freq;
total_freq += freq;
if (total_freq == 0) {
/* Nothing found , do full scan which will also update candidates */
rspamd_language_detector_process_ngramm_full (d, window, type, candidates);
+
+ return FALSE;
}
+
+ return TRUE;
}
static gboolean
guint wlen;
UChar window[3];
goffset cur = 0;
+ gboolean ret = TRUE;
switch (type) {
case rs_unigramm:
/* Split words */
while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur))
!= -1) {
- rspamd_language_detector_process_ngramm_update (d, window, type, candidates);
+ if (!rspamd_language_detector_process_ngramm_update (d, window,
+ type, candidates)) {
+ ret = FALSE;
+ }
}
+
+ return ret;
}
static void
g_hash_table_iter_remove (&it);
}
else {
- cand->prob = log2 (cand->prob / cand->total_words);
+ cand->prob = log2 (cand->prob);
if (cand->prob > max_prob) {
max_prob = cand->prob;
}
}
+ g_hash_table_iter_init (&it, candidates);
/* Filter step */
while (g_hash_table_iter_next (&it, &k, &v)) {
cand = (struct rspamd_lang_detector_res *) v;
rspamd_language_detector_detect_type (struct rspamd_lang_detector *d,
GPtrArray *ucs_tokens,
GHashTable *candidates,
- enum rspamd_language_gramm_type type)
+ enum rspamd_language_gramm_type type,
+ gboolean start_over)
{
guint nparts = MIN (ucs_tokens->len, default_words);
goffset *selected_words;
/* Deal with the first word in a special case */
tok = g_ptr_array_index (ucs_tokens, selected_words[0]);
- rspamd_language_detector_detect_word (d, tok, candidates, type);
+
+ if (start_over) {
+ rspamd_language_detector_detect_word (d, tok, candidates, type);
+ }
+ else {
+ rspamd_language_detector_update_guess (d, tok, candidates, type);
+ }
for (i = 1; i < nparts; i ++) {
tok = g_ptr_array_index (ucs_tokens, selected_words[i]);
rspamd_language_detector_filter_negligible (candidates);
}
-const gchar *
+static gint
+rspamd_language_detector_cmp (gconstpointer a, gconstpointer b)
+{
+ const struct rspamd_lang_detector_res
+ *canda = *(const struct rspamd_lang_detector_res **)a,
+ *candb = *(const struct rspamd_lang_detector_res **)a;
+
+ if (canda->prob > candb->prob) {
+ return 1;
+ }
+ else if (candb->prob > canda->prob) {
+ return -1;
+ }
+
+ return 0;
+}
+
+GPtrArray *
rspamd_language_detector_detect (struct rspamd_lang_detector *d,
GPtrArray *ucs_tokens, gsize words_len)
{
GHashTable *candidates;
+ GPtrArray *result;
+ GHashTableIter it;
+ gpointer k, v;
+ struct rspamd_lang_detector_res *cand;
+ guint cand_len, prev_len;
candidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal,
NULL, g_free);
if (words_len < d->short_text_limit) {
/* For short text, start directly from trigramms */
rspamd_language_detector_detect_type (d, ucs_tokens, candidates,
- rs_trigramm);
+ rs_trigramm, TRUE);
}
else {
/* Start with unigramms */
+ rspamd_language_detector_detect_type (d, ucs_tokens, candidates,
+ rs_unigramm, TRUE);
+ cand_len = g_hash_table_size (candidates);
+
+ if (cand_len > 1) {
+ /* Try bigramms */
+ rspamd_language_detector_detect_type (d, ucs_tokens, candidates,
+ rs_unigramm, FALSE);
+
+ cand_len = g_hash_table_size (candidates);
+ if (cand_len > 1) {
+ prev_len = cand_len;
+ /* Try trigramms */
+ GHashTable *ncandidates;
+ ncandidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal,
+ NULL, g_free);
+ rspamd_language_detector_detect_type (d, ucs_tokens, ncandidates,
+ rs_trigramm, TRUE);
+ cand_len = g_hash_table_size (ncandidates);
+
+ if (cand_len < prev_len) {
+ g_hash_table_unref (candidates);
+ candidates = ncandidates;
+ }
+ else {
+ /* Not a better guess */
+ g_hash_table_unref (ncandidates);
+ }
+ }
+ }
}
+
+ /* Now, convert hash to array and sort it */
+ result = g_ptr_array_new_full (g_hash_table_size (candidates), g_free);
+ g_hash_table_iter_init (&it, candidates);
+
+ while (g_hash_table_iter_next (&it, &k, &v)) {
+ cand = (struct rspamd_lang_detector_res *) v;
+ g_ptr_array_add (result, cand);
+ g_hash_table_iter_steal (&it);
+ }
+
+ g_ptr_array_sort (result, rspamd_language_detector_cmp);
+ g_hash_table_unref (candidates);
+
+ return result;
}
\ No newline at end of file