}
static void
-rspamd_language_detector_random_select (GPtrArray *ucs_tokens, guint nwords,
+rspamd_language_detector_random_select (GArray *ucs_tokens, guint nwords,
goffset *offsets_out)
{
guint step_len, remainder, i, out_idx;
}
}
else {
- if (tok->len >= cur_off) {
+ if (tok->len <= cur_off) {
return -1;
}
freq = ((gdouble)GPOINTER_TO_UINT (
g_hash_table_lookup (ngramms, window))) / class_freq;
- cand = g_hash_table_lookup (candidates, elt->name);
- if (cand == NULL) {
- cand = g_malloc (sizeof (*cand));
- cand->elt = elt;
- cand->lang = elt->name;
- cand->prob = freq;
- }
- else {
- /* Update guess */
- cand->prob += freq;
+ if (freq > 0) {
+ cand = g_hash_table_lookup (candidates, elt->name);
+
+ if (cand == NULL) {
+ cand = g_malloc (sizeof (*cand));
+ cand->elt = elt;
+ cand->lang = elt->name;
+ cand->prob = freq;
+
+ g_hash_table_insert (candidates, (gpointer)elt->name, cand);
+ } else {
+ /* Update guess */
+ cand->prob += freq;
+ }
}
}
}
static void
rspamd_language_detector_detect_type (struct rspamd_lang_detector *d,
- GPtrArray *ucs_tokens,
+ GArray *ucs_tokens,
GHashTable *candidates,
enum rspamd_language_gramm_type type,
gboolean start_over)
rspamd_language_detector_random_select (ucs_tokens, nparts, selected_words);
/* Deal with the first word in a special case */
- tok = g_ptr_array_index (ucs_tokens, selected_words[0]);
+ tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, selected_words[0]);
if (start_over) {
rspamd_language_detector_detect_word (d, tok, candidates, type);
}
for (i = 1; i < nparts; i ++) {
- tok = g_ptr_array_index (ucs_tokens, selected_words[i]);
+ tok = &g_array_index (ucs_tokens, rspamd_stat_token_t, selected_words[i]);
rspamd_language_detector_update_guess (d, tok, candidates, type);
}
{
const struct rspamd_lang_detector_res
*canda = *(const struct rspamd_lang_detector_res **)a,
- *candb = *(const struct rspamd_lang_detector_res **)a;
+ *candb = *(const struct rspamd_lang_detector_res **)b;
if (canda->prob > candb->prob) {
- return 1;
+ return -1;
}
else if (candb->prob > canda->prob) {
- return -1;
+ return 1;
}
return 0;
GPtrArray *
rspamd_language_detector_detect (struct rspamd_lang_detector *d,
- GPtrArray *ucs_tokens, gsize words_len)
+ GArray *ucs_tokens, gsize words_len)
{
GHashTable *candidates;
GPtrArray *result;
while (g_hash_table_iter_next (&it, &k, &v)) {
cand = (struct rspamd_lang_detector_res *) v;
+ msg_err ("%s -> %.2f", cand->lang, cand->prob);
g_ptr_array_add (result, cand);
g_hash_table_iter_steal (&it);
}
}
if (part->ucs32_words) {
+ struct rspamd_lang_detector_res *lang;
+
for (i = 0; i < part->normalized_words->len; i++) {
w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
ucs_len += ucs_w.len;
}
+ part->languages = rspamd_language_detector_detect (task->lang_det,
+ part->ucs32_words, ucs_len);
+
+ if (part->languages->len > 0) {
+ lang = g_ptr_array_index (part->languages, 0);
+ part->language = lang->lang;
+
+ msg_info_task ("detected part language: %s", part->language);
+ }
+
#ifdef WITH_SNOWBALL
static GHashTable *stemmers = NULL;
if (RSPAMD_TASK_IS_EMPTY (task)) {
/* Don't do anything with empty task */
-
return TRUE;
}