}
}
+enum rspamd_language_gramm_type {
+ rs_unigramm = 0,
+ rs_bigramm,
+ rs_trigramm
+};
+
+static goffset
+rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
+ guint wlen, goffset cur_off)
+{
+ guint i;
+
+ if (wlen > 1) {
+ /* Deal with spaces at the beginning and ending */
+
+ if (cur_off == 0) {
+ window[0] = (UChar)' ';
+
+ for (i = 0; i < wlen - 1; i ++) {
+ window[i + 1] = *(((UChar *)tok->begin) + i);
+ }
+ }
+ else if (cur_off + wlen == tok->len + 1) {
+ /* Add trailing space */
+ for (i = 0; i < wlen - 1; i ++) {
+ window[i] = *(((UChar *)tok->begin) + cur_off + i);
+ }
+ window[wlen - 1] = (UChar)' ';
+ }
+ else if (cur_off + wlen > tok->len + 1) {
+ /* No more fun */
+ return -1;
+ }
+
+ /* Normal case */
+ for (i = 0; i < wlen; i ++) {
+ window[i] = *(((UChar *)tok->begin) + cur_off + i);
+ }
+ }
+ else {
+ if (tok->len >= cur_off) {
+ return -1;
+ }
+
+ window[0] = *(((UChar *)tok->begin) + cur_off);
+ }
+
+ return cur_off + 1;
+}
+
+/*
+ * Do full guess for a specific ngramm, checking all languages defined
+ */
+static void
+rspamd_language_detector_process_ngramm_full (struct rspamd_lang_detector *d,
+ UChar *window, enum rspamd_language_gramm_type type,
+ GHashTable *candidates)
+{
+ guint i, freq;
+ struct rspamd_language_elt *elt;
+ struct rspamd_lang_detector_res *cand;
+ GHashTable *ngramms;
+
+ for (i = 0; i < d->languages->len; i ++) {
+ elt = g_ptr_array_index (d->languages, i);
+
+ switch (type) {
+ case rs_unigramm:
+ ngramms = elt->unigramms;
+ break;
+ case rs_bigramm:
+ ngramms = elt->bigramms;
+ break;
+ case rs_trigramm:
+ ngramms = elt->trigramms;
+ break;
+ }
+
+ freq = GPOINTER_TO_UINT (g_hash_table_lookup (ngramms, window));
+ cand = g_hash_table_lookup (candidates, elt->name);
+
+ if (cand == NULL) {
+ cand = g_malloc (sizeof (*cand));
+ cand->elt = elt;
+ cand->lang = elt->name;
+ cand->prob = freq;
+ }
+ else {
+ /* Update guess */
+ cand->prob += freq;
+ }
+ }
+}
+
+/*
+ * Check only candidates, if none found, switch to full version
+ */
+static void
+rspamd_language_detector_process_ngramm_update (struct rspamd_lang_detector *d,
+ UChar *window, enum rspamd_language_gramm_type type,
+ GHashTable *candidates)
+{
+ guint freq, total_freq = 0;
+ struct rspamd_language_elt *elt;
+ struct rspamd_lang_detector_res *cand;
+ GHashTableIter it;
+ gpointer k, v;
+ GHashTable *ngramms;
+
+ g_hash_table_iter_init (&it, candidates);
+
+ while (g_hash_table_iter_next (&it, &k, &v)) {
+ cand = (struct rspamd_lang_detector_res *)v;
+ elt = cand->elt;
+
+ switch (type) {
+ case rs_unigramm:
+ ngramms = elt->unigramms;
+ break;
+ case rs_bigramm:
+ ngramms = elt->bigramms;
+ break;
+ case rs_trigramm:
+ ngramms = elt->trigramms;
+ break;
+ }
+
+ freq = GPOINTER_TO_UINT (g_hash_table_lookup (ngramms, window));
+
+ cand->prob += freq;
+ total_freq += freq;
+ }
+
+ if (total_freq == 0) {
+ /* Nothing found , do full scan which will also update candidates */
+ rspamd_language_detector_process_ngramm_full (d, window, type, candidates);
+ }
+}
+
+static gboolean
+rspamd_language_detector_update_guess (struct rspamd_lang_detector *d,
+ rspamd_stat_token_t *tok, GHashTable *candidates,
+ enum rspamd_language_gramm_type type)
+{
+ guint wlen;
+ UChar window[3];
+ goffset cur = 0;
+
+ switch (type) {
+ case rs_unigramm:
+ wlen = 1;
+ break;
+ case rs_bigramm:
+ wlen = 2;
+ break;
+ case rs_trigramm:
+ wlen = 3;
+ break;
+ }
+
+ /* Split words */
+ while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur))
+ != -1) {
+
+ }
+}
+
+static void
+rspamd_language_detector_detect_word (struct rspamd_lang_detector *d,
+ rspamd_stat_token_t *tok, GHashTable *candidates,
+ enum rspamd_language_gramm_type type)
+{
+ guint wlen;
+ UChar window[3];
+ goffset cur = 0;
+
+ switch (type) {
+ case rs_unigramm:
+ wlen = 1;
+ break;
+ case rs_bigramm:
+ wlen = 2;
+ break;
+ case rs_trigramm:
+ wlen = 3;
+ break;
+ }
+
+ /* Split words */
+ while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur))
+ != -1) {
+
+ }
+}
+
const gchar *
rspamd_language_detector_detect (struct rspamd_lang_detector *d,
GPtrArray *ucs_tokens, gsize words_len)
{
if (words_len < d->short_text_limit) {
/* For short text, start directly from trigramms */
- return rspamd_language_detector_detect_trigramm ();
}
/* Start with unigramms */