struct rspamd_language_elt {
const gchar *name; /* e.g. "en" or "ru" */
+ guint unigramms_total; /* total frequencies for unigramms */
+ GHashTable *unigramms; /* unigramms frequencies */
guint bigramms_total; /* total frequencies for bigramms */
- GHashTable *bigramms; /* bigrams frequencies */
+ GHashTable *bigramms; /* bigramms frequencies */
guint trigramms_total; /* total frequencies for trigramms */
GHashTable *trigramms; /* trigramms frequencies */
};
UConverter *uchar_converter;
};
+static guint
+rspamd_unigram_hash (gconstpointer key)
+{
+ return rspamd_cryptobox_fast_hash (key, sizeof (UChar), rspamd_hash_seed ());
+}
+
+static gboolean
+rspamd_unigram_equal (gconstpointer v, gconstpointer v2)
+{
+ return memcmp (v, v2, sizeof (UChar)) == 0;
+}
+
static guint
rspamd_bigram_hash (gconstpointer key)
{
pos = strchr (nelt->name, '.');
g_assert (pos != NULL);
*pos = '\0';
+ nelt->unigramms = g_hash_table_new (rspamd_unigram_hash, rspamd_unigram_equal);
nelt->bigramms = g_hash_table_new (rspamd_bigram_hash, rspamd_bigram_equal);
nelt->trigramms = g_hash_table_new (rspamd_trigram_hash, rspamd_trigram_equal);
GUINT_TO_POINTER (freq));
nelt->trigramms_total += freq;
}
+ else if (nsym == 1) {
+ g_hash_table_insert (nelt->unigramms, ucs_key,
+ GUINT_TO_POINTER (freq));
+ nelt->unigramms_total += freq;
+ }
else if (nsym > 3) {
msg_warn_config ("have more than 3 characters in key: %d", nsym);
}
}
}
- msg_info_config ("loaded %s language, %d digramms, %d trigramms",
- nelt->name, (gint)g_hash_table_size (nelt->bigramms),
+ msg_info_config ("loaded %s language, %d unigramms, %d digramms, %d trigramms",
+ nelt->name,
+ (gint)g_hash_table_size (nelt->unigramms),
+ (gint)g_hash_table_size (nelt->bigramms),
(gint)g_hash_table_size (nelt->trigramms));
g_ptr_array_add (d->languages, nelt);
return ret;
}
+
+
+void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
+ rspamd_mempool_t *pool,
+ rspamd_stat_token_t *utf_token, rspamd_stat_token_t *ucs_token)
+{
+ UChar *out;
+ int32_t nsym;
+ UErrorCode uc_err = U_ZERO_ERROR;
+
+ ucs_token->flags = utf_token->flags;
+ out = rspamd_mempool_alloc (pool, sizeof (*out) * (utf_token->len + 1));
+ nsym = ucnv_toUChars (d->uchar_converter, out, (utf_token->len + 1),
+ utf_token->begin, utf_token->len, &uc_err);
+
+ if (nsym >= 0) {
+ ucs_token->begin = (const gchar *) out;
+ ucs_token->len = nsym;
+ }
+ else {
+ ucs_token->len = 0;
+ }
+}
\ No newline at end of file
#include "config.h"
#include "libserver/cfg_file.h"
+#include "libstat/stat_api.h"
struct rspamd_lang_detector;
+/**
+ * Create new language detector object using configuration object
+ * @param cfg
+ * @return
+ */
struct rspamd_lang_detector* rspamd_language_detector_init (struct rspamd_config *cfg);
+/**
+ * Convert string from utf8 to ucs32
+ * @param d
+ * @param utf_token
+ * @param ucs_token
+ */
+void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
+ rspamd_mempool_t *pool,
+ rspamd_stat_token_t *utf_token,
+ rspamd_stat_token_t *ucs_token);
#endif
#include "smtp_parsers.h"
#include "mime_parser.h"
#include "mime_encoding.h"
+#include "lang_detection.h"
#include "libutil/multipattern.h"
#include "libserver/mempool_vars_internal.h"
#ifdef WITH_SNOWBALL
struct sb_stemmer *stem = NULL;
#endif
- rspamd_stat_token_t *w;
+ rspamd_stat_token_t *w, ucs_w;
gchar *temp_word;
const guchar *r;
- guint i, nlen, total_len = 0, short_len = 0;
+ guint i, nlen, total_len = 0, short_len = 0, ucs_len = 0;
gdouble avg_len = 0;
#ifdef WITH_SNOWBALL
part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
sizeof (guint64), part->normalized_words->len);
+ if (IS_PART_UTF (part) && task->lang_det) {
+ part->ucs32_words = g_array_sized_new (FALSE, FALSE,
+ sizeof (rspamd_stat_token_t), part->normalized_words->len);
+ }
+
for (i = 0; i < part->normalized_words->len; i ++) {
guint64 h;
w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
+
+ if (part->ucs32_words) {
+ rspamd_language_detector_to_ucs (task->lang_det, task->task_pool,
+ w, &ucs_w);
+ g_array_append_val (part->ucs32_words, ucs_w);
+ ucs_len += ucs_w.len;
+ }
+
r = NULL;
#ifdef WITH_SNOWBALL
if (stem) {