return memcmp (v, v2, 3 * sizeof (UChar)) == 0;
}
-KHASH_INIT (rspamd_unigram_hash, UChar *, struct rspamd_ngramm_chain, true,
+KHASH_INIT (rspamd_unigram_hash, const UChar *, struct rspamd_ngramm_chain, true,
rspamd_unigram_hash_func, rspamd_unigram_equal_func);
-KHASH_INIT (rspamd_trigram_hash, UChar *, struct rspamd_ngramm_chain, true,
+KHASH_INIT (rspamd_trigram_hash, const UChar *, struct rspamd_ngramm_chain, true,
rspamd_trigram_hash_func, rspamd_trigram_equal_func);
+KHASH_INIT (rspamd_candidates_hash, const gchar *,
+ struct rspamd_lang_detector_res *, true,
+ rspamd_str_hash, rspamd_str_equal);
struct rspamd_lang_detector {
GPtrArray *languages;
rspamd_language_detector_process_ngramm_full (struct rspamd_task *task,
struct rspamd_lang_detector *d,
UChar *window, enum rspamd_language_gramm_type type,
- GHashTable *candidates)
+ khash_t(rspamd_candidates_hash) *candidates)
{
guint i;
+ gint ret;
struct rspamd_ngramm_chain *chain = NULL;
struct rspamd_ngramm_elt *elt;
struct rspamd_lang_detector_res *cand;
if (chain) {
PTR_ARRAY_FOREACH (chain->languages, i, elt) {
- cand = g_hash_table_lookup (candidates, elt->elt->name);
prob = elt->prob;
if (prob < chain->mean) {
continue;
}
+
+ k = kh_get (rspamd_candidates_hash, candidates, elt->elt->name);
+ if (k != kh_end (candidates)) {
+ cand = kh_value (candidates, k);
+ }
+ else {
+ cand = NULL;
+ }
+
#ifdef NGRAMMS_DEBUG
msg_err ("gramm: %s, lang: %s, prob: %.3f", chain->utf,
elt->elt->name, log2 (elt->prob));
#endif
if (cand == NULL) {
- cand = g_malloc (sizeof (*cand));
+ cand = rspamd_mempool_alloc (task->task_pool, sizeof (*cand));
cand->elt = elt->elt;
cand->lang = elt->elt->name;
cand->prob = prob;
- g_hash_table_insert (candidates, (gpointer)cand->lang, cand);
+ k = kh_put (rspamd_candidates_hash, candidates, elt->elt->name,
+ &ret);
+ kh_value (candidates, k) = cand;
} else {
/* Update guess */
cand->prob += prob;
static void
rspamd_language_detector_detect_word (struct rspamd_task *task,
struct rspamd_lang_detector *d,
- rspamd_stat_token_t *tok, GHashTable *candidates,
+ rspamd_stat_token_t *tok,
+ khash_t(rspamd_candidates_hash) *candidates,
enum rspamd_language_gramm_type type)
{
guint wlen;
* Converts frequencies to log probabilities, filter those candidates who
* has the lowest probabilities
*/
-static void
-rspamd_language_detector_filter_negligible (struct rspamd_task *task,
- GHashTable *candidates)
-{
- GHashTableIter it;
- gpointer k, v;
- struct rspamd_lang_detector_res *cand;
- guint filtered = 0;
- gdouble max_prob = -(G_MAXDOUBLE);
-
- /* Normalize step */
- g_hash_table_iter_init (&it, candidates);
-
- while (g_hash_table_iter_next (&it, &k, &v)) {
- cand = (struct rspamd_lang_detector_res *)v;
+static inline void
+rspamd_language_detector_filter_step1 (struct rspamd_task *task,
+ struct rspamd_lang_detector_res *cand,
+ gdouble *max_prob, guint *filtered)
+{
+ if (!isnan (cand->prob)) {
if (cand->prob == 0) {
- msg_debug_lang_det ("exclude language %s: %.3f",
- cand->lang, cand->prob, max_prob);
- g_hash_table_iter_remove (&it);
- filtered ++;
+ cand->prob = NAN;
+ msg_debug_lang_det (
+ "exclude language %s",
+ cand->lang);
+ (*filtered)++;
}
else {
cand->prob = log2 (cand->prob);
if (cand->prob < cutoff_limit) {
- msg_debug_lang_det ("exclude language %s: %.3f, cutoff limit: %.3f",
+ msg_debug_lang_det (
+ "exclude language %s: %.3f, cutoff limit: %.3f",
cand->lang, cand->prob, cutoff_limit);
- g_hash_table_iter_remove (&it);
- filtered ++;
+ cand->prob = NAN;
+ (*filtered)++;
}
- else if (cand->prob > max_prob) {
- max_prob = cand->prob;
+ else if (cand->prob > *max_prob) {
+ *max_prob = cand->prob;
}
}
}
+}
- g_hash_table_iter_init (&it, candidates);
- /* Filter step */
- while (g_hash_table_iter_next (&it, &k, &v)) {
- cand = (struct rspamd_lang_detector_res *) v;
-
- /*
+static inline void
+rspamd_language_detector_filter_step2 (struct rspamd_task *task,
+ struct rspamd_lang_detector_res *cand,
+ gdouble max_prob, guint *filtered)
+{
+ /*
* Probabilities are logarithmic, so if prob1 - prob2 > 4, it means that
* prob2 is 2^4 less than prob1
*/
- if (max_prob - cand->prob > 1) {
- msg_debug_lang_det ("exclude language %s: %.3f (%.3f max)",
- cand->lang, cand->prob, max_prob);
- g_hash_table_iter_remove (&it);
- filtered ++;
- }
+ if (!isnan (cand->prob) && max_prob - cand->prob > 1) {
+ msg_debug_lang_det ("exclude language %s: %.3f (%.3f max)",
+ cand->lang, cand->prob, max_prob);
+ cand->prob = NAN;
+ (*filtered) ++;
}
+}
+
+static void
+rspamd_language_detector_filter_negligible (struct rspamd_task *task,
+ khash_t(rspamd_candidates_hash) *candidates)
+{
+ struct rspamd_lang_detector_res *cand;
+ guint filtered = 0;
+ gdouble max_prob = -(G_MAXDOUBLE);
+
+ kh_foreach_value (candidates, cand,
+ rspamd_language_detector_filter_step1 (task, cand, &max_prob, &filtered));
+ kh_foreach_value (candidates, cand,
+ rspamd_language_detector_filter_step2 (task, cand, max_prob, &filtered));
msg_debug_lang_det ("removed %d languages", filtered);
}
GArray *ucs_tokens,
goffset *selected_words,
gsize nparts,
- GHashTable *candidates)
+ khash_t(rspamd_candidates_hash) *candidates)
{
guint i, j, total_found = 0, total_checked = 0;
rspamd_stat_token_t *tok;
UChar t;
- gint uc_script;
+ gint uc_script, ret;
+ khint_t k;
struct rspamd_language_elt *elt;
struct rspamd_lang_detector_res *cand;
- GHashTableIter it;
- gpointer k, v;
for (i = 0; i < nparts; i++) {
tok = &g_array_index (ucs_tokens, rspamd_stat_token_t,
elt = g_hash_table_lookup (d->unicode_scripts, &uc_script);
if (elt) {
- cand = g_hash_table_lookup (candidates, elt->name);
+ k = kh_get (rspamd_candidates_hash, candidates, elt->name);
+ if (k != kh_end (candidates)) {
+ cand = kh_value (candidates, k);
+ }
+ else {
+ cand = NULL;
+ }
if (cand == NULL) {
- cand = g_malloc (sizeof (*cand));
+ cand = rspamd_mempool_alloc (task->task_pool,
+ sizeof (*cand));
cand->elt = elt;
cand->lang = elt->name;
cand->prob = 1;
- g_hash_table_insert (candidates, (gpointer)cand->lang, cand);
+ k = kh_put (rspamd_candidates_hash, candidates, elt->name, &ret);
+ kh_value (candidates, k) = cand;
} else {
/* Update guess */
cand->prob ++;
}
else {
/* Filter candidates */
- g_hash_table_iter_init (&it, candidates);
-
- while (g_hash_table_iter_next (&it, &k, &v)) {
- cand = (struct rspamd_lang_detector_res *)v;
-
+ kh_foreach_value (candidates, cand, {
cand->prob = cand->prob / total_checked;
- }
+ });
}
return TRUE;
guint nwords,
struct rspamd_lang_detector *d,
GArray *ucs_tokens,
- GHashTable *candidates,
+ khash_t(rspamd_candidates_hash) *candidates,
enum rspamd_language_gramm_type type) {
guint nparts = MIN (ucs_tokens->len, nwords);
goffset *selected_words;
msg_debug_lang_det ("randomly selected %d words", nparts);
/* Check unicode scripts */
- if (g_hash_table_size (candidates) != 0 ||
+ if (kh_size (candidates) != 0 ||
!rspamd_language_detector_is_unicode (task, d, ucs_tokens,
selected_words, nparts, candidates)) {
struct rspamd_lang_detector *d,
GArray *ucs_tokens,
enum rspamd_language_gramm_type type,
- GHashTable *candidates)
+ khash_t(rspamd_candidates_hash) *candidates)
{
- guint cand_len;
+ guint cand_len = 0;
+ struct rspamd_lang_detector_res *cand;
rspamd_language_detector_detect_type (task,
nwords,
candidates,
type);
- cand_len = g_hash_table_size (candidates);
+ kh_foreach_value (candidates, cand, {
+ if (!isnan (cand->prob)) {
+ cand_len ++;
+ }
+ });
if (cand_len == 0) {
return rs_detect_none;
struct rspamd_lang_detector *d,
GArray *ucs_tokens, gsize words_len)
{
- GHashTable *candidates;
+ khash_t(rspamd_candidates_hash) *candidates;
GPtrArray *result;
- GHashTableIter it;
- gpointer k, v;
gdouble mean, std, start_ticks, end_ticks;
+ guint cand_len;
struct rspamd_lang_detector_res *cand;
enum rspamd_language_detected_type r;
struct rspamd_frequency_sort_cbdata cbd;
}
start_ticks = rspamd_get_ticks (TRUE);
- candidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal,
- NULL, g_free);
+ candidates = kh_init (rspamd_candidates_hash);
+ kh_resize (rspamd_candidates_hash, candidates, 32);
r = rspamd_language_detector_try_ngramm (task, default_words, d,
ucs_tokens, rs_trigramm,
}
else if (r == rs_detect_multiple) {
/* Check our guess */
- msg_debug_lang_det ("trigramms pass finished, found %d candidates",
- (gint)g_hash_table_size (candidates));
mean = 0.0;
std = 0.0;
- g_hash_table_iter_init (&it, candidates);
+ cand_len = 0;
/* Check distirbution */
- while (g_hash_table_iter_next (&it, &k, &v)) {
- cand = (struct rspamd_lang_detector_res *) v;
- mean += cand->prob;
- }
+ kh_foreach_value (candidates, cand, {
+ if (!isnan (cand->prob)) {
+ mean += cand->prob;
+ cand_len ++;
+ }
+ });
- mean /= g_hash_table_size (candidates);
+ if (cand_len > 0) {
+ mean /= cand_len;
- g_hash_table_iter_init (&it, candidates);
- while (g_hash_table_iter_next (&it, &k, &v)) {
- gdouble err;
- cand = (struct rspamd_lang_detector_res *) v;
- err = cand->prob - mean;
- std += fabs (err);
- }
+ kh_foreach_value (candidates, cand, {
+ gdouble err;
+ if (!isnan (cand->prob)) {
+ err = cand->prob - mean;
+ std += fabs (err);
+ }
+ });
- std /= g_hash_table_size (candidates);
+ std /= cand_len;
+ }
- msg_debug_lang_det ("trigramms checked, %.3f mean, %.4f stddev",
- mean, std);
+ msg_debug_lang_det ("trigramms checked, %d candidates, %.3f mean, %.4f stddev",
+ cand_len, mean, std);
- if (std / fabs (mean) < 0.25) {
+ if (cand_len > 0 && std / fabs (mean) < 0.25) {
msg_debug_lang_det ("apply frequency heuristic sorting");
frequency_heuristic_applied = TRUE;
cbd.d = d;
}
/* Now, convert hash to array and sort it */
- result = g_ptr_array_new_full (g_hash_table_size (candidates), g_free);
- g_hash_table_iter_init (&it, candidates);
+ result = g_ptr_array_sized_new (kh_size (candidates));
- while (g_hash_table_iter_next (&it, &k, &v)) {
- cand = (struct rspamd_lang_detector_res *) v;
- msg_debug_lang_det ("final probability %s -> %.2f", cand->lang, cand->prob);
- g_ptr_array_add (result, cand);
- g_hash_table_iter_steal (&it);
- }
+ kh_foreach_value (candidates, cand, {
+ if (!isnan (cand->prob)) {
+ msg_debug_lang_det ("final probability %s -> %.2f", cand->lang,
+ cand->prob);
+ g_ptr_array_add (result, cand);
+ }
+ });
if (frequency_heuristic_applied) {
g_ptr_array_sort_with_data (result,
else {
g_ptr_array_sort (result, rspamd_language_detector_cmp);
}
- g_hash_table_unref (candidates);
+ kh_destroy (rspamd_candidates_hash, candidates);
if (result->len > 0 && !frequency_heuristic_applied) {
cand = g_ptr_array_index (result, 0);