}
if (!ret) {
- if (part->utf_words->len < default_short_text_limit) {
+ if (part->nwords < default_short_text_limit) {
r = rs_detect_none;
msg_debug_lang_det ("text is too short for trigramms detection: "
"%d words; at least %d words required",
- (int)part->utf_words->len,
+ (int)part->nwords,
(int)default_short_text_limit);
rspamd_language_detector_set_language (task, part, "en");
candidates = kh_init (rspamd_candidates_hash);
cbd.std = std;
cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
- if (part->utf_words->len < default_words / 2) {
+ if (part->nwords < default_words / 2) {
cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
}
}
}
}
+ if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+ part->nwords ++;
+ }
+
if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE|
RSPAMD_STAT_TOKEN_FLAG_NORMALISED|
RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES)) {
}
}
- if (part->utf_words && part->utf_words->len) {
+ if (part->utf_words->len) {
gdouble *avg_len_p, *short_len_p;
avg_len_p = rspamd_mempool_get_variable (task->task_pool,
rspamd_mime_part_extract_words (task, text_part);
if (text_part->utf_words) {
- total_words += text_part->utf_words->len;
+ total_words += text_part->nwords;
}
}
guint flags;
guint nlines;
guint spaces;
+ guint nwords;
guint non_ascii_chars;
guint ascii_chars;
guint double_spaces;
lua_pushinteger (L, 0);
}
else {
- lua_pushinteger (L, part->utf_words->len);
+ lua_pushinteger (L, part->nwords);
}
return 1;
*/
part->capital_letters += ncap;
- cur_score /= (gdouble)part->utf_words->len;
+ cur_score /= (gdouble)part->nwords;
if (cur_score > 2.0) {
cur_score = 2.0;