Bladeren bron

[Rework] Rework language detector

tags/1.8.0
Vsevolod Stakhov 5 jaren geleden
bovenliggende
commit
6c4212543a
5 gewijzigde bestanden met toevoegingen van 499 en 450 verwijderingen
  1. 472
    412
      src/libmime/lang_detection.c
  2. 23
    2
      src/libmime/lang_detection.h
  3. 3
    32
      src/libmime/message.c
  4. 1
    1
      src/libmime/message.h
  5. 0
    3
      src/libserver/task.c

+ 472
- 412
src/libmime/lang_detection.c
Diff onderdrukt omdat het te groot bestand
Bestand weergeven


+ 23
- 2
src/libmime/lang_detection.h Bestand weergeven

@@ -20,11 +20,32 @@
#include "config.h"
#include "libserver/cfg_file.h"
#include "libstat/stat_api.h"
#include "libmime/message.h"

struct rspamd_lang_detector;
struct rspamd_language_elt;
struct rspamd_task;

enum rspamd_unicode_scripts {
RSPAMD_UNICODE_LATIN = (1 << 0),
RSPAMD_UNICODE_GREEK = (1 << 1),
RSPAMD_UNICODE_CYRILLIC = (1 << 2),
RSPAMD_UNICODE_HEBREW = (1 << 3),
RSPAMD_UNICODE_CJK = (1 << 4),
RSPAMD_UNICODE_JP = (1 << 5),
RSPAMD_UNICODE_ARABIC = (1 << 6),
RSPAMD_UNICODE_DEVANAGARI = (1 << 7),
RSPAMD_UNICODE_THAI = (1 << 8),
RSPAMD_UNICODE_ARMENIAN = (1 << 9),
RSPAMD_UNICODE_GEORGIAN = (1 << 10),
RSPAMD_UNICODE_GUJARATI = (1 << 11),
RSPAMD_UNICODE_TAMIL = (1 << 12),
RSPAMD_UNICODE_TELUGU = (1 << 13),
RSPAMD_UNICODE_MALAYALAM = (1 << 14),
RSPAMD_UNICODE_SINHALA = (1 << 15),
RSPAMD_UNICODE_HANGUL = (1 << 16),
};

struct rspamd_lang_detector_res {
gdouble prob;
const gchar *lang;
@@ -59,8 +80,8 @@ void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
* @param words_len
* @return array of struct rspamd_lang_detector_res sorted by freq descending
*/
GPtrArray * rspamd_language_detector_detect (struct rspamd_task *task,
gboolean rspamd_language_detector_detect (struct rspamd_task *task,
struct rspamd_lang_detector *d,
GArray *ucs_tokens);
struct rspamd_mime_text_part *part);

#endif

+ 3
- 32
src/libmime/message.c Bestand weergeven

@@ -188,12 +188,10 @@ rspamd_mime_part_extract_words (struct rspamd_task *task,
}
}

static guint
static void
rspamd_mime_part_create_words (struct rspamd_task *task,
struct rspamd_mime_text_part *part)
{
rspamd_stat_token_t *w, ucs_w;
guint i, ucs_len = 0;
enum rspamd_tokenize_type tok_type;

if (IS_PART_UTF (part)) {
@@ -215,31 +213,8 @@ rspamd_mime_part_create_words (struct rspamd_task *task,
if (part->utf_words) {
part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
sizeof (guint64), part->utf_words->len);

if (IS_PART_UTF (part) && task->lang_det) {
part->unicode_words = g_array_sized_new (FALSE, FALSE,
sizeof (rspamd_stat_token_t), part->utf_words->len);
}

if (part->unicode_words) {


for (i = 0; i < part->utf_words->len; i++) {
w = &g_array_index (part->utf_words, rspamd_stat_token_t,
i);

if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
rspamd_language_detector_to_ucs (task->lang_det,
task->task_pool,
w, &ucs_w);
g_array_append_val (part->unicode_words, ucs_w);
ucs_len += ucs_w.len;
}
}
}
}

return ucs_len;
}

static void
@@ -248,12 +223,8 @@ rspamd_mime_part_detect_language (struct rspamd_task *task,
{
struct rspamd_lang_detector_res *lang;

if (part->unicode_words) {
part->languages = rspamd_language_detector_detect (task,
task->lang_det,
part->unicode_words);

if (part->languages->len > 0) {
if (part->utf_words) {
if (rspamd_language_detector_detect (task, task->lang_det, part)) {
lang = g_ptr_array_index (part->languages, 0);
part->language = lang->lang;


+ 1
- 1
src/libmime/message.h Bestand weergeven

@@ -103,7 +103,6 @@ struct rspamd_mime_text_part {
/* Unicode content, used by libicu */
GArray *unicode_raw_content; /* unicode raw content (of UChar) */
GArray *unicode_content; /* unicode processed content (of UChar) */
GArray *unicode_words;

GPtrArray *newlines; /**< positions of newlines in text, relative to content*/
struct html_content *html;
@@ -120,6 +119,7 @@ struct rspamd_mime_text_part {
guint empty_lines;
guint capital_letters;
guint numeric_characters;
guint unicode_scripts;
};

enum rspamd_received_type {

+ 0
- 3
src/libserver/task.c Bestand weergeven

@@ -248,9 +248,6 @@ rspamd_task_free (struct rspamd_task *task)
if (tp->normalized_hashes) {
g_array_free (tp->normalized_hashes, TRUE);
}
if (tp->unicode_words) {
g_array_free (tp->unicode_words, TRUE);
}
if (tp->languages) {
g_ptr_array_unref (tp->languages);
}

Laden…
Annuleren
Opslaan