1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732 |
- /*
- * Copyright 2024 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- #include "config.h"
- #include "util.h"
- #include "rspamd.h"
- #include "message.h"
- #include "libserver/html/html.h"
- #include "images.h"
- #include "archives.h"
- #include "tokenizers/tokenizers.h"
- #include "smtp_parsers.h"
- #include "mime_parser.h"
- #include "mime_encoding.h"
- #include "lang_detection.h"
- #include "libutil/multipattern.h"
- #include "libserver/mempool_vars_internal.h"
-
- #ifdef WITH_SNOWBALL
- #include "libstemmer.h"
- #endif
-
- #include <math.h>
- #include <unicode/uchar.h>
- #include "sodium.h"
- #include "libserver/cfg_file_private.h"
- #include "lua/lua_common.h"
- #include "contrib/uthash/utlist.h"
- #include "contrib/t1ha/t1ha.h"
- #include "received.h"
-
- #define GTUBE_SYMBOL "GTUBE"
-
- #define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_TEXT_PART_FLAG_UTF)
- #define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_TEXT_PART_FLAG_UTF)
-
- static const gchar gtube_pattern_reject[] = "XJS*C4JDBQADN1.NSBN3*2IDNEN*"
- "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X";
- static const gchar gtube_pattern_add_header[] = "YJS*C4JDBQADN1.NSBN3*2IDNEN*"
- "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X";
- static const gchar gtube_pattern_rewrite_subject[] = "ZJS*C4JDBQADN1.NSBN3*2IDNEN*"
- "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X";
- static const gchar gtube_pattern_no_action[] = "AJS*C4JDBQADN1.NSBN3*2IDNEN*"
- "GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X";
- struct rspamd_multipattern *gtube_matcher = NULL;
- static const guint64 words_hash_seed = 0xdeadbabe;
-
- static void
- free_byte_array_callback(void *pointer)
- {
- GByteArray *arr = (GByteArray *) pointer;
- g_byte_array_free(arr, TRUE);
- }
-
- static void
- rspamd_mime_part_extract_words(struct rspamd_task *task,
- struct rspamd_mime_text_part *part)
- {
- rspamd_stat_token_t *w;
- guint i, total_len = 0, short_len = 0;
-
- if (part->utf_words) {
- rspamd_stem_words(part->utf_words, task->task_pool, part->language,
- task->lang_det);
-
- for (i = 0; i < part->utf_words->len; i++) {
- guint64 h;
-
- w = &g_array_index(part->utf_words, rspamd_stat_token_t, i);
-
- if (w->stemmed.len > 0) {
- /*
- * We use static hash seed if we would want to use that in shingles
- * computation in future
- */
- h = rspamd_cryptobox_fast_hash_specific(
- RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
- w->stemmed.begin, w->stemmed.len, words_hash_seed);
- g_array_append_val(part->normalized_hashes, h);
- total_len += w->stemmed.len;
-
- if (w->stemmed.len <= 3) {
- short_len++;
- }
-
- if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT &&
- !(w->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED)) {
- part->nwords++;
- }
- }
-
- if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE |
- RSPAMD_STAT_TOKEN_FLAG_NORMALISED |
- RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES)) {
- task->flags |= RSPAMD_TASK_FLAG_BAD_UNICODE;
- }
- }
-
- if (part->utf_words->len) {
- gdouble *avg_len_p, *short_len_p;
-
- avg_len_p = rspamd_mempool_get_variable(task->task_pool,
- RSPAMD_MEMPOOL_AVG_WORDS_LEN);
-
- if (avg_len_p == NULL) {
- avg_len_p = rspamd_mempool_alloc(task->task_pool,
- sizeof(double));
- *avg_len_p = total_len;
- rspamd_mempool_set_variable(task->task_pool,
- RSPAMD_MEMPOOL_AVG_WORDS_LEN, avg_len_p, NULL);
- }
- else {
- *avg_len_p += total_len;
- }
-
- short_len_p = rspamd_mempool_get_variable(task->task_pool,
- RSPAMD_MEMPOOL_SHORT_WORDS_CNT);
-
- if (short_len_p == NULL) {
- short_len_p = rspamd_mempool_alloc(task->task_pool,
- sizeof(double));
- *short_len_p = short_len;
- rspamd_mempool_set_variable(task->task_pool,
- RSPAMD_MEMPOOL_SHORT_WORDS_CNT, avg_len_p, NULL);
- }
- else {
- *short_len_p += short_len;
- }
- }
- }
- }
-
- static void
- rspamd_mime_part_create_words(struct rspamd_task *task,
- struct rspamd_mime_text_part *part)
- {
- enum rspamd_tokenize_type tok_type;
-
- if (IS_TEXT_PART_UTF(part)) {
-
- #if U_ICU_VERSION_MAJOR_NUM < 50
- /* Hack to prevent hang with Thai in old libicu */
- const gchar *p = part->utf_stripped_content->data, *end;
- guint i = 0;
- end = p + part->utf_stripped_content->len;
- gint32 uc, sc;
-
- tok_type = RSPAMD_TOKENIZE_UTF;
-
- while (p + i < end) {
- U8_NEXT(p, i, part->utf_stripped_content->len, uc);
-
- if (((gint32) uc) < 0) {
- tok_type = RSPAMD_TOKENIZE_RAW;
- break;
- }
-
- if (u_isalpha(uc)) {
- sc = ublock_getCode(uc);
-
- if (sc == UBLOCK_THAI) {
- msg_info_task("enable workaround for Thai characters for old libicu");
- tok_type = RSPAMD_TOKENIZE_RAW;
- break;
- }
- }
- }
- #else
- tok_type = RSPAMD_TOKENIZE_UTF;
- #endif
- }
- else {
- tok_type = RSPAMD_TOKENIZE_RAW;
- }
-
- part->utf_words = rspamd_tokenize_text(
- part->utf_stripped_content->data,
- part->utf_stripped_content->len,
- &part->utf_stripped_text,
- tok_type, task->cfg,
- part->exceptions,
- NULL,
- NULL,
- task->task_pool);
-
-
- if (part->utf_words) {
- part->normalized_hashes = g_array_sized_new(FALSE, FALSE,
- sizeof(guint64), part->utf_words->len);
- rspamd_normalize_words(part->utf_words, task->task_pool);
- }
- }
-
- static void
- rspamd_mime_part_detect_language(struct rspamd_task *task,
- struct rspamd_mime_text_part *part)
- {
- struct rspamd_lang_detector_res *lang;
-
- if (!IS_TEXT_PART_EMPTY(part) && part->utf_words && part->utf_words->len > 0 &&
- task->lang_det) {
- if (rspamd_language_detector_detect(task, task->lang_det, part)) {
- lang = g_ptr_array_index(part->languages, 0);
- part->language = lang->lang;
-
- msg_info_task("detected part language: %s", part->language);
- }
- else {
- part->language = "en"; /* Safe fallback */
- }
- }
- }
-
- static void
- rspamd_strip_newlines_parse(struct rspamd_task *task,
- const gchar *begin, const gchar *pe,
- struct rspamd_mime_text_part *part)
- {
- const gchar *p = begin, *c = begin;
- gboolean crlf_added = FALSE, is_utf = IS_TEXT_PART_UTF(part);
- gboolean url_open_bracket = FALSE;
- UChar32 uc;
-
- enum {
- normal_char,
- seen_cr,
- seen_lf,
- } state = normal_char;
-
- while (p < pe) {
- if (U8_IS_LEAD(*p) && is_utf) {
- gint32 off = p - begin;
- U8_NEXT(begin, off, pe - begin, uc);
-
- if (uc != -1) {
- while (p < pe && off < (pe - begin)) {
- if (IS_ZERO_WIDTH_SPACE(uc)) {
- /* Invisible space ! */
- task->flags |= RSPAMD_TASK_FLAG_BAD_UNICODE;
- part->spaces++;
-
- if (p > c) {
- g_byte_array_append(part->utf_stripped_content,
- (const guint8 *) c, p - c);
- c = begin + off;
- p = c;
- }
-
- U8_NEXT(begin, off, pe - begin, uc);
-
- if (!IS_ZERO_WIDTH_SPACE(uc)) {
- break;
- }
-
- part->double_spaces++;
- p = begin + off;
- c = p;
- }
- else {
- break;
- }
- }
- }
- }
-
- if (G_UNLIKELY(p >= pe)) {
- /*
- * This is reached when there is a utf8 part and we
- * have zero width spaces at the end of the text
- * So we just check overflow and refuse to access *p if it is
- * after our real content.
- */
- break;
- }
- else if (*p == '\r') {
- switch (state) {
- case normal_char:
- state = seen_cr;
- if (p > c) {
- g_byte_array_append(part->utf_stripped_content,
- (const guint8 *) c, p - c);
- }
-
- crlf_added = FALSE;
- c = p + 1;
- break;
- case seen_cr:
- /* Double \r\r */
- if (!crlf_added) {
- g_byte_array_append(part->utf_stripped_content,
- (const guint8 *) " ", 1);
- crlf_added = TRUE;
- g_ptr_array_add(part->newlines,
- (((gpointer) (goffset) (part->utf_stripped_content->len))));
- }
-
- part->nlines++;
- part->empty_lines++;
- c = p + 1;
- break;
- case seen_lf:
- /* Likely \r\n\r...*/
- state = seen_cr;
- c = p + 1;
- break;
- }
-
- url_open_bracket = FALSE;
-
- p++;
- }
- else if (*p == '\n') {
- switch (state) {
- case normal_char:
- state = seen_lf;
-
- if (p > c) {
- g_byte_array_append(part->utf_stripped_content,
- (const guint8 *) c, p - c);
- }
-
- c = p + 1;
-
- if (IS_TEXT_PART_HTML(part) || !url_open_bracket) {
- g_byte_array_append(part->utf_stripped_content,
- (const guint8 *) " ", 1);
- g_ptr_array_add(part->newlines,
- (((gpointer) (goffset) (part->utf_stripped_content->len))));
- crlf_added = TRUE;
- }
- else {
- crlf_added = FALSE;
- }
-
- break;
- case seen_cr:
- /* \r\n */
- if (!crlf_added) {
- if (IS_TEXT_PART_HTML(part) || !url_open_bracket) {
- g_byte_array_append(part->utf_stripped_content,
- (const guint8 *) " ", 1);
- crlf_added = TRUE;
- }
-
- g_ptr_array_add(part->newlines,
- (((gpointer) (goffset) (part->utf_stripped_content->len))));
- }
-
- c = p + 1;
- state = seen_lf;
-
- break;
- case seen_lf:
- /* Double \n\n */
- if (!crlf_added) {
- g_byte_array_append(part->utf_stripped_content,
- (const guint8 *) " ", 1);
- crlf_added = TRUE;
- g_ptr_array_add(part->newlines,
- (((gpointer) (goffset) (part->utf_stripped_content->len))));
- }
-
- part->nlines++;
- part->empty_lines++;
-
- c = p + 1;
- break;
- }
- url_open_bracket = FALSE;
-
- p++;
- }
- else {
- if ((*p) == '<') {
- url_open_bracket = TRUE;
- }
- else if ((*p) == '>') {
- url_open_bracket = FALSE;
- }
-
- switch (state) {
- case normal_char:
- if (*p == ' ') {
- part->spaces++;
-
- if (p > begin && *(p - 1) == ' ') {
- part->double_spaces++;
- }
- }
- else {
- part->non_spaces++;
-
- if ((*p) & 0x80) {
- part->non_ascii_chars++;
- }
- else {
- if (g_ascii_isupper(*p)) {
- part->capital_letters++;
- }
- else if (g_ascii_isdigit(*p)) {
- part->numeric_characters++;
- }
-
- part->ascii_chars++;
- }
- }
- break;
- case seen_cr:
- case seen_lf:
- part->nlines++;
-
- if (!crlf_added) {
- g_ptr_array_add(part->newlines,
- (((gpointer) (goffset) (part->utf_stripped_content->len))));
- }
-
- /* Skip initial spaces */
- if (*p == ' ') {
- if (!crlf_added) {
- g_byte_array_append(part->utf_stripped_content,
- (const guint8 *) " ", 1);
- }
-
- while (p < pe && *p == ' ') {
- p++;
- c++;
- part->spaces++;
- }
-
- if (p < pe && (*p == '\r' || *p == '\n')) {
- part->empty_lines++;
- }
- }
-
- state = normal_char;
- continue;
- }
-
- p++;
- }
- }
-
- /* Leftover */
- if (p > c) {
- if (p > pe) {
- p = pe;
- }
-
- switch (state) {
- case normal_char:
- g_byte_array_append(part->utf_stripped_content,
- (const guint8 *) c, p - c);
-
- while (c < p) {
- if (*c == ' ') {
- part->spaces++;
-
- if (c > begin && *(c - 1) == ' ') {
- part->double_spaces++;
- }
- }
- else {
- part->non_spaces++;
-
- if ((*c) & 0x80) {
- part->non_ascii_chars++;
- }
- else {
- part->ascii_chars++;
- }
- }
-
- c++;
- }
- break;
- default:
-
- if (!crlf_added) {
- g_byte_array_append(part->utf_stripped_content,
- (const guint8 *) " ", 1);
- g_ptr_array_add(part->newlines,
- (((gpointer) (goffset) (part->utf_stripped_content->len))));
- }
-
- part->nlines++;
- break;
- }
- }
- }
-
- static void
- rspamd_u_text_dtor(void *p)
- {
- utext_close((UText *) p);
- }
-
- static void
- rspamd_normalize_text_part(struct rspamd_task *task,
- struct rspamd_mime_text_part *part)
- {
- const gchar *p, *end;
- guint i;
- goffset off;
- struct rspamd_process_exception *ex;
- UErrorCode uc_err = U_ZERO_ERROR;
-
- part->newlines = g_ptr_array_sized_new(128);
-
- if (IS_TEXT_PART_EMPTY(part)) {
- part->utf_stripped_content = g_byte_array_new();
- }
- else {
- part->utf_stripped_content = g_byte_array_sized_new(part->utf_content.len);
-
- p = (const gchar *) part->utf_content.begin;
- end = p + part->utf_content.len;
-
- rspamd_strip_newlines_parse(task, p, end, part);
-
- for (i = 0; i < part->newlines->len; i++) {
- ex = rspamd_mempool_alloc(task->task_pool, sizeof(*ex));
- off = (goffset) g_ptr_array_index(part->newlines, i);
- g_ptr_array_index(part->newlines, i) = (gpointer) (goffset) (part->utf_stripped_content->data + off);
- ex->pos = off;
- ex->len = 0;
- ex->type = RSPAMD_EXCEPTION_NEWLINE;
- part->exceptions = g_list_prepend(part->exceptions, ex);
- }
- }
-
- if (IS_TEXT_PART_UTF(part)) {
- utext_openUTF8(&part->utf_stripped_text,
- part->utf_stripped_content->data,
- part->utf_stripped_content->len,
- &uc_err);
-
- if (!U_SUCCESS(uc_err)) {
- msg_warn_task("cannot open text from utf content");
- /* Probably, should be an assertion */
- }
- else {
- rspamd_mempool_add_destructor(task->task_pool,
- rspamd_u_text_dtor,
- &part->utf_stripped_text);
- }
- }
-
- rspamd_mempool_add_destructor(task->task_pool,
- (rspamd_mempool_destruct_t) free_byte_array_callback,
- part->utf_stripped_content);
- rspamd_mempool_notify_alloc(task->task_pool,
- part->utf_stripped_content->len);
- rspamd_mempool_add_destructor(task->task_pool,
- (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
- part->newlines);
- }
-
- #define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
-
- static guint
- rspamd_words_levenshtein_distance(struct rspamd_task *task,
- GArray *w1, GArray *w2)
- {
- guint s1len, s2len, x, y, lastdiag, olddiag;
- guint *column, ret;
- guint64 h1, h2;
- gint eq;
- static const guint max_words = 8192;
-
- s1len = w1->len;
- s2len = w2->len;
-
- if (s1len + s2len > max_words) {
- msg_info_task("cannot direct compare multipart/alternative parts with more than %ud words in total: "
- "(%ud words in one part and %ud in another)",
- max_words, s1len, s2len);
-
- /* Use approximate comparison of number of words */
- if (s1len > s2len) {
- return s1len - s2len;
- }
- else {
- return s2len - s1len;
- }
- }
-
- column = g_malloc0((s1len + 1) * sizeof(guint));
-
- for (y = 1; y <= s1len; y++) {
- column[y] = y;
- }
-
- for (x = 1; x <= s2len; x++) {
- column[0] = x;
-
- for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
- olddiag = column[y];
- h1 = g_array_index(w1, guint64, y - 1);
- h2 = g_array_index(w2, guint64, x - 1);
- eq = (h1 == h2) ? 1 : 0;
- /*
- * Cost of replacement is twice higher than cost of add/delete
- * to calculate percentage properly
- */
- column[y] = MIN3(column[y] + 1, column[y - 1] + 1,
- lastdiag + (eq * 2));
- lastdiag = olddiag;
- }
- }
-
- ret = column[s1len];
- g_free(column);
-
- return ret;
- }
-
- static gint
- rspamd_multipattern_gtube_cb(struct rspamd_multipattern *mp,
- guint strnum,
- gint match_start,
- gint match_pos,
- const gchar *text,
- gsize len,
- void *context)
- {
- struct rspamd_task *task = (struct rspamd_task *) context;
-
- if (strnum > 0) {
- if (task->cfg->gtube_patterns_policy == RSPAMD_GTUBE_ALL) {
- return strnum + 1;
- }
-
- return 0;
- }
-
- return strnum + 1; /* To distinguish from zero */
- }
-
- static enum rspamd_action_type
- rspamd_check_gtube(struct rspamd_task *task, struct rspamd_mime_text_part *part)
- {
- static const gsize max_check_size = 8 * 1024;
- gint ret;
- enum rspamd_action_type act = METRIC_ACTION_NOACTION;
- enum rspamd_gtube_patterns_policy policy = task->cfg ? task->cfg->gtube_patterns_policy : RSPAMD_GTUBE_REJECT;
- g_assert(part != NULL);
-
- if (gtube_matcher == NULL && policy != RSPAMD_GTUBE_DISABLED) {
- gtube_matcher = rspamd_multipattern_create(RSPAMD_MULTIPATTERN_DEFAULT);
-
- rspamd_multipattern_add_pattern(gtube_matcher,
- gtube_pattern_reject,
- RSPAMD_MULTIPATTERN_DEFAULT);
- rspamd_multipattern_add_pattern(gtube_matcher,
- gtube_pattern_add_header,
- RSPAMD_MULTIPATTERN_DEFAULT);
- rspamd_multipattern_add_pattern(gtube_matcher,
- gtube_pattern_rewrite_subject,
- RSPAMD_MULTIPATTERN_DEFAULT);
- rspamd_multipattern_add_pattern(gtube_matcher,
- gtube_pattern_no_action,
- RSPAMD_MULTIPATTERN_DEFAULT);
-
- GError *err = NULL;
- rspamd_multipattern_compile(gtube_matcher, RSPAMD_MULTIPATTERN_COMPILE_NO_FS, &err);
-
- if (err != NULL) {
- /* It will be expensive, but I don't care, still better than to abort */
- msg_err("cannot compile gtube matcher: %s", err->message);
- g_error_free(err);
- }
- }
-
- if (part->utf_content.len >= sizeof(gtube_pattern_reject) &&
- part->utf_content.len <= max_check_size &&
- policy != RSPAMD_GTUBE_DISABLED) {
- if ((ret = rspamd_multipattern_lookup(gtube_matcher, part->utf_content.begin,
- part->utf_content.len,
- rspamd_multipattern_gtube_cb, task, NULL)) > 0) {
-
- switch (ret) {
- case 1:
- act = METRIC_ACTION_REJECT;
- break;
- case 2:
- act = METRIC_ACTION_ADD_HEADER;
- break;
- case 3:
- act = METRIC_ACTION_REWRITE_SUBJECT;
- break;
- case 4:
- act = METRIC_ACTION_NOACTION;
- break;
- }
-
- if (ret != 0) {
- task->flags |= RSPAMD_TASK_FLAG_SKIP;
- task->flags |= RSPAMD_TASK_FLAG_GTUBE;
- msg_info_task(
- "gtube %s pattern has been found in part of length %uz",
- rspamd_action_to_str(act),
- part->utf_content.len);
- }
- }
- }
-
- return act;
- }
-
- static gint
- exceptions_compare_func(gconstpointer a, gconstpointer b)
- {
- const struct rspamd_process_exception *ea = a, *eb = b;
-
- return ea->pos - eb->pos;
- }
-
- static gboolean
- rspamd_message_process_plain_text_part(struct rspamd_task *task,
- struct rspamd_mime_text_part *text_part)
- {
- if (text_part->parsed.len == 0) {
- text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
-
- return TRUE;
- }
-
- rspamd_mime_text_part_maybe_convert(task, text_part);
-
- if (text_part->utf_raw_content != NULL) {
- /* Just have the same content */
- text_part->utf_content.begin = (const gchar *) text_part->utf_raw_content->data;
- text_part->utf_content.len = text_part->utf_raw_content->len;
- }
- else {
- /*
- * We ignore unconverted parts from now as it is dangerous
- * to treat them as text parts
- */
- text_part->utf_content.begin = NULL;
- text_part->utf_content.len = 0;
-
- return FALSE;
- }
-
- return TRUE;
- }
-
- static gboolean
- rspamd_message_process_html_text_part(struct rspamd_task *task,
- struct rspamd_mime_text_part *text_part,
- uint16_t *cur_url_order)
- {
- text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML;
-
- if (text_part->parsed.len == 0) {
- text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
-
- return TRUE;
- }
-
- rspamd_mime_text_part_maybe_convert(task, text_part);
-
- if (text_part->utf_raw_content == NULL) {
- return FALSE;
- }
-
-
- text_part->html = rspamd_html_process_part_full(
- task,
- text_part->utf_raw_content,
- &text_part->exceptions,
- MESSAGE_FIELD(task, urls),
- text_part->mime_part->urls,
- task->cfg ? task->cfg->enable_css_parser : true,
- cur_url_order);
- rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content);
-
- if (text_part->utf_content.len == 0) {
- text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
- }
-
- return TRUE;
- }
-
- enum rspamd_message_part_is_text_result {
- RSPAMD_MESSAGE_PART_IS_TEXT_PLAIN = 0,
- RSPAMD_MESSAGE_PART_IS_TEXT_HTML,
- RSPAMD_MESSAGE_PART_IS_NOT_TEXT
- };
-
- static enum rspamd_message_part_is_text_result
- rspamd_message_part_can_be_parsed_as_text(struct rspamd_task *task,
- struct rspamd_mime_part *mime_part)
- {
- enum rspamd_message_part_is_text_result res = RSPAMD_MESSAGE_PART_IS_NOT_TEXT;
-
- if ((mime_part->ct && (mime_part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) ||
- (mime_part->detected_type && strcmp(mime_part->detected_type, "text") == 0)) {
-
- res = RSPAMD_MESSAGE_PART_IS_TEXT_PLAIN;
- rspamd_ftok_t html_tok, xhtml_tok;
-
- html_tok.begin = "html";
- html_tok.len = 4;
- xhtml_tok.begin = "xhtml";
- xhtml_tok.len = 5;
-
- if (rspamd_ftok_casecmp(&mime_part->ct->subtype, &html_tok) == 0 ||
- rspamd_ftok_casecmp(&mime_part->ct->subtype, &xhtml_tok) == 0 ||
- (mime_part->detected_ext &&
- strcmp(mime_part->detected_ext, "html") == 0)) {
- res = RSPAMD_MESSAGE_PART_IS_TEXT_HTML;
- }
- }
-
- /* Skip attachments */
- if (res != RSPAMD_MESSAGE_PART_IS_NOT_TEXT &&
- (mime_part->cd && mime_part->cd->type == RSPAMD_CT_ATTACHMENT)) {
- if (!task->cfg->check_text_attachements) {
- debug_task("skip attachments for checking as text parts");
- return RSPAMD_MESSAGE_PART_IS_NOT_TEXT;
- }
- }
-
- return res;
- }
-
- static gboolean
- rspamd_message_process_text_part_maybe(struct rspamd_task *task,
- struct rspamd_mime_part *mime_part,
- enum rspamd_message_part_is_text_result is_text,
- uint16_t *cur_url_order)
- {
- struct rspamd_mime_text_part *text_part;
- guint flags = 0;
- enum rspamd_action_type act;
-
- /* Skip attachments */
- if ((mime_part->cd && mime_part->cd->type == RSPAMD_CT_ATTACHMENT)) {
- flags |= RSPAMD_MIME_TEXT_PART_ATTACHMENT;
- }
-
- text_part = rspamd_mempool_alloc0(task->task_pool,
- sizeof(struct rspamd_mime_text_part));
- text_part->mime_part = mime_part;
- text_part->raw.begin = mime_part->raw_data.begin;
- text_part->raw.len = mime_part->raw_data.len;
- text_part->parsed.begin = mime_part->parsed_data.begin;
- text_part->parsed.len = mime_part->parsed_data.len;
- text_part->utf_stripped_text = (UText) UTEXT_INITIALIZER;
- text_part->flags |= flags;
-
- if (is_text == RSPAMD_MESSAGE_PART_IS_TEXT_HTML) {
- if (!rspamd_message_process_html_text_part(task, text_part, cur_url_order)) {
- return FALSE;
- }
- }
- else {
- if (!rspamd_message_process_plain_text_part(task, text_part)) {
- return FALSE;
- }
- }
-
- g_ptr_array_add(MESSAGE_FIELD(task, text_parts), text_part);
- mime_part->part_type = RSPAMD_MIME_PART_TEXT;
- mime_part->specific.txt = text_part;
-
- act = rspamd_check_gtube(task, text_part);
- if (act != METRIC_ACTION_NOACTION) {
- struct rspamd_action *action;
- gdouble score = NAN;
-
- action = rspamd_config_get_action_by_type(task->cfg, act);
-
- if (action) {
- score = action->threshold;
-
- rspamd_add_passthrough_result(task, action,
- RSPAMD_PASSTHROUGH_CRITICAL,
- score, "Gtube pattern",
- "GTUBE", 0, NULL);
- }
-
- rspamd_task_insert_result(task, GTUBE_SYMBOL, 0, NULL);
-
- return TRUE;
- }
-
- /* Post process part */
- rspamd_normalize_text_part(task, text_part);
-
- if (!IS_TEXT_PART_HTML(text_part)) {
- if (mime_part->parent_part) {
- struct rspamd_mime_part *parent = mime_part->parent_part;
-
- if (IS_PART_MULTIPART(parent) && parent->specific.mp->children->len == 2) {
- /*
- * Use strict extraction mode: we will extract missing urls from
- * an html part if needed
- */
- rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order,
- RSPAMD_URL_FIND_STRICT);
- }
- else {
- /*
- * Fall back to full text extraction using TLD patterns
- */
- rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order,
- RSPAMD_URL_FIND_ALL);
- }
- }
- else {
- /*
- * Fall back to full text extraction using TLD patterns
- */
- rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order,
- RSPAMD_URL_FIND_ALL);
- }
- }
- else {
- rspamd_url_text_extract(task->task_pool, task, text_part, cur_url_order,
- RSPAMD_URL_FIND_STRICT);
- }
-
- if (text_part->exceptions) {
- text_part->exceptions = g_list_sort(text_part->exceptions,
- exceptions_compare_func);
- rspamd_mempool_add_destructor(task->task_pool,
- (rspamd_mempool_destruct_t) g_list_free,
- text_part->exceptions);
- }
-
- rspamd_mime_part_create_words(task, text_part);
-
- return TRUE;
- }
-
- /* Creates message from various data using libmagic to detect type */
- static void
- rspamd_message_from_data(struct rspamd_task *task, const guchar *start,
- gsize len)
- {
- struct rspamd_content_type *ct = NULL;
- struct rspamd_mime_part *part;
- const char *mb = "application/octet-stream";
- gchar *mid;
- rspamd_ftok_t srch, *tok;
- gchar cdbuf[1024];
-
- g_assert(start != NULL);
-
- part = rspamd_mempool_alloc0(task->task_pool, sizeof(*part));
-
- part->raw_data.begin = start;
- part->raw_data.len = len;
- part->parsed_data.begin = start;
- part->parsed_data.len = len;
- part->part_number = MESSAGE_FIELD(task, parts)->len;
- part->urls = g_ptr_array_new();
- part->raw_headers = rspamd_message_headers_new();
- part->headers_order = NULL;
-
- tok = rspamd_task_get_request_header(task, "Content-Type");
-
- if (tok) {
- /* We have Content-Type defined */
- ct = rspamd_content_type_parse(tok->begin, tok->len,
- task->task_pool);
- part->ct = ct;
- }
- else if (task->cfg && task->cfg->libs_ctx) {
- lua_State *L = task->cfg->lua_state;
-
- if (rspamd_lua_require_function(L,
- "lua_magic", "detect_mime_part")) {
-
- struct rspamd_mime_part **pmime;
- struct rspamd_task **ptask;
-
- pmime = lua_newuserdata(L, sizeof(struct rspamd_mime_part *));
- rspamd_lua_setclass(L, rspamd_mimepart_classname, -1);
- *pmime = part;
- ptask = lua_newuserdata(L, sizeof(struct rspamd_task *));
- rspamd_lua_setclass(L, rspamd_task_classname, -1);
- *ptask = task;
-
- if (lua_pcall(L, 2, 2, 0) != 0) {
- msg_err_task("cannot detect type: %s", lua_tostring(L, -1));
- }
- else {
- if (lua_istable(L, -1)) {
- lua_pushstring(L, "ct");
- lua_gettable(L, -2);
-
- if (lua_isstring(L, -1)) {
- mb = rspamd_mempool_strdup(task->task_pool,
- lua_tostring(L, -1));
- }
- }
- }
-
- lua_settop(L, 0);
- }
- else {
- msg_err_task("cannot require lua_magic.detect_mime_part");
- }
-
- if (mb) {
- srch.begin = mb;
- srch.len = strlen(mb);
- ct = rspamd_content_type_parse(srch.begin, srch.len,
- task->task_pool);
-
- if (!part->ct) {
- msg_info_task("construct fake mime of type: %s", mb);
- part->ct = ct;
- }
- else {
- /* Check sanity */
- if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT)) {
- RSPAMD_FTOK_FROM_STR(&srch, "application");
-
- if (rspamd_ftok_cmp(&ct->type, &srch) == 0) {
- msg_info_task("construct fake mime of type: %s", mb);
- part->ct = ct;
- }
- }
- else {
- msg_info_task("construct fake mime of type: %T/%T, detected %s",
- &part->ct->type, &part->ct->subtype, mb);
- }
- }
-
- part->detected_ct = ct;
- }
- }
-
-
- tok = rspamd_task_get_request_header(task, "Filename");
-
- if (tok) {
- rspamd_snprintf(cdbuf, sizeof(cdbuf), "inline; filename=\"%T\"", tok);
- }
- else {
- rspamd_snprintf(cdbuf, sizeof(cdbuf), "inline");
- }
-
- part->cd = rspamd_content_disposition_parse(cdbuf, strlen(cdbuf),
- task->task_pool);
-
- g_ptr_array_add(MESSAGE_FIELD(task, parts), part);
- rspamd_mime_parser_calc_digest(part);
-
- /* Generate message ID */
- mid = rspamd_mime_message_id_generate("localhost.localdomain");
- rspamd_mempool_add_destructor(task->task_pool,
- (rspamd_mempool_destruct_t) g_free, mid);
- MESSAGE_FIELD(task, message_id) = mid;
- task->queue_id = mid;
- }
-
- static void
- rspamd_message_dtor(struct rspamd_message *msg)
- {
- guint i;
- struct rspamd_mime_part *p;
- struct rspamd_mime_text_part *tp;
-
-
- PTR_ARRAY_FOREACH(msg->parts, i, p)
- {
- if (p->raw_headers) {
- rspamd_message_headers_unref(p->raw_headers);
- }
-
- if (IS_PART_MULTIPART(p)) {
- if (p->specific.mp->children) {
- g_ptr_array_free(p->specific.mp->children, TRUE);
- }
- }
-
- if (p->part_type == RSPAMD_MIME_PART_CUSTOM_LUA &&
- p->specific.lua_specific.cbref != -1) {
- luaL_unref(msg->task->cfg->lua_state,
- LUA_REGISTRYINDEX,
- p->specific.lua_specific.cbref);
- }
-
- if (p->urls) {
- g_ptr_array_unref(p->urls);
- }
- }
-
- PTR_ARRAY_FOREACH(msg->text_parts, i, tp)
- {
- if (tp->utf_words) {
- g_array_free(tp->utf_words, TRUE);
- }
- if (tp->normalized_hashes) {
- g_array_free(tp->normalized_hashes, TRUE);
- }
- if (tp->languages) {
- g_ptr_array_unref(tp->languages);
- }
- }
-
- rspamd_message_headers_unref(msg->raw_headers);
-
- g_ptr_array_unref(msg->text_parts);
- g_ptr_array_unref(msg->parts);
-
- kh_destroy(rspamd_url_hash, msg->urls);
- }
-
- struct rspamd_message *
- rspamd_message_new(struct rspamd_task *task)
- {
- struct rspamd_message *msg;
-
- msg = rspamd_mempool_alloc0(task->task_pool, sizeof(*msg));
-
- msg->raw_headers = rspamd_message_headers_new();
- msg->urls = kh_init(rspamd_url_hash);
- msg->parts = g_ptr_array_sized_new(4);
- msg->text_parts = g_ptr_array_sized_new(2);
- msg->task = task;
-
- REF_INIT_RETAIN(msg, rspamd_message_dtor);
-
- return msg;
- }
-
- gboolean
- rspamd_message_parse(struct rspamd_task *task)
- {
- const gchar *p;
- gsize len;
- guint i;
- GError *err = NULL;
- guint64 n[2], seed;
-
- if (RSPAMD_TASK_IS_EMPTY(task)) {
- /* Don't do anything with empty task */
- task->flags |= RSPAMD_TASK_FLAG_SKIP_PROCESS;
- return TRUE;
- }
-
- p = task->msg.begin;
- len = task->msg.len;
-
- /* Skip any space characters to avoid some bad messages to be unparsed */
- while (len > 0 && g_ascii_isspace(*p)) {
- p++;
- len--;
- }
-
- /*
- * Exim somehow uses mailbox format for messages being scanned:
- * From xxx@xxx.com Fri May 13 19:08:48 2016
- *
- * So we check if a task has this line to avoid possible issues
- */
- if (len > sizeof("From ") - 1) {
- if (memcmp(p, "From ", sizeof("From ") - 1) == 0) {
- /* Skip to CRLF */
- msg_info_task("mailbox input detected, enable workaround");
- p += sizeof("From ") - 1;
- len -= sizeof("From ") - 1;
-
- while (len > 0 && *p != '\n') {
- p++;
- len--;
- }
- while (len > 0 && g_ascii_isspace(*p)) {
- p++;
- len--;
- }
- }
- }
-
- task->msg.begin = p;
- task->msg.len = len;
-
- /* Cleanup old message */
- if (task->message) {
- rspamd_message_unref(task->message);
- }
-
- task->message = rspamd_message_new(task);
-
- if (task->flags & RSPAMD_TASK_FLAG_MIME) {
- enum rspamd_mime_parse_error ret;
-
- debug_task("construct mime parser from string length %d",
- (gint) task->msg.len);
- ret = rspamd_mime_parse_task(task, &err);
-
- switch (ret) {
- case RSPAMD_MIME_PARSE_FATAL:
- msg_err_task("cannot construct mime from stream: %e", err);
-
- if (task->cfg && (!task->cfg->allow_raw_input)) {
- msg_err_task("cannot construct mime from stream");
- if (err) {
- task->err = err;
- }
-
- return FALSE;
- }
- else {
- task->flags &= ~RSPAMD_TASK_FLAG_MIME;
- rspamd_message_from_data(task, p, len);
- }
- break;
- case RSPAMD_MIME_PARSE_NESTING:
- msg_warn_task("cannot construct full mime from stream: %e", err);
- task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS;
- break;
- case RSPAMD_MIME_PARSE_OK:
- default:
- break;
- }
-
- if (err) {
- g_error_free(err);
- }
- }
- else {
- rspamd_message_from_data(task, p, len);
- }
-
-
- if (MESSAGE_FIELD(task, message_id) == NULL) {
- MESSAGE_FIELD(task, message_id) = "undef";
- }
-
- debug_task("found %ud parts in message", MESSAGE_FIELD(task, parts)->len);
- if (task->queue_id == NULL) {
- task->queue_id = "undef";
- }
-
- rspamd_received_maybe_fix_task(task);
-
- struct rspamd_mime_part *part;
-
- /* Blake2b applied to string 'rspamd' */
- static const guchar RSPAMD_ALIGNED(32) hash_key[] = {
- 0xef,
- 0x43,
- 0xae,
- 0x80,
- 0xcc,
- 0x8d,
- 0xc3,
- 0x4c,
- 0x6f,
- 0x1b,
- 0xd6,
- 0x18,
- 0x1b,
- 0xae,
- 0x87,
- 0x74,
- 0x0c,
- 0xca,
- 0xf7,
- 0x8e,
- 0x5f,
- 0x2e,
- 0x54,
- 0x32,
- 0xf6,
- 0x79,
- 0xb9,
- 0x27,
- 0x26,
- 0x96,
- 0x20,
- 0x92,
- 0x70,
- 0x07,
- 0x85,
- 0xeb,
- 0x83,
- 0xf7,
- 0x89,
- 0xe0,
- 0xd7,
- 0x32,
- 0x2a,
- 0xd2,
- 0x1a,
- 0x64,
- 0x41,
- 0xef,
- 0x49,
- 0xff,
- 0xc3,
- 0x8c,
- 0x54,
- 0xf9,
- 0x67,
- 0x74,
- 0x30,
- 0x1e,
- 0x70,
- 0x2e,
- 0xb7,
- 0x12,
- 0x09,
- 0xfe,
- };
-
- memcpy(&seed, hash_key, sizeof(seed));
-
- PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part)
- {
- n[0] = t1ha2_atonce128(&n[1],
- part->digest, sizeof(part->digest),
- seed);
-
- seed = n[0] ^ n[1];
- }
-
- memcpy(MESSAGE_FIELD(task, digest), n, sizeof(n));
-
- if (MESSAGE_FIELD(task, subject)) {
- p = MESSAGE_FIELD(task, subject);
- len = strlen(p);
- n[0] = t1ha2_atonce128(&n[1],
- p, len,
- seed);
- memcpy(MESSAGE_FIELD(task, digest), n, sizeof(n));
- }
-
- if (task->queue_id) {
- msg_info_task("loaded message; id: <%s>; queue-id: <%s>; size: %z; "
- "checksum: <%*xs>",
- MESSAGE_FIELD(task, message_id), task->queue_id, task->msg.len,
- (gint) sizeof(MESSAGE_FIELD(task, digest)), MESSAGE_FIELD(task, digest));
- }
- else {
- msg_info_task("loaded message; id: <%s>; size: %z; "
- "checksum: <%*xs>",
- MESSAGE_FIELD(task, message_id), task->msg.len,
- (gint) sizeof(MESSAGE_FIELD(task, digest)), MESSAGE_FIELD(task, digest));
- }
-
- return TRUE;
- }
-
-
- /*
- * A helper structure to store text parts positions, if it was C++, I could just use std::pair,
- * but here I have to make it all manually, sigh...
- */
- struct rspamd_mime_part_text_position {
- unsigned pos;
- enum rspamd_message_part_is_text_result res;
- };
-
- /* Place html parts first during analysis */
- static int
- rspamd_mime_text_part_position_compare_func(const void *v1, const void *v2)
- {
- const struct rspamd_mime_part_text_position *p1 = (const struct rspamd_mime_part_text_position *) v1;
- const struct rspamd_mime_part_text_position *p2 = (const struct rspamd_mime_part_text_position *) v2;
-
- if (p1->res == p2->res) {
- return (int) p2->pos - (int) p1->pos;
- }
- else {
- if (p1->res == RSPAMD_MESSAGE_PART_IS_TEXT_HTML) {
- return -1;
- }
- else {
- return 1;
- }
- }
- }
-
- void rspamd_message_process(struct rspamd_task *task)
- {
- guint i;
- struct rspamd_mime_text_part *p1, *p2;
- gdouble diff, *pdiff;
- guint tw, *ptw, dw;
- struct rspamd_mime_part *part;
- lua_State *L = NULL;
- gint magic_func_pos = -1, content_func_pos = -1, old_top = -1, funcs_top = -1;
-
- if (task->cfg) {
- L = task->cfg->lua_state;
- }
-
- rspamd_archives_process(task);
-
- if (L) {
- old_top = lua_gettop(L);
- }
-
- if (L && rspamd_lua_require_function(L,
- "lua_magic", "detect_mime_part")) {
- magic_func_pos = lua_gettop(L);
- }
- else {
- msg_err_task("cannot require lua_magic.detect_mime_part");
- }
-
- if (L && rspamd_lua_require_function(L,
- "lua_content", "maybe_process_mime_part")) {
- content_func_pos = lua_gettop(L);
- }
- else {
- msg_err_task("cannot require lua_content.maybe_process_mime_part");
- }
-
- if (L) {
- funcs_top = lua_gettop(L);
- }
-
- GArray *detected_text_parts = g_array_sized_new(FALSE, FALSE, sizeof(struct rspamd_mime_part_text_position), 2);
-
- PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part)
- {
- if (magic_func_pos != -1 && part->parsed_data.len > 0) {
- struct rspamd_mime_part **pmime;
- struct rspamd_task **ptask;
-
- lua_pushcfunction(L, &rspamd_lua_traceback);
- gint err_idx = lua_gettop(L);
- lua_pushvalue(L, magic_func_pos);
- pmime = lua_newuserdata(L, sizeof(struct rspamd_mime_part *));
- rspamd_lua_setclass(L, rspamd_mimepart_classname, -1);
- *pmime = part;
- ptask = lua_newuserdata(L, sizeof(struct rspamd_task *));
- rspamd_lua_setclass(L, rspamd_task_classname, -1);
- *ptask = task;
-
- if (lua_pcall(L, 2, 2, err_idx) != 0) {
- msg_err_task("cannot detect type: %s", lua_tostring(L, -1));
- }
- else {
- if (lua_istable(L, -1)) {
- const gchar *mb;
-
- /* First returned value */
- part->detected_ext = rspamd_mempool_strdup(task->task_pool,
- lua_tostring(L, -2));
-
- lua_pushstring(L, "ct");
- lua_gettable(L, -2);
-
- if (lua_isstring(L, -1)) {
- mb = lua_tostring(L, -1);
-
- if (mb) {
- rspamd_ftok_t srch;
-
- srch.begin = mb;
- srch.len = strlen(mb);
- part->detected_ct = rspamd_content_type_parse(srch.begin,
- srch.len,
- task->task_pool);
- }
- }
-
- lua_pop(L, 1);
-
- lua_pushstring(L, "type");
- lua_gettable(L, -2);
-
- if (lua_isstring(L, -1)) {
- part->detected_type = rspamd_mempool_strdup(task->task_pool,
- lua_tostring(L, -1));
- }
-
- lua_pop(L, 1);
-
- lua_pushstring(L, "no_text");
- lua_gettable(L, -2);
-
- if (lua_isboolean(L, -1)) {
- if (!!lua_toboolean(L, -1)) {
- part->flags |= RSPAMD_MIME_PART_NO_TEXT_EXTRACTION;
- }
- }
-
- lua_pop(L, 1);
- }
- }
-
- lua_settop(L, funcs_top);
- }
-
- /* Now detect content */
- if (content_func_pos != -1 && part->parsed_data.len > 0 &&
- part->part_type == RSPAMD_MIME_PART_UNDEFINED) {
- struct rspamd_mime_part **pmime;
- struct rspamd_task **ptask;
-
- lua_pushcfunction(L, &rspamd_lua_traceback);
- gint err_idx = lua_gettop(L);
- lua_pushvalue(L, content_func_pos);
- pmime = lua_newuserdata(L, sizeof(struct rspamd_mime_part *));
- rspamd_lua_setclass(L, rspamd_mimepart_classname, -1);
- *pmime = part;
- ptask = lua_newuserdata(L, sizeof(struct rspamd_task *));
- rspamd_lua_setclass(L, rspamd_task_classname, -1);
- *ptask = task;
-
- if (lua_pcall(L, 2, 0, err_idx) != 0) {
- msg_err_task("cannot detect content: %s", lua_tostring(L, -1));
- }
-
- lua_settop(L, funcs_top);
- }
-
- /* Try to detect image before checking for text */
- rspamd_images_process_mime_part_maybe(task, part);
-
- if (part->part_type == RSPAMD_MIME_PART_UNDEFINED &&
- !(part->flags & RSPAMD_MIME_PART_NO_TEXT_EXTRACTION)) {
- enum rspamd_message_part_is_text_result res = rspamd_message_part_can_be_parsed_as_text(task, part);
-
- if (res != RSPAMD_MESSAGE_PART_IS_NOT_TEXT) {
- struct rspamd_mime_part_text_position p = {
- .pos = i,
- .res = res};
- g_array_append_val(detected_text_parts, p);
- }
- }
- }
-
- uint16_t cur_url_order = 0;
- g_array_sort(detected_text_parts, rspamd_mime_text_part_position_compare_func);
- /* One more iteration to process text parts in a more specific order */
- for (i = 0; i < detected_text_parts->len; i++) {
- part = g_ptr_array_index(MESSAGE_FIELD(task, parts),
- g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).pos);
- rspamd_message_process_text_part_maybe(task, part,
- g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).res, &cur_url_order);
- }
-
- g_array_free(detected_text_parts, TRUE);
-
- if (old_top != -1) {
- lua_settop(L, old_top);
- }
-
- /* Parse urls inside Subject header */
- if (MESSAGE_FIELD(task, subject)) {
- rspamd_url_find_multiple(task->task_pool, MESSAGE_FIELD(task, subject),
- strlen(MESSAGE_FIELD(task, subject)),
- RSPAMD_URL_FIND_STRICT, NULL,
- rspamd_url_task_subject_callback,
- task);
- }
-
- /* Calculate average words length and number of short words */
- struct rspamd_mime_text_part *text_part;
- gdouble *var;
- guint total_words = 0;
-
- PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
- {
- if (!text_part->language) {
- rspamd_mime_part_detect_language(task, text_part);
- }
-
- rspamd_mime_part_extract_words(task, text_part);
-
- if (text_part->utf_words) {
- total_words += text_part->nwords;
- }
- }
-
- /* Calculate distance for 2-parts messages */
- if (i == 2) {
- p1 = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), 0);
- p2 = g_ptr_array_index(MESSAGE_FIELD(task, text_parts), 1);
-
- /* First of all check parent object */
- if (p1->mime_part->parent_part) {
- rspamd_ftok_t srch;
-
- srch.begin = "alternative";
- srch.len = 11;
-
- if (rspamd_ftok_cmp(&p1->mime_part->parent_part->ct->subtype, &srch) == 0) {
- if (!IS_TEXT_PART_EMPTY(p1) && !IS_TEXT_PART_EMPTY(p2) &&
- p1->normalized_hashes && p2->normalized_hashes) {
- /*
- * We also detect language on one part and propagate it to
- * another one
- */
- struct rspamd_mime_text_part *sel;
-
- /* Prefer HTML as text part is not displayed normally */
- if (IS_TEXT_PART_HTML(p1)) {
- sel = p1;
- }
- else if (IS_TEXT_PART_HTML(p2)) {
- sel = p2;
- }
- else {
- if (p1->utf_content.len > p2->utf_content.len) {
- sel = p1;
- }
- else {
- sel = p2;
- }
- }
-
- if (sel->language && sel->language[0]) {
- /* Propagate language */
- if (sel == p1) {
- if (p2->languages) {
- g_ptr_array_unref(p2->languages);
- }
-
- p2->language = sel->language;
- p2->languages = g_ptr_array_ref(sel->languages);
- }
- else {
- if (p1->languages) {
- g_ptr_array_unref(p1->languages);
- }
-
- p1->language = sel->language;
- p1->languages = g_ptr_array_ref(sel->languages);
- }
- }
-
- tw = p1->normalized_hashes->len + p2->normalized_hashes->len;
-
- if (tw > 0) {
- dw = rspamd_words_levenshtein_distance(task,
- p1->normalized_hashes,
- p2->normalized_hashes);
- diff = dw / (gdouble) tw;
-
- msg_debug_task(
- "different words: %d, total words: %d, "
- "got diff between parts of %.2f",
- dw, tw,
- diff);
-
- pdiff = rspamd_mempool_alloc(task->task_pool,
- sizeof(gdouble));
- *pdiff = diff;
- rspamd_mempool_set_variable(task->task_pool,
- "parts_distance",
- pdiff,
- NULL);
- ptw = rspamd_mempool_alloc(task->task_pool,
- sizeof(gint));
- *ptw = tw;
- rspamd_mempool_set_variable(task->task_pool,
- "total_words",
- ptw,
- NULL);
- }
- }
- }
- }
- else {
- debug_task(
- "message contains two parts but they are in different multi-parts");
- }
- }
-
- if (total_words > 0) {
- var = rspamd_mempool_get_variable(task->task_pool,
- RSPAMD_MEMPOOL_AVG_WORDS_LEN);
-
- if (var) {
- *var /= (double) total_words;
- }
-
- var = rspamd_mempool_get_variable(task->task_pool,
- RSPAMD_MEMPOOL_SHORT_WORDS_CNT);
-
- if (var) {
- *var /= (double) total_words;
- }
- }
-
- rspamd_images_link(task);
- rspamd_tokenize_meta_words(task);
- }
-
-
- struct rspamd_message *
- rspamd_message_ref(struct rspamd_message *msg)
- {
- REF_RETAIN(msg);
-
- return msg;
- }
-
- void rspamd_message_unref(struct rspamd_message *msg)
- {
- if (msg) {
- REF_RELEASE(msg);
- }
- }
-
- void rspamd_message_update_digest(struct rspamd_message *msg,
- const void *input, gsize len)
- {
- guint64 n[2];
- /* Sanity */
- G_STATIC_ASSERT(sizeof(n) == sizeof(msg->digest));
-
- memcpy(n, msg->digest, sizeof(msg->digest));
- n[0] = t1ha2_atonce128(&n[1], input, len, n[0]);
- memcpy(msg->digest, n, sizeof(msg->digest));
- }
|