mirrors
/
rspamd
mirror of https://github.com/vstakhov/rspamd.git


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904
							/*-
 * Copyright 2016 Vsevolod Stakhov
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#include "config.h"
#include "util.h"
#include "rspamd.h"
#include "message.h"
#include "cfg_file.h"
#include "libutil/regexp.h"
#include "html.h"
#include "images.h"
#include "utlist.h"
#include "tokenizers/tokenizers.h"

#ifdef WITH_SNOWBALL
#include "libstemmer.h"
#endif

#include "acism.h"

#include <iconv.h>

#define RECURSION_LIMIT 5
#define UTF8_CHARSET "UTF-8"
#define GTUBE_SYMBOL "GTUBE"

#define SET_PART_RAW(part) ((part)->flags &= ~RSPAMD_MIME_PART_FLAG_UTF)
#define SET_PART_UTF(part) ((part)->flags |= RSPAMD_MIME_PART_FLAG_UTF)

static ac_trie_t *gtube_trie = NULL;
static const gchar gtube_pattern[] = "XJS*C4JDBQADN1.NSBN3*2IDNEN*"
		"GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL*C.34X";
static rspamd_regexp_t *utf_compatible_re = NULL;

static GQuark
rspamd_message_quark (void)
{
	return g_quark_from_static_string ("mime-error");
}

static void
parse_qmail_recv (rspamd_mempool_t * pool,
	gchar *line,
	struct received_header *r)
{
	gchar *s, *p, t;

	/* We are interested only with received from network headers */
	if ((p = strstr (line, "from network")) == NULL) {
		r->is_error = 2;
		return;
	}

	p += sizeof ("from network") - 1;
	while (g_ascii_isspace (*p) || *p == '[') {
		p++;
	}
	/* format is ip/host */
	s = p;
	if (*p) {
		while (g_ascii_isdigit (*++p) || *p == '.') ;
		if (*p != '/') {
			r->is_error = 1;
			return;
		}
		else {
			*p = '\0';
			r->real_ip = rspamd_mempool_strdup (pool, s);
			*p = '/';
			/* Now try to parse hostname */
			s = ++p;
			while (g_ascii_isalnum (*p) || *p == '.' || *p == '-' || *p ==
				'_') {
				p++;
			}
			t = *p;
			*p = '\0';
			r->real_hostname = rspamd_mempool_strdup (pool, s);
			*p = t;
		}
	}
}

static void
parse_recv_header (rspamd_mempool_t * pool,
	struct raw_header *rh,
	struct received_header *r)
{
	gchar *p, *s, t, **res = NULL;
	gchar *line;
	enum {
		RSPAMD_RECV_STATE_INIT = 0,
		RSPAMD_RECV_STATE_FROM,
		RSPAMD_RECV_STATE_IP_BLOCK,
		RSPAMD_RECV_STATE_BRACES_BLOCK,
		RSPAMD_RECV_STATE_BY_BLOCK,
		RSPAMD_RECV_STATE_PARSE_IP,
		RSPAMD_RECV_STATE_PARSE_IP6,
		RSPAMD_RECV_STATE_SKIP_SPACES,
		RSPAMD_RECV_STATE_ERROR
	} state = RSPAMD_RECV_STATE_INIT, next_state = RSPAMD_RECV_STATE_INIT;
	gboolean is_exim = FALSE;

	line = rh->decoded;
	if (line == NULL) {
		return;
	}

	g_strstrip (line);
	p = line;
	s = line;

	while (*p) {
		switch (state) {
		/* Initial state, search for from */
		case RSPAMD_RECV_STATE_INIT:
			if (*p == 'f' || *p == 'F') {
				if (g_ascii_tolower (*++p) == 'r' && g_ascii_tolower (*++p) ==
					'o' && g_ascii_tolower (*++p) == 'm') {
					p++;
					state = RSPAMD_RECV_STATE_SKIP_SPACES;
					next_state = RSPAMD_RECV_STATE_FROM;
				}
			}
			else if (g_ascii_tolower (*p) == 'b' &&
				g_ascii_tolower (*(p + 1)) == 'y') {
				state = RSPAMD_RECV_STATE_IP_BLOCK;
			}
			else {
				/* This can be qmail header, parse it separately */
				parse_qmail_recv (pool, line, r);
				return;
			}
			break;
		/* Read hostname */
		case RSPAMD_RECV_STATE_FROM:
			if (*p == '[') {
				/* This should be IP address */
				res = &r->from_ip;
				state = RSPAMD_RECV_STATE_PARSE_IP;
				next_state = RSPAMD_RECV_STATE_IP_BLOCK;
				s = ++p;
			}
			else if (g_ascii_isalnum (*p) || *p == '.' || *p == '-' || *p ==
				'_') {
				p++;
			}
			else {
				t = *p;
				*p = '\0';
				r->from_hostname = rspamd_mempool_strdup (pool, s);
				*p = t;
				state = RSPAMD_RECV_STATE_SKIP_SPACES;
				next_state = RSPAMD_RECV_STATE_IP_BLOCK;
			}
			break;
		/* Try to extract additional info */
		case RSPAMD_RECV_STATE_IP_BLOCK:
			/* Try to extract ip or () info or by */
			if (g_ascii_tolower (*p) == 'b' && g_ascii_tolower (*(p + 1)) ==
				'y') {
				p += 2;
				/* Skip spaces after by */
				state = RSPAMD_RECV_STATE_SKIP_SPACES;
				next_state = RSPAMD_RECV_STATE_BY_BLOCK;
			}
			else if (*p == '(') {
				state = RSPAMD_RECV_STATE_SKIP_SPACES;
				next_state = RSPAMD_RECV_STATE_BRACES_BLOCK;
				p++;
			}
			else if (*p == '[') {
				/* Got ip before '(' so extract it */
				s = ++p;
				res = &r->from_ip;
				state = RSPAMD_RECV_STATE_PARSE_IP;
				next_state = RSPAMD_RECV_STATE_IP_BLOCK;
			}
			else {
				p++;
			}
			break;
		/* We are in () block. Here can be found real hostname and real ip, this is written by some MTA */
		case RSPAMD_RECV_STATE_BRACES_BLOCK:
			/* End of block */
			if (g_ascii_isalnum (*p) || *p == '.' || *p == '-' ||
				*p == '_' || *p == ':') {
				p++;
			}
			else if (*p == '[') {
				s = ++p;
				state = RSPAMD_RECV_STATE_PARSE_IP;
				res = &r->real_ip;
				next_state = RSPAMD_RECV_STATE_BRACES_BLOCK;
			}
			else {
				if (p > s) {
					/* Got some real hostname */
					/* check whether it is helo or p is not space symbol */
					if (!g_ascii_isspace (*p) || *(p + 1) != '[') {
						/* Exim style ([ip]:port helo=hostname) */
						if (*s == ':' && (g_ascii_isspace (*p) || *p == ')')) {
							/* Ip ending */
							is_exim = TRUE;
							state = RSPAMD_RECV_STATE_SKIP_SPACES;
							next_state = RSPAMD_RECV_STATE_BRACES_BLOCK;
						}
						else if (p - s == 4 && memcmp (s, "helo=", 5) == 0) {
							p++;
							is_exim = TRUE;
							if (r->real_hostname == NULL && r->from_hostname !=
								NULL) {
								r->real_hostname = r->from_hostname;
							}
							s = p;
							while (*p != ')' && !g_ascii_isspace (*p) && *p !=
								'\0') {
								p++;
							}
							if (p > s) {
								r->from_hostname = rspamd_mempool_alloc (pool,
										p - s + 1);
								rspamd_strlcpy (r->from_hostname, s, p - s + 1);
							}
						}
						else if (p - s == 4 && memcmp (s, "port=", 5) == 0) {
							p++;
							is_exim = TRUE;
							while (g_ascii_isdigit (*p)) {
								p++;
							}
							state = RSPAMD_RECV_STATE_SKIP_SPACES;
							next_state = RSPAMD_RECV_STATE_BRACES_BLOCK;
						}
						else if (*p == '=' && is_exim) {
							/* Just skip unknown pairs */
							p++;
							while (!g_ascii_isspace (*p) && *p != ')' && *p !=
								'\0') {
								p++;
							}
							state = RSPAMD_RECV_STATE_SKIP_SPACES;
							next_state = RSPAMD_RECV_STATE_BRACES_BLOCK;
						}
						else {
							/* skip all  */
							while (*p++ != ')' && *p != '\0') ;
							state = RSPAMD_RECV_STATE_IP_BLOCK;
						}
					}
					else {
						/* Postfix style (hostname [ip]) */
						t = *p;
						*p = '\0';
						r->real_hostname = rspamd_mempool_strdup (pool, s);
						*p = t;
						/* Now parse ip */
						p += 2;
						s = p;
						res = &r->real_ip;
						state = RSPAMD_RECV_STATE_PARSE_IP;
						next_state = RSPAMD_RECV_STATE_BRACES_BLOCK;
						continue;
					}
					if (*p == ')') {
						p++;
						state = RSPAMD_RECV_STATE_SKIP_SPACES;
						next_state = RSPAMD_RECV_STATE_IP_BLOCK;
					}
				}
				else if (*p == ')') {
					p++;
					state = RSPAMD_RECV_STATE_SKIP_SPACES;
					next_state = RSPAMD_RECV_STATE_IP_BLOCK;
				}
				else {
					r->is_error = 1;
					return;
				}
			}
			break;
		/* Got by word */
		case RSPAMD_RECV_STATE_BY_BLOCK:
			/* Here can be only hostname */
			if ((g_ascii_isalnum (*p) || *p == '.' || *p == '-'
				|| *p == '_') && p[1] != '\0') {
				p++;
			}
			else {
				/* We got something like hostname */
				if (p[1] != '\0') {
					t = *p;
					*p = '\0';
					r->by_hostname = rspamd_mempool_strdup (pool, s);
					*p = t;
				}
				else {
					r->by_hostname = rspamd_mempool_strdup (pool, s);
				}
				/* Now end of parsing */
				if (is_exim) {
					/* Adjust for exim received */
					if (r->real_ip == NULL && r->from_ip != NULL) {
						r->real_ip = r->from_ip;
					}
					else if (r->from_ip == NULL && r->real_ip != NULL) {
						r->from_ip = r->real_ip;
						if (r->real_hostname == NULL && r->from_hostname !=
							NULL) {
							r->real_hostname = r->from_hostname;
						}
					}
				}
				return;
			}
			break;

		/* Extract ip */
		case RSPAMD_RECV_STATE_PARSE_IP:
			if (*p == 'I') {
				/* IPv6: */
				state = RSPAMD_RECV_STATE_PARSE_IP6;
			}
			else {
				while (g_ascii_isxdigit (*p) || *p == '.' || *p == ':') {
					p++;
				}
				if (*p != ']') {
					/* Not an ip in fact */
					state = RSPAMD_RECV_STATE_SKIP_SPACES;
					p++;
				}
				else {
					*p = '\0';
					*res = rspamd_mempool_strdup (pool, s);
					*p = ']';
					p++;
					state = RSPAMD_RECV_STATE_SKIP_SPACES;
				}
			}
			break;
		case RSPAMD_RECV_STATE_PARSE_IP6:
			if (g_ascii_strncasecmp (p, "IPv6:", sizeof ("IPv6") - 1) == 0) {
				p += sizeof ("IPv6") - 1;
				s = p;
				state = RSPAMD_RECV_STATE_PARSE_IP;
			}
			else {
				state = RSPAMD_RECV_STATE_SKIP_SPACES;
			}
			break;
		/* Skip spaces */
		case RSPAMD_RECV_STATE_SKIP_SPACES:
			if (!g_ascii_isspace (*p)) {
				state = next_state;
				s = p;
			}
			else {
				p++;
			}
			break;
		default:
			r->is_error = 1;
			return;
			break;
		}
	}

	r->is_error = 1;
	return;
}

static void
append_raw_header (struct rspamd_task *task,
		GHashTable *target, struct raw_header *rh)
{
	struct raw_header *lp;

	rh->next = NULL;
	rh->prev = rh;
	if ((lp =
			g_hash_table_lookup (target, rh->name)) != NULL) {
		DL_APPEND (lp, rh);
	}
	else {
		g_hash_table_insert (target, rh->name, rh);
	}
	msg_debug_task ("add raw header %s: %s", rh->name, rh->value);
}

/* Convert raw headers to a list of struct raw_header * */
static void
process_raw_headers (struct rspamd_task *task, GHashTable *target,
		const gchar *in, gsize len)
{
	struct raw_header *new = NULL;
	const gchar *p, *c, *end;
	gchar *tmp, *tp;
	gint state = 0, l, next_state = 100, err_state = 100, t_state;
	gboolean valid_folding = FALSE;

	p = in;
	end = p + len;
	c = p;

	while (p < end) {
		/* FSM for processing headers */
		switch (state) {
		case 0:
			/* Begin processing headers */
			if (!g_ascii_isalpha (*p)) {
				/* We have some garbage at the beginning of headers, skip this line */
				state = 100;
				next_state = 0;
			}
			else {
				state = 1;
				c = p;
			}
			break;
		case 1:
			/* We got something like header's name */
			if (*p == ':') {
				new =
					rspamd_mempool_alloc0 (task->task_pool,
						sizeof (struct raw_header));
				new->prev = new;
				l = p - c;
				tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
				rspamd_strlcpy (tmp, c, l + 1);
				new->name = tmp;
				new->empty_separator = TRUE;
				p++;
				state = 2;
				c = p;
			}
			else if (g_ascii_isspace (*p)) {
				/* Not header but some garbage */
				task->flags |= RSPAMD_TASK_FLAG_BROKEN_HEADERS;
				state = 100;
				next_state = 0;
			}
			else {
				p++;
			}
			break;
		case 2:
			/* We got header's name, so skip any \t or spaces */
			if (*p == '\t') {
				new->tab_separated = TRUE;
				new->empty_separator = FALSE;
				p++;
			}
			else if (*p == ' ') {
				new->empty_separator = FALSE;
				p++;
			}
			else if (*p == '\n' || *p == '\r') {
				/* Process folding */
				state = 99;
				l = p - c;
				if (l > 0) {
					tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
					rspamd_strlcpy (tmp, c, l + 1);
					new->separator = tmp;
				}
				next_state = 3;
				err_state = 5;
				c = p;
			}
			else {
				/* Process value */
				l = p - c;
				if (l >= 0) {
					tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
					rspamd_strlcpy (tmp, c, l + 1);
					new->separator = tmp;
				}
				c = p;
				state = 3;
			}
			break;
		case 3:
			if (*p == '\r' || *p == '\n') {
				/* Hold folding */
				state = 99;
				next_state = 3;
				err_state = 4;
			}
			else if (p + 1 == end) {
				state = 4;
			}
			else {
				p++;
			}
			break;
		case 4:
			/* Copy header's value */
			l = p - c;
			tmp = rspamd_mempool_alloc (task->task_pool, l + 1);
			tp = tmp;
			t_state = 0;
			while (l--) {
				if (t_state == 0) {
					/* Before folding */
					if (*c == '\n' || *c == '\r') {
						t_state = 1;
						c++;
						*tp++ = ' ';
					}
					else {
						*tp++ = *c++;
					}
				}
				else if (t_state == 1) {
					/* Inside folding */
					if (g_ascii_isspace (*c)) {
						c++;
					}
					else {
						t_state = 0;
						*tp++ = *c++;
					}
				}
			}
			/* Strip last space that can be added by \r\n parsing */
			if (*(tp - 1) == ' ') {
				tp--;
			}

			*tp = '\0';
			/* Strip the initial spaces that could also be added by folding */
			while (*tmp != '\0' && g_ascii_isspace (*tmp)) {
				tmp ++;
			}

			new->value = tmp;
			new->decoded = g_mime_utils_header_decode_text (new->value);
			rspamd_mempool_add_destructor (task->task_pool,
					(rspamd_mempool_destruct_t)g_free, new->decoded);
			append_raw_header (task, target, new);
			state = 0;
			break;
		case 5:
			/* Header has only name, no value */
			new->value = "";
			new->decoded = NULL;
			append_raw_header (task, target, new);
			state = 0;
			break;
		case 99:
			/* Folding state */
			if (p + 1 == end) {
				state = err_state;
			}
			else {
				if (*p == '\r' || *p == '\n') {
					p++;
					valid_folding = FALSE;
				}
				else if (*p == '\t' || *p == ' ') {
					/* Valid folding */
					p++;
					valid_folding = TRUE;
				}
				else {
					if (valid_folding) {
						debug_task ("go to state: %d->%d", state, next_state);
						state = next_state;
					}
					else {
						/* Fall back */
						debug_task ("go to state: %d->%d", state, err_state);
						state = err_state;
					}
				}
			}
			break;
		case 100:
			/* Fail state, skip line */

			if (*p == '\r') {
				if (*(p + 1) == '\n') {
					p++;
				}
				p++;
				state = next_state;
			}
			else if (*p == '\n') {
				if (*(p + 1) == '\r') {
					p++;
				}
				p++;
				state = next_state;
			}
			else if (p + 1 == end) {
				state = next_state;
				p++;
			}
			else {
				p++;
			}
			break;
		}
	}
}

static void
free_byte_array_callback (void *pointer)
{
	GByteArray *arr = (GByteArray *) pointer;
	g_byte_array_free (arr, TRUE);
}

static gboolean
charset_validate (rspamd_mempool_t *pool, const gchar *in, gchar **out)
{
	/*
	 * This is a simple routine to validate input charset
	 * we just check that charset starts with alphanumeric and ends
	 * with alphanumeric
	 */
	const gchar *begin, *end;
	gboolean changed = FALSE, to_uppercase = FALSE;

	begin = in;

	while (!g_ascii_isalnum (*begin)) {
		begin ++;
		changed = TRUE;
	}
	if (!g_ascii_islower(*begin)) {
		changed = TRUE;
		to_uppercase = TRUE;
	}
	end = begin + strlen (begin) - 1;
	while (!g_ascii_isalnum (*end)) {
		end --;
		changed = TRUE;
	}

	if (!changed) {
		*out = (gchar *)in;
	}
	else {
		*out = rspamd_mempool_alloc (pool, end - begin + 2);
		if (to_uppercase) {
			gchar *o = *out;

			while (begin != end + 1) {
				if (g_ascii_islower (*begin)) {
					*o++ = g_ascii_toupper (*begin ++);
				}
				else {
					*o++ = *begin++;
				}
			}
			*o = '\0';
		}
		else {
			rspamd_strlcpy (*out, begin, end - begin + 2);
		}
	}

	return TRUE;
}

static GQuark
converter_error_quark (void)
{
	return g_quark_from_static_string ("conversion error");
}

static gchar *
rspamd_text_to_utf8 (struct rspamd_task *task,
		gchar *input, gsize len, const gchar *in_enc,
		gsize *olen, GError **err)
{
	gchar *res, *s, *d;
	gsize outlen;
	iconv_t ic;
	gsize processed, ret;

	ic = iconv_open (UTF8_CHARSET, in_enc);

	if (ic == (iconv_t)-1) {
		g_set_error (err, converter_error_quark(), EINVAL,
				"cannot open iconv for: %s", in_enc);
		return NULL;
	}

	/* For the most of charsets utf8 notation is larger than native one */
	outlen = len * 2 + 1;

	res = rspamd_mempool_alloc (task->task_pool, outlen);
	s = input;
	d = res;
	processed = outlen - 1;

	while (len > 0 && processed > 0) {
		ret = iconv (ic, &s, &len, &d, &processed);
		if (ret == (gsize)-1) {
			switch (errno) {
			case E2BIG:
				g_set_error (err, converter_error_quark(), EINVAL,
						"output of size %zd is not enough to handle "
						"converison of %zd bytes", outlen, len);
				iconv_close (ic);
				return NULL;
			case EILSEQ:
			case EINVAL:
				/* Ignore bad characters */
				if (processed > 0 && len > 0) {
					*d++ = '?';
					s++;
					len --;
					processed --;
				}
				break;
			}
		}
		else if (ret == 0) {
			break;
		}
	}

	*d = '\0';
	*olen = d - res;

	iconv_close (ic);

	return res;
}


static GByteArray *
convert_text_to_utf (struct rspamd_task *task,
	GByteArray * part_content,
	GMimeContentType * type,
	struct mime_text_part *text_part)
{
	GError *err = NULL;
	gsize write_bytes;
	const gchar *charset;
	gchar *res_str, *ocharset;
	GByteArray *result_array;

	if (task->cfg->raw_mode) {
		SET_PART_RAW (text_part);
		return part_content;
	}

	if (utf_compatible_re == NULL) {
		utf_compatible_re = rspamd_regexp_new (
			"^(?:utf-?8.*)|(?:us-ascii)|(?:ascii)|(?:us)|(?:ISO-8859-1)|"
			"(?:latin.*)|(?:CSASCII)$",
			"i", NULL);
	}

	if ((charset =
		g_mime_content_type_get_parameter (type, "charset")) == NULL) {
		SET_PART_RAW (text_part);
		return part_content;
	}
	if (!charset_validate (task->task_pool, charset, &ocharset)) {
		msg_info_task (
			"<%s>: has invalid charset",
			task->message_id);
		SET_PART_RAW (text_part);
		return part_content;
	}

	if (rspamd_regexp_match (utf_compatible_re, ocharset, strlen (ocharset), TRUE)) {
		if (g_utf8_validate (part_content->data, part_content->len, NULL)) {
			SET_PART_UTF (text_part);
			return part_content;
		}
		else {
			msg_info_task (
				"<%s>: contains invalid utf8 characters, assume it as raw",
				task->message_id);
			SET_PART_RAW (text_part);
			return part_content;
		}
	}
	else {
		res_str = rspamd_text_to_utf8 (task, part_content->data,
				part_content->len,
				ocharset,
				&write_bytes,
				&err);
		if (res_str == NULL) {
			msg_warn_task ("<%s>: cannot convert from %s to utf8: %s",
					task->message_id,
					ocharset,
					err ? err->message : "unknown problem");
			SET_PART_RAW (text_part);
			g_error_free (err);
			return part_content;
		}
	}

	result_array = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray));
	result_array->data = res_str;
	result_array->len = write_bytes;
	SET_PART_UTF (text_part);

	return result_array;
}

struct language_match {
	const char *code;
	const char *name;
	GUnicodeScript script;
};

static int
language_elts_cmp (const void *a, const void *b)
{
	GUnicodeScript sc = *(const GUnicodeScript *)a;
	const struct language_match *bb = (const struct language_match *)b;

	return (sc - bb->script);
}

static void
detect_text_language (struct mime_text_part *part)
{
	/* Keep sorted */
	static const struct language_match language_codes[] = {
			{ "", "english", G_UNICODE_SCRIPT_COMMON },
			{ "", "", G_UNICODE_SCRIPT_INHERITED },
			{ "ar", "arabic", G_UNICODE_SCRIPT_ARABIC },
			{ "hy", "armenian", G_UNICODE_SCRIPT_ARMENIAN },
			{ "bn", "chineese", G_UNICODE_SCRIPT_BENGALI },
			{ "", "", G_UNICODE_SCRIPT_BOPOMOFO },
			{ "chr", "", G_UNICODE_SCRIPT_CHEROKEE },
			{ "cop", "",  G_UNICODE_SCRIPT_COPTIC  },
			{ "ru", "russian",  G_UNICODE_SCRIPT_CYRILLIC },
			/* Deseret was used to write English */
			{ "", "",  G_UNICODE_SCRIPT_DESERET },
			{ "hi", "",  G_UNICODE_SCRIPT_DEVANAGARI },
			{ "am", "",  G_UNICODE_SCRIPT_ETHIOPIC },
			{ "ka", "",  G_UNICODE_SCRIPT_GEORGIAN },
			{ "", "",  G_UNICODE_SCRIPT_GOTHIC },
			{ "el", "greek",  G_UNICODE_SCRIPT_GREEK },
			{ "gu", "",  G_UNICODE_SCRIPT_GUJARATI },
			{ "pa", "",  G_UNICODE_SCRIPT_GURMUKHI },
			{ "han", "chineese",  G_UNICODE_SCRIPT_HAN },
			{ "ko", "",  G_UNICODE_SCRIPT_HANGUL },
			{ "he", "hebrew",  G_UNICODE_SCRIPT_HEBREW },
			{ "ja", "",  G_UNICODE_SCRIPT_HIRAGANA },
			{ "kn", "",  G_UNICODE_SCRIPT_KANNADA },
			{ "ja", "",  G_UNICODE_SCRIPT_KATAKANA },
			{ "km", "",  G_UNICODE_SCRIPT_KHMER },
			{ "lo", "",  G_UNICODE_SCRIPT_LAO },
			{ "en", "english",  G_UNICODE_SCRIPT_LATIN },
			{ "ml", "",  G_UNICODE_SCRIPT_MALAYALAM },
			{ "mn", "",  G_UNICODE_SCRIPT_MONGOLIAN },
			{ "my", "",  G_UNICODE_SCRIPT_MYANMAR },
			/* Ogham was used to write old Irish */
			{ "", "",  G_UNICODE_SCRIPT_OGHAM },
			{ "", "",  G_UNICODE_SCRIPT_OLD_ITALIC },
			{ "or", "",  G_UNICODE_SCRIPT_ORIYA },
			{ "", "",  G_UNICODE_SCRIPT_RUNIC },
			{ "si", "",  G_UNICODE_SCRIPT_SINHALA },
			{ "syr", "",  G_UNICODE_SCRIPT_SYRIAC },
			{ "ta", "",  G_UNICODE_SCRIPT_TAMIL },
			{ "te", "",  G_UNICODE_SCRIPT_TELUGU },
			{ "dv", "",  G_UNICODE_SCRIPT_THAANA },
			{ "th", "",  G_UNICODE_SCRIPT_THAI },
			{ "bo", "",  G_UNICODE_SCRIPT_TIBETAN },
			{ "iu", "",  G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL },
			{ "", "",  G_UNICODE_SCRIPT_YI },
			{ "tl", "",  G_UNICODE_SCRIPT_TAGALOG },
			/* Phillipino languages/scripts */
			{ "hnn", "",  G_UNICODE_SCRIPT_HANUNOO },
			{ "bku", "",  G_UNICODE_SCRIPT_BUHID },
			{ "tbw", "",  G_UNICODE_SCRIPT_TAGBANWA },

			{ "", "",  G_UNICODE_SCRIPT_BRAILLE },
			{ "", "",  G_UNICODE_SCRIPT_CYPRIOT },
			{ "", "",  G_UNICODE_SCRIPT_LIMBU },
			/* Used for Somali (so) in the past */
			{ "", "",  G_UNICODE_SCRIPT_OSMANYA },
			/* The Shavian alphabet was designed for English */
			{ "", "",  G_UNICODE_SCRIPT_SHAVIAN },
			{ "", "",  G_UNICODE_SCRIPT_LINEAR_B },
			{ "", "",  G_UNICODE_SCRIPT_TAI_LE },
			{ "uga", "",  G_UNICODE_SCRIPT_UGARITIC },
			{ "", "",  G_UNICODE_SCRIPT_NEW_TAI_LUE },
			{ "bug", "",  G_UNICODE_SCRIPT_BUGINESE },
			{ "", "",  G_UNICODE_SCRIPT_GLAGOLITIC },
			/* Used for for Berber (ber), but Arabic script is more common */
			{ "", "",  G_UNICODE_SCRIPT_TIFINAGH },
			{ "syl", "",  G_UNICODE_SCRIPT_SYLOTI_NAGRI },
			{ "peo", "",  G_UNICODE_SCRIPT_OLD_PERSIAN },
			{ "", "",  G_UNICODE_SCRIPT_KHAROSHTHI },
			{ "", "",  G_UNICODE_SCRIPT_UNKNOWN },
			{ "", "",  G_UNICODE_SCRIPT_BALINESE },
			{ "", "",  G_UNICODE_SCRIPT_CUNEIFORM },
			{ "", "",  G_UNICODE_SCRIPT_PHOENICIAN },
			{ "", "",  G_UNICODE_SCRIPT_PHAGS_PA },
			{ "nqo", "", G_UNICODE_SCRIPT_NKO }
	};
	const struct language_match *lm;
	const int max_chars = 32;

	if (part != NULL) {
		if (IS_PART_UTF (part)) {
			/* Try to detect encoding by several symbols */
			const gchar *p, *pp;
			gunichar c;
			gint32 remain = part->content->len, max = 0, processed = 0;
			gint32 scripts[G_N_ELEMENTS (language_codes)];
			GUnicodeScript scc, sel = G_UNICODE_SCRIPT_COMMON;

			p = part->content->data;
			memset (scripts, 0, sizeof (scripts));

			while (remain > 0 && processed < max_chars) {
				c = g_utf8_get_char_validated (p, remain);
				if (c == (gunichar) -2 || c == (gunichar) -1) {
					break;
				}
				if (g_unichar_isalpha (c)) {
					scc = g_unichar_get_script (c);
					if (scc < (gint)G_N_ELEMENTS (scripts)) {
						scripts[scc]++;
					}
					processed ++;
				}
				pp = g_utf8_next_char (p);
				remain -= pp - p;
				p = pp;
			}
			for (remain = 0; remain < (gint)G_N_ELEMENTS (scripts); remain++) {
				if (scripts[remain] > max) {
					max = scripts[remain];
					sel = remain;
				}
			}
			part->script = sel;
			lm = bsearch (&sel, language_codes, G_N_ELEMENTS (language_codes),
					sizeof (language_codes[0]), &language_elts_cmp);

			if (lm != NULL) {
				part->lang_code = lm->code;
				part->language = lm->name;
			}
		}
	}
}

static void
rspamd_normalize_text_part (struct rspamd_task *task,
		struct mime_text_part *part)
{
#ifdef WITH_SNOWBALL
	struct sb_stemmer *stem = NULL;
#endif
	rspamd_ftok_t *w;
	const guchar *r;
	gchar *temp_word;
	guint i, nlen;

#ifdef WITH_SNOWBALL
	if (part->language && part->language[0] != '\0' && IS_PART_UTF (part)) {
		stem = sb_stemmer_new (part->language, "UTF_8");
		if (stem == NULL) {
			msg_info_task ("<%s> cannot create lemmatizer for %s language",
				task->message_id, part->language);
		}
	}
#endif

	/* Ugly workaround */
	part->normalized_words = rspamd_tokenize_text (part->content->data,
			part->content->len, IS_PART_UTF (part), task->cfg,
			part->urls_offset, FALSE,
			NULL);

	if (part->normalized_words) {
		for (i = 0; i < part->normalized_words->len; i ++) {
			w = &g_array_index (part->normalized_words, rspamd_ftok_t, i);
			r = NULL;
#ifdef WITH_SNOWBALL
			if (stem) {
				r = sb_stemmer_stem (stem, w->begin, w->len);
			}
#endif

			if (w->len > 0 && !(w->len == 6 && memcmp (w->begin, "!!EX!!", 6) == 0)) {
				if (r != NULL) {
					nlen = strlen (r);
					nlen = MIN (nlen, w->len);
					temp_word = rspamd_mempool_alloc (task->task_pool, nlen);
					memcpy (temp_word, r, nlen);
					w->begin = temp_word;
					w->len = nlen;
				}
				else {
					temp_word = rspamd_mempool_alloc (task->task_pool, w->len);
					memcpy (temp_word, w->begin, w->len);

					if (IS_PART_UTF (part)) {
						rspamd_str_lc_utf8 (temp_word, w->len);
					}
					else {
						rspamd_str_lc (temp_word, w->len);
					}

					w->begin = temp_word;
				}
			}
		}
	}
#ifdef WITH_SNOWBALL
	if (stem != NULL) {
		sb_stemmer_delete (stem);
	}
#endif
}

#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))

static guint
rspamd_words_levenshtein_distance (struct rspamd_task *task,
		GArray *w1, GArray *w2)
{
	guint s1len, s2len, x, y, lastdiag, olddiag;
	guint *column;
	rspamd_ftok_t *s1, *s2;
	gint eq;
	static const guint max_words = 8192;

	s1len = w1->len;
	s2len = w2->len;

	if (s1len > max_words) {
		msg_err_task ("cannot compare parts with more than %ud words: %ud",
				max_words, s1len);
		return 0;
	}

	column = g_alloca ((s1len + 1) * sizeof (guint));

	for (y = 1; y <= s1len; y++) {
		column[y] = y;
	}

	for (x = 1; x <= s2len; x++) {
		column[0] = x;

		for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
			olddiag = column[y];
			s1 = &g_array_index (w1, rspamd_ftok_t, y - 1);
			s2 = &g_array_index (w2, rspamd_ftok_t, x - 1);
			eq = rspamd_ftok_cmp (s1, s2) == 0 ? 0 : 1;
			column[y] = MIN3 (column[y] + 1, column[y - 1] + 1,
					lastdiag + (eq));
			lastdiag = olddiag;
		}
	}

	return column[s1len];
}

static int
rspamd_gtube_cb (int strnum, int textpos, void *context)
{
	return TRUE;
}

static gboolean
rspamd_check_gtube (struct rspamd_task *task, struct mime_text_part *part)
{
	static ac_trie_pat_t pat[1] = {
		{
			.ptr = gtube_pattern,
			.len = sizeof (gtube_pattern) - 1
		}
	};
	gint state = 0;

	g_assert (part != NULL);

	if (gtube_trie == NULL) {
		gtube_trie = acism_create (pat, G_N_ELEMENTS (pat));
	}

	if (part->content && part->content->len > sizeof (gtube_pattern)) {
		if (acism_lookup (gtube_trie, part->content->data, part->content->len,
				rspamd_gtube_cb, NULL, &state, FALSE)) {
			task->flags |= RSPAMD_TASK_FLAG_SKIP;
			task->flags |= RSPAMD_TASK_FLAG_GTUBE;
			msg_info_task ("<%s>: gtube pattern has been found in part of length %ud",
					task->message_id, part->content->len);

			return TRUE;
		}
	}

	return FALSE;
}

static void
process_text_part (struct rspamd_task *task,
	GByteArray *part_content,
	GMimeContentType *type,
	struct mime_part *mime_part,
	GMimeObject *parent,
	gboolean is_empty)
{
	struct mime_text_part *text_part;
	const gchar *cd, *p, *c;
	guint remain;

	/* Skip attachements */
#ifndef GMIME24
	cd = g_mime_part_get_content_disposition (GMIME_PART (mime_part->mime));
	if (cd &&
		g_ascii_strcasecmp (cd,
		"attachment") == 0 && !task->cfg->check_text_attachements) {
		debug_task ("skip attachments for checking as text parts");
		return;
	}
#else
	cd = g_mime_object_get_disposition (GMIME_OBJECT (mime_part->mime));
	if (cd &&
		g_ascii_strcasecmp (cd,
		GMIME_DISPOSITION_ATTACHMENT) == 0 &&
		!task->cfg->check_text_attachements) {
		debug_task ("skip attachments for checking as text parts");
		return;
	}
#endif

	if (g_mime_content_type_is_type (type, "text",
		"html") || g_mime_content_type_is_type (type, "text", "xhtml")) {

		text_part =
			rspamd_mempool_alloc0 (task->task_pool,
				sizeof (struct mime_text_part));
		text_part->flags |= RSPAMD_MIME_PART_FLAG_HTML;
		if (is_empty) {
			text_part->flags |= RSPAMD_MIME_PART_FLAG_EMPTY;
			text_part->orig = NULL;
			text_part->content = NULL;
			g_ptr_array_add (task->text_parts, text_part);
			return;
		}
		text_part->orig = part_content;
		part_content = convert_text_to_utf (task,
				text_part->orig,
				type,
				text_part);
		text_part->html = rspamd_mempool_alloc0 (task->task_pool,
				sizeof (*text_part->html));
		text_part->parent = parent;
		text_part->mime_part = mime_part;

		text_part->flags |= RSPAMD_MIME_PART_FLAG_BALANCED;
		text_part->content = rspamd_html_process_part_full (
				task->task_pool,
				text_part->html,
				part_content,
				&text_part->urls_offset,
				task->urls,
				task->emails);

		if (text_part->content->len == 0) {
			text_part->flags |= RSPAMD_MIME_PART_FLAG_EMPTY;
		}

		/* Handle offsets of this part */
		if (text_part->urls_offset != NULL) {
			text_part->urls_offset = g_list_reverse (text_part->urls_offset);
			rspamd_mempool_add_destructor (task->task_pool,
					(rspamd_mempool_destruct_t) g_list_free, text_part->urls_offset);
		}

		rspamd_mempool_add_destructor (task->task_pool,
			(rspamd_mempool_destruct_t) free_byte_array_callback,
			text_part->content);
		g_ptr_array_add (task->text_parts, text_part);
	}
	else if (g_mime_content_type_is_type (type, "text", "*")) {

		text_part =
			rspamd_mempool_alloc0 (task->task_pool,
				sizeof (struct mime_text_part));
		text_part->parent = parent;
		text_part->mime_part = mime_part;

		if (is_empty) {
			text_part->flags |= RSPAMD_MIME_PART_FLAG_EMPTY;
			text_part->orig = NULL;
			text_part->content = NULL;
			g_ptr_array_add (task->text_parts, text_part);
			return;
		}

		text_part->content = convert_text_to_utf (task,
				part_content,
				type,
				text_part);
		text_part->orig = part_content;
		rspamd_url_text_extract (task->task_pool, task, text_part, FALSE);
		g_ptr_array_add (task->text_parts, text_part);
	}
	else {
		return;
	}

	if (rspamd_check_gtube (task, text_part)) {
		struct metric_result *mres;

		mres = rspamd_create_metric_result (task, DEFAULT_METRIC);

		if (mres != NULL) {
			mres->score = mres->metric->actions[METRIC_ACTION_REJECT].score;
			mres->action = METRIC_ACTION_REJECT;
		}

		task->pre_result.action = METRIC_ACTION_REJECT;
		task->pre_result.str = "Gtube pattern";
		rspamd_task_insert_result (task, GTUBE_SYMBOL, 0, NULL);

		return;
	}

	/* Post process part */
	detect_text_language (text_part);
	rspamd_normalize_text_part (task, text_part);

	/* Calculate number of lines */
	p = text_part->content->data;
	remain = text_part->content->len;
	c = p;

	while (p != NULL && remain > 0) {
		p = memchr (c, '\n', remain);

		if (p != NULL) {
			text_part->nlines ++;
			remain -= p - c + 1;
			c = p + 1;
		}
	}
}

struct mime_foreach_data {
	struct rspamd_task *task;
	guint parser_recursion;
	GMimeObject *parent;
};

#ifdef GMIME24
static void
mime_foreach_callback (GMimeObject * parent,
	GMimeObject * part,
	gpointer user_data)
#else
static void
mime_foreach_callback (GMimeObject * part, gpointer user_data)
#endif
{
	struct mime_foreach_data *md = user_data;
	struct rspamd_task *task;
	struct mime_part *mime_part;
	GMimeContentType *type;
	GMimeDataWrapper *wrapper;
	GMimeStream *part_stream;
	GByteArray *part_content;
	gchar *hdrs;

	task = md->task;
	/* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */

	/* find out what class 'part' is... */
	if (GMIME_IS_MESSAGE_PART (part)) {
		/* message/rfc822 or message/news */
		GMimeMessage *message;

		/* g_mime_message_foreach_part() won't descend into
		   child message parts, so if we want to count any
		   subparts of this child message, we'll have to call
		   g_mime_message_foreach_part() again here. */

		message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
		if (md->parser_recursion++ < RECURSION_LIMIT) {
#ifdef GMIME24
			g_mime_message_foreach (message, mime_foreach_callback, md);
#else
			g_mime_message_foreach_part (message, mime_foreach_callback, md);
#endif
		}
		else {
			msg_err_task ("too deep mime recursion detected: %d", md->parser_recursion);
			return;
		}
#ifndef GMIME24
		g_object_unref (message);
#endif
	}
	else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
		/* message/partial */

		/* this is an incomplete message part, probably a
		   large message that the sender has broken into
		   smaller parts and is sending us bit by bit. we
		   could save some info about it so that we could
		   piece this back together again once we get all the
		   parts? */
	}
	else if (GMIME_IS_MULTIPART (part)) {
		/* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */
#ifndef GMIME24
		debug_task ("detected multipart part");
		/* we'll get to finding out if this is a signed/encrypted multipart later... */
		if (task->parser_recursion++ < RECURSION_LIMIT) {
			g_mime_multipart_foreach ((GMimeMultipart *) part,
				mime_foreach_callback,
				md);
		}
		else {
			msg_err_task ("endless recursion detected: %d", task->parser_recursion);
			return;
		}
#endif
		type = (GMimeContentType *) g_mime_object_get_content_type (GMIME_OBJECT (
				part));
		mime_part = rspamd_mempool_alloc0 (task->task_pool,
				sizeof (struct mime_part));

		hdrs = g_mime_object_get_headers (GMIME_OBJECT (part));
		mime_part->raw_headers = g_hash_table_new (rspamd_strcase_hash,
				rspamd_strcase_equal);
		rspamd_mempool_add_destructor (task->task_pool,
				(rspamd_mempool_destruct_t) g_hash_table_unref,
				mime_part->raw_headers);
		if (hdrs != NULL) {
			process_raw_headers (task, mime_part->raw_headers,
					hdrs, strlen (hdrs));
			g_free (hdrs);
		}

		mime_part->type = type;
		/* XXX: we don't need it, but it's sometimes dereferenced */
		mime_part->content = g_byte_array_new ();
		mime_part->parent = md->parent;
		mime_part->filename = NULL;
		mime_part->mime = part;

		debug_task ("found part with content-type: %s/%s",
				type->type,
				type->subtype);
		g_ptr_array_add (task->parts, mime_part);

		md->parent = part;
	}
	else if (GMIME_IS_PART (part)) {
		/* a normal leaf part, could be text/plain or image/jpeg etc */
#ifdef GMIME24
		type = (GMimeContentType *) g_mime_object_get_content_type (GMIME_OBJECT (
					part));
#else
		type =
			(GMimeContentType *) g_mime_part_get_content_type (GMIME_PART (part));
#endif

		if (type == NULL) {
			msg_warn_task ("type of part is unknown, assume text/plain");
			type = g_mime_content_type_new ("text", "plain");
#ifdef GMIME24
			rspamd_mempool_add_destructor (task->task_pool,
				(rspamd_mempool_destruct_t) g_object_unref,				 type);
#else
			rspamd_mempool_add_destructor (task->task_pool,
				(rspamd_mempool_destruct_t) g_mime_content_type_destroy, type);
#endif
		}
		wrapper = g_mime_part_get_content_object (GMIME_PART (part));
#ifdef GMIME24
		if (wrapper != NULL && GMIME_IS_DATA_WRAPPER (wrapper)) {
#else
		if (wrapper != NULL) {
#endif
			part_stream = g_mime_stream_mem_new ();
			if (g_mime_data_wrapper_write_to_stream (wrapper,
				part_stream) != -1) {

				g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (
						part_stream), FALSE);
				part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (
							part_stream));
				g_object_unref (part_stream);
				mime_part =
					rspamd_mempool_alloc0 (task->task_pool,
						sizeof (struct mime_part));

				hdrs = g_mime_object_get_headers (GMIME_OBJECT (part));
				mime_part->raw_headers = g_hash_table_new (rspamd_strcase_hash,
						rspamd_strcase_equal);
				rspamd_mempool_add_destructor (task->task_pool,
					(rspamd_mempool_destruct_t) g_hash_table_unref,
					mime_part->raw_headers);
				if (hdrs != NULL) {
					process_raw_headers (task, mime_part->raw_headers,
							hdrs, strlen (hdrs));
					g_free (hdrs);
				}

				mime_part->type = type;
				mime_part->content = part_content;
				mime_part->parent = md->parent;
				mime_part->filename = g_mime_part_get_filename (GMIME_PART (
							part));
				mime_part->mime = part;

				debug_task ("found part with content-type: %s/%s",
					type->type,
					type->subtype);
				g_ptr_array_add (task->parts, mime_part);
				/* Skip empty parts */
				process_text_part (task,
					part_content,
					type,
					mime_part,
					md->parent,
					(part_content->len <= 0));
			}
			else {
				msg_warn_task ("write to stream failed: %d, %s", errno,
					strerror (errno));
			}
#ifndef GMIME24
			g_object_unref (wrapper);
#endif
		}
		else {
			msg_warn_task ("cannot get wrapper for mime part, type of part: %s/%s",
				type->type,
				type->subtype);
		}
	}
	else {
		g_assert_not_reached ();
	}
}

static void
destroy_message (void *pointer)
{
	GMimeMessage *msg = pointer;

	g_object_unref (msg);
}

/* Creates message from various data using libmagic to detect type */
static void
rspamd_message_from_data (struct rspamd_task *task, GByteArray *data,
		GMimeStream *stream)
{
	GMimeMessage *message;
	GMimePart *part;
	GMimeDataWrapper *wrapper;
	GMimeContentType *ct = NULL;
	const char *mb = NULL;
	gchar *mid;

	g_assert (data != NULL);

	message = g_mime_message_new (TRUE);
	task->message = message;
	if (task->from_envelope) {
		g_mime_message_set_sender (task->message,
				rspamd_task_get_sender (task));
	}

	if (task->cfg->libs_ctx) {
		mb = magic_buffer (task->cfg->libs_ctx->libmagic,
				data->data,
				data->len);

		if (mb) {
			ct = g_mime_content_type_new_from_string (mb);
		}
	}

	msg_warn_task ("construct fake mime of type: %s", mb);

	part = g_mime_part_new ();

	if (ct != NULL) {
		g_mime_object_set_content_type (GMIME_OBJECT (part), ct);
		g_object_unref (ct);
	}

#ifdef GMIME24
	wrapper = g_mime_data_wrapper_new_with_stream (stream,
			GMIME_CONTENT_ENCODING_8BIT);
#else
	wrapper = g_mime_data_wrapper_new_with_stream (stream,
				GMIME_PART_ENCODING_8BIT);
#endif

	g_mime_part_set_content_object (part, wrapper);
	g_mime_message_set_mime_part (task->message, GMIME_OBJECT (part));
	/* Register destructors */
	rspamd_mempool_add_destructor (task->task_pool,
			(rspamd_mempool_destruct_t) g_object_unref, wrapper);
	rspamd_mempool_add_destructor (task->task_pool,
			(rspamd_mempool_destruct_t) g_object_unref, part);
	rspamd_mempool_add_destructor (task->task_pool,
			(rspamd_mempool_destruct_t) destroy_message, task->message);

	/* Generate message ID */
	mid = g_mime_utils_generate_message_id ("localhost.localdomain");
	rspamd_mempool_add_destructor (task->task_pool,
			(rspamd_mempool_destruct_t) g_free, mid);
	g_mime_message_set_message_id (task->message, mid);
	task->message_id = mid;
	task->queue_id = mid;

	/* Set headers for message */
	if (task->subject) {
		g_mime_message_set_subject (task->message, task->subject);
	}
}

gboolean
rspamd_message_parse (struct rspamd_task *task)
{
	GMimeMessage *message;
	GMimeParser *parser;
	GMimeStream *stream;
	GByteArray *tmp;
	GList *first, *cur;
	GMimeObject *parent;
	const GMimeContentType *ct;
	struct raw_header *rh;
	struct mime_text_part *p1, *p2;
	struct mime_foreach_data md;
	struct received_header *recv;
	gchar *url_str;
	const gchar *url_end, *p, *end;
	struct rspamd_url *subject_url;
	gsize len;
	goffset hdr_pos;
	gint rc, state = 0, diff, *pdiff;
	guint tw, dw;

	tmp = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray));
	p = task->msg.begin;
	len = task->msg.len;

	/* Skip any space characters to avoid some bad messages to be unparsed */
	while (len > 0 && g_ascii_isspace (*p)) {
		p ++;
		len --;
	}

	tmp->data = (guint8 *)p;
	tmp->len = len;

	stream = g_mime_stream_mem_new_with_byte_array (tmp);
	/*
	 * This causes g_mime_stream not to free memory by itself as it is memory allocated by
	 * pool allocator
	 */
	g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (stream), FALSE);

	if (task->flags & RSPAMD_TASK_FLAG_MIME) {

		debug_task ("construct mime parser from string length %d",
				(gint) task->msg.len);
		/* create a new parser object to parse the stream */
		parser = g_mime_parser_new_with_stream (stream);

		/* parse the message from the stream */
		message = g_mime_parser_construct_message (parser);

		if (message == NULL) {
			if (!task->cfg->allow_raw_input) {
				msg_err_task ("cannot construct mime from stream");
				g_set_error (&task->err,
						rspamd_message_quark (),
						RSPAMD_FILTER_ERROR, \

						"cannot parse MIME in the message");
				/* TODO: backport to 0.9 */
				g_object_unref (parser);
				return FALSE;
			}
			else {
				task->flags &= ~RSPAMD_TASK_FLAG_MIME;
				rspamd_message_from_data (task, tmp, stream);
			}
		}
		else {
			GString str;

			task->message = message;
			rspamd_mempool_add_destructor (task->task_pool,
					(rspamd_mempool_destruct_t) destroy_message, task->message);
			str.str = tmp->data;
			str.len = tmp->len;

			hdr_pos = rspamd_string_find_eoh (&str);

			if (hdr_pos > 0 && hdr_pos < tmp->len) {
				task->raw_headers_content.begin = (gchar *) (p);
				task->raw_headers_content.len = (guint64) (hdr_pos);

				if (task->raw_headers_content.len > 0) {
					process_raw_headers (task, task->raw_headers,
							task->raw_headers_content.begin,
							task->raw_headers_content.len);
				}
			}
		}

		/* free the parser (and the stream) */
		g_object_unref (stream);
		g_object_unref (parser);
	}
	else {
		task->flags &= ~RSPAMD_TASK_FLAG_MIME;
		rspamd_message_from_data (task, tmp, stream);
		g_object_unref (stream);
	}


	/* Save message id for future use */
	task->message_id = g_mime_message_get_message_id (task->message);
	if (task->message_id == NULL) {
		task->message_id = "undef";
	}

	memset (&md, 0, sizeof (md));
	md.task = task;
#ifdef GMIME24
	g_mime_message_foreach (task->message, mime_foreach_callback, &md);
#else
	/*
	 * This is rather strange, but gmime 2.2 do NOT pass top-level part to foreach callback
	 * so we need to set up parent part by hands
	 */
	md.parent = g_mime_message_get_mime_part (task->message);
	g_object_unref (md.parent);
	g_mime_message_foreach_part (task->message, mime_foreach_callback, &md);
#endif

	debug_task ("found %ud parts in message", task->parts->len);
	if (task->queue_id == NULL) {
		task->queue_id = "undef";
	}

	rspamd_images_process (task);

	/* Parse received headers */
	first =
			rspamd_message_get_header (task, "Received", FALSE);
	cur = first;
	while (cur) {
		recv =
				rspamd_mempool_alloc0 (task->task_pool,
						sizeof (struct received_header));
		parse_recv_header (task->task_pool, cur->data, recv);
		g_ptr_array_add (task->received, recv);
		cur = g_list_next (cur);
	}

	/* Extract data from received header if we were not given IP */
	if (task->received->len > 0 && (task->flags & RSPAMD_TASK_FLAG_NO_IP)) {
		recv = g_ptr_array_index (task->received, 0);
		if (recv->real_ip) {
			if (!rspamd_parse_inet_address (&task->from_addr,
					recv->real_ip,
					0)) {
				msg_warn_task ("cannot get IP from received header: '%s'",
						recv->real_ip);
				task->from_addr = NULL;
			}
		}
		if (recv->real_hostname) {
			task->hostname = recv->real_hostname;
		}
	}

	if (task->from_envelope == NULL) {
		first = rspamd_message_get_header (task, "Return-Path", FALSE);

		if (first) {
			rh = first->data;
			task->from_envelope = internet_address_list_parse_string (rh->value);
			if (task->from_envelope) {
#ifdef GMIME24
				rspamd_mempool_add_destructor (task->task_pool,
						(rspamd_mempool_destruct_t) g_object_unref,
						task->from_envelope);
#else
				rspamd_mempool_add_destructor (task->task_pool,
					(rspamd_mempool_destruct_t) internet_address_list_destroy,
					task->from_envelope);
#endif
			}
		}
	}

	if (task->deliver_to == NULL) {
		first = rspamd_message_get_header (task, "Delivered-To", FALSE);

		if (first) {
			rh = first->data;
			task->deliver_to = rspamd_mempool_strdup (task->task_pool, rh->decoded);
		}
	}

	/* Set mime recipients and sender for the task */
	task->rcpt_mime = g_mime_message_get_all_recipients (task->message);
	if (task->rcpt_mime) {
#ifdef GMIME24
		rspamd_mempool_add_destructor (task->task_pool,
			(rspamd_mempool_destruct_t) g_object_unref,
			task->rcpt_mime);
#else
		rspamd_mempool_add_destructor (task->task_pool,
			(rspamd_mempool_destruct_t) internet_address_list_destroy,
			task->rcpt_mime);
#endif
	}
	first = rspamd_message_get_header (task, "From", FALSE);

	if (first) {
		rh = first->data;
		task->from_mime = internet_address_list_parse_string (rh->value);
		if (task->from_mime) {
#ifdef GMIME24
			rspamd_mempool_add_destructor (task->task_pool,
					(rspamd_mempool_destruct_t) g_object_unref,
					task->from_mime);
#else
			rspamd_mempool_add_destructor (task->task_pool,
					(rspamd_mempool_destruct_t) internet_address_list_destroy,
					task->from_mime);
#endif
		}
	}

	/* Parse urls inside Subject header */
	cur = rspamd_message_get_header (task, "Subject", FALSE);
	if (cur) {
		p = cur->data;
		len = strlen (p);
		end = p + len;

		while (p < end) {
			/* Search to the end of url */
			if (rspamd_url_find (task->task_pool, p, end - p, NULL, &url_end,
				&url_str, FALSE, &state)) {
				if (url_str != NULL) {
					subject_url = rspamd_mempool_alloc0 (task->task_pool,
							sizeof (struct rspamd_url));
					rc = rspamd_url_parse (subject_url, url_str,
							strlen (url_str), task->task_pool);

					if ((rc == URI_ERRNO_OK) && subject_url->hostlen > 0) {
						if (subject_url->protocol != PROTOCOL_MAILTO) {
							if (!g_hash_table_lookup (task->urls, subject_url)) {
								g_hash_table_insert (task->urls,
										subject_url,
										subject_url);
							}
						}
					}
					else if (rc != URI_ERRNO_OK) {
						msg_info_task ("extract of url '%s' failed: %s",
								url_str,
								rspamd_url_strerror (rc));
					}
				}
			}
			else {
				break;
			}
			p = url_end + 1;
		}
	}

	/* Calculate distance for 2-parts messages */
	if (task->text_parts->len == 2) {
		p1 = g_ptr_array_index (task->text_parts, 0);
		p2 = g_ptr_array_index (task->text_parts, 1);

		/* First of all check parent object */
		if (p1->parent && p1->parent == p2->parent) {
			parent = p1->parent;
			ct = g_mime_object_get_content_type (parent);
			if (ct == NULL ||
					!g_mime_content_type_is_type ((GMimeContentType *)ct,
							"multipart", "alternative")) {
				debug_task (
						"two parts are not belong to multipart/alternative container, skip check");
			}
			else {
				if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) &&
						p1->normalized_words && p2->normalized_words) {

					tw = MAX (p1->normalized_words->len, p2->normalized_words->len);

					if (tw > 0) {
						dw = rspamd_words_levenshtein_distance (task,
								p1->normalized_words,
								p2->normalized_words);
						diff = (100.0 * (gdouble)(tw - dw) / (gdouble)tw);

						debug_task (
								"different words: %d, total words: %d, "
								"got likeliness between parts of %d%%",
								dw, tw,
								diff);

						pdiff = rspamd_mempool_alloc (task->task_pool, sizeof (gint));
						*pdiff = diff;
						rspamd_mempool_set_variable (task->task_pool,
								"parts_distance",
								pdiff,
								NULL);
					}
				}
			}
		}
		else {
			debug_task (
					"message contains two parts but they are in different multi-parts");
		}
	}
	else {
		debug_task (
				"message has too many text parts, so do not try to compare "
				"them with each other");
	}

	if (task->queue_id) {
		msg_info_task ("loaded message; id: <%s>; queue-id: <%s>",
				task->message_id, task->queue_id);
	}
	else {
		msg_info_task ("loaded message; id: <%s>",
				task->message_id);
	}

	return TRUE;
}

GList *
rspamd_message_get_header (struct rspamd_task *task,
	const gchar *field,
	gboolean strong)
{
	GList *gret = NULL;
	struct raw_header *rh;

	rh = g_hash_table_lookup (task->raw_headers, field);

	if (rh == NULL) {
		return NULL;
	}

	while (rh) {
		if (strong) {
			if (strcmp (rh->name, field) == 0) {
				gret = g_list_prepend (gret, rh);
			}
		}
		else {
			gret = g_list_prepend (gret, rh);
		}
		rh = rh->next;
	}

	if (gret != NULL) {
		gret = g_list_reverse (gret);
		rspamd_mempool_add_destructor (task->task_pool,
			(rspamd_mempool_destruct_t)g_list_free, gret);
	}

	return gret;
}