}
#endif
/* Ugly workaround */
- part->normalized_words = rspamd_tokenize_text (part->content->data,
- part->content->len, IS_PART_UTF (part), task->cfg,
- part->exceptions, FALSE,
- NULL);
+ if (IS_PART_HTML (part)) {
+ part->normalized_words = rspamd_tokenize_text (
+ part->content->data,
+ part->content->len, IS_PART_UTF (part), task->cfg,
+ part->exceptions, FALSE,
+ NULL);
+ }
+ else {
+ part->normalized_words = rspamd_tokenize_text (
+ part->stripped_content->data,
+ part->stripped_content->len, IS_PART_UTF (part), task->cfg,
+ part->exceptions, FALSE,
+ NULL);
+ }
if (part->normalized_words) {
part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
enum {
skip_delimiters = 0,
feed_token,
- skip_exception,
process_signature
} state = skip_delimiters;
switch (state) {
case skip_delimiters:
if (ex != NULL && p - buf->begin == ex->pos) {
- if (ex->type == RSPAMD_EXCEPTION_URL) {
- token->begin = "!!EX!!";
- token->len = sizeof ("!!EX!!") - 1;
- token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
- processed = token->len;
- }
- state = skip_exception;
- continue;
+ goto process_exception;
}
else if (u_isgraph (uc)) {
- if (!u_ispunct (uc)) {
+ if (u_isalnum (uc)) {
state = feed_token;
token->begin = p;
continue;
case feed_token:
if (ex != NULL && p - buf->begin == (gint)ex->pos) {
token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
- goto set_token;
+ goto process_exception;
}
else if (!u_isgraph (uc) || u_ispunct (uc)) {
token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
}
processed ++;
break;
- case skip_exception:
- *cur = p + ex->len;
- *exceptions = g_list_next (*exceptions);
- goto set_token;
- break;
case process_signature:
if (*p == '\r' || *p == '\n') {
msg_debug ("signature found: %*s", (gint)siglen, sig);
*cur = &s[i];
+ return TRUE;
+
+process_exception:
+ if (ex->type == RSPAMD_EXCEPTION_URL) {
+ token->begin = "!!EX!!";
+ token->len = sizeof ("!!EX!!") - 1;
+ token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
+ processed = token->len;
+ }
+ *cur = p + ex->len;
+ *exceptions = g_list_next (*exceptions);
+
+ if (rl) {
+ *rl = processed;
+ }
+
return TRUE;
}