aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/plugins/chartable.c27
1 files changed, 17 insertions, 10 deletions
diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c
index 892815a5c..67ade942d 100644
--- a/src/plugins/chartable.c
+++ b/src/plugins/chartable.c
@@ -26,6 +26,8 @@
#include "libmime/message.h"
#include "rspamd.h"
#include "libstat/stat_api.h"
+#include "unicode/utf8.h"
+#include "unicode/uchar.h"
#define DEFAULT_SYMBOL "R_MIXED_CHARSET"
#define DEFAULT_URL_SYMBOL "R_MIXED_CHARSET_URL"
@@ -170,9 +172,9 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
{
const gchar *p, *end, *c;
gdouble badness = 0.0;
- gunichar uc;
- gint sc, last_sc;
- guint same_script_count = 0, nsym = 0;
+ UChar32 uc;
+ UBlockCode sc, last_sc;
+ guint same_script_count = 0, nsym = 0, i = 0;
enum {
start_process = 0,
got_alpha,
@@ -187,16 +189,21 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
/* We assume that w is normalized */
- while (p < end) {
- uc = g_utf8_get_char (p);
+ while (p + i < end) {
+ U8_NEXT_UNSAFE (p, i, uc);
+
+ if (u_isalpha (uc)) {
+ sc = ublock_getCode (uc);
- if (g_unichar_isalpha (uc)) {
- sc = g_unichar_get_script (uc);
+ if (sc <= UBLOCK_LATIN_EXTENDED_B) {
+ /* Assume all latin characters as basic latin */
+ sc = UBLOCK_BASIC_LATIN;
+ }
if (state == got_digit) {
/* Penalize digit -> alpha translations */
- if (!is_url && sc != G_UNICODE_SCRIPT_COMMON &&
- sc != G_UNICODE_SCRIPT_LATIN && prev_state != start_process) {
+ if (!is_url && sc != UBLOCK_BASIC_LATIN &&
+ prev_state != start_process) {
badness += 1.0;
}
}
@@ -222,7 +229,7 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
state = got_alpha;
}
- else if (g_unichar_isdigit (uc)) {
+ else if (u_isdigit (uc)) {
if (state != got_digit) {
prev_state = state;
}