aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/snowball/runtime
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2020-02-25 09:55:31 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2020-02-25 09:55:31 +0000
commitb87995255fa2ef0de97d509b8cd27860f014e90f (patch)
treeff7fcc84aa85fcd4cd129d94f6fb23ac5f91d4cb /contrib/snowball/runtime
parent52154a6c1dd7e46c174d4aab782494b92f955df5 (diff)
downloadrspamd-b87995255fa2ef0de97d509b8cd27860f014e90f.tar.gz
rspamd-b87995255fa2ef0de97d509b8cd27860f014e90f.zip
[Rework] Update snowball stemmer to 2.0 and remove all crap aside of UTF8
Diffstat (limited to 'contrib/snowball/runtime')
-rw-r--r--contrib/snowball/runtime/api.c10
-rw-r--r--contrib/snowball/runtime/api.h10
-rw-r--r--contrib/snowball/runtime/header.h3
-rw-r--r--contrib/snowball/runtime/utilities.c191
4 files changed, 119 insertions, 95 deletions
diff --git a/contrib/snowball/runtime/api.c b/contrib/snowball/runtime/api.c
index 40039ef4a..21ea5f246 100644
--- a/contrib/snowball/runtime/api.c
+++ b/contrib/snowball/runtime/api.c
@@ -2,7 +2,7 @@
#include <stdlib.h> /* for calloc, free */
#include "header.h"
-extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
+extern struct SN_env * SN_create_env(int S_size, int I_size)
{
struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env));
if (z == NULL) return NULL;
@@ -27,12 +27,6 @@ extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
if (z->I == NULL) goto error;
}
- if (B_size)
- {
- z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char));
- if (z->B == NULL) goto error;
- }
-
return z;
error:
SN_close_env(z, S_size);
@@ -52,7 +46,6 @@ extern void SN_close_env(struct SN_env * z, int S_size)
free(z->S);
}
free(z->I);
- free(z->B);
if (z->p) lose_s(z->p);
free(z);
}
@@ -63,4 +56,3 @@ extern int SN_set_current(struct SN_env * z, int size, const symbol * s)
z->c = 0;
return err;
}
-
diff --git a/contrib/snowball/runtime/api.h b/contrib/snowball/runtime/api.h
index 8b997f0c2..ba9d1c14b 100644
--- a/contrib/snowball/runtime/api.h
+++ b/contrib/snowball/runtime/api.h
@@ -16,11 +16,17 @@ struct SN_env {
int c; int l; int lb; int bra; int ket;
symbol * * S;
int * I;
- unsigned char * B;
};
-extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size);
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern struct SN_env * SN_create_env(int S_size, int I_size);
extern void SN_close_env(struct SN_env * z, int S_size);
extern int SN_set_current(struct SN_env * z, int size, const symbol * s);
+#ifdef __cplusplus
+}
+#endif
diff --git a/contrib/snowball/runtime/header.h b/contrib/snowball/runtime/header.h
index 4d3078f50..85a42fdb8 100644
--- a/contrib/snowball/runtime/header.h
+++ b/contrib/snowball/runtime/header.h
@@ -54,5 +54,6 @@ extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p);
extern symbol * slice_to(struct SN_env * z, symbol * p);
extern symbol * assign_to(struct SN_env * z, symbol * p);
-extern void debug(struct SN_env * z, int number, int line_count);
+extern int len_utf8(const symbol * p);
+extern void debug(struct SN_env * z, int number, int line_count);
diff --git a/contrib/snowball/runtime/utilities.c b/contrib/snowball/runtime/utilities.c
index 1840f0280..1cfd1a17d 100644
--- a/contrib/snowball/runtime/utilities.c
+++ b/contrib/snowball/runtime/utilities.c
@@ -5,8 +5,6 @@
#include "header.h"
-#define unless(C) if(!(C))
-
#define CREATE_SIZE 1
extern symbol * create_s(void) {
@@ -15,7 +13,7 @@ extern symbol * create_s(void) {
if (mem == NULL) return NULL;
p = (symbol *) (HEAD + (char *) mem);
CAPACITY(p) = CREATE_SIZE;
- SET_SIZE(p, CREATE_SIZE);
+ SET_SIZE(p, 0);
return p;
}
@@ -27,7 +25,7 @@ extern void lose_s(symbol * p) {
/*
new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new
- position, or 0 on failure.
+ position, or -1 on failure.
-- used to implement hop and next in the utf8 case.
*/
@@ -66,77 +64,95 @@ extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) {
/* Code for character groupings: utf8 cases */
static int get_utf8(const symbol * p, int c, int l, int * slot) {
- int b0, b1;
+ int b0, b1, b2;
if (c >= l) return 0;
b0 = p[c++];
if (b0 < 0xC0 || c == l) { /* 1100 0000 */
- * slot = b0; return 1;
+ *slot = b0;
+ return 1;
}
- b1 = p[c++];
+ b1 = p[c++] & 0x3F;
if (b0 < 0xE0 || c == l) { /* 1110 0000 */
- * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
+ *slot = (b0 & 0x1F) << 6 | b1;
+ return 2;
+ }
+ b2 = p[c++] & 0x3F;
+ if (b0 < 0xF0 || c == l) { /* 1111 0000 */
+ *slot = (b0 & 0xF) << 12 | b1 << 6 | b2;
+ return 3;
}
- * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[c] & 0x3F); return 3;
+ *slot = (b0 & 0xE) << 18 | b1 << 12 | b2 << 6 | (p[c] & 0x3F);
+ return 4;
}
static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
- int b0, b1;
+ int a, b;
if (c <= lb) return 0;
- b0 = p[--c];
- if (b0 < 0x80 || c == lb) { /* 1000 0000 */
- * slot = b0; return 1;
+ b = p[--c];
+ if (b < 0x80 || c == lb) { /* 1000 0000 */
+ *slot = b;
+ return 1;
}
- b1 = p[--c];
- if (b1 >= 0xC0 || c == lb) { /* 1100 0000 */
- * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
+ a = b & 0x3F;
+ b = p[--c];
+ if (b >= 0xC0 || c == lb) { /* 1100 0000 */
+ *slot = (b & 0x1F) << 6 | a;
+ return 2;
}
- * slot = (p[c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
+ a |= (b & 0x3F) << 6;
+ b = p[--c];
+ if (b >= 0xE0 || c == lb) { /* 1110 0000 */
+ *slot = (b & 0xF) << 12 | a;
+ return 3;
+ }
+ *slot = (p[--c] & 0xE) << 18 | (b & 0x3F) << 12 | a;
+ return 4;
}
extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
- int ch;
- int w = get_utf8(z->p, z->c, z->l, & ch);
- unless (w) return -1;
- if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
- return w;
- z->c += w;
+ int ch;
+ int w = get_utf8(z->p, z->c, z->l, & ch);
+ if (!w) return -1;
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return w;
+ z->c += w;
} while (repeat);
return 0;
}
extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
- int ch;
- int w = get_b_utf8(z->p, z->c, z->lb, & ch);
- unless (w) return -1;
- if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
- return w;
- z->c -= w;
+ int ch;
+ int w = get_b_utf8(z->p, z->c, z->lb, & ch);
+ if (!w) return -1;
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return w;
+ z->c -= w;
} while (repeat);
return 0;
}
extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
- int ch;
- int w = get_utf8(z->p, z->c, z->l, & ch);
- unless (w) return -1;
- unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
- return w;
- z->c += w;
+ int ch;
+ int w = get_utf8(z->p, z->c, z->l, & ch);
+ if (!w) return -1;
+ if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
+ return w;
+ z->c += w;
} while (repeat);
return 0;
}
extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
- int ch;
- int w = get_b_utf8(z->p, z->c, z->lb, & ch);
- unless (w) return -1;
- unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
- return w;
- z->c -= w;
+ int ch;
+ int w = get_b_utf8(z->p, z->c, z->lb, & ch);
+ if (!w) return -1;
+ if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
+ return w;
+ z->c -= w;
} while (repeat);
return 0;
}
@@ -145,48 +161,48 @@ extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min,
extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
- int ch;
- if (z->c >= z->l) return -1;
- ch = z->p[z->c];
- if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
- return 1;
- z->c++;
+ int ch;
+ if (z->c >= z->l) return -1;
+ ch = z->p[z->c];
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return 1;
+ z->c++;
} while (repeat);
return 0;
}
extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
- int ch;
- if (z->c <= z->lb) return -1;
- ch = z->p[z->c - 1];
- if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
- return 1;
- z->c--;
+ int ch;
+ if (z->c <= z->lb) return -1;
+ ch = z->p[z->c - 1];
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return 1;
+ z->c--;
} while (repeat);
return 0;
}
extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
- int ch;
- if (z->c >= z->l) return -1;
- ch = z->p[z->c];
- unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
- return 1;
- z->c++;
+ int ch;
+ if (z->c >= z->l) return -1;
+ ch = z->p[z->c];
+ if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
+ return 1;
+ z->c++;
} while (repeat);
return 0;
}
extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
- int ch;
- if (z->c <= z->lb) return -1;
- ch = z->p[z->c - 1];
- unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
- return 1;
- z->c--;
+ int ch;
+ if (z->c <= z->lb) return -1;
+ ch = z->p[z->c - 1];
+ if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
+ return 1;
+ z->c--;
} while (repeat);
return 0;
}
@@ -215,7 +231,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
int j = v_size;
int c = z->c; int l = z->l;
- symbol * q = z->p + c;
+ const symbol * q = z->p + c;
const struct among * w;
@@ -224,7 +240,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
int first_key_inspected = 0;
- while(1) {
+ while (1) {
int k = i + ((j - i) >> 1);
int diff = 0;
int common = common_i < common_j ? common_i : common_j; /* smaller */
@@ -237,8 +253,13 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
common++;
}
}
- if (diff < 0) { j = k; common_j = common; }
- else { i = k; common_i = common; }
+ if (diff < 0) {
+ j = k;
+ common_j = common;
+ } else {
+ i = k;
+ common_i = common;
+ }
if (j - i <= 1) {
if (i > 0) break; /* v->s has been inspected */
if (j == i) break; /* only one item in v */
@@ -251,7 +272,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
first_key_inspected = 1;
}
}
- while(1) {
+ while (1) {
w = v + i;
if (common_i >= w->s_size) {
z->c = c + w->s_size;
@@ -275,7 +296,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) {
int j = v_size;
int c = z->c; int lb = z->lb;
- symbol * q = z->p + c - 1;
+ const symbol * q = z->p + c - 1;
const struct among * w;
@@ -284,7 +305,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) {
int first_key_inspected = 0;
- while(1) {
+ while (1) {
int k = i + ((j - i) >> 1);
int diff = 0;
int common = common_i < common_j ? common_i : common_j;
@@ -306,7 +327,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) {
first_key_inspected = 1;
}
}
- while(1) {
+ while (1) {
w = v + i;
if (common_i >= w->s_size) {
z->c = c - w->s_size;
@@ -367,11 +388,10 @@ extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const
z->l += adjustment;
if (z->c >= c_ket)
z->c += adjustment;
- else
- if (z->c > c_bra)
- z->c = c_bra;
+ else if (z->c > c_bra)
+ z->c = c_bra;
}
- unless (s_size == 0) memmove(z->p + c_bra, s, s_size * sizeof(symbol));
+ if (s_size) memmove(z->p + c_bra, s, s_size * sizeof(symbol));
if (adjptr != NULL)
*adjptr = adjustment;
return 0;
@@ -417,12 +437,7 @@ extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbo
}
extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) {
- int adjustment;
- if (replace_s(z, bra, ket, SIZE(p), p, &adjustment))
- return -1;
- if (bra <= z->bra) z->bra += adjustment;
- if (bra <= z->ket) z->ket += adjustment;
- return 0;
+ return insert_s(z, bra, ket, SIZE(p), p);
}
extern symbol * slice_to(struct SN_env * z, symbol * p) {
@@ -455,6 +470,16 @@ extern symbol * assign_to(struct SN_env * z, symbol * p) {
return p;
}
+extern int len_utf8(const symbol * p) {
+ int size = SIZE(p);
+ int len = 0;
+ while (size--) {
+ symbol b = *p++;
+ if (b >= 0xC0 || b < 0x80) ++len;
+ }
+ return len;
+}
+
#if 0
extern void debug(struct SN_env * z, int number, int line_count) {
int i;