diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-02-25 09:55:31 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-02-25 09:55:31 +0000 |
commit | b87995255fa2ef0de97d509b8cd27860f014e90f (patch) | |
tree | ff7fcc84aa85fcd4cd129d94f6fb23ac5f91d4cb /contrib/snowball/runtime | |
parent | 52154a6c1dd7e46c174d4aab782494b92f955df5 (diff) | |
download | rspamd-b87995255fa2ef0de97d509b8cd27860f014e90f.tar.gz rspamd-b87995255fa2ef0de97d509b8cd27860f014e90f.zip |
[Rework] Update snowball stemmer to 2.0 and remove all crap aside of UTF8
Diffstat (limited to 'contrib/snowball/runtime')
-rw-r--r-- | contrib/snowball/runtime/api.c | 10 | ||||
-rw-r--r-- | contrib/snowball/runtime/api.h | 10 | ||||
-rw-r--r-- | contrib/snowball/runtime/header.h | 3 | ||||
-rw-r--r-- | contrib/snowball/runtime/utilities.c | 191 |
4 files changed, 119 insertions, 95 deletions
diff --git a/contrib/snowball/runtime/api.c b/contrib/snowball/runtime/api.c index 40039ef4a..21ea5f246 100644 --- a/contrib/snowball/runtime/api.c +++ b/contrib/snowball/runtime/api.c @@ -2,7 +2,7 @@ #include <stdlib.h> /* for calloc, free */ #include "header.h" -extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size) +extern struct SN_env * SN_create_env(int S_size, int I_size) { struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env)); if (z == NULL) return NULL; @@ -27,12 +27,6 @@ extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size) if (z->I == NULL) goto error; } - if (B_size) - { - z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char)); - if (z->B == NULL) goto error; - } - return z; error: SN_close_env(z, S_size); @@ -52,7 +46,6 @@ extern void SN_close_env(struct SN_env * z, int S_size) free(z->S); } free(z->I); - free(z->B); if (z->p) lose_s(z->p); free(z); } @@ -63,4 +56,3 @@ extern int SN_set_current(struct SN_env * z, int size, const symbol * s) z->c = 0; return err; } - diff --git a/contrib/snowball/runtime/api.h b/contrib/snowball/runtime/api.h index 8b997f0c2..ba9d1c14b 100644 --- a/contrib/snowball/runtime/api.h +++ b/contrib/snowball/runtime/api.h @@ -16,11 +16,17 @@ struct SN_env { int c; int l; int lb; int bra; int ket; symbol * * S; int * I; - unsigned char * B; }; -extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size); +#ifdef __cplusplus +extern "C" { +#endif + +extern struct SN_env * SN_create_env(int S_size, int I_size); extern void SN_close_env(struct SN_env * z, int S_size); extern int SN_set_current(struct SN_env * z, int size, const symbol * s); +#ifdef __cplusplus +} +#endif diff --git a/contrib/snowball/runtime/header.h b/contrib/snowball/runtime/header.h index 4d3078f50..85a42fdb8 100644 --- a/contrib/snowball/runtime/header.h +++ b/contrib/snowball/runtime/header.h @@ -54,5 +54,6 @@ extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p); extern symbol * slice_to(struct SN_env * z, symbol * p); extern symbol * assign_to(struct SN_env * z, symbol * p); -extern void debug(struct SN_env * z, int number, int line_count); +extern int len_utf8(const symbol * p); +extern void debug(struct SN_env * z, int number, int line_count); diff --git a/contrib/snowball/runtime/utilities.c b/contrib/snowball/runtime/utilities.c index 1840f0280..1cfd1a17d 100644 --- a/contrib/snowball/runtime/utilities.c +++ b/contrib/snowball/runtime/utilities.c @@ -5,8 +5,6 @@ #include "header.h" -#define unless(C) if(!(C)) - #define CREATE_SIZE 1 extern symbol * create_s(void) { @@ -15,7 +13,7 @@ extern symbol * create_s(void) { if (mem == NULL) return NULL; p = (symbol *) (HEAD + (char *) mem); CAPACITY(p) = CREATE_SIZE; - SET_SIZE(p, CREATE_SIZE); + SET_SIZE(p, 0); return p; } @@ -27,7 +25,7 @@ extern void lose_s(symbol * p) { /* new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new - position, or 0 on failure. + position, or -1 on failure. -- used to implement hop and next in the utf8 case. */ @@ -66,77 +64,95 @@ extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) { /* Code for character groupings: utf8 cases */ static int get_utf8(const symbol * p, int c, int l, int * slot) { - int b0, b1; + int b0, b1, b2; if (c >= l) return 0; b0 = p[c++]; if (b0 < 0xC0 || c == l) { /* 1100 0000 */ - * slot = b0; return 1; + *slot = b0; + return 1; } - b1 = p[c++]; + b1 = p[c++] & 0x3F; if (b0 < 0xE0 || c == l) { /* 1110 0000 */ - * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2; + *slot = (b0 & 0x1F) << 6 | b1; + return 2; + } + b2 = p[c++] & 0x3F; + if (b0 < 0xF0 || c == l) { /* 1111 0000 */ + *slot = (b0 & 0xF) << 12 | b1 << 6 | b2; + return 3; } - * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[c] & 0x3F); return 3; + *slot = (b0 & 0xE) << 18 | b1 << 12 | b2 << 6 | (p[c] & 0x3F); + return 4; } static int get_b_utf8(const symbol * p, int c, int lb, int * slot) { - int b0, b1; + int a, b; if (c <= lb) return 0; - b0 = p[--c]; - if (b0 < 0x80 || c == lb) { /* 1000 0000 */ - * slot = b0; return 1; + b = p[--c]; + if (b < 0x80 || c == lb) { /* 1000 0000 */ + *slot = b; + return 1; } - b1 = p[--c]; - if (b1 >= 0xC0 || c == lb) { /* 1100 0000 */ - * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2; + a = b & 0x3F; + b = p[--c]; + if (b >= 0xC0 || c == lb) { /* 1100 0000 */ + *slot = (b & 0x1F) << 6 | a; + return 2; } - * slot = (p[c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3; + a |= (b & 0x3F) << 6; + b = p[--c]; + if (b >= 0xE0 || c == lb) { /* 1110 0000 */ + *slot = (b & 0xF) << 12 | a; + return 3; + } + *slot = (p[--c] & 0xE) << 18 | (b & 0x3F) << 12 | a; + return 4; } extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { - int ch; - int w = get_utf8(z->p, z->c, z->l, & ch); - unless (w) return -1; - if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) - return w; - z->c += w; + int ch; + int w = get_utf8(z->p, z->c, z->l, & ch); + if (!w) return -1; + if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return w; + z->c += w; } while (repeat); return 0; } extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { - int ch; - int w = get_b_utf8(z->p, z->c, z->lb, & ch); - unless (w) return -1; - if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) - return w; - z->c -= w; + int ch; + int w = get_b_utf8(z->p, z->c, z->lb, & ch); + if (!w) return -1; + if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return w; + z->c -= w; } while (repeat); return 0; } extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { - int ch; - int w = get_utf8(z->p, z->c, z->l, & ch); - unless (w) return -1; - unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) - return w; - z->c += w; + int ch; + int w = get_utf8(z->p, z->c, z->l, & ch); + if (!w) return -1; + if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) + return w; + z->c += w; } while (repeat); return 0; } extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { - int ch; - int w = get_b_utf8(z->p, z->c, z->lb, & ch); - unless (w) return -1; - unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) - return w; - z->c -= w; + int ch; + int w = get_b_utf8(z->p, z->c, z->lb, & ch); + if (!w) return -1; + if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) + return w; + z->c -= w; } while (repeat); return 0; } @@ -145,48 +161,48 @@ extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { - int ch; - if (z->c >= z->l) return -1; - ch = z->p[z->c]; - if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) - return 1; - z->c++; + int ch; + if (z->c >= z->l) return -1; + ch = z->p[z->c]; + if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return 1; + z->c++; } while (repeat); return 0; } extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { - int ch; - if (z->c <= z->lb) return -1; - ch = z->p[z->c - 1]; - if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) - return 1; - z->c--; + int ch; + if (z->c <= z->lb) return -1; + ch = z->p[z->c - 1]; + if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return 1; + z->c--; } while (repeat); return 0; } extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { - int ch; - if (z->c >= z->l) return -1; - ch = z->p[z->c]; - unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) - return 1; - z->c++; + int ch; + if (z->c >= z->l) return -1; + ch = z->p[z->c]; + if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) + return 1; + z->c++; } while (repeat); return 0; } extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { - int ch; - if (z->c <= z->lb) return -1; - ch = z->p[z->c - 1]; - unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) - return 1; - z->c--; + int ch; + if (z->c <= z->lb) return -1; + ch = z->p[z->c - 1]; + if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) + return 1; + z->c--; } while (repeat); return 0; } @@ -215,7 +231,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) { int j = v_size; int c = z->c; int l = z->l; - symbol * q = z->p + c; + const symbol * q = z->p + c; const struct among * w; @@ -224,7 +240,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) { int first_key_inspected = 0; - while(1) { + while (1) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; /* smaller */ @@ -237,8 +253,13 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) { common++; } } - if (diff < 0) { j = k; common_j = common; } - else { i = k; common_i = common; } + if (diff < 0) { + j = k; + common_j = common; + } else { + i = k; + common_i = common; + } if (j - i <= 1) { if (i > 0) break; /* v->s has been inspected */ if (j == i) break; /* only one item in v */ @@ -251,7 +272,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) { first_key_inspected = 1; } } - while(1) { + while (1) { w = v + i; if (common_i >= w->s_size) { z->c = c + w->s_size; @@ -275,7 +296,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) { int j = v_size; int c = z->c; int lb = z->lb; - symbol * q = z->p + c - 1; + const symbol * q = z->p + c - 1; const struct among * w; @@ -284,7 +305,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) { int first_key_inspected = 0; - while(1) { + while (1) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; @@ -306,7 +327,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) { first_key_inspected = 1; } } - while(1) { + while (1) { w = v + i; if (common_i >= w->s_size) { z->c = c - w->s_size; @@ -367,11 +388,10 @@ extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const z->l += adjustment; if (z->c >= c_ket) z->c += adjustment; - else - if (z->c > c_bra) - z->c = c_bra; + else if (z->c > c_bra) + z->c = c_bra; } - unless (s_size == 0) memmove(z->p + c_bra, s, s_size * sizeof(symbol)); + if (s_size) memmove(z->p + c_bra, s, s_size * sizeof(symbol)); if (adjptr != NULL) *adjptr = adjustment; return 0; @@ -417,12 +437,7 @@ extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbo } extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) { - int adjustment; - if (replace_s(z, bra, ket, SIZE(p), p, &adjustment)) - return -1; - if (bra <= z->bra) z->bra += adjustment; - if (bra <= z->ket) z->ket += adjustment; - return 0; + return insert_s(z, bra, ket, SIZE(p), p); } extern symbol * slice_to(struct SN_env * z, symbol * p) { @@ -455,6 +470,16 @@ extern symbol * assign_to(struct SN_env * z, symbol * p) { return p; } +extern int len_utf8(const symbol * p) { + int size = SIZE(p); + int len = 0; + while (size--) { + symbol b = *p++; + if (b >= 0xC0 || b < 0x80) ++len; + } + return len; +} + #if 0 extern void debug(struct SN_env * z, int number, int line_count) { int i; |