diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-02-25 09:55:31 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-02-25 09:55:31 +0000 |
commit | b87995255fa2ef0de97d509b8cd27860f014e90f (patch) | |
tree | ff7fcc84aa85fcd4cd129d94f6fb23ac5f91d4cb /contrib/snowball/compiler/driver.c | |
parent | 52154a6c1dd7e46c174d4aab782494b92f955df5 (diff) | |
download | rspamd-b87995255fa2ef0de97d509b8cd27860f014e90f.tar.gz rspamd-b87995255fa2ef0de97d509b8cd27860f014e90f.zip |
[Rework] Update snowball stemmer to 2.0 and remove all crap aside of UTF8
Diffstat (limited to 'contrib/snowball/compiler/driver.c')
-rw-r--r-- | contrib/snowball/compiler/driver.c | 466 |
1 files changed, 391 insertions, 75 deletions
diff --git a/contrib/snowball/compiler/driver.c b/contrib/snowball/compiler/driver.c index fbb1e9cae..9cbe447a5 100644 --- a/contrib/snowball/compiler/driver.c +++ b/contrib/snowball/compiler/driver.c @@ -1,48 +1,86 @@ +#include <ctype.h> /* for toupper etc */ #include <stdio.h> /* for fprintf etc */ #include <stdlib.h> /* for free etc */ -#include <string.h> /* for strlen */ +#include <string.h> /* for strcmp */ #include "header.h" -#define DEFAULT_PACKAGE "org.tartarus.snowball.ext" -#define DEFAULT_BASE_CLASS "org.tartarus.snowball.SnowballProgram" -#define DEFAULT_AMONG_CLASS "org.tartarus.snowball.Among" -#define DEFAULT_STRING_CLASS "java.lang.StringBuilder" +#define DEFAULT_JAVA_PACKAGE "org.tartarus.snowball.ext" +#define DEFAULT_JAVA_BASE_CLASS "org.tartarus.snowball.SnowballProgram" +#define DEFAULT_JAVA_AMONG_CLASS "org.tartarus.snowball.Among" +#define DEFAULT_JAVA_STRING_CLASS "java.lang.StringBuilder" + +#define DEFAULT_GO_PACKAGE "snowball" +#define DEFAULT_GO_SNOWBALL_RUNTIME "github.com/snowballstem/snowball/go" + +#define DEFAULT_CS_NAMESPACE "Snowball" +#define DEFAULT_CS_BASE_CLASS "Stemmer" +#define DEFAULT_CS_AMONG_CLASS "Among" +#define DEFAULT_CS_STRING_CLASS "StringBuilder" + +#define DEFAULT_JS_BASE_CLASS "BaseStemmer" + +#define DEFAULT_PYTHON_BASE_CLASS "BaseStemmer" static int eq(const char * s1, const char * s2) { - int s1_len = strlen(s1); - int s2_len = strlen(s2); - return s1_len == s2_len && memcmp(s1, s2, s1_len) == 0; + return strcmp(s1, s2) == 0; } -static void print_arglist(void) { - fprintf(stderr, "Usage: snowball <file> [options]\n\n" - "options are: [-o[utput] file]\n" - " [-s[yntax]]\n" +static void print_arglist(int exit_code) { + FILE * f = exit_code ? stderr : stdout; + fprintf(f, "Usage: snowball SOURCE_FILE... [OPTIONS]\n\n" + "Supported options:\n" + " -o[utput] file\n" + " -s[yntax]\n" + " -comments\n" #ifndef DISABLE_JAVA - " [-j[ava]]\n" + " -j[ava]\n" #endif - " [-c++]\n" - " [-w[idechars]]\n" - " [-u[tf8]]\n" - " [-n[ame] class name]\n" - " [-ep[refix] string]\n" - " [-vp[refix] string]\n" - " [-i[nclude] directory]\n" - " [-r[untime] path to runtime headers]\n" -#ifndef DISABLE_JAVA - " [-p[arentclassname] fully qualified parent class name]\n" - " [-P[ackage] package name for stemmers]\n" - " [-S[tringclass] StringBuffer-compatible class]\n" - " [-a[mongclass] fully qualified name of the Among class]\n" +#ifndef DISABLE_CSHARP + " -cs[harp]\n" +#endif + " -c++\n" +#ifndef DISABLE_PASCAL + " -pascal\n" +#endif +#ifndef DISABLE_PYTHON + " -py[thon]\n" #endif +#ifndef DISABLE_JS + " -js\n" +#endif +#ifndef DISABLE_RUST + " -rust\n" +#endif +#ifndef DISABLE_GO + " -go\n" +#endif + " -w[idechars]\n" + " -u[tf8]\n" + " -n[ame] class name\n" + " -ep[refix] string\n" + " -vp[refix] string\n" + " -i[nclude] directory\n" + " -r[untime] path to runtime headers\n" + " -p[arentclassname] fully qualified parent class name\n" +#if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP) + " -P[ackage] package name for stemmers\n" + " -S[tringclass] StringBuffer-compatible class\n" + " -a[mongclass] fully qualified name of the Among class\n" +#endif +#ifndef DISABLE_GO + " -gop[ackage] Go package name for stemmers\n" + " -gor[untime] Go snowball runtime package\n" +#endif + " --help display this help and exit\n" + " --version output version information and exit\n" ); - exit(1); + exit(exit_code); } static void check_lim(int i, int argc) { if (i >= argc) { fprintf(stderr, "argument list is one short\n"); - print_arglist(); + print_arglist(1); } } @@ -57,35 +95,47 @@ static FILE * get_output(symbol * b) { return output; } -static void read_options(struct options * o, int argc, char * argv[]) { +static int read_options(struct options * o, int argc, char * argv[]) { char * s; - int i = 2; + int i = 1; + int new_argc = 1; + /* Note down the last option used to specify an explicit encoding so + * we can warn we ignored it for languages with a fixed encoding. + */ + const char * encoding_opt = NULL; /* set defaults: */ o->output_file = 0; o->syntax_tree = false; - o->externals_prefix = ""; + o->comments = false; + o->externals_prefix = NULL; o->variables_prefix = 0; o->runtime_path = 0; - o->parent_class_name = DEFAULT_BASE_CLASS; - o->string_class = DEFAULT_STRING_CLASS; - o->among_class = DEFAULT_AMONG_CLASS; - o->package = DEFAULT_PACKAGE; - o->name = ""; + o->parent_class_name = NULL; + o->string_class = NULL; + o->among_class = NULL; + o->package = NULL; + o->go_snowball_runtime = DEFAULT_GO_SNOWBALL_RUNTIME; + o->name = NULL; o->make_lang = LANG_C; - o->widechars = false; o->includes = 0; o->includes_end = 0; - o->utf8 = false; + o->encoding = ENC_SINGLEBYTE; /* read options: */ - repeat { - if (i >= argc) break; + while (i < argc) { s = argv[i++]; - { if (eq(s, "-o") || eq(s, "-output")) { - check_lim(i, argc); + if (s[0] != '-') { + /* Non-option argument - shuffle down. */ + argv[new_argc++] = s; + continue; + } + + { + if (eq(s, "-o") || eq(s, "-output")) { + check_lim(i, argc); o->output_file = argv[i++]; continue; } @@ -94,10 +144,33 @@ static void read_options(struct options * o, int argc, char * argv[]) { o->name = argv[i++]; continue; } +#ifndef DISABLE_JS + if (eq(s, "-js")) { + o->make_lang = LANG_JAVASCRIPT; + continue; + } +#endif +#ifndef DISABLE_RUST + if (eq(s, "-rust")) { + o->make_lang = LANG_RUST; + continue; + } +#endif +#ifndef DISABLE_GO + if (eq(s, "-go")) { + o->make_lang = LANG_GO; + continue; + } +#endif #ifndef DISABLE_JAVA if (eq(s, "-j") || eq(s, "-java")) { o->make_lang = LANG_JAVA; - o->widechars = true; + continue; + } +#endif +#ifndef DISABLE_CSHARP + if (eq(s, "-cs") || eq(s, "-csharp")) { + o->make_lang = LANG_CSHARP; continue; } #endif @@ -105,15 +178,31 @@ static void read_options(struct options * o, int argc, char * argv[]) { o->make_lang = LANG_CPLUSPLUS; continue; } +#ifndef DISABLE_PASCAL + if (eq(s, "-pascal")) { + o->make_lang = LANG_PASCAL; + continue; + } +#endif +#ifndef DISABLE_PYTHON + if (eq(s, "-py") || eq(s, "-python")) { + o->make_lang = LANG_PYTHON; + continue; + } +#endif if (eq(s, "-w") || eq(s, "-widechars")) { - o->widechars = true; - o->utf8 = false; + encoding_opt = s; + o->encoding = ENC_WIDECHARS; continue; } if (eq(s, "-s") || eq(s, "-syntax")) { o->syntax_tree = true; continue; } + if (eq(s, "-comments")) { + o->comments = true; + continue; + } if (eq(s, "-ep") || eq(s, "-eprefix")) { check_lim(i, argc); o->externals_prefix = argv[i++]; @@ -145,16 +234,16 @@ static void read_options(struct options * o, int argc, char * argv[]) { continue; } if (eq(s, "-u") || eq(s, "-utf8")) { - o->utf8 = true; - o->widechars = false; + encoding_opt = s; + o->encoding = ENC_UTF8; continue; } -#ifndef DISABLE_JAVA if (eq(s, "-p") || eq(s, "-parentclassname")) { check_lim(i, argc); o->parent_class_name = argv[i++]; continue; } +#if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP) if (eq(s, "-P") || eq(s, "-Package")) { check_lim(i, argc); o->package = argv[i++]; @@ -171,44 +260,216 @@ static void read_options(struct options * o, int argc, char * argv[]) { continue; } #endif +#ifndef DISABLE_GO + if (eq(s, "-gop") || eq(s, "-gopackage")) { + check_lim(i, argc); + o->package = argv[i++]; + continue; + } + if (eq(s, "-gor") || eq(s, "-goruntime")) { + check_lim(i, argc); + o->go_snowball_runtime = argv[i++]; + continue; + } +#endif + if (eq(s, "--help")) { + print_arglist(0); + } + + if (eq(s, "--version")) { + printf("Snowball compiler version " SNOWBALL_VERSION "\n"); + exit(0); + } + fprintf(stderr, "'%s' misplaced\n", s); - print_arglist(); + print_arglist(1); } } + if (new_argc == 1) { + fprintf(stderr, "no source files specified\n"); + print_arglist(1); + } + argv[new_argc] = NULL; + + /* Set language-dependent defaults. */ + switch (o->make_lang) { + case LANG_C: + case LANG_CPLUSPLUS: + encoding_opt = NULL; + break; + case LANG_CSHARP: + o->encoding = ENC_WIDECHARS; + if (!o->parent_class_name) + o->parent_class_name = DEFAULT_CS_BASE_CLASS; + if (!o->string_class) + o->string_class = DEFAULT_CS_STRING_CLASS; + if (!o->among_class) + o->among_class = DEFAULT_CS_AMONG_CLASS; + if (!o->package) + o->package = DEFAULT_CS_NAMESPACE; + break; + case LANG_GO: + o->encoding = ENC_UTF8; + if (!o->package) + o->package = DEFAULT_GO_PACKAGE; + break; + case LANG_JAVA: + o->encoding = ENC_WIDECHARS; + if (!o->parent_class_name) + o->parent_class_name = DEFAULT_JAVA_BASE_CLASS; + if (!o->string_class) + o->string_class = DEFAULT_JAVA_STRING_CLASS; + if (!o->among_class) + o->among_class = DEFAULT_JAVA_AMONG_CLASS; + if (!o->package) + o->package = DEFAULT_JAVA_PACKAGE; + break; + case LANG_JAVASCRIPT: + o->encoding = ENC_WIDECHARS; + if (!o->parent_class_name) + o->parent_class_name = DEFAULT_JS_BASE_CLASS; + break; + case LANG_PYTHON: + o->encoding = ENC_WIDECHARS; + if (!o->parent_class_name) + o->parent_class_name = DEFAULT_PYTHON_BASE_CLASS; + break; + case LANG_RUST: + o->encoding = ENC_UTF8; + break; + default: + break; + } + + if (encoding_opt) { + fprintf(stderr, "warning: %s only meaningful for C and C++\n", + encoding_opt); + } + + if (o->make_lang != LANG_C && o->make_lang != LANG_CPLUSPLUS) { + if (o->runtime_path) { + fprintf(stderr, "warning: -r/-runtime only meaningful for C and C++\n"); + } + if (o->externals_prefix) { + fprintf(stderr, "warning: -ep/-eprefix only meaningful for C and C++\n"); + } + } + if (!o->externals_prefix) o->externals_prefix = ""; + + if (!o->name && o->output_file) { + /* Default class name to basename of output_file - this is the standard + * convention for at least Java and C#. + */ + const char * slash = strrchr(o->output_file, '/'); + size_t len; + const char * leaf = (slash == NULL) ? o->output_file : slash + 1; + + slash = strrchr(leaf, '\\'); + if (slash != NULL) leaf = slash + 1; + + { + const char * dot = strchr(leaf, '.'); + len = (dot == NULL) ? strlen(leaf) : (size_t)(dot - leaf); + } + + { + char * new_name = malloc(len + 1); + switch (o->make_lang) { + case LANG_CSHARP: + case LANG_PASCAL: + /* Upper case initial letter. */ + memcpy(new_name, leaf, len); + new_name[0] = toupper(new_name[0]); + break; + case LANG_JAVASCRIPT: + case LANG_PYTHON: { + /* Upper case initial letter and change each + * underscore+letter or hyphen+letter to an upper case + * letter. + */ + size_t i, j = 0; + int uc_next = true; + for (i = 0; i != len; ++i) { + unsigned char ch = leaf[i]; + if (ch == '_' || ch == '-') { + uc_next = true; + } else { + if (uc_next) { + new_name[j] = toupper(ch); + uc_next = false; + } else { + new_name[j] = ch; + } + ++j; + } + } + len = j; + break; + } + default: + /* Just copy. */ + memcpy(new_name, leaf, len); + break; + } + new_name[len] = '\0'; + o->name = new_name; + } + } + + return new_argc; } extern int main(int argc, char * argv[]) { + int i; NEW(options, o); - if (argc == 1) print_arglist(); - read_options(o, argc, argv); + argc = read_options(o, argc, argv); { - symbol * filename = add_s_to_b(0, argv[1]); - char * file; - symbol * u = get_input(filename, &file); + char * file = argv[1]; + symbol * u = get_input(file); if (u == 0) { - fprintf(stderr, "Can't open input %s\n", argv[1]); + fprintf(stderr, "Can't open input %s\n", file); exit(1); } { struct tokeniser * t = create_tokeniser(u, file); struct analyser * a = create_analyser(t); - t->widechars = o->widechars; + struct input ** next_input_ptr = &(t->next); + a->encoding = t->encoding = o->encoding; t->includes = o->includes; - a->utf8 = t->utf8 = o->utf8; + /* If multiple source files are specified, set up the others to be + * read after the first in order, using the same mechanism as + * 'get' uses. */ + for (i = 2; i != argc; ++i) { + NEW(input, q); + file = argv[i]; + u = get_input(file); + if (u == 0) { + fprintf(stderr, "Can't open input %s\n", file); + exit(1); + } + q->p = u; + q->c = 0; + q->file = file; + q->file_needs_freeing = false; + q->line_number = 1; + *next_input_ptr = q; + next_input_ptr = &(q->next); + } + *next_input_ptr = NULL; read_program(a); if (t->error_count > 0) exit(1); if (o->syntax_tree) print_program(a); close_tokeniser(t); - unless (o->syntax_tree) { + if (!o->syntax_tree) { struct generator * g; - char * s = o->output_file; - unless (s) { + const char * s = o->output_file; + if (!s) { fprintf(stderr, "Please include the -o option\n"); - print_arglist(); - exit(1); + print_arglist(1); } + g = create_generator(a, o); if (o->make_lang == LANG_C || o->make_lang == LANG_CPLUSPLUS) { symbol * b = add_s_to_b(0, s); b = add_s_to_b(b, ".h"); @@ -217,41 +478,96 @@ extern int main(int argc, char * argv[]) { if (o->make_lang == LANG_CPLUSPLUS) { b = add_s_to_b(b, "c"); } - o->output_c = get_output(b); + o->output_src = get_output(b); lose_b(b); - g = create_generator_c(a, o); generate_program_c(g); - close_generator_c(g); - fclose(o->output_c); + fclose(o->output_src); fclose(o->output_h); } #ifndef DISABLE_JAVA if (o->make_lang == LANG_JAVA) { symbol * b = add_s_to_b(0, s); b = add_s_to_b(b, ".java"); - o->output_java = get_output(b); + o->output_src = get_output(b); lose_b(b); - g = create_generator_java(a, o); generate_program_java(g); - close_generator_java(g); - fclose(o->output_java); + fclose(o->output_src); } #endif +#ifndef DISABLE_PASCAL + if (o->make_lang == LANG_PASCAL) { + symbol *b = add_s_to_b(0, s); + b = add_s_to_b(b, ".pas"); + o->output_src = get_output(b); + lose_b(b); + generate_program_pascal(g); + fclose(o->output_src); + } +#endif +#ifndef DISABLE_PYTHON + if (o->make_lang == LANG_PYTHON) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".py"); + o->output_src = get_output(b); + lose_b(b); + generate_program_python(g); + fclose(o->output_src); + } +#endif +#ifndef DISABLE_JS + if (o->make_lang == LANG_JAVASCRIPT) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".js"); + o->output_src = get_output(b); + lose_b(b); + generate_program_js(g); + fclose(o->output_src); + } +#endif +#ifndef DISABLE_CSHARP + if (o->make_lang == LANG_CSHARP) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".cs"); + o->output_src = get_output(b); + lose_b(b); + generate_program_csharp(g); + fclose(o->output_src); + } +#endif +#ifndef DISABLE_RUST + if (o->make_lang == LANG_RUST) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".rs"); + o->output_src = get_output(b); + lose_b(b); + generate_program_rust(g); + fclose(o->output_src); + } +#endif +#ifndef DISABLE_GO + if (o->make_lang == LANG_GO) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".go"); + o->output_src = get_output(b); + lose_b(b); + generate_program_go(g); + fclose(o->output_src); + } +#endif + close_generator(g); } close_analyser(a); } lose_b(u); - lose_b(filename); } { struct include * p = o->includes; - until (p == 0) - { struct include * q = p->next; + while (p) { + struct include * q = p->next; lose_b(p->b); FREE(p); p = q; } } FREE(o); - unless (space_count == 0) fprintf(stderr, "%d blocks unfreed\n", space_count); + if (space_count) fprintf(stderr, "%d blocks unfreed\n", space_count); return 0; } - |