aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/snowball/compiler/driver.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2020-02-25 09:55:31 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2020-02-25 09:55:31 +0000
commitb87995255fa2ef0de97d509b8cd27860f014e90f (patch)
treeff7fcc84aa85fcd4cd129d94f6fb23ac5f91d4cb /contrib/snowball/compiler/driver.c
parent52154a6c1dd7e46c174d4aab782494b92f955df5 (diff)
downloadrspamd-b87995255fa2ef0de97d509b8cd27860f014e90f.tar.gz
rspamd-b87995255fa2ef0de97d509b8cd27860f014e90f.zip
[Rework] Update snowball stemmer to 2.0 and remove all crap aside of UTF8
Diffstat (limited to 'contrib/snowball/compiler/driver.c')
-rw-r--r--contrib/snowball/compiler/driver.c466
1 files changed, 391 insertions, 75 deletions
diff --git a/contrib/snowball/compiler/driver.c b/contrib/snowball/compiler/driver.c
index fbb1e9cae..9cbe447a5 100644
--- a/contrib/snowball/compiler/driver.c
+++ b/contrib/snowball/compiler/driver.c
@@ -1,48 +1,86 @@
+#include <ctype.h> /* for toupper etc */
#include <stdio.h> /* for fprintf etc */
#include <stdlib.h> /* for free etc */
-#include <string.h> /* for strlen */
+#include <string.h> /* for strcmp */
#include "header.h"
-#define DEFAULT_PACKAGE "org.tartarus.snowball.ext"
-#define DEFAULT_BASE_CLASS "org.tartarus.snowball.SnowballProgram"
-#define DEFAULT_AMONG_CLASS "org.tartarus.snowball.Among"
-#define DEFAULT_STRING_CLASS "java.lang.StringBuilder"
+#define DEFAULT_JAVA_PACKAGE "org.tartarus.snowball.ext"
+#define DEFAULT_JAVA_BASE_CLASS "org.tartarus.snowball.SnowballProgram"
+#define DEFAULT_JAVA_AMONG_CLASS "org.tartarus.snowball.Among"
+#define DEFAULT_JAVA_STRING_CLASS "java.lang.StringBuilder"
+
+#define DEFAULT_GO_PACKAGE "snowball"
+#define DEFAULT_GO_SNOWBALL_RUNTIME "github.com/snowballstem/snowball/go"
+
+#define DEFAULT_CS_NAMESPACE "Snowball"
+#define DEFAULT_CS_BASE_CLASS "Stemmer"
+#define DEFAULT_CS_AMONG_CLASS "Among"
+#define DEFAULT_CS_STRING_CLASS "StringBuilder"
+
+#define DEFAULT_JS_BASE_CLASS "BaseStemmer"
+
+#define DEFAULT_PYTHON_BASE_CLASS "BaseStemmer"
static int eq(const char * s1, const char * s2) {
- int s1_len = strlen(s1);
- int s2_len = strlen(s2);
- return s1_len == s2_len && memcmp(s1, s2, s1_len) == 0;
+ return strcmp(s1, s2) == 0;
}
-static void print_arglist(void) {
- fprintf(stderr, "Usage: snowball <file> [options]\n\n"
- "options are: [-o[utput] file]\n"
- " [-s[yntax]]\n"
+static void print_arglist(int exit_code) {
+ FILE * f = exit_code ? stderr : stdout;
+ fprintf(f, "Usage: snowball SOURCE_FILE... [OPTIONS]\n\n"
+ "Supported options:\n"
+ " -o[utput] file\n"
+ " -s[yntax]\n"
+ " -comments\n"
#ifndef DISABLE_JAVA
- " [-j[ava]]\n"
+ " -j[ava]\n"
#endif
- " [-c++]\n"
- " [-w[idechars]]\n"
- " [-u[tf8]]\n"
- " [-n[ame] class name]\n"
- " [-ep[refix] string]\n"
- " [-vp[refix] string]\n"
- " [-i[nclude] directory]\n"
- " [-r[untime] path to runtime headers]\n"
-#ifndef DISABLE_JAVA
- " [-p[arentclassname] fully qualified parent class name]\n"
- " [-P[ackage] package name for stemmers]\n"
- " [-S[tringclass] StringBuffer-compatible class]\n"
- " [-a[mongclass] fully qualified name of the Among class]\n"
+#ifndef DISABLE_CSHARP
+ " -cs[harp]\n"
+#endif
+ " -c++\n"
+#ifndef DISABLE_PASCAL
+ " -pascal\n"
+#endif
+#ifndef DISABLE_PYTHON
+ " -py[thon]\n"
#endif
+#ifndef DISABLE_JS
+ " -js\n"
+#endif
+#ifndef DISABLE_RUST
+ " -rust\n"
+#endif
+#ifndef DISABLE_GO
+ " -go\n"
+#endif
+ " -w[idechars]\n"
+ " -u[tf8]\n"
+ " -n[ame] class name\n"
+ " -ep[refix] string\n"
+ " -vp[refix] string\n"
+ " -i[nclude] directory\n"
+ " -r[untime] path to runtime headers\n"
+ " -p[arentclassname] fully qualified parent class name\n"
+#if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP)
+ " -P[ackage] package name for stemmers\n"
+ " -S[tringclass] StringBuffer-compatible class\n"
+ " -a[mongclass] fully qualified name of the Among class\n"
+#endif
+#ifndef DISABLE_GO
+ " -gop[ackage] Go package name for stemmers\n"
+ " -gor[untime] Go snowball runtime package\n"
+#endif
+ " --help display this help and exit\n"
+ " --version output version information and exit\n"
);
- exit(1);
+ exit(exit_code);
}
static void check_lim(int i, int argc) {
if (i >= argc) {
fprintf(stderr, "argument list is one short\n");
- print_arglist();
+ print_arglist(1);
}
}
@@ -57,35 +95,47 @@ static FILE * get_output(symbol * b) {
return output;
}
-static void read_options(struct options * o, int argc, char * argv[]) {
+static int read_options(struct options * o, int argc, char * argv[]) {
char * s;
- int i = 2;
+ int i = 1;
+ int new_argc = 1;
+ /* Note down the last option used to specify an explicit encoding so
+ * we can warn we ignored it for languages with a fixed encoding.
+ */
+ const char * encoding_opt = NULL;
/* set defaults: */
o->output_file = 0;
o->syntax_tree = false;
- o->externals_prefix = "";
+ o->comments = false;
+ o->externals_prefix = NULL;
o->variables_prefix = 0;
o->runtime_path = 0;
- o->parent_class_name = DEFAULT_BASE_CLASS;
- o->string_class = DEFAULT_STRING_CLASS;
- o->among_class = DEFAULT_AMONG_CLASS;
- o->package = DEFAULT_PACKAGE;
- o->name = "";
+ o->parent_class_name = NULL;
+ o->string_class = NULL;
+ o->among_class = NULL;
+ o->package = NULL;
+ o->go_snowball_runtime = DEFAULT_GO_SNOWBALL_RUNTIME;
+ o->name = NULL;
o->make_lang = LANG_C;
- o->widechars = false;
o->includes = 0;
o->includes_end = 0;
- o->utf8 = false;
+ o->encoding = ENC_SINGLEBYTE;
/* read options: */
- repeat {
- if (i >= argc) break;
+ while (i < argc) {
s = argv[i++];
- { if (eq(s, "-o") || eq(s, "-output")) {
- check_lim(i, argc);
+ if (s[0] != '-') {
+ /* Non-option argument - shuffle down. */
+ argv[new_argc++] = s;
+ continue;
+ }
+
+ {
+ if (eq(s, "-o") || eq(s, "-output")) {
+ check_lim(i, argc);
o->output_file = argv[i++];
continue;
}
@@ -94,10 +144,33 @@ static void read_options(struct options * o, int argc, char * argv[]) {
o->name = argv[i++];
continue;
}
+#ifndef DISABLE_JS
+ if (eq(s, "-js")) {
+ o->make_lang = LANG_JAVASCRIPT;
+ continue;
+ }
+#endif
+#ifndef DISABLE_RUST
+ if (eq(s, "-rust")) {
+ o->make_lang = LANG_RUST;
+ continue;
+ }
+#endif
+#ifndef DISABLE_GO
+ if (eq(s, "-go")) {
+ o->make_lang = LANG_GO;
+ continue;
+ }
+#endif
#ifndef DISABLE_JAVA
if (eq(s, "-j") || eq(s, "-java")) {
o->make_lang = LANG_JAVA;
- o->widechars = true;
+ continue;
+ }
+#endif
+#ifndef DISABLE_CSHARP
+ if (eq(s, "-cs") || eq(s, "-csharp")) {
+ o->make_lang = LANG_CSHARP;
continue;
}
#endif
@@ -105,15 +178,31 @@ static void read_options(struct options * o, int argc, char * argv[]) {
o->make_lang = LANG_CPLUSPLUS;
continue;
}
+#ifndef DISABLE_PASCAL
+ if (eq(s, "-pascal")) {
+ o->make_lang = LANG_PASCAL;
+ continue;
+ }
+#endif
+#ifndef DISABLE_PYTHON
+ if (eq(s, "-py") || eq(s, "-python")) {
+ o->make_lang = LANG_PYTHON;
+ continue;
+ }
+#endif
if (eq(s, "-w") || eq(s, "-widechars")) {
- o->widechars = true;
- o->utf8 = false;
+ encoding_opt = s;
+ o->encoding = ENC_WIDECHARS;
continue;
}
if (eq(s, "-s") || eq(s, "-syntax")) {
o->syntax_tree = true;
continue;
}
+ if (eq(s, "-comments")) {
+ o->comments = true;
+ continue;
+ }
if (eq(s, "-ep") || eq(s, "-eprefix")) {
check_lim(i, argc);
o->externals_prefix = argv[i++];
@@ -145,16 +234,16 @@ static void read_options(struct options * o, int argc, char * argv[]) {
continue;
}
if (eq(s, "-u") || eq(s, "-utf8")) {
- o->utf8 = true;
- o->widechars = false;
+ encoding_opt = s;
+ o->encoding = ENC_UTF8;
continue;
}
-#ifndef DISABLE_JAVA
if (eq(s, "-p") || eq(s, "-parentclassname")) {
check_lim(i, argc);
o->parent_class_name = argv[i++];
continue;
}
+#if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP)
if (eq(s, "-P") || eq(s, "-Package")) {
check_lim(i, argc);
o->package = argv[i++];
@@ -171,44 +260,216 @@ static void read_options(struct options * o, int argc, char * argv[]) {
continue;
}
#endif
+#ifndef DISABLE_GO
+ if (eq(s, "-gop") || eq(s, "-gopackage")) {
+ check_lim(i, argc);
+ o->package = argv[i++];
+ continue;
+ }
+ if (eq(s, "-gor") || eq(s, "-goruntime")) {
+ check_lim(i, argc);
+ o->go_snowball_runtime = argv[i++];
+ continue;
+ }
+#endif
+ if (eq(s, "--help")) {
+ print_arglist(0);
+ }
+
+ if (eq(s, "--version")) {
+ printf("Snowball compiler version " SNOWBALL_VERSION "\n");
+ exit(0);
+ }
+
fprintf(stderr, "'%s' misplaced\n", s);
- print_arglist();
+ print_arglist(1);
}
}
+ if (new_argc == 1) {
+ fprintf(stderr, "no source files specified\n");
+ print_arglist(1);
+ }
+ argv[new_argc] = NULL;
+
+ /* Set language-dependent defaults. */
+ switch (o->make_lang) {
+ case LANG_C:
+ case LANG_CPLUSPLUS:
+ encoding_opt = NULL;
+ break;
+ case LANG_CSHARP:
+ o->encoding = ENC_WIDECHARS;
+ if (!o->parent_class_name)
+ o->parent_class_name = DEFAULT_CS_BASE_CLASS;
+ if (!o->string_class)
+ o->string_class = DEFAULT_CS_STRING_CLASS;
+ if (!o->among_class)
+ o->among_class = DEFAULT_CS_AMONG_CLASS;
+ if (!o->package)
+ o->package = DEFAULT_CS_NAMESPACE;
+ break;
+ case LANG_GO:
+ o->encoding = ENC_UTF8;
+ if (!o->package)
+ o->package = DEFAULT_GO_PACKAGE;
+ break;
+ case LANG_JAVA:
+ o->encoding = ENC_WIDECHARS;
+ if (!o->parent_class_name)
+ o->parent_class_name = DEFAULT_JAVA_BASE_CLASS;
+ if (!o->string_class)
+ o->string_class = DEFAULT_JAVA_STRING_CLASS;
+ if (!o->among_class)
+ o->among_class = DEFAULT_JAVA_AMONG_CLASS;
+ if (!o->package)
+ o->package = DEFAULT_JAVA_PACKAGE;
+ break;
+ case LANG_JAVASCRIPT:
+ o->encoding = ENC_WIDECHARS;
+ if (!o->parent_class_name)
+ o->parent_class_name = DEFAULT_JS_BASE_CLASS;
+ break;
+ case LANG_PYTHON:
+ o->encoding = ENC_WIDECHARS;
+ if (!o->parent_class_name)
+ o->parent_class_name = DEFAULT_PYTHON_BASE_CLASS;
+ break;
+ case LANG_RUST:
+ o->encoding = ENC_UTF8;
+ break;
+ default:
+ break;
+ }
+
+ if (encoding_opt) {
+ fprintf(stderr, "warning: %s only meaningful for C and C++\n",
+ encoding_opt);
+ }
+
+ if (o->make_lang != LANG_C && o->make_lang != LANG_CPLUSPLUS) {
+ if (o->runtime_path) {
+ fprintf(stderr, "warning: -r/-runtime only meaningful for C and C++\n");
+ }
+ if (o->externals_prefix) {
+ fprintf(stderr, "warning: -ep/-eprefix only meaningful for C and C++\n");
+ }
+ }
+ if (!o->externals_prefix) o->externals_prefix = "";
+
+ if (!o->name && o->output_file) {
+ /* Default class name to basename of output_file - this is the standard
+ * convention for at least Java and C#.
+ */
+ const char * slash = strrchr(o->output_file, '/');
+ size_t len;
+ const char * leaf = (slash == NULL) ? o->output_file : slash + 1;
+
+ slash = strrchr(leaf, '\\');
+ if (slash != NULL) leaf = slash + 1;
+
+ {
+ const char * dot = strchr(leaf, '.');
+ len = (dot == NULL) ? strlen(leaf) : (size_t)(dot - leaf);
+ }
+
+ {
+ char * new_name = malloc(len + 1);
+ switch (o->make_lang) {
+ case LANG_CSHARP:
+ case LANG_PASCAL:
+ /* Upper case initial letter. */
+ memcpy(new_name, leaf, len);
+ new_name[0] = toupper(new_name[0]);
+ break;
+ case LANG_JAVASCRIPT:
+ case LANG_PYTHON: {
+ /* Upper case initial letter and change each
+ * underscore+letter or hyphen+letter to an upper case
+ * letter.
+ */
+ size_t i, j = 0;
+ int uc_next = true;
+ for (i = 0; i != len; ++i) {
+ unsigned char ch = leaf[i];
+ if (ch == '_' || ch == '-') {
+ uc_next = true;
+ } else {
+ if (uc_next) {
+ new_name[j] = toupper(ch);
+ uc_next = false;
+ } else {
+ new_name[j] = ch;
+ }
+ ++j;
+ }
+ }
+ len = j;
+ break;
+ }
+ default:
+ /* Just copy. */
+ memcpy(new_name, leaf, len);
+ break;
+ }
+ new_name[len] = '\0';
+ o->name = new_name;
+ }
+ }
+
+ return new_argc;
}
extern int main(int argc, char * argv[]) {
+ int i;
NEW(options, o);
- if (argc == 1) print_arglist();
- read_options(o, argc, argv);
+ argc = read_options(o, argc, argv);
{
- symbol * filename = add_s_to_b(0, argv[1]);
- char * file;
- symbol * u = get_input(filename, &file);
+ char * file = argv[1];
+ symbol * u = get_input(file);
if (u == 0) {
- fprintf(stderr, "Can't open input %s\n", argv[1]);
+ fprintf(stderr, "Can't open input %s\n", file);
exit(1);
}
{
struct tokeniser * t = create_tokeniser(u, file);
struct analyser * a = create_analyser(t);
- t->widechars = o->widechars;
+ struct input ** next_input_ptr = &(t->next);
+ a->encoding = t->encoding = o->encoding;
t->includes = o->includes;
- a->utf8 = t->utf8 = o->utf8;
+ /* If multiple source files are specified, set up the others to be
+ * read after the first in order, using the same mechanism as
+ * 'get' uses. */
+ for (i = 2; i != argc; ++i) {
+ NEW(input, q);
+ file = argv[i];
+ u = get_input(file);
+ if (u == 0) {
+ fprintf(stderr, "Can't open input %s\n", file);
+ exit(1);
+ }
+ q->p = u;
+ q->c = 0;
+ q->file = file;
+ q->file_needs_freeing = false;
+ q->line_number = 1;
+ *next_input_ptr = q;
+ next_input_ptr = &(q->next);
+ }
+ *next_input_ptr = NULL;
read_program(a);
if (t->error_count > 0) exit(1);
if (o->syntax_tree) print_program(a);
close_tokeniser(t);
- unless (o->syntax_tree) {
+ if (!o->syntax_tree) {
struct generator * g;
- char * s = o->output_file;
- unless (s) {
+ const char * s = o->output_file;
+ if (!s) {
fprintf(stderr, "Please include the -o option\n");
- print_arglist();
- exit(1);
+ print_arglist(1);
}
+ g = create_generator(a, o);
if (o->make_lang == LANG_C || o->make_lang == LANG_CPLUSPLUS) {
symbol * b = add_s_to_b(0, s);
b = add_s_to_b(b, ".h");
@@ -217,41 +478,96 @@ extern int main(int argc, char * argv[]) {
if (o->make_lang == LANG_CPLUSPLUS) {
b = add_s_to_b(b, "c");
}
- o->output_c = get_output(b);
+ o->output_src = get_output(b);
lose_b(b);
- g = create_generator_c(a, o);
generate_program_c(g);
- close_generator_c(g);
- fclose(o->output_c);
+ fclose(o->output_src);
fclose(o->output_h);
}
#ifndef DISABLE_JAVA
if (o->make_lang == LANG_JAVA) {
symbol * b = add_s_to_b(0, s);
b = add_s_to_b(b, ".java");
- o->output_java = get_output(b);
+ o->output_src = get_output(b);
lose_b(b);
- g = create_generator_java(a, o);
generate_program_java(g);
- close_generator_java(g);
- fclose(o->output_java);
+ fclose(o->output_src);
}
#endif
+#ifndef DISABLE_PASCAL
+ if (o->make_lang == LANG_PASCAL) {
+ symbol *b = add_s_to_b(0, s);
+ b = add_s_to_b(b, ".pas");
+ o->output_src = get_output(b);
+ lose_b(b);
+ generate_program_pascal(g);
+ fclose(o->output_src);
+ }
+#endif
+#ifndef DISABLE_PYTHON
+ if (o->make_lang == LANG_PYTHON) {
+ symbol * b = add_s_to_b(0, s);
+ b = add_s_to_b(b, ".py");
+ o->output_src = get_output(b);
+ lose_b(b);
+ generate_program_python(g);
+ fclose(o->output_src);
+ }
+#endif
+#ifndef DISABLE_JS
+ if (o->make_lang == LANG_JAVASCRIPT) {
+ symbol * b = add_s_to_b(0, s);
+ b = add_s_to_b(b, ".js");
+ o->output_src = get_output(b);
+ lose_b(b);
+ generate_program_js(g);
+ fclose(o->output_src);
+ }
+#endif
+#ifndef DISABLE_CSHARP
+ if (o->make_lang == LANG_CSHARP) {
+ symbol * b = add_s_to_b(0, s);
+ b = add_s_to_b(b, ".cs");
+ o->output_src = get_output(b);
+ lose_b(b);
+ generate_program_csharp(g);
+ fclose(o->output_src);
+ }
+#endif
+#ifndef DISABLE_RUST
+ if (o->make_lang == LANG_RUST) {
+ symbol * b = add_s_to_b(0, s);
+ b = add_s_to_b(b, ".rs");
+ o->output_src = get_output(b);
+ lose_b(b);
+ generate_program_rust(g);
+ fclose(o->output_src);
+ }
+#endif
+#ifndef DISABLE_GO
+ if (o->make_lang == LANG_GO) {
+ symbol * b = add_s_to_b(0, s);
+ b = add_s_to_b(b, ".go");
+ o->output_src = get_output(b);
+ lose_b(b);
+ generate_program_go(g);
+ fclose(o->output_src);
+ }
+#endif
+ close_generator(g);
}
close_analyser(a);
}
lose_b(u);
- lose_b(filename);
}
{ struct include * p = o->includes;
- until (p == 0)
- { struct include * q = p->next;
+ while (p) {
+ struct include * q = p->next;
lose_b(p->b); FREE(p); p = q;
}
}
FREE(o);
- unless (space_count == 0) fprintf(stderr, "%d blocks unfreed\n", space_count);
+ if (space_count) fprintf(stderr, "%d blocks unfreed\n", space_count);
return 0;
}
-