summaryrefslogtreecommitdiffstats
path: root/contrib/snowball/examples/stemwords.c
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/snowball/examples/stemwords.c')
-rw-r--r--contrib/snowball/examples/stemwords.c209
1 files changed, 209 insertions, 0 deletions
diff --git a/contrib/snowball/examples/stemwords.c b/contrib/snowball/examples/stemwords.c
new file mode 100644
index 000000000..995bf40dc
--- /dev/null
+++ b/contrib/snowball/examples/stemwords.c
@@ -0,0 +1,209 @@
+/* This is a simple program which uses libstemmer to provide a command
+ * line interface for stemming using any of the algorithms provided.
+ */
+
+#include <stdio.h>
+#include <stdlib.h> /* for malloc, free */
+#include <string.h> /* for memmove */
+#include <ctype.h> /* for isupper, tolower */
+
+#include "libstemmer.h"
+
+const char * progname;
+static int pretty = 1;
+
+static void
+stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
+{
+#define INC 10
+ int lim = INC;
+ sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol));
+
+ while(1) {
+ int ch = getc(f_in);
+ if (ch == EOF) {
+ free(b); return;
+ }
+ {
+ int i = 0;
+ int inlen = 0;
+ while(1) {
+ if (ch == '\n' || ch == EOF) break;
+ if (i == lim) {
+ sb_symbol * newb;
+ newb = (sb_symbol *)
+ realloc(b, (lim + INC) * sizeof(sb_symbol));
+ if (newb == 0) goto error;
+ b = newb;
+ lim = lim + INC;
+ }
+ /* Update count of utf-8 characters. */
+ if (ch < 0x80 || ch > 0xBF) inlen += 1;
+ /* force lower case: */
+ if (isupper(ch)) ch = tolower(ch);
+
+ b[i] = ch;
+ i++;
+ ch = getc(f_in);
+ }
+
+ {
+ const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i);
+ if (stemmed == NULL)
+ {
+ fprintf(stderr, "Out of memory");
+ exit(1);
+ }
+ else
+ {
+ if (pretty == 1) {
+ fwrite(b, i, 1, f_out);
+ fputs(" -> ", f_out);
+ } else if (pretty == 2) {
+ fwrite(b, i, 1, f_out);
+ if (sb_stemmer_length(stemmer) > 0) {
+ int j;
+ if (inlen < 30) {
+ for (j = 30 - inlen; j > 0; j--)
+ fputs(" ", f_out);
+ } else {
+ fputs("\n", f_out);
+ for (j = 30; j > 0; j--)
+ fputs(" ", f_out);
+ }
+ }
+ }
+
+ fputs((const char *)stemmed, f_out);
+ putc('\n', f_out);
+ }
+ }
+ }
+ }
+error:
+ if (b != 0) free(b);
+ return;
+}
+
+/** Display the command line syntax, and then exit.
+ * @param n The value to exit with.
+ */
+static void
+usage(int n)
+{
+ printf("usage: %s [-l <language>] [-i <input file>] [-o <output file>] [-c <character encoding>] [-p[2]] [-h]\n"
+ "\n"
+ "The input file consists of a list of words to be stemmed, one per\n"
+ "line. Words should be in lower case, but (for English) A-Z letters\n"
+ "are mapped to their a-z equivalents anyway. If omitted, stdin is\n"
+ "used.\n"
+ "\n"
+ "If -c is given, the argument is the character encoding of the input\n"
+ "and output files. If it is omitted, the UTF-8 encoding is used.\n"
+ "\n"
+ "If -p is given the output file consists of each word of the input\n"
+ "file followed by \"->\" followed by its stemmed equivalent.\n"
+ "If -p2 is given the output file is a two column layout containing\n"
+ "the input words in the first column and the stemmed equivalents in\n"
+ "the second column.\n"
+ "Otherwise, the output file consists of the stemmed words, one per\n"
+ "line.\n"
+ "\n"
+ "-h displays this help\n",
+ progname);
+ exit(n);
+}
+
+int
+main(int argc, char * argv[])
+{
+ char * in = 0;
+ char * out = 0;
+ FILE * f_in;
+ FILE * f_out;
+ struct sb_stemmer * stemmer;
+
+ char * language = "english";
+ char * charenc = NULL;
+
+ char * s;
+ int i = 1;
+ pretty = 0;
+
+ progname = argv[0];
+
+ while(i < argc) {
+ s = argv[i++];
+ if (s[0] == '-') {
+ if (strcmp(s, "-o") == 0) {
+ if (i >= argc) {
+ fprintf(stderr, "%s requires an argument\n", s);
+ exit(1);
+ }
+ out = argv[i++];
+ } else if (strcmp(s, "-i") == 0) {
+ if (i >= argc) {
+ fprintf(stderr, "%s requires an argument\n", s);
+ exit(1);
+ }
+ in = argv[i++];
+ } else if (strcmp(s, "-l") == 0) {
+ if (i >= argc) {
+ fprintf(stderr, "%s requires an argument\n", s);
+ exit(1);
+ }
+ language = argv[i++];
+ } else if (strcmp(s, "-c") == 0) {
+ if (i >= argc) {
+ fprintf(stderr, "%s requires an argument\n", s);
+ exit(1);
+ }
+ charenc = argv[i++];
+ } else if (strcmp(s, "-p2") == 0) {
+ pretty = 2;
+ } else if (strcmp(s, "-p") == 0) {
+ pretty = 1;
+ } else if (strcmp(s, "-h") == 0) {
+ usage(0);
+ } else {
+ fprintf(stderr, "option %s unknown\n", s);
+ usage(1);
+ }
+ } else {
+ fprintf(stderr, "unexpected parameter %s\n", s);
+ usage(1);
+ }
+ }
+
+ /* prepare the files */
+ f_in = (in == 0) ? stdin : fopen(in, "r");
+ if (f_in == 0) {
+ fprintf(stderr, "file %s not found\n", in);
+ exit(1);
+ }
+ f_out = (out == 0) ? stdout : fopen(out, "w");
+ if (f_out == 0) {
+ fprintf(stderr, "file %s cannot be opened\n", out);
+ exit(1);
+ }
+
+ /* do the stemming process: */
+ stemmer = sb_stemmer_new(language, charenc);
+ if (stemmer == 0) {
+ if (charenc == NULL) {
+ fprintf(stderr, "language `%s' not available for stemming\n", language);
+ exit(1);
+ } else {
+ fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc);
+ exit(1);
+ }
+ }
+ stem_file(stemmer, f_in, f_out);
+ sb_stemmer_delete(stemmer);
+
+ if (in != 0) (void) fclose(f_in);
+ if (out != 0) (void) fclose(f_out);
+
+ return 0;
+}
+