summaryrefslogtreecommitdiffstats
path: root/contrib/snowball
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/snowball')
m---------contrib/snowball0
-rw-r--r--contrib/snowball/.gitignore5
-rw-r--r--contrib/snowball/.travis.yml4
-rw-r--r--contrib/snowball/AUTHORS27
-rw-r--r--contrib/snowball/CMakeLists.txt85
-rw-r--r--contrib/snowball/GNUmakefile300
-rw-r--r--contrib/snowball/README5
-rw-r--r--contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl91
-rw-r--r--contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl91
-rw-r--r--contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl164
-rw-r--r--contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl164
-rw-r--r--contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl229
-rw-r--r--contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl196
-rw-r--r--contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl248
-rw-r--r--contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl239
-rw-r--r--contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl139
-rw-r--r--contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl139
-rw-r--r--contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl145
-rw-r--r--contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl241
-rw-r--r--contrib/snowball/algorithms/hungarian/stem_Unicode.sbl241
-rw-r--r--contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl195
-rw-r--r--contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl195
-rw-r--r--contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl245
-rw-r--r--contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl208
-rw-r--r--contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl80
-rw-r--r--contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl80
-rw-r--r--contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl139
-rw-r--r--contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl218
-rw-r--r--contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl218
-rw-r--r--contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl236
-rw-r--r--contrib/snowball/algorithms/romanian/stem_Unicode.sbl236
-rw-r--r--contrib/snowball/algorithms/russian/stem_KOI8_R.sbl217
-rw-r--r--contrib/snowball/algorithms/russian/stem_Unicode.sbl215
-rw-r--r--contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl230
-rw-r--r--contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl230
-rw-r--r--contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl72
-rw-r--r--contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl72
-rw-r--r--contrib/snowball/algorithms/turkish/stem_Unicode.sbl477
-rw-r--r--contrib/snowball/compiler/analyser.c959
-rw-r--r--contrib/snowball/compiler/driver.c257
-rw-r--r--contrib/snowball/compiler/generator.c1465
-rw-r--r--contrib/snowball/compiler/generator_java.c1452
-rw-r--r--contrib/snowball/compiler/header.h324
-rw-r--r--contrib/snowball/compiler/space.c263
-rw-r--r--contrib/snowball/compiler/syswords.h84
-rw-r--r--contrib/snowball/compiler/syswords2.h13
-rw-r--r--contrib/snowball/compiler/tokeniser.c470
-rw-r--r--contrib/snowball/doc/TODO15
-rw-r--r--contrib/snowball/doc/libstemmer_c_README125
-rw-r--r--contrib/snowball/doc/libstemmer_java_README40
-rw-r--r--contrib/snowball/examples/stemwords.c209
-rw-r--r--contrib/snowball/include/libstemmer.h79
-rw-r--r--contrib/snowball/java/org/tartarus/snowball/Among.java31
-rw-r--r--contrib/snowball/java/org/tartarus/snowball/SnowballProgram.java432
-rw-r--r--contrib/snowball/java/org/tartarus/snowball/SnowballStemmer.java7
-rw-r--r--contrib/snowball/java/org/tartarus/snowball/TestApp.java77
-rw-r--r--contrib/snowball/runtime/api.c66
-rw-r--r--contrib/snowball/runtime/api.h26
-rw-r--r--contrib/snowball/runtime/header.h58
-rw-r--r--contrib/snowball/runtime/utilities.c478
60 files changed, 13246 insertions, 0 deletions
diff --git a/contrib/snowball b/contrib/snowball
deleted file mode 160000
-Subproject c381f4fa958b59d41b0c596f0cbfe3ed48831e9
diff --git a/contrib/snowball/.gitignore b/contrib/snowball/.gitignore
new file mode 100644
index 000000000..2147da841
--- /dev/null
+++ b/contrib/snowball/.gitignore
@@ -0,0 +1,5 @@
+*.o
+/libstemmer
+/snowball
+/src_c
+/stemwords
diff --git a/contrib/snowball/.travis.yml b/contrib/snowball/.travis.yml
new file mode 100644
index 000000000..e576233dc
--- /dev/null
+++ b/contrib/snowball/.travis.yml
@@ -0,0 +1,4 @@
+language: c
+compiler: gcc
+before_script: git clone https://github.com/snowballstem/snowball-data ../data
+script: make check
diff --git a/contrib/snowball/AUTHORS b/contrib/snowball/AUTHORS
new file mode 100644
index 000000000..60eae6f04
--- /dev/null
+++ b/contrib/snowball/AUTHORS
@@ -0,0 +1,27 @@
+Authors
+=======
+
+Martin Porter
+-------------
+
+ - Designed the snowball language.
+ - Implemented the snowball to C compiler.
+ - Implemented the stemming algorithms in C.
+ - Wrote the documentation.
+
+Richard Boulton
+---------------
+
+ - Implemented Java backend of the snowball compiler.
+ - Developed build system.
+ - Assisted with website maintenance.
+
+
+Assistance from
+---------------
+
+Olivier Bornet - fixes to java packaging and build system.
+Andreas Jung - useful bug reports on the libstemmer library.
+Olly Betts - several patches, bug reports, and performance improvements.
+Sebastiano Vigna and Oerd Cukalla - patches for the Java stemming algorithms.
+Ralf Junker - fix a potential memory leak in sb_stemmer_new().
diff --git a/contrib/snowball/CMakeLists.txt b/contrib/snowball/CMakeLists.txt
new file mode 100644
index 000000000..1dee79490
--- /dev/null
+++ b/contrib/snowball/CMakeLists.txt
@@ -0,0 +1,85 @@
+PROJECT(snowball C)
+
+cmake_minimum_required(VERSION 2.8)
+
+INCLUDE(CheckCCompilerFlag)
+INCLUDE(FindPerl)
+
+# End of configuration
+SET(LIBSTEM_ALGORITHMS danish dutch english finnish french german hungarian
+ italian norwegian porter portuguese romanian
+ russian spanish swedish turkish)
+SET(KOI8_ALGORITHMS russian)
+SET(ISO_8859_1_ALGORITHMS danish dutch english finnish french german italian
+ norwegian porter portuguese spanish swedish)
+SET(ISO_8859_2_ALGORITHMS hungarian romanian)
+SET(OTHER_ALGORITHMS german2 kraaij_pohlmann lovins)
+SET(ALL_ALGORITHMS ${LIBSTEM_ALGORITHMS} ${OTHER_ALGORITHMS})
+
+SET(COMPILER_SOURCES compiler/space.c
+ compiler/tokeniser.c
+ compiler/analyser.c
+ compiler/generator.c
+ compiler/driver.c
+ compiler/generator_java.c)
+
+SET(SNOWBALL_RUNTIME runtime/api.c
+ runtime/utilities.c)
+SET(LIBSTEMMER_SOURCES libstemmer/libstemmer.c)
+SET(LIBSTEMMER_UTF8_SOURCES libstemmer/libstemmer_utf8.c)
+#LIBSTEMMER_UTF8_SOURCES = libstemmer/libstemmer_utf8.c
+#LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h
+#LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in
+
+SET(STEMWORDS_SOURCES examples/stemwords.c)
+SET(MODULES_H "modules.h")
+CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/libstemmer_c.in ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/libstemmer.c @ONLY)
+
+MACRO(gen_stem IN ENCODING)
+ FOREACH(_it ${IN})
+ SET(_base "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/stem_${ENCODING}_${_it}")
+ SET(_header "${_base}.h")
+ SET(_source "${_base}.c")
+ STRING(REPLACE "UTF_8" "Unicode" _in_enc "${ENCODING}")
+ SET(_input "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}/stem_${_in_enc}.sbl")
+ IF(${_in_enc} STREQUAL "Unicode" AND NOT EXISTS ${_input})
+ ADD_CUSTOM_COMMAND(OUTPUT ${_source}
+ COMMAND ${CMAKE_CURRENT_BINARY_DIR}/snowball "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}/stem_ISO_8859_1.sbl" -o ${_base} -eprefix ${_it}_${ENCODING}_ -r ${CMAKE_CURRENT_SOURCE_DIR}/runtime -u
+ DEPENDS snowball)
+ LIST(APPEND STEMMER_SOURCES ${_source})
+
+ ELSE()
+ IF(EXISTS "${_input}")
+ ADD_CUSTOM_COMMAND(OUTPUT ${_source}
+ COMMAND ${CMAKE_CURRENT_BINARY_DIR}/snowball ${_input} -o ${_base} -eprefix ${_it}_${ENCODING}_ -r ${CMAKE_CURRENT_SOURCE_DIR}/runtime -u
+ DEPENDS snowball)
+ LIST(APPEND STEMMER_SOURCES ${_source})
+ ENDIF()
+ ENDIF()
+ ENDFOREACH()
+ENDMACRO()
+
+INCLUDE_DIRECTORIES("include")
+INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}/libstemmer")
+
+ADD_EXECUTABLE(snowball ${COMPILER_SOURCES})
+
+ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h
+ COMMAND ${PERL_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/mkmodules.pl -f ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h ${CMAKE_CURRENT_BINARY_DIR}/libstemmer ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/modules.txt ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/mkinc.mak)
+ADD_CUSTOM_TARGET(modules DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h")
+
+SET(STEMMER_SOURCES "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/libstemmer.c")
+ADD_CUSTOM_TARGET(stemmer_deps ALL)
+ADD_DEPENDENCIES(stemmer_deps modules)
+
+gen_stem("${LIBSTEM_ALGORITHMS}" "UTF_8")
+gen_stem("${KOI8_ALGORITHMS}" "KOI8_R")
+gen_stem("${ISO_8859_1_ALGORITHMS}" "ISO_8859_1")
+gen_stem("${ISO_8859_2_ALGORITHMS}" "ISO_8859_2")
+
+
+ADD_LIBRARY(stemmer ${LINK_TYPE} ${SNOWBALL_RUNTIME} ${STEMMER_SOURCES})
+ADD_DEPENDENCIES(stemmer stemmer_deps)
+
+ADD_EXECUTABLE(stemwords ${STEMWORDS_SOURCES})
+TARGET_LINK_LIBRARIES(stemwords stemmer)
diff --git a/contrib/snowball/GNUmakefile b/contrib/snowball/GNUmakefile
new file mode 100644
index 000000000..a30cafd89
--- /dev/null
+++ b/contrib/snowball/GNUmakefile
@@ -0,0 +1,300 @@
+# -*- makefile -*-
+
+c_src_dir = src_c
+java_src_main_dir = java/org/tartarus/snowball
+java_src_dir = $(java_src_main_dir)/ext
+
+libstemmer_algorithms = danish dutch english finnish french german hungarian \
+ italian \
+ norwegian porter portuguese romanian \
+ russian spanish swedish turkish
+
+KOI8_R_algorithms = russian
+ISO_8859_1_algorithms = danish dutch english finnish french german italian \
+ norwegian porter portuguese spanish swedish
+ISO_8859_2_algorithms = hungarian romanian
+
+other_algorithms = german2 kraaij_pohlmann lovins
+
+all_algorithms = $(libstemmer_algorithms) $(other_algorithms)
+
+COMPILER_SOURCES = compiler/space.c \
+ compiler/tokeniser.c \
+ compiler/analyser.c \
+ compiler/generator.c \
+ compiler/driver.c \
+ compiler/generator_java.c
+COMPILER_HEADERS = compiler/header.h \
+ compiler/syswords.h \
+ compiler/syswords2.h
+
+RUNTIME_SOURCES = runtime/api.c \
+ runtime/utilities.c
+RUNTIME_HEADERS = runtime/api.h \
+ runtime/header.h
+
+JAVARUNTIME_SOURCES = java/org/tartarus/snowball/Among.java \
+ java/org/tartarus/snowball/SnowballProgram.java \
+ java/org/tartarus/snowball/SnowballStemmer.java \
+ java/org/tartarus/snowball/TestApp.java
+
+LIBSTEMMER_SOURCES = libstemmer/libstemmer.c
+LIBSTEMMER_UTF8_SOURCES = libstemmer/libstemmer_utf8.c
+LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h
+LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in
+
+STEMWORDS_SOURCES = examples/stemwords.c
+
+ALL_ALGORITHM_FILES = $(all_algorithms:%=algorithms/%/stem*.sbl)
+C_LIB_SOURCES = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) \
+ $(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.c) \
+ $(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.c) \
+ $(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.c)
+C_LIB_HEADERS = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) \
+ $(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.h) \
+ $(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.h) \
+ $(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.h)
+C_OTHER_SOURCES = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c)
+C_OTHER_HEADERS = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h)
+JAVA_SOURCES = $(libstemmer_algorithms:%=$(java_src_dir)/%Stemmer.java)
+
+COMPILER_OBJECTS=$(COMPILER_SOURCES:.c=.o)
+RUNTIME_OBJECTS=$(RUNTIME_SOURCES:.c=.o)
+LIBSTEMMER_OBJECTS=$(LIBSTEMMER_SOURCES:.c=.o)
+LIBSTEMMER_UTF8_OBJECTS=$(LIBSTEMMER_UTF8_SOURCES:.c=.o)
+STEMWORDS_OBJECTS=$(STEMWORDS_SOURCES:.c=.o)
+C_LIB_OBJECTS = $(C_LIB_SOURCES:.c=.o)
+C_OTHER_OBJECTS = $(C_OTHER_SOURCES:.c=.o)
+JAVA_CLASSES = $(JAVA_SOURCES:.java=.class)
+JAVA_RUNTIME_CLASSES=$(JAVARUNTIME_SOURCES:.java=.class)
+
+CFLAGS=-Iinclude -O2
+CPPFLAGS=-W -Wall -Wmissing-prototypes -Wmissing-declarations
+
+all: snowball libstemmer.o stemwords $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS)
+
+clean:
+ rm -f $(COMPILER_OBJECTS) $(RUNTIME_OBJECTS) \
+ $(LIBSTEMMER_OBJECTS) $(LIBSTEMMER_UTF8_OBJECTS) $(STEMWORDS_OBJECTS) snowball \
+ libstemmer.o stemwords \
+ libstemmer/modules.h \
+ libstemmer/modules_utf8.h \
+ snowball.splint \
+ $(C_LIB_SOURCES) $(C_LIB_HEADERS) $(C_LIB_OBJECTS) \
+ $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) \
+ $(JAVA_SOURCES) $(JAVA_CLASSES) $(JAVA_RUNTIME_CLASSES) \
+ libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak \
+ libstemmer/libstemmer.c libstemmer/libstemmer_utf8.c
+ rm -rf dist
+ rmdir $(c_src_dir) || true
+
+snowball: $(COMPILER_OBJECTS)
+ $(CC) -o $@ $^
+
+$(COMPILER_OBJECTS): $(COMPILER_HEADERS)
+
+libstemmer/libstemmer.c: libstemmer/libstemmer_c.in
+ sed 's/@MODULES_H@/modules.h/' $^ >$@
+
+libstemmer/libstemmer_utf8.c: libstemmer/libstemmer_c.in
+ sed 's/@MODULES_H@/modules_utf8.h/' $^ >$@
+
+libstemmer/modules.h libstemmer/mkinc.mak: libstemmer/mkmodules.pl libstemmer/modules.txt
+ libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules.txt libstemmer/mkinc.mak
+
+libstemmer/modules_utf8.h libstemmer/mkinc_utf8.mak: libstemmer/mkmodules.pl libstemmer/modules_utf8.txt
+ libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules_utf8.txt libstemmer/mkinc_utf8.mak utf8
+
+libstemmer/libstemmer.o: libstemmer/modules.h $(C_LIB_HEADERS)
+
+libstemmer.o: libstemmer/libstemmer.o $(RUNTIME_OBJECTS) $(C_LIB_OBJECTS)
+ $(AR) -cru $@ $^
+
+stemwords: $(STEMWORDS_OBJECTS) libstemmer.o
+ $(CC) -o $@ $^
+
+algorithms/%/stem_Unicode.sbl: algorithms/%/stem_ISO_8859_1.sbl
+ cp $^ $@
+
+$(c_src_dir)/stem_UTF_8_%.c $(c_src_dir)/stem_UTF_8_%.h: algorithms/%/stem_Unicode.sbl snowball
+ @mkdir -p $(c_src_dir)
+ @l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \
+ o="$(c_src_dir)/stem_UTF_8_$${l}"; \
+ echo "./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u"; \
+ ./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u
+
+$(c_src_dir)/stem_KOI8_R_%.c $(c_src_dir)/stem_KOI8_R_%.h: algorithms/%/stem_KOI8_R.sbl snowball
+ @mkdir -p $(c_src_dir)
+ @l=`echo "$<" | sed 's!\(.*\)/stem_KOI8_R.sbl$$!\1!;s!^.*/!!'`; \
+ o="$(c_src_dir)/stem_KOI8_R_$${l}"; \
+ echo "./snowball $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime"; \
+ ./snowball $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime
+
+$(c_src_dir)/stem_ISO_8859_1_%.c $(c_src_dir)/stem_ISO_8859_1_%.h: algorithms/%/stem_ISO_8859_1.sbl snowball
+ @mkdir -p $(c_src_dir)
+ @l=`echo "$<" | sed 's!\(.*\)/stem_ISO_8859_1.sbl$$!\1!;s!^.*/!!'`; \
+ o="$(c_src_dir)/stem_ISO_8859_1_$${l}"; \
+ echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime"; \
+ ./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime
+
+$(c_src_dir)/stem_ISO_8859_2_%.c $(c_src_dir)/stem_ISO_8859_2_%.h: algorithms/%/stem_ISO_8859_2.sbl snowball
+ @mkdir -p $(c_src_dir)
+ @l=`echo "$<" | sed 's!\(.*\)/stem_ISO_8859_2.sbl$$!\1!;s!^.*/!!'`; \
+ o="$(c_src_dir)/stem_ISO_8859_2_$${l}"; \
+ echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime"; \
+ ./snowball $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime
+
+$(c_src_dir)/stem_%.o: $(c_src_dir)/stem_%.c $(c_src_dir)/stem_%.h
+ $(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ $<
+
+$(java_src_dir)/%Stemmer.java: algorithms/%/stem_Unicode.sbl snowball
+ @mkdir -p $(java_src_dir)
+ @l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \
+ o="$(java_src_dir)/$${l}Stemmer"; \
+ echo "./snowball $< -j -o $${o} -p \"org.tartarus.snowball.SnowballStemmer\" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer"; \
+ ./snowball $< -j -o $${o} -p "org.tartarus.snowball.SnowballStemmer" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer
+
+splint: snowball.splint
+snowball.splint: $(COMPILER_SOURCES)
+ splint $^ >$@ -weak
+
+# Make a full source distribution
+dist: dist_snowball dist_libstemmer_c dist_libstemmer_java
+
+# Make a distribution of all the sources involved in snowball
+dist_snowball: $(COMPILER_SOURCES) $(COMPILER_HEADERS) \
+ $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \
+ $(LIBSTEMMER_SOURCES) \
+ $(LIBSTEMMER_UTF8_SOURCES) \
+ $(LIBSTEMMER_HEADERS) \
+ $(LIBSTEMMER_EXTRA) \
+ $(ALL_ALGORITHM_FILES) $(STEMWORDS_SOURCES) \
+ GNUmakefile README doc/TODO libstemmer/mkmodules.pl
+ destname=snowball_code; \
+ dest=dist/$${destname}; \
+ rm -rf $${dest} && \
+ rm -f $${dest}.tgz && \
+ for file in $^; do \
+ dir=`dirname $$file` && \
+ mkdir -p $${dest}/$${dir} && \
+ cp -a $${file} $${dest}/$${dir} || exit 1 ; \
+ done && \
+ (cd dist && tar zcf $${destname}.tgz $${destname}) && \
+ rm -rf $${dest}
+
+# Make a distribution of all the sources required to compile the C library.
+dist_libstemmer_c: \
+ $(RUNTIME_SOURCES) \
+ $(RUNTIME_HEADERS) \
+ $(LIBSTEMMER_SOURCES) \
+ $(LIBSTEMMER_UTF8_SOURCES) \
+ $(LIBSTEMMER_HEADERS) \
+ $(LIBSTEMMER_EXTRA) \
+ $(C_LIB_SOURCES) \
+ $(C_LIB_HEADERS) \
+ libstemmer/mkinc.mak \
+ libstemmer/mkinc_utf8.mak
+ destname=libstemmer_c; \
+ dest=dist/$${destname}; \
+ rm -rf $${dest} && \
+ rm -f $${dest}.tgz && \
+ mkdir -p $${dest} && \
+ cp -a doc/libstemmer_c_README $${dest}/README && \
+ mkdir -p $${dest}/examples && \
+ cp -a examples/stemwords.c $${dest}/examples && \
+ mkdir -p $${dest}/$(c_src_dir) && \
+ cp -a $(C_LIB_SOURCES) $(C_LIB_HEADERS) $${dest}/$(c_src_dir) && \
+ mkdir -p $${dest}/runtime && \
+ cp -a $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) $${dest}/runtime && \
+ mkdir -p $${dest}/libstemmer && \
+ cp -a $(LIBSTEMMER_SOURCES) $(LIBSTEMMER_UTF8_SOURCES) $(LIBSTEMMER_HEADERS) $(LIBSTEMMER_EXTRA) $${dest}/libstemmer && \
+ mkdir -p $${dest}/include && \
+ mv $${dest}/libstemmer/libstemmer.h $${dest}/include && \
+ (cd $${dest} && \
+ echo "README" >> MANIFEST && \
+ ls $(c_src_dir)/*.c $(c_src_dir)/*.h >> MANIFEST && \
+ ls runtime/*.c runtime/*.h >> MANIFEST && \
+ ls libstemmer/*.c libstemmer/*.h >> MANIFEST && \
+ ls include/*.h >> MANIFEST) && \
+ cp -a libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak $${dest}/ && \
+ echo 'include mkinc.mak' >> $${dest}/Makefile && \
+ echo 'CFLAGS=-Iinclude' >> $${dest}/Makefile && \
+ echo 'all: libstemmer.o stemwords' >> $${dest}/Makefile && \
+ echo 'libstemmer.o: $$(snowball_sources:.c=.o)' >> $${dest}/Makefile && \
+ echo ' $$(AR) -cru $$@ $$^' >> $${dest}/Makefile && \
+ echo 'stemwords: examples/stemwords.o libstemmer.o' >> $${dest}/Makefile && \
+ echo ' $$(CC) -o $$@ $$^' >> $${dest}/Makefile && \
+ echo 'clean:' >> $${dest}/Makefile && \
+ echo ' rm -f stemwords *.o $(c_src_dir)/*.o runtime/*.o libstemmer/*.o' >> $${dest}/Makefile && \
+ (cd dist && tar zcf $${destname}.tgz $${destname}) && \
+ rm -rf $${dest}
+
+# Make a distribution of all the sources required to compile the Java library.
+dist_libstemmer_java: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \
+ $(LIBSTEMMER_EXTRA) \
+ $(JAVA_SOURCES)
+ destname=libstemmer_java; \
+ dest=dist/$${destname}; \
+ rm -rf $${dest} && \
+ rm -f $${dest}.tgz && \
+ mkdir -p $${dest} && \
+ cp -a doc/libstemmer_java_README $${dest}/README && \
+ mkdir -p $${dest}/$(java_src_dir) && \
+ cp -a $(JAVA_SOURCES) $${dest}/$(java_src_dir) && \
+ mkdir -p $${dest}/$(java_src_main_dir) && \
+ cp -a $(JAVARUNTIME_SOURCES) $${dest}/$(java_src_main_dir) && \
+ (cd $${dest} && \
+ echo "README" >> MANIFEST && \
+ ls $(java_src_dir)/*.java >> MANIFEST && \
+ ls $(java_src_main_dir)/*.java >> MANIFEST) && \
+ (cd dist && tar zcf $${destname}.tgz $${destname}) && \
+ rm -rf $${dest}
+
+check: check_utf8 check_iso_8859_1 check_iso_8859_2 check_koi8r
+
+check_utf8: $(libstemmer_algorithms:%=check_utf8_%)
+
+check_iso_8859_1: $(ISO_8859_1_algorithms:%=check_iso_8859_1_%)
+
+check_iso_8859_2: $(ISO_8859_2_algorithms:%=check_iso_8859_2_%)
+
+check_koi8r: $(KOI8_R_algorithms:%=check_koi8r_%)
+
+# Where the data files are located - assumed their repo is checked out as
+# a sibling to this one.
+STEMMING_DATA = ../snowball-data
+
+check_utf8_%: $(STEMMING_DATA)/% stemwords
+ @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with UTF-8"
+ @./stemwords -c UTF_8 -l `echo $<|sed 's!.*/!!'` -i $</voc.txt -o tmp.txt
+ @diff -u $</output.txt tmp.txt
+ @if [ -e $</diffs.txt ] ; \
+ then \
+ ./stemwords -c UTF_8 -l `echo $<|sed 's!.*/!!'` -i $</voc.txt -o tmp.txt -p2 && \
+ diff -u $</diffs.txt tmp.txt; \
+ fi
+ @rm tmp.txt
+
+check_iso_8859_1_%: $(STEMMING_DATA)/% stemwords
+ @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_1"
+ @python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-1"))' | \
+ ./stemwords -c ISO_8859_1 -l `echo $<|sed 's!.*/!!'` -o tmp.txt
+ @python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-1"))' | \
+ diff -u - tmp.txt
+ @rm tmp.txt
+
+check_iso_8859_2_%: $(STEMMING_DATA)/% stemwords
+ @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_2"
+ @python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-2"))' | \
+ ./stemwords -c ISO_8859_2 -l `echo $<|sed 's!.*/!!'` -o tmp.txt
+ @python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-2"))' | \
+ diff -u - tmp.txt
+ @rm tmp.txt
+
+check_koi8r_%: $(STEMMING_DATA)/% stemwords
+ @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with KOI8R"
+ @python -c 'print(open("$</voc.txt").read().decode("utf8").encode("koi8_r"))' | \
+ ./stemwords -c KOI8_R -l `echo $<|sed 's!.*/!!'` -o tmp.txt
+ @python -c 'print(open("$</output.txt").read().decode("utf8").encode("koi8_r"))' | \
+ diff -u - tmp.txt
+ @rm tmp.txt
diff --git a/contrib/snowball/README b/contrib/snowball/README
new file mode 100644
index 000000000..afb51b340
--- /dev/null
+++ b/contrib/snowball/README
@@ -0,0 +1,5 @@
+This contains the source code for the snowball compiler and the stemming
+algorithms on the website.
+
+See http://snowball.tartarus.org/ for more details.
+
diff --git a/contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..0a8190a08
--- /dev/null
+++ b/contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl
@@ -0,0 +1,91 @@
+routines (
+ mark_regions
+ main_suffix
+ consonant_pair
+ other_suffix
+ undouble
+)
+
+externals ( stem )
+
+strings ( ch )
+
+integers ( p1 x )
+
+groupings ( v s_ending )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef ae hex 'E6'
+stringdef ao hex 'E5'
+stringdef o/ hex 'F8'
+
+define v 'aeiouy{ae}{ao}{o/}'
+
+define s_ending 'abcdfghjklmnoprtvyz{ao}'
+
+define mark_regions as (
+
+ $p1 = limit
+
+ test ( hop 3 setmark x )
+ goto v gopast non-v setmark p1
+ try ( $p1 < x $p1 = x )
+)
+
+backwardmode (
+
+ define main_suffix as (
+ setlimit tomark p1 for ([substring])
+ among(
+
+ 'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere'
+ 'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes'
+ 'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets'
+ 'erets' 'et' 'eret'
+ (delete)
+ 's'
+ (s_ending delete)
+ )
+ )
+
+ define consonant_pair as (
+ test (
+ setlimit tomark p1 for ([substring])
+ among(
+ 'gd' // significant in the call from other_suffix
+ 'dt' 'gt' 'kt'
+ )
+ )
+ next] delete
+ )
+
+ define other_suffix as (
+ do ( ['st'] 'ig' delete )
+ setlimit tomark p1 for ([substring])
+ among(
+ 'ig' 'lig' 'elig' 'els'
+ (delete do consonant_pair)
+ 'l{o/}st'
+ (<-'l{o/}s')
+ )
+ )
+ define undouble as (
+ setlimit tomark p1 for ([non-v] ->ch)
+ ch
+ delete
+ )
+)
+
+define stem as (
+
+ do mark_regions
+ backwards (
+ do main_suffix
+ do consonant_pair
+ do other_suffix
+ do undouble
+ )
+)
diff --git a/contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl
new file mode 100644
index 000000000..1131a1cb7
--- /dev/null
+++ b/contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl
@@ -0,0 +1,91 @@
+routines (
+ mark_regions
+ main_suffix
+ consonant_pair
+ other_suffix
+ undouble
+)
+
+externals ( stem )
+
+strings ( ch )
+
+integers ( p1 x )
+
+groupings ( v s_ending )
+
+stringescapes {}
+
+/* special characters (in MS-DOS Latin I) */
+
+stringdef ae hex '91'
+stringdef ao hex '86'
+stringdef o/ hex '9B'
+
+define v 'aeiouy{ae}{ao}{o/}'
+
+define s_ending 'abcdfghjklmnoprtvyz{ao}'
+
+define mark_regions as (
+
+ $p1 = limit
+
+ test ( hop 3 setmark x )
+ goto v gopast non-v setmark p1
+ try ( $p1 < x $p1 = x )
+)
+
+backwardmode (
+
+ define main_suffix as (
+ setlimit tomark p1 for ([substring])
+ among(
+
+ 'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere'
+ 'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes'
+ 'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets'
+ 'erets' 'et' 'eret'
+ (delete)
+ 's'
+ (s_ending delete)
+ )
+ )
+
+ define consonant_pair as (
+ test (
+ setlimit tomark p1 for ([substring])
+ among(
+ 'gd' // significant in the call from other_suffix
+ 'dt' 'gt' 'kt'
+ )
+ )
+ next] delete
+ )
+
+ define other_suffix as (
+ do ( ['st'] 'ig' delete )
+ setlimit tomark p1 for ([substring])
+ among(
+ 'ig' 'lig' 'elig' 'els'
+ (delete do consonant_pair)
+ 'l{o/}st'
+ (<-'l{o/}s')
+ )
+ )
+ define undouble as (
+ setlimit tomark p1 for ([non-v] ->ch)
+ ch
+ delete
+ )
+)
+
+define stem as (
+
+ do mark_regions
+ backwards (
+ do main_suffix
+ do consonant_pair
+ do other_suffix
+ do undouble
+ )
+)
diff --git a/contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..f7609f766
--- /dev/null
+++ b/contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl
@@ -0,0 +1,164 @@
+routines (
+ prelude postlude
+ e_ending
+ en_ending
+ mark_regions
+ R1 R2
+ undouble
+ standard_suffix
+)
+
+externals ( stem )
+
+booleans ( e_found )
+
+integers ( p1 p2 )
+
+groupings ( v v_I v_j )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef a" hex 'E4'
+stringdef e" hex 'EB'
+stringdef i" hex 'EF'
+stringdef o" hex 'F6'
+stringdef u" hex 'FC'
+
+stringdef a' hex 'E1'
+stringdef e' hex 'E9'
+stringdef i' hex 'ED'
+stringdef o' hex 'F3'
+stringdef u' hex 'FA'
+
+stringdef e` hex 'E8'
+
+define v 'aeiouy{e`}'
+define v_I v + 'I'
+define v_j v + 'j'
+
+define prelude as (
+ test repeat (
+ [substring] among(
+ '{a"}' '{a'}'
+ (<- 'a')
+ '{e"}' '{e'}'
+ (<- 'e')
+ '{i"}' '{i'}'
+ (<- 'i')
+ '{o"}' '{o'}'
+ (<- 'o')
+ '{u"}' '{u'}'
+ (<- 'u')
+ '' (next)
+ ) //or next
+ )
+ try(['y'] <- 'Y')
+ repeat goto (
+ v [('i'] v <- 'I') or
+ ('y'] <- 'Y')
+ )
+)
+
+define mark_regions as (
+
+ $p1 = limit
+ $p2 = limit
+
+ gopast v gopast non-v setmark p1
+ try($p1 < 3 $p1 = 3) // at least 3
+ gopast v gopast non-v setmark p2
+
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'Y' (<- 'y')
+ 'I' (<- 'i')
+ '' (next)
+ ) //or next
+
+)
+
+backwardmode (
+
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define undouble as (
+ test among('kk' 'dd' 'tt') [next] delete
+ )
+
+ define e_ending as (
+ unset e_found
+ ['e'] R1 test non-v delete
+ set e_found
+ undouble
+ )
+
+ define en_ending as (
+ R1 non-v and not 'gem' delete
+ undouble
+ )
+
+ define standard_suffix as (
+ do (
+ [substring] among(
+ 'heden'
+ ( R1 <- 'heid'
+ )
+ 'en' 'ene'
+ ( en_ending
+ )
+ 's' 'se'
+ ( R1 non-v_j delete
+ )
+ )
+ )
+ do e_ending
+
+ do ( ['heid'] R2 not 'c' delete
+ ['en'] en_ending
+ )
+
+ do (
+ [substring] among(
+ 'end' 'ing'
+ ( R2 delete
+ (['ig'] R2 not 'e' delete) or undouble
+ )
+ 'ig'
+ ( R2 not 'e' delete
+ )
+ 'lijk'
+ ( R2 delete e_ending
+ )
+ 'baar'
+ ( R2 delete
+ )
+ 'bar'
+ ( R2 e_found delete
+ )
+ )
+ )
+ do (
+ non-v_I
+ test (
+ among ('aa' 'ee' 'oo' 'uu')
+ non-v
+ )
+ [next] delete
+ )
+ )
+)
+
+define stem as (
+
+ do prelude
+ do mark_regions
+ backwards
+ do standard_suffix
+ do postlude
+)
diff --git a/contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl
new file mode 100644
index 000000000..15b8718d1
--- /dev/null
+++ b/contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl
@@ -0,0 +1,164 @@
+routines (
+ prelude postlude
+ e_ending
+ en_ending
+ mark_regions
+ R1 R2
+ undouble
+ standard_suffix
+)
+
+externals ( stem )
+
+booleans ( e_found )
+
+integers ( p1 p2 )
+
+groupings ( v v_I v_j )
+
+stringescapes {}
+
+/* special characters (in MS-DOS Latin I) */
+
+stringdef a" hex '84'
+stringdef e" hex '89'
+stringdef i" hex '8B'
+stringdef o" hex '94'
+stringdef u" hex '81'
+
+stringdef a' hex 'A0'
+stringdef e' hex '82'
+stringdef i' hex 'A1'
+stringdef o' hex 'A2'
+stringdef u' hex 'A3'
+
+stringdef e` hex '8A'
+
+define v 'aeiouy{e`}'
+define v_I v + 'I'
+define v_j v + 'j'
+
+define prelude as (
+ test repeat (
+ [substring] among(
+ '{a"}' '{a'}'
+ (<- 'a')
+ '{e"}' '{e'}'
+ (<- 'e')
+ '{i"}' '{i'}'
+ (<- 'i')
+ '{o"}' '{o'}'
+ (<- 'o')
+ '{u"}' '{u'}'
+ (<- 'u')
+ '' (next)
+ ) //or next
+ )
+ try(['y'] <- 'Y')
+ repeat goto (
+ v [('i'] v <- 'I') or
+ ('y'] <- 'Y')
+ )
+)
+
+define mark_regions as (
+
+ $p1 = limit
+ $p2 = limit
+
+ gopast v gopast non-v setmark p1
+ try($p1 < 3 $p1 = 3) // at least 3
+ gopast v gopast non-v setmark p2
+
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'Y' (<- 'y')
+ 'I' (<- 'i')
+ '' (next)
+ ) //or next
+
+)
+
+backwardmode (
+
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define undouble as (
+ test among('kk' 'dd' 'tt') [next] delete
+ )
+
+ define e_ending as (
+ unset e_found
+ ['e'] R1 test non-v delete
+ set e_found
+ undouble
+ )
+
+ define en_ending as (
+ R1 non-v and not 'gem' delete
+ undouble
+ )
+
+ define standard_suffix as (
+ do (
+ [substring] among(
+ 'heden'
+ ( R1 <- 'heid'
+ )
+ 'en' 'ene'
+ ( en_ending
+ )
+ 's' 'se'
+ ( R1 non-v_j delete
+ )
+ )
+ )
+ do e_ending
+
+ do ( ['heid'] R2 not 'c' delete
+ ['en'] en_ending
+ )
+
+ do (
+ [substring] among(
+ 'end' 'ing'
+ ( R2 delete
+ (['ig'] R2 not 'e' delete) or undouble
+ )
+ 'ig'
+ ( R2 not 'e' delete
+ )
+ 'lijk'
+ ( R2 delete e_ending
+ )
+ 'baar'
+ ( R2 delete
+ )
+ 'bar'
+ ( R2 e_found delete
+ )
+ )
+ )
+ do (
+ non-v_I
+ test (
+ among ('aa' 'ee' 'oo' 'uu')
+ non-v
+ )
+ [next] delete
+ )
+ )
+)
+
+define stem as (
+
+ do prelude
+ do mark_regions
+ backwards
+ do standard_suffix
+ do postlude
+)
diff --git a/contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..fe18d7a91
--- /dev/null
+++ b/contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl
@@ -0,0 +1,229 @@
+integers ( p1 p2 )
+booleans ( Y_found )
+
+routines (
+ prelude postlude
+ mark_regions
+ shortv
+ R1 R2
+ Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5
+ exception1
+ exception2
+)
+
+externals ( stem )
+
+groupings ( v v_WXY valid_LI )
+
+stringescapes {}
+
+define v 'aeiouy'
+define v_WXY v + 'wxY'
+
+define valid_LI 'cdeghkmnrt'
+
+define prelude as (
+ unset Y_found
+ do ( ['{'}'] delete)
+ do ( ['y'] <-'Y' set Y_found)
+ do repeat(goto (v ['y']) <-'Y' set Y_found)
+)
+
+define mark_regions as (
+ $p1 = limit
+ $p2 = limit
+ do(
+ among (
+ 'gener'
+ 'commun' // added May 2005
+ 'arsen' // added Nov 2006 (arsenic/arsenal)
+ // ... extensions possible here ...
+ ) or (gopast v gopast non-v)
+ setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+backwardmode (
+
+ define shortv as (
+ ( non-v_WXY v non-v )
+ or
+ ( non-v v atlimit )
+ )
+
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define Step_1a as (
+ try (
+ [substring] among (
+ '{'}' '{'}s' '{'}s{'}'
+ (delete)
+ )
+ )
+ [substring] among (
+ 'sses' (<-'ss')
+ 'ied' 'ies'
+ ((hop 2 <-'i') or <-'ie')
+ 's' (next gopast v delete)
+ 'us' 'ss'
+ )
+ )
+
+ define Step_1b as (
+ [substring] among (
+ 'eed' 'eedly'
+ (R1 <-'ee')
+ 'ed' 'edly' 'ing' 'ingly'
+ (
+ test gopast v delete
+ test substring among(
+ 'at' 'bl' 'iz'
+ (<+ 'e')
+ 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt'
+ // ignoring double c, h, j, k, q, v, w, and x
+ ([next] delete)
+ '' (atmark p1 test shortv <+ 'e')
+ )
+ )
+ )
+ )
+
+ define Step_1c as (
+ ['y' or 'Y']
+ non-v not atlimit
+ <-'i'
+ )
+
+ define Step_2 as (
+ [substring] R1 among (
+ 'tional' (<-'tion')
+ 'enci' (<-'ence')
+ 'anci' (<-'ance')
+ 'abli' (<-'able')
+ 'entli' (<-'ent')
+ 'izer' 'ization'
+ (<-'ize')
+ 'ational' 'ation' 'ator'
+ (<-'ate')
+ 'alism' 'aliti' 'alli'
+ (<-'al')
+ 'fulness' (<-'ful')
+ 'ousli' 'ousness'
+ (<-'ous')
+ 'iveness' 'iviti'
+ (<-'ive')
+ 'biliti' 'bli'
+ (<-'ble')
+ 'ogi' ('l' <-'og')
+ 'fulli' (<-'ful')
+ 'lessli' (<-'less')
+ 'li' (valid_LI delete)
+ )
+ )
+
+ define Step_3 as (
+ [substring] R1 among (
+ 'tional' (<- 'tion')
+ 'ational' (<- 'ate')
+ 'alize' (<-'al')
+ 'icate' 'iciti' 'ical'
+ (<-'ic')
+ 'ful' 'ness'
+ (delete)
+ 'ative'
+ (R2 delete) // 'R2' added Dec 2001
+ )
+ )
+
+ define Step_4 as (
+ [substring] R2 among (
+ 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement'
+ 'ment' 'ent' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize'
+ (delete)
+ 'ion' ('s' or 't' delete)
+ )
+ )
+
+ define Step_5 as (
+ [substring] among (
+ 'e' (R2 or (R1 not shortv) delete)
+ 'l' (R2 'l' delete)
+ )
+ )
+
+ define exception2 as (
+
+ [substring] atlimit among(
+ 'inning' 'outing' 'canning' 'herring' 'earring'
+ 'proceed' 'exceed' 'succeed'
+
+ // ... extensions possible here ...
+
+ )
+ )
+)
+
+define exception1 as (
+
+ [substring] atlimit among(
+
+ /* special changes: */
+
+ 'skis' (<-'ski')
+ 'skies' (<-'sky')
+ 'dying' (<-'die')
+ 'lying' (<-'lie')
+ 'tying' (<-'tie')
+
+ /* special -LY cases */
+
+ 'idly' (<-'idl')
+ 'gently' (<-'gentl')
+ 'ugly' (<-'ugli')
+ 'early' (<-'earli')
+ 'only' (<-'onli')
+ 'singly' (<-'singl')
+
+ // ... extensions possible here ...
+
+ /* invariant forms: */
+
+ 'sky'
+ 'news'
+ 'howe'
+
+ 'atlas' 'cosmos' 'bias' 'andes' // not plural forms
+
+ // ... extensions possible here ...
+ )
+)
+
+define postlude as (Y_found repeat(goto (['Y']) <-'y'))
+
+define stem as (
+
+ exception1 or
+ not hop 3 or (
+ do prelude
+ do mark_regions
+ backwards (
+
+ do Step_1a
+
+ exception2 or (
+
+ do Step_1b
+ do Step_1c
+
+ do Step_2
+ do Step_3
+ do Step_4
+
+ do Step_5
+ )
+ )
+ do postlude
+ )
+)
diff --git a/contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..9ac74f292
--- /dev/null
+++ b/contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl
@@ -0,0 +1,196 @@
+
+/* Finnish stemmer.
+
+ Numbers in square brackets refer to the sections in
+ Fred Karlsson, Finnish: An Essential Grammar. Routledge, 1999
+ ISBN 0-415-20705-3
+
+*/
+
+routines (
+ mark_regions
+ R2
+ particle_etc possessive
+ LONG VI
+ case_ending
+ i_plural
+ t_plural
+ other_endings
+ tidy
+)
+
+externals ( stem )
+
+integers ( p1 p2 )
+strings ( x )
+booleans ( ending_removed )
+groupings ( AEI V1 V2 particle_end )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef a" hex 'E4'
+stringdef o" hex 'F6'
+
+define AEI 'a{a"}ei'
+define V1 'aeiouy{a"}{o"}'
+define V2 'aeiou{a"}{o"}'
+define particle_end V1 + 'nt'
+
+define mark_regions as (
+
+ $p1 = limit
+ $p2 = limit
+
+ goto V1 gopast non-V1 setmark p1
+ goto V1 gopast non-V1 setmark p2
+)
+
+backwardmode (
+
+ define R2 as $p2 <= cursor
+
+ define particle_etc as (
+ setlimit tomark p1 for ([substring])
+ among(
+ 'kin'
+ 'kaan' 'k{a"}{a"}n'
+ 'ko' 'k{o"}'
+ 'han' 'h{a"}n'
+ 'pa' 'p{a"}' // Particles [91]
+ (particle_end)
+ 'sti' // Adverb [87]
+ (R2)
+ )
+ delete
+ )
+ define possessive as ( // [36]
+ setlimit tomark p1 for ([substring])
+ among(
+ 'si'
+ (not 'k' delete) // take 'ksi' as the Comitative case
+ 'ni'
+ (delete ['kse'] <- 'ksi') // kseni = ksi + ni
+ 'nsa' 'ns{a"}'
+ 'mme'
+ 'nne'
+ (delete)
+ /* Now for Vn possessives after case endings: [36] */
+ 'an'
+ (among('ta' 'ssa' 'sta' 'lla' 'lta' 'na') delete)
+ '{a"}n'
+ (among('t{a"}' 'ss{a"}' 'st{a"}'
+ 'll{a"}' 'lt{a"}' 'n{a"}') delete)
+ 'en'
+ (among('lle' 'ine') delete)
+ )
+ )
+
+ define LONG as
+ among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}')
+
+ define VI as ('i' V2)
+
+ define case_ending as (
+ setlimit tomark p1 for ([substring])
+ among(
+ 'han' ('a') //-.
+ 'hen' ('e') // |
+ 'hin' ('i') // |
+ 'hon' ('o') // |
+ 'h{a"}n' ('{a"}') // Illative [43]
+ 'h{o"}n' ('{o"}') // |
+ 'siin' VI // |
+ 'seen' LONG //-'
+
+ 'den' VI
+ 'tten' VI // Genitive plurals [34]
+ ()
+ 'n' // Genitive or Illative
+ ( try ( LONG // Illative
+ or 'ie' // Genitive
+ and next ]
+ )
+ /* otherwise Genitive */
+ )
+
+ 'a' '{a"}' //-.
+ (V1 non-V1) // |
+ 'tta' 'tt{a"}' // Partitive [32]
+ ('e') // |
+ 'ta' 't{a"}' //-'
+
+ 'ssa' 'ss{a"}' // Inessive [41]
+ 'sta' 'st{a"}' // Elative [42]
+
+ 'lla' 'll{a"}' // Adessive [44]
+ 'lta' 'lt{a"}' // Ablative [51]
+ 'lle' // Allative [46]
+ 'na' 'n{a"}' // Essive [49]
+ 'ksi' // Translative[50]
+ 'ine' // Comitative [51]
+
+ /* Abessive and Instructive are too rare for
+ inclusion [51] */
+
+ )
+ delete
+ set ending_removed
+ )
+ define other_endings as (
+ setlimit tomark p2 for ([substring])
+ among(
+ 'mpi' 'mpa' 'mp{a"}'
+ 'mmi' 'mma' 'mm{a"}' // Comparative forms [85]
+ (not 'po') //-improves things
+ 'impi' 'impa' 'imp{a"}'
+ 'immi' 'imma' 'imm{a"}' // Superlative forms [86]
+ 'eja' 'ej{a"}' // indicates agent [93.1B]
+ )
+ delete
+ )
+ define i_plural as ( // [26]
+ setlimit tomark p1 for ([substring])
+ among(
+ 'i' 'j'
+ )
+ delete
+ )
+ define t_plural as ( // [26]
+ setlimit tomark p1 for (
+ ['t'] test V1
+ delete
+ )
+ setlimit tomark p2 for ([substring])
+ among(
+ 'mma' (not 'po') //-mmat endings
+ 'imma' //-immat endings
+ )
+ delete
+ )
+ define tidy as (
+ setlimit tomark p1 for (
+ do ( LONG and ([next] delete ) ) // undouble vowel
+ do ( [AEI] non-V1 delete ) // remove trailing a, a", e, i
+ do ( ['j'] 'o' or 'u' delete )
+ do ( ['o'] 'j' delete )
+ )
+ goto non-V1 [next] -> x x delete // undouble consonant
+ )
+)
+
+define stem as (
+
+ do mark_regions
+ unset ending_removed
+ backwards (
+ do particle_etc
+ do possessive
+ do case_ending
+ do other_endings
+ (ending_removed do i_plural) or do t_plural
+ do tidy
+ )
+)
+
diff --git a/contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..e972f227f
--- /dev/null
+++ b/contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl
@@ -0,0 +1,248 @@
+routines (
+ prelude postlude mark_regions
+ RV R1 R2
+ standard_suffix
+ i_verb_suffix
+ verb_suffix
+ residual_suffix
+ un_double
+ un_accent
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v keep_with_s )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef a^ hex 'E2' // a-circumflex
+stringdef a` hex 'E0' // a-grave
+stringdef c, hex 'E7' // c-cedilla
+
+stringdef e" hex 'EB' // e-diaeresis (rare)
+stringdef e' hex 'E9' // e-acute
+stringdef e^ hex 'EA' // e-circumflex
+stringdef e` hex 'E8' // e-grave
+stringdef i" hex 'EF' // i-diaeresis
+stringdef i^ hex 'EE' // i-circumflex
+stringdef o^ hex 'F4' // o-circumflex
+stringdef u^ hex 'FB' // u-circumflex
+stringdef u` hex 'F9' // u-grave
+
+define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}'
+
+define prelude as repeat goto (
+
+ ( v [ ('u' ] v <- 'U') or
+ ('i' ] v <- 'I') or
+ ('y' ] <- 'Y')
+ )
+ or
+ ( ['y'] v <- 'Y' )
+ or
+ ( 'q' ['u'] <- 'U' )
+)
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v v next )
+ or
+ among ( // this exception list begun Nov 2006
+ 'par' // paris, parie, pari
+ 'col' // colis
+ 'tap' // tapis
+ // extensions possible here
+ )
+ or
+ ( next gopast v )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'I' (<- 'i')
+ 'U' (<- 'u')
+ 'Y' (<- 'y')
+ '' (next)
+ )
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define standard_suffix as (
+ [substring] among(
+
+ 'ance' 'iqUe' 'isme' 'able' 'iste' 'eux'
+ 'ances' 'iqUes' 'ismes' 'ables' 'istes'
+ ( R2 delete )
+ 'atrice' 'ateur' 'ation'
+ 'atrices' 'ateurs' 'ations'
+ ( R2 delete
+ try ( ['ic'] (R2 delete) or <-'iqU' )
+ )
+ 'logie'
+ 'logies'
+ ( R2 <- 'log' )
+ 'usion' 'ution'
+ 'usions' 'utions'
+ ( R2 <- 'u' )
+ 'ence'
+ 'ences'
+ ( R2 <- 'ent' )
+ 'ement'
+ 'ements'
+ (
+ RV delete
+ try (
+ [substring] among(
+ 'iv' (R2 delete ['at'] R2 delete)
+ 'eus' ((R2 delete) or (R1<-'eux'))
+ 'abl' 'iqU'
+ (R2 delete)
+ 'i{e`}r' 'I{e`}r' //)
+ (RV <-'i') //)--new 2 Sept 02
+ )
+ )
+ )
+ 'it{e'}'
+ 'it{e'}s'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'abil' ((R2 delete) or <-'abl')
+ 'ic' ((R2 delete) or <-'iqU')
+ 'iv' (R2 delete)
+ )
+ )
+ )
+ 'if' 'ive'
+ 'ifs' 'ives'
+ (
+ R2 delete
+ try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' )
+ )
+ 'eaux' (<- 'eau')
+ 'aux' (R1 <- 'al')
+ 'euse'
+ 'euses'((R2 delete) or (R1<-'eux'))
+
+ 'issement'
+ 'issements'(R1 non-v delete) // verbal
+
+ // fail(...) below forces entry to verb_suffix. -ment typically
+ // follows the p.p., e.g 'confus{e'}ment'.
+
+ 'amment' (RV fail(<- 'ant'))
+ 'emment' (RV fail(<- 'ent'))
+ 'ment'
+ 'ments' (test(v RV) fail(delete))
+ // v is e,i,u,{e'},I or U
+ )
+ )
+
+ define i_verb_suffix as setlimit tomark pV for (
+ [substring] among (
+ '{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai'
+ 'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez'
+ 'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait'
+ 'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses'
+ 'issez' 'issiez' 'issions' 'issons' 'it'
+ (non-v delete)
+ )
+ )
+
+ define verb_suffix as setlimit tomark pV for (
+ [substring] among (
+ 'ions'
+ (R2 delete)
+
+ '{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai'
+ 'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions'
+ 'erons' 'eront' 'ez' 'iez'
+
+ // 'ons' //-best omitted
+
+ (delete)
+
+ '{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant'
+ 'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez'
+ 'assions'
+ (delete
+ try(['e'] delete)
+ )
+ )
+ )
+
+ define keep_with_s 'aiou{e`}s'
+
+ define residual_suffix as (
+ try(['s'] test non-keep_with_s delete)
+ setlimit tomark pV for (
+ [substring] among(
+ 'ion' (R2 's' or 't' delete)
+ 'ier' 'i{e`}re'
+ 'Ier' 'I{e`}re' (<-'i')
+ 'e' (delete)
+ '{e"}' ('gu' delete)
+ )
+ )
+ )
+
+ define un_double as (
+ test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete
+ )
+
+ define un_accent as (
+ atleast 1 non-v
+ [ '{e'}' or '{e`}' ] <-'e'
+ )
+)
+
+define stem as (
+
+ do prelude
+ do mark_regions
+ backwards (
+
+ do (
+ (
+ ( standard_suffix or
+ i_verb_suffix or
+ verb_suffix
+ )
+ and
+ try( [ ('Y' ] <- 'i' ) or
+ ('{c,}'] <- 'c' )
+ )
+ ) or
+ residual_suffix
+ )
+
+ // try(['ent'] RV delete) // is best omitted
+
+ do un_double
+ do un_accent
+ )
+ do postlude
+)
+
diff --git a/contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl
new file mode 100644
index 000000000..996eba1ab
--- /dev/null
+++ b/contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl
@@ -0,0 +1,239 @@
+routines (
+ prelude postlude mark_regions
+ RV R1 R2
+ standard_suffix
+ i_verb_suffix
+ verb_suffix
+ residual_suffix
+ un_double
+ un_accent
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v keep_with_s )
+
+stringescapes {}
+
+/* special characters (in MS-DOS Latin I) */
+
+stringdef a^ hex '83' // a-circumflex
+stringdef a` hex '85' // a-grave
+stringdef c, hex '87' // c-cedilla
+
+stringdef e" hex '89' // e-diaeresis (rare)
+stringdef e' hex '82' // e-acute
+stringdef e^ hex '88' // e-circumflex
+stringdef e` hex '8A' // e-grave
+stringdef i" hex '8B' // i-diaeresis
+stringdef i^ hex '8C' // i-circumflex
+stringdef o^ hex '93' // o-circumflex
+stringdef u^ hex '96' // u-circumflex
+stringdef u` hex '97' // u-grave
+
+define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}'
+
+define prelude as repeat goto (
+
+ ( v [ ('u' ] v <- 'U') or
+ ('i' ] v <- 'I') or
+ ('y' ] <- 'Y')
+ )
+ or
+ ( ['y'] v <- 'Y' )
+ or
+ ( 'q' ['u'] <- 'U' )
+)
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v v next ) or ( next gopast v )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'I' (<- 'i')
+ 'U' (<- 'u')
+ 'Y' (<- 'y')
+ '' (next)
+ )
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define standard_suffix as (
+ [substring] among(
+
+ 'ance' 'iqUe' 'isme' 'able' 'iste' 'eux'
+ 'ances' 'iqUes' 'ismes' 'ables' 'istes'
+ ( R2 delete )
+ 'atrice' 'ateur' 'ation'
+ 'atrices' 'ateurs' 'ations'
+ ( R2 delete
+ try ( ['ic'] (R2 delete) or <-'iqU' )
+ )
+ 'logie'
+ 'logies'
+ ( R2 <- 'log' )
+ 'usion' 'ution'
+ 'usions' 'utions'
+ ( R2 <- 'u' )
+ 'ence'
+ 'ences'
+ ( R2 <- 'ent' )
+ 'ement'
+ 'ements'
+ (
+ RV delete
+ try (
+ [substring] among(
+ 'iv' (R2 delete ['at'] R2 delete)
+ 'eus' ((R2 delete) or (R1<-'eux'))
+ 'abl' 'iqU'
+ (R2 delete)
+ 'i{e`}r' 'I{e`}r' //)
+ (RV <-'i') //)--new 2 Sept 02
+ )
+ )
+ )
+ 'it{e'}'
+ 'it{e'}s'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'abil' ((R2 delete) or <-'abl')
+ 'ic' ((R2 delete) or <-'iqU')
+ 'iv' (R2 delete)
+ )
+ )
+ )
+ 'if' 'ive'
+ 'ifs' 'ives'
+ (
+ R2 delete
+ try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' )
+ )
+ 'eaux' (<- 'eau')
+ 'aux' (R1 <- 'al')
+ 'euse'
+ 'euses'((R2 delete) or (R1<-'eux'))
+
+ 'issement'
+ 'issements'(R1 non-v delete) // verbal
+
+ // fail(...) below forces entry to verb_suffix. -ment typically
+ // follows the p.p., e.g 'confus{e'}ment'.
+
+ 'amment' (RV fail(<- 'ant'))
+ 'emment' (RV fail(<- 'ent'))
+ 'ment'
+ 'ments' (test(v RV) fail(delete))
+ // v is e,i,u,{e'},I or U
+ )
+ )
+
+ define i_verb_suffix as setlimit tomark pV for (
+ [substring] among (
+ '{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai'
+ 'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez'
+ 'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait'
+ 'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses'
+ 'issez' 'issiez' 'issions' 'issons' 'it'
+ (non-v delete)
+ )
+ )
+
+ define verb_suffix as setlimit tomark pV for (
+ [substring] among (
+ 'ions'
+ (R2 delete)
+
+ '{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai'
+ 'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions'
+ 'erons' 'eront' 'ez' 'iez'
+
+ // 'ons' //-best omitted
+
+ (delete)
+
+ '{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant'
+ 'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez'
+ 'assions'
+ (delete
+ try(['e'] delete)
+ )
+ )
+ )
+
+ define keep_with_s 'aiou{e`}s'
+
+ define residual_suffix as (
+ try(['s'] test non-keep_with_s delete)
+ setlimit tomark pV for (
+ [substring] among(
+ 'ion' (R2 's' or 't' delete)
+ 'ier' 'i{e`}re'
+ 'Ier' 'I{e`}re' (<-'i')
+ 'e' (delete)
+ '{e"}' ('gu' delete)
+ )
+ )
+ )
+
+ define un_double as (
+ test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete
+ )
+
+ define un_accent as (
+ atleast 1 non-v
+ [ '{e'}' or '{e`}' ] <-'e'
+ )
+)
+
+define stem as (
+
+ do prelude
+ do mark_regions
+ backwards (
+
+ do (
+ (
+ ( standard_suffix or
+ i_verb_suffix or
+ verb_suffix
+ )
+ and
+ try( [ ('Y' ] <- 'i' ) or
+ ('{c,}'] <- 'c' )
+ )
+ ) or
+ residual_suffix
+ )
+
+ // try(['ent'] RV delete) // is best omitted
+
+ do un_double
+ do un_accent
+ )
+ do postlude
+)
+
diff --git a/contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..7069daf0d
--- /dev/null
+++ b/contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl
@@ -0,0 +1,139 @@
+
+/*
+ Extra rule for -nisse ending added 11 Dec 2009
+*/
+
+routines (
+ prelude postlude
+ mark_regions
+ R1 R2
+ standard_suffix
+)
+
+externals ( stem )
+
+integers ( p1 p2 x )
+
+groupings ( v s_ending st_ending )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef a" hex 'E4'
+stringdef o" hex 'F6'
+stringdef u" hex 'FC'
+stringdef ss hex 'DF'
+
+define v 'aeiouy{a"}{o"}{u"}'
+
+define s_ending 'bdfghklmnrt'
+define st_ending s_ending - 'r'
+
+define prelude as (
+
+ test repeat (
+ (
+ ['{ss}'] <- 'ss'
+ ) or next
+ )
+
+ repeat goto (
+ v [('u'] v <- 'U') or
+ ('y'] v <- 'Y')
+ )
+)
+
+define mark_regions as (
+
+ $p1 = limit
+ $p2 = limit
+
+ test(hop 3 setmark x)
+
+ gopast v gopast non-v setmark p1
+ try($p1 < x $p1 = x) // at least 3
+ gopast v gopast non-v setmark p2
+
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'Y' (<- 'y')
+ 'U' (<- 'u')
+ '{a"}' (<- 'a')
+ '{o"}' (<- 'o')
+ '{u"}' (<- 'u')
+ '' (next)
+ )
+
+)
+
+backwardmode (
+
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define standard_suffix as (
+ do (
+ [substring] R1 among(
+ 'em' 'ern' 'er'
+ ( delete
+ )
+ 'e' 'en' 'es'
+ ( delete
+ try (['s'] 'nis' delete)
+ )
+ 's'
+ ( s_ending delete
+ )
+ )
+ )
+ do (
+ [substring] R1 among(
+ 'en' 'er' 'est'
+ ( delete
+ )
+ 'st'
+ ( st_ending hop 3 delete
+ )
+ )
+ )
+ do (
+ [substring] R2 among(
+ 'end' 'ung'
+ ( delete
+ try (['ig'] not 'e' R2 delete)
+ )
+ 'ig' 'ik' 'isch'
+ ( not 'e' delete
+ )
+ 'lich' 'heit'
+ ( delete
+ try (
+ ['er' or 'en'] R1 delete
+ )
+ )
+ 'keit'
+ ( delete
+ try (
+ [substring] R2 among(
+ 'lich' 'ig'
+ ( delete
+ )
+ )
+ )
+ )
+ )
+ )
+ )
+)
+
+define stem as (
+ do prelude
+ do mark_regions
+ backwards
+ do standard_suffix
+ do postlude
+)
diff --git a/contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl
new file mode 100644
index 000000000..3effb3257
--- /dev/null
+++ b/contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl
@@ -0,0 +1,139 @@
+
+/*
+ Extra rule for -nisse ending added 11 Dec 2009
+*/
+
+routines (
+ prelude postlude
+ mark_regions
+ R1 R2
+ standard_suffix
+)
+
+externals ( stem )
+
+integers ( p1 p2 x )
+
+groupings ( v s_ending st_ending )
+
+stringescapes {}
+
+/* special characters (in MS-DOS Latin I) */
+
+stringdef a" hex '84'
+stringdef o" hex '94'
+stringdef u" hex '81'
+stringdef ss hex 'E1'
+
+define v 'aeiouy{a"}{o"}{u"}'
+
+define s_ending 'bdfghklmnrt'
+define st_ending s_ending - 'r'
+
+define prelude as (
+
+ test repeat (
+ (
+ ['{ss}'] <- 'ss'
+ ) or next
+ )
+
+ repeat goto (
+ v [('u'] v <- 'U') or
+ ('y'] v <- 'Y')
+ )
+)
+
+define mark_regions as (
+
+ $p1 = limit
+ $p2 = limit
+
+ test(hop 3 setmark x)
+
+ gopast v gopast non-v setmark p1
+ try($p1 < x $p1 = x) // at least 3
+ gopast v gopast non-v setmark p2
+
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'Y' (<- 'y')
+ 'U' (<- 'u')
+ '{a"}' (<- 'a')
+ '{o"}' (<- 'o')
+ '{u"}' (<- 'u')
+ '' (next)
+ )
+
+)
+
+backwardmode (
+
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define standard_suffix as (
+ do (
+ [substring] R1 among(
+ 'em' 'ern' 'er'
+ ( delete
+ )
+ 'e' 'en' 'es'
+ ( delete
+ try (['s'] 'nis' delete)
+ )
+ 's'
+ ( s_ending delete
+ )
+ )
+ )
+ do (
+ [substring] R1 among(
+ 'en' 'er' 'est'
+ ( delete
+ )
+ 'st'
+ ( st_ending hop 3 delete
+ )
+ )
+ )
+ do (
+ [substring] R2 among(
+ 'end' 'ung'
+ ( delete
+ try (['ig'] not 'e' R2 delete)
+ )
+ 'ig' 'ik' 'isch'
+ ( not 'e' delete
+ )
+ 'lich' 'heit'
+ ( delete
+ try (
+ ['er' or 'en'] R1 delete
+ )
+ )
+ 'keit'
+ ( delete
+ try (
+ [substring] R2 among(
+ 'lich' 'ig'
+ ( delete
+ )
+ )
+ )
+ )
+ )
+ )
+ )
+)
+
+define stem as (
+ do prelude
+ do mark_regions
+ backwards
+ do standard_suffix
+ do postlude
+)
diff --git a/contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..ce6026a86
--- /dev/null
+++ b/contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl
@@ -0,0 +1,145 @@
+
+/*
+ Extra rule for -nisse ending added 11 Dec 2009
+*/
+
+routines (
+ prelude postlude
+ mark_regions
+ R1 R2
+ standard_suffix
+)
+
+externals ( stem )
+
+integers ( p1 p2 x )
+
+groupings ( v s_ending st_ending )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef a" hex 'E4'
+stringdef o" hex 'F6'
+stringdef u" hex 'FC'
+stringdef ss hex 'DF'
+
+define v 'aeiouy{a"}{o"}{u"}'
+
+define s_ending 'bdfghklmnrt'
+define st_ending s_ending - 'r'
+
+define prelude as (
+
+ test repeat goto (
+ v [('u'] v <- 'U') or
+ ('y'] v <- 'Y')
+ )
+
+ repeat (
+ [substring] among(
+ '{ss}' (<- 'ss')
+ 'ae' (<- '{a"}')
+ 'oe' (<- '{o"}')
+ 'ue' (<- '{u"}')
+ 'qu' (hop 2)
+ '' (next)
+ )
+ )
+
+)
+
+define mark_regions as (
+
+ $p1 = limit
+ $p2 = limit
+
+ test(hop 3 setmark x)
+
+ gopast v gopast non-v setmark p1
+ try($p1 < x $p1 = x) // at least 3
+ gopast v gopast non-v setmark p2
+
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'Y' (<- 'y')
+ 'U' (<- 'u')
+ '{a"}' (<- 'a')
+ '{o"}' (<- 'o')
+ '{u"}' (<- 'u')
+ '' (next)
+ )
+
+)
+
+backwardmode (
+
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define standard_suffix as (
+ do (
+ [substring] R1 among(
+ 'em' 'ern' 'er'
+ ( delete
+ )
+ 'e' 'en' 'es'
+ ( delete
+ try (['s'] 'nis' delete)
+ )
+ 's'
+ ( s_ending delete
+ )
+ )
+ )
+ do (
+ [substring] R1 among(
+ 'en' 'er' 'est'
+ ( delete
+ )
+ 'st'
+ ( st_ending hop 3 delete
+ )
+ )
+ )
+ do (
+ [substring] R2 among(
+ 'end' 'ung'
+ ( delete
+ try (['ig'] not 'e' R2 delete)
+ )
+ 'ig' 'ik' 'isch'
+ ( not 'e' delete
+ )
+ 'lich' 'heit'
+ ( delete
+ try (
+ ['er' or 'en'] R1 delete
+ )
+ )
+ 'keit'
+ ( delete
+ try (
+ [substring] R2 among(
+ 'lich' 'ig'
+ ( delete
+ )
+ )
+ )
+ )
+ )
+ )
+ )
+)
+
+define stem as (
+ do prelude
+ do mark_regions
+ backwards
+ do standard_suffix
+ do postlude
+)
diff --git a/contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl b/contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl
new file mode 100644
index 000000000..d1a644931
--- /dev/null
+++ b/contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl
@@ -0,0 +1,241 @@
+/*
+Hungarian Stemmer
+Removes noun inflections
+*/
+
+routines (
+ mark_regions
+ R1
+ v_ending
+ case
+ case_special
+ case_other
+ plural
+ owned
+ sing_owner
+ plur_owner
+ instrum
+ factive
+ undouble
+ double
+)
+
+externals ( stem )
+
+integers ( p1 )
+groupings ( v )
+
+stringescapes {}
+
+/* special characters (in ISO Latin 2) */
+
+stringdef a' hex 'E1' //a-acute
+stringdef e' hex 'E9' //e-acute
+stringdef i' hex 'ED' //i-acute
+stringdef o' hex 'F3' //o-acute
+stringdef o" hex 'F6' //o-umlaut
+stringdef oq hex 'F5' //o-double acute
+stringdef u' hex 'FA' //u-acute
+stringdef u" hex 'FC' //u-umlaut
+stringdef uq hex 'FB' //u-double acute
+
+define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}'
+
+define mark_regions as (
+
+ $p1 = limit
+
+ (v goto non-v
+ among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next
+ setmark p1)
+ or
+
+ (non-v gopast v setmark p1)
+)
+
+backwardmode (
+
+ define R1 as $p1 <= cursor
+
+ define v_ending as (
+ [substring] R1 among(
+ '{a'}' (<- 'a')
+ '{e'}' (<- 'e')
+ )
+ )
+
+ define double as (
+ test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm'
+ 'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs')
+ )
+
+ define undouble as (
+ next [hop 1] delete
+ )
+
+ define instrum as(
+ [substring] R1 among(
+ 'al' (double)
+ 'el' (double)
+ )
+ delete
+ undouble
+ )
+
+
+ define case as (
+ [substring] R1 among(
+ 'ban' 'ben'
+ 'ba' 'be'
+ 'ra' 're'
+ 'nak' 'nek'
+ 'val' 'vel'
+ 't{o'}l' 't{oq}l'
+ 'r{o'}l' 'r{oq}l'
+ 'b{o'}l' 'b{oq}l'
+ 'hoz' 'hez' 'h{o"}z'
+ 'n{a'}l' 'n{e'}l'
+ 'ig'
+ 'at' 'et' 'ot' '{o"}t'
+ '{e'}rt'
+ 'k{e'}pp' 'k{e'}ppen'
+ 'kor'
+ 'ul' '{u"}l'
+ 'v{a'}' 'v{e'}'
+ 'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt'
+ 'k{e'}nt'
+ 'en' 'on' 'an' '{o"}n'
+ 'n'
+ 't'
+ )
+ delete
+ v_ending
+ )
+
+ define case_special as(
+ [substring] R1 among(
+ '{e'}n' (<- 'e')
+ '{a'}n' (<- 'a')
+ '{a'}nk{e'}nt' (<- 'a')
+ )
+ )
+
+ define case_other as(
+ [substring] R1 among(
+ 'astul' 'est{u"}l' (delete)
+ 'stul' 'st{u"}l' (delete)
+ '{a'}stul' (<- 'a')
+ '{e'}st{u"}l' (<- 'e')
+ )
+ )
+
+ define factive as(
+ [substring] R1 among(
+ '{a'}' (double)
+ '{e'}' (double)
+ )
+ delete
+ undouble
+ )
+
+ define plural as (
+ [substring] R1 among(
+ '{a'}k' (<- 'a')
+ '{e'}k' (<- 'e')
+ '{o"}k' (delete)
+ 'ak' (delete)
+ 'ok' (delete)
+ 'ek' (delete)
+ 'k' (delete)
+ )
+ )
+
+ define owned as (
+ [substring] R1 among (
+ 'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete)
+ '{e'}k{e'}' (<- 'e')
+ '{a'}k{e'}' (<- 'a')
+ 'k{e'}' (delete)
+ '{e'}{e'}i' (<- 'e')
+ '{a'}{e'}i' (<- 'a')
+ '{e'}i' (delete)
+ '{e'}{e'}' (<- 'e')
+ '{e'}' (delete)
+ )
+ )
+
+ define sing_owner as (
+ [substring] R1 among(
+ '{u"}nk' 'unk' (delete)
+ '{a'}nk' (<- 'a')
+ '{e'}nk' (<- 'e')
+ 'nk' (delete)
+ '{a'}juk' (<- 'a')
+ '{e'}j{u"}k' (<- 'e')
+ 'juk' 'j{u"}k' (delete)
+ 'uk' '{u"}k' (delete)
+ 'em' 'om' 'am' (delete)
+ '{a'}m' (<- 'a')
+ '{e'}m' (<- 'e')
+ 'm' (delete)
+ 'od' 'ed' 'ad' '{o"}d' (delete)
+ '{a'}d' (<- 'a')
+ '{e'}d' (<- 'e')
+ 'd' (delete)
+ 'ja' 'je' (delete)
+ 'a' 'e' 'o' (delete)
+ '{a'}' (<- 'a')
+ '{e'}' (<- 'e')
+ )
+ )
+
+ define plur_owner as (
+ [substring] R1 among(
+ 'jaim' 'jeim' (delete)
+ '{a'}im' (<- 'a')
+ '{e'}im' (<- 'e')
+ 'aim' 'eim' (delete)
+ 'im' (delete)
+ 'jaid' 'jeid' (delete)
+ '{a'}id' (<- 'a')
+ '{e'}id' (<- 'e')
+ 'aid' 'eid' (delete)
+ 'id' (delete)
+ 'jai' 'jei' (delete)
+ '{a'}i' (<- 'a')
+ '{e'}i' (<- 'e')
+ 'ai' 'ei' (delete)
+ 'i' (delete)
+ 'jaink' 'jeink' (delete)
+ 'eink' 'aink' (delete)
+ '{a'}ink' (<- 'a')
+ '{e'}ink' (<- 'e')
+ 'ink'
+ 'jaitok' 'jeitek' (delete)
+ 'aitok' 'eitek' (delete)
+ '{a'}itok' (<- 'a')
+ '{e'}itek' (<- 'e')
+ 'itek' (delete)
+ 'jeik' 'jaik' (delete)
+ 'aik' 'eik' (delete)
+ '{a'}ik' (<- 'a')
+ '{e'}ik' (<- 'e')
+ 'ik' (delete)
+ )
+ )
+)
+
+define stem as (
+ do mark_regions
+ backwards (
+ do instrum
+ do case
+ do case_special
+ do case_other
+ do factive
+ do owned
+ do sing_owner
+ do plur_owner
+ do plural
+ )
+)
diff --git a/contrib/snowball/algorithms/hungarian/stem_Unicode.sbl b/contrib/snowball/algorithms/hungarian/stem_Unicode.sbl
new file mode 100644
index 000000000..c812e055c
--- /dev/null
+++ b/contrib/snowball/algorithms/hungarian/stem_Unicode.sbl
@@ -0,0 +1,241 @@
+/*
+Hungarian Stemmer
+Removes noun inflections
+*/
+
+routines (
+ mark_regions
+ R1
+ v_ending
+ case
+ case_special
+ case_other
+ plural
+ owned
+ sing_owner
+ plur_owner
+ instrum
+ factive
+ undouble
+ double
+)
+
+externals ( stem )
+
+integers ( p1 )
+groupings ( v )
+
+stringescapes {}
+
+/* special characters (in Unicode) */
+
+stringdef a' hex 'E1' //a-acute
+stringdef e' hex 'E9' //e-acute
+stringdef i' hex 'ED' //i-acute
+stringdef o' hex 'F3' //o-acute
+stringdef o" hex 'F6' //o-umlaut
+stringdef oq hex '151' //o-double acute
+stringdef u' hex 'FA' //u-acute
+stringdef u" hex 'FC' //u-umlaut
+stringdef uq hex '171' //u-double acute
+
+define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}'
+
+define mark_regions as (
+
+ $p1 = limit
+
+ (v goto non-v
+ among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next
+ setmark p1)
+ or
+
+ (non-v gopast v setmark p1)
+)
+
+backwardmode (
+
+ define R1 as $p1 <= cursor
+
+ define v_ending as (
+ [substring] R1 among(
+ '{a'}' (<- 'a')
+ '{e'}' (<- 'e')
+ )
+ )
+
+ define double as (
+ test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm'
+ 'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs')
+ )
+
+ define undouble as (
+ next [hop 1] delete
+ )
+
+ define instrum as(
+ [substring] R1 among(
+ 'al' (double)
+ 'el' (double)
+ )
+ delete
+ undouble
+ )
+
+
+ define case as (
+ [substring] R1 among(
+ 'ban' 'ben'
+ 'ba' 'be'
+ 'ra' 're'
+ 'nak' 'nek'
+ 'val' 'vel'
+ 't{o'}l' 't{oq}l'
+ 'r{o'}l' 'r{oq}l'
+ 'b{o'}l' 'b{oq}l'
+ 'hoz' 'hez' 'h{o"}z'
+ 'n{a'}l' 'n{e'}l'
+ 'ig'
+ 'at' 'et' 'ot' '{o"}t'
+ '{e'}rt'
+ 'k{e'}pp' 'k{e'}ppen'
+ 'kor'
+ 'ul' '{u"}l'
+ 'v{a'}' 'v{e'}'
+ 'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt'
+ 'k{e'}nt'
+ 'en' 'on' 'an' '{o"}n'
+ 'n'
+ 't'
+ )
+ delete
+ v_ending
+ )
+
+ define case_special as(
+ [substring] R1 among(
+ '{e'}n' (<- 'e')
+ '{a'}n' (<- 'a')
+ '{a'}nk{e'}nt' (<- 'a')
+ )
+ )
+
+ define case_other as(
+ [substring] R1 among(
+ 'astul' 'est{u"}l' (delete)
+ 'stul' 'st{u"}l' (delete)
+ '{a'}stul' (<- 'a')
+ '{e'}st{u"}l' (<- 'e')
+ )
+ )
+
+ define factive as(
+ [substring] R1 among(
+ '{a'}' (double)
+ '{e'}' (double)
+ )
+ delete
+ undouble
+ )
+
+ define plural as (
+ [substring] R1 among(
+ '{a'}k' (<- 'a')
+ '{e'}k' (<- 'e')
+ '{o"}k' (delete)
+ 'ak' (delete)
+ 'ok' (delete)
+ 'ek' (delete)
+ 'k' (delete)
+ )
+ )
+
+ define owned as (
+ [substring] R1 among (
+ 'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete)
+ '{e'}k{e'}' (<- 'e')
+ '{a'}k{e'}' (<- 'a')
+ 'k{e'}' (delete)
+ '{e'}{e'}i' (<- 'e')
+ '{a'}{e'}i' (<- 'a')
+ '{e'}i' (delete)
+ '{e'}{e'}' (<- 'e')
+ '{e'}' (delete)
+ )
+ )
+
+ define sing_owner as (
+ [substring] R1 among(
+ '{u"}nk' 'unk' (delete)
+ '{a'}nk' (<- 'a')
+ '{e'}nk' (<- 'e')
+ 'nk' (delete)
+ '{a'}juk' (<- 'a')
+ '{e'}j{u"}k' (<- 'e')
+ 'juk' 'j{u"}k' (delete)
+ 'uk' '{u"}k' (delete)
+ 'em' 'om' 'am' (delete)
+ '{a'}m' (<- 'a')
+ '{e'}m' (<- 'e')
+ 'm' (delete)
+ 'od' 'ed' 'ad' '{o"}d' (delete)
+ '{a'}d' (<- 'a')
+ '{e'}d' (<- 'e')
+ 'd' (delete)
+ 'ja' 'je' (delete)
+ 'a' 'e' 'o' (delete)
+ '{a'}' (<- 'a')
+ '{e'}' (<- 'e')
+ )
+ )
+
+ define plur_owner as (
+ [substring] R1 among(
+ 'jaim' 'jeim' (delete)
+ '{a'}im' (<- 'a')
+ '{e'}im' (<- 'e')
+ 'aim' 'eim' (delete)
+ 'im' (delete)
+ 'jaid' 'jeid' (delete)
+ '{a'}id' (<- 'a')
+ '{e'}id' (<- 'e')
+ 'aid' 'eid' (delete)
+ 'id' (delete)
+ 'jai' 'jei' (delete)
+ '{a'}i' (<- 'a')
+ '{e'}i' (<- 'e')
+ 'ai' 'ei' (delete)
+ 'i' (delete)
+ 'jaink' 'jeink' (delete)
+ 'eink' 'aink' (delete)
+ '{a'}ink' (<- 'a')
+ '{e'}ink' (<- 'e')
+ 'ink'
+ 'jaitok' 'jeitek' (delete)
+ 'aitok' 'eitek' (delete)
+ '{a'}itok' (<- 'a')
+ '{e'}itek' (<- 'e')
+ 'itek' (delete)
+ 'jeik' 'jaik' (delete)
+ 'aik' 'eik' (delete)
+ '{a'}ik' (<- 'a')
+ '{e'}ik' (<- 'e')
+ 'ik' (delete)
+ )
+ )
+)
+
+define stem as (
+ do mark_regions
+ backwards (
+ do instrum
+ do case
+ do case_special
+ do case_other
+ do factive
+ do owned
+ do sing_owner
+ do plur_owner
+ do plural
+ )
+)
diff --git a/contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..8d25cf64f
--- /dev/null
+++ b/contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl
@@ -0,0 +1,195 @@
+
+routines (
+ prelude postlude mark_regions
+ RV R1 R2
+ attached_pronoun
+ standard_suffix
+ verb_suffix
+ vowel_suffix
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v AEIO CG )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef a' hex 'E1'
+stringdef a` hex 'E0'
+stringdef e' hex 'E9'
+stringdef e` hex 'E8'
+stringdef i' hex 'ED'
+stringdef i` hex 'EC'
+stringdef o' hex 'F3'
+stringdef o` hex 'F2'
+stringdef u' hex 'FA'
+stringdef u` hex 'F9'
+
+define v 'aeiou{a`}{e`}{i`}{o`}{u`}'
+
+define prelude as (
+ test repeat (
+ [substring] among(
+ '{a'}' (<- '{a`}')
+ '{e'}' (<- '{e`}')
+ '{i'}' (<- '{i`}')
+ '{o'}' (<- '{o`}')
+ '{u'}' (<- '{u`}')
+ 'qu' (<- 'qU')
+ '' (next)
+ )
+ )
+ repeat goto (
+ v [ ('u' ] v <- 'U') or
+ ('i' ] v <- 'I')
+ )
+)
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v (non-v gopast v) or (v gopast non-v) )
+ or
+ ( non-v (non-v gopast v) or (v next) )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'I' (<- 'i')
+ 'U' (<- 'u')
+ '' (next)
+ )
+
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define attached_pronoun as (
+ [substring] among(
+ 'ci' 'gli' 'la' 'le' 'li' 'lo'
+ 'mi' 'ne' 'si' 'ti' 'vi'
+ // the compound forms are:
+ 'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene'
+ 'mela' 'mele' 'meli' 'melo' 'mene'
+ 'tela' 'tele' 'teli' 'telo' 'tene'
+ 'cela' 'cele' 'celi' 'celo' 'cene'
+ 'vela' 'vele' 'veli' 'velo' 'vene'
+ )
+ among( (RV)
+ 'ando' 'endo' (delete)
+ 'ar' 'er' 'ir' (<- 'e')
+ )
+ )
+
+ define standard_suffix as (
+ [substring] among(
+
+ 'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo'
+ 'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti'
+ 'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente'
+ 'atrice' 'atrici'
+ 'ante' 'anti' // Note 1
+ ( R2 delete )
+ 'azione' 'azioni' 'atore' 'atori'
+ ( R2 delete
+ try ( ['ic'] R2 delete )
+ )
+ 'logia' 'logie'
+ ( R2 <- 'log' )
+ 'uzione' 'uzioni' 'usione' 'usioni'
+ ( R2 <- 'u' )
+ 'enza' 'enze'
+ ( R2 <- 'ente' )
+ 'amento' 'amenti' 'imento' 'imenti'
+ ( RV delete )
+ 'amente' (
+ R1 delete
+ try (
+ [substring] R2 delete among(
+ 'iv' ( ['at'] R2 delete )
+ 'os' 'ic' 'abil'
+ )
+ )
+ )
+ 'it{a`}' (
+ R2 delete
+ try (
+ [substring] among(
+ 'abil' 'ic' 'iv' (R2 delete)
+ )
+ )
+ )
+ 'ivo' 'ivi' 'iva' 'ive' (
+ R2 delete
+ try ( ['at'] R2 delete ['ic'] R2 delete )
+ )
+ )
+ )
+
+ define verb_suffix as setlimit tomark pV for (
+ [substring] among(
+ 'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi'
+ 'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate'
+ 'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai'
+ 'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo'
+ 'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete'
+ 'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo'
+ 'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei'
+ 'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono'
+ 'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita'
+ 'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo'
+ 'ono' 'uta' 'ute' 'uti' 'uto'
+
+ 'ar' 'ir' // but 'er' is problematical
+ (delete)
+ )
+ )
+
+ define AEIO 'aeio{a`}{e`}{i`}{o`}'
+ define CG 'cg'
+
+ define vowel_suffix as (
+ try (
+ [AEIO] RV delete
+ ['i'] RV delete
+ )
+ try (
+ ['h'] CG RV delete
+ )
+ )
+)
+
+define stem as (
+ do prelude
+ do mark_regions
+ backwards (
+ do attached_pronoun
+ do (standard_suffix or verb_suffix)
+ do vowel_suffix
+ )
+ do postlude
+)
+
+/*
+ Note 1: additions of 15 Jun 2005
+*/
+
diff --git a/contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl
new file mode 100644
index 000000000..b43295c09
--- /dev/null
+++ b/contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl
@@ -0,0 +1,195 @@
+
+routines (
+ prelude postlude mark_regions
+ RV R1 R2
+ attached_pronoun
+ standard_suffix
+ verb_suffix
+ vowel_suffix
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v AEIO CG )
+
+stringescapes {}
+
+/* special characters (in MS-DOS Latin I) */
+
+stringdef a' hex 'A0'
+stringdef a` hex '85'
+stringdef e' hex '82'
+stringdef e` hex '8A'
+stringdef i' hex 'A1'
+stringdef i` hex '8D'
+stringdef o' hex 'A2'
+stringdef o` hex '95'
+stringdef u' hex 'A3'
+stringdef u` hex '97'
+
+define v 'aeiou{a`}{e`}{i`}{o`}{u`}'
+
+define prelude as (
+ test repeat (
+ [substring] among(
+ '{a'}' (<- '{a`}')
+ '{e'}' (<- '{e`}')
+ '{i'}' (<- '{i`}')
+ '{o'}' (<- '{o`}')
+ '{u'}' (<- '{u`}')
+ 'qu' (<- 'qU')
+ '' (next)
+ )
+ )
+ repeat goto (
+ v [ ('u' ] v <- 'U') or
+ ('i' ] v <- 'I')
+ )
+)
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v (non-v gopast v) or (v gopast non-v) )
+ or
+ ( non-v (non-v gopast v) or (v next) )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'I' (<- 'i')
+ 'U' (<- 'u')
+ '' (next)
+ )
+
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define attached_pronoun as (
+ [substring] among(
+ 'ci' 'gli' 'la' 'le' 'li' 'lo'
+ 'mi' 'ne' 'si' 'ti' 'vi'
+ // the compound forms are:
+ 'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene'
+ 'mela' 'mele' 'meli' 'melo' 'mene'
+ 'tela' 'tele' 'teli' 'telo' 'tene'
+ 'cela' 'cele' 'celi' 'celo' 'cene'
+ 'vela' 'vele' 'veli' 'velo' 'vene'
+ )
+ among( (RV)
+ 'ando' 'endo' (delete)
+ 'ar' 'er' 'ir' (<- 'e')
+ )
+ )
+
+ define standard_suffix as (
+ [substring] among(
+
+ 'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo'
+ 'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti'
+ 'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente'
+ 'atrice' 'atrici'
+ 'ante' 'anti' // Note 1
+ ( R2 delete )
+ 'azione' 'azioni' 'atore' 'atori'
+ ( R2 delete
+ try ( ['ic'] R2 delete )
+ )
+ 'logia' 'logie'
+ ( R2 <- 'log' )
+ 'uzione' 'uzioni' 'usione' 'usioni'
+ ( R2 <- 'u' )
+ 'enza' 'enze'
+ ( R2 <- 'ente' )
+ 'amento' 'amenti' 'imento' 'imenti'
+ ( RV delete )
+ 'amente' (
+ R1 delete
+ try (
+ [substring] R2 delete among(
+ 'iv' ( ['at'] R2 delete )
+ 'os' 'ic' 'abil'
+ )
+ )
+ )
+ 'it{a`}' (
+ R2 delete
+ try (
+ [substring] among(
+ 'abil' 'ic' 'iv' (R2 delete)
+ )
+ )
+ )
+ 'ivo' 'ivi' 'iva' 'ive' (
+ R2 delete
+ try ( ['at'] R2 delete ['ic'] R2 delete )
+ )
+ )
+ )
+
+ define verb_suffix as setlimit tomark pV for (
+ [substring] among(
+ 'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi'
+ 'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate'
+ 'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai'
+ 'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo'
+ 'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete'
+ 'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo'
+ 'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei'
+ 'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono'
+ 'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita'
+ 'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo'
+ 'ono' 'uta' 'ute' 'uti' 'uto'
+
+ 'ar' 'ir' // but 'er' is problematical
+ (delete)
+ )
+ )
+
+ define AEIO 'aeio{a`}{e`}{i`}{o`}'
+ define CG 'cg'
+
+ define vowel_suffix as (
+ try (
+ [AEIO] RV delete
+ ['i'] RV delete
+ )
+ try (
+ ['h'] CG RV delete
+ )
+ )
+)
+
+define stem as (
+ do prelude
+ do mark_regions
+ backwards (
+ do attached_pronoun
+ do (standard_suffix or verb_suffix)
+ do vowel_suffix
+ )
+ do postlude
+)
+
+/*
+ Note 1: additions of 15 Jun 2005
+*/
+
diff --git a/contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..cd79d1280
--- /dev/null
+++ b/contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl
@@ -0,0 +1,245 @@
+strings ( ch )
+integers ( x p1 p2 )
+booleans ( Y_found stemmed GE_removed )
+
+routines (
+
+ R1 R2
+ C V VX
+ lengthen_V
+ Step_1 Step_2 Step_3 Step_4 Step_7
+ Step_6 Step_1c
+ Lose_prefix
+ Lose_infix
+ measure
+)
+
+externals ( stem )
+
+groupings ( v v_WX AOU AIOU )
+
+stringescapes {}
+
+stringdef ' hex '27' // yuk
+
+define v 'aeiouy'
+define v_WX v + 'wx'
+define AOU 'aou'
+define AIOU 'aiou'
+
+backwardmode (
+
+ define R1 as (setmark x $x >= p1)
+ define R2 as (setmark x $x >= p2)
+
+ define V as test (v or 'ij')
+ define VX as test (next v or 'ij')
+ define C as test (not 'ij' non-v)
+
+ define lengthen_V as do (
+ non-v_WX [ (AOU] test (non-v or atlimit)) or
+ ('e'] test (non-v or atlimit
+ not AIOU
+ not (next AIOU non-v)))
+ ->ch insert ch
+ )
+
+ define Step_1 as
+ (
+ [among ( (])
+
+ '{'}s' (delete)
+ 's' (R1 not ('t' R1) C delete)
+ 'ies' (R1 <-'ie')
+ 'es'
+ (('ar' R1 C ] delete lengthen_V) or
+ ('er' R1 C ] delete) or
+ (R1 C <-'e'))
+
+ 'aus' (R1 V <-'au')
+ 'en' (('hed' R1 ] <-'heid') or
+ ('nd' delete) or
+ ('d' R1 C ] delete) or
+ ('i' or 'j' V delete) or
+ (R1 C delete lengthen_V))
+ 'nde' (<-'nd')
+ )
+ )
+
+ define Step_2 as
+ (
+ [among ( (])
+ 'je' (('{'}t' ] delete) or
+ ('et' ] R1 C delete) or
+ ('rnt' ] <-'rn') or
+ ('t' ] R1 VX delete) or
+ ('ink' ] <-'ing') or
+ ('mp' ] <-'m') or
+ ('{'}' ] R1 delete) or
+ (] R1 C delete))
+ 'ge' (R1 <-'g')
+ 'lijke'(R1 <-'lijk')
+ 'ische'(R1 <-'isch')
+ 'de' (R1 C delete)
+ 'te' (R1 <-'t')
+ 'se' (R1 <-'s')
+ 're' (R1 <-'r')
+ 'le' (R1 delete attach 'l' lengthen_V)
+ 'ene' (R1 C delete attach 'en' lengthen_V)
+ 'ieve' (R1 C <-'ief')
+ )
+ )
+
+ define Step_3 as
+ (
+ [among ( (])
+ 'atie' (R1 <-'eer')
+ 'iteit' (R1 delete lengthen_V)
+ 'heid'
+ 'sel'
+ 'ster' (R1 delete)
+ 'rder' (<-'r')
+ 'ing'
+ 'isme'
+ 'erij' (R1 delete lengthen_V)
+ 'arij' (R1 C <-'aar')
+ 'fie' (R2 delete attach 'f' lengthen_V)
+ 'gie' (R2 delete attach 'g' lengthen_V)
+ 'tst' (R1 C <-'t')
+ 'dst' (R1 C <-'d')
+ )
+ )
+
+ define Step_4 as
+ (
+ ( [among ( (])
+ 'ioneel' (R1 <-'ie')
+ 'atief' (R1 <-'eer')
+ 'baar' (R1 delete)
+ 'naar' (R1 V <-'n')
+ 'laar' (R1 V <-'l')
+ 'raar' (R1 V <-'r')
+ 'tant' (R1 <-'teer')
+ 'lijker'
+ 'lijkst' (R1 <-'lijk')
+ 'achtig'
+ 'achtiger'
+ 'achtigst'(R1 delete)
+ 'eriger'
+ 'erigst'
+ 'erig'
+ 'end' (R1 C delete lengthen_V)
+ )
+ )
+ or
+ ( [among ( (])
+ 'iger'
+ 'igst'
+ 'ig' (R1 C delete lengthen_V)
+ )
+ )
+ )
+
+ define Step_7 as
+ (
+ [among ( (])
+ 'kt' (<-'k')
+ 'ft' (<-'f')
+ 'pt' (<-'p')
+ )
+ )
+
+ define Step_6 as
+ (
+ [among ( (])
+ 'bb' (<-'b')
+ 'cc' (<-'c')
+ 'dd' (<-'d')
+ 'ff' (<-'f')
+ 'gg' (<-'g')
+ 'hh' (<-'h')
+ 'jj' (<-'j')
+ 'kk' (<-'k')
+ 'll' (<-'l')
+ 'mm' (<-'m')
+ 'nn' (<-'n')
+ 'pp' (<-'p')
+ 'qq' (<-'q')
+ 'rr' (<-'r')
+ 'ss' (<-'s')
+ 'tt' (<-'t')
+ 'vv' (<-'v')
+ 'ww' (<-'w')
+ 'xx' (<-'x')
+ 'zz' (<-'z')
+ 'v' (<-'f')
+ 'z' (<-'s')
+ )
+ )
+
+ define Step_1c as
+ (
+ [among ( (] R1 C)
+ 'd' (not ('n' R1) delete)
+ 't' (not ('h' R1) delete)
+ )
+ )
+)
+
+define Lose_prefix as (
+ ['ge'] test hop 3 (goto v goto non-v)
+ set GE_removed
+ delete
+)
+
+define Lose_infix as (
+ next
+ gopast (['ge']) test hop 3 (goto v goto non-v)
+ set GE_removed
+ delete
+)
+
+define measure as (
+ do (
+ tolimit
+ setmark p1
+ setmark p2
+ )
+ do(
+ repeat non-v atleast 1 ('ij' or v) non-v setmark p1
+ repeat non-v atleast 1 ('ij' or v) non-v setmark p2
+ )
+
+)
+define stem as (
+
+ unset Y_found
+ unset stemmed
+ do ( ['y'] <-'Y' set Y_found )
+ do repeat(goto (v ['y'])<-'Y' set Y_found )
+
+ measure
+
+ backwards (
+ do (Step_1 set stemmed )
+ do (Step_2 set stemmed )
+ do (Step_3 set stemmed )
+ do (Step_4 set stemmed )
+ )
+ unset GE_removed
+ do (Lose_prefix and measure)
+ backwards (
+ do (GE_removed Step_1c)
+ )
+ unset GE_removed
+ do (Lose_infix and measure)
+ backwards (
+ do (GE_removed Step_1c)
+ )
+ backwards (
+ do (Step_7 set stemmed )
+ do (stemmed or GE_removed Step_6)
+ )
+ do(Y_found repeat(goto (['Y']) <-'y'))
+)
+
diff --git a/contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..3f69f1572
--- /dev/null
+++ b/contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl
@@ -0,0 +1,208 @@
+
+stringescapes {}
+
+routines (
+ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z AA BB CC
+
+ endings
+
+ undouble respell
+)
+
+externals ( stem )
+
+backwardmode (
+
+ /* Lovins' conditions A, B ... CC, as given in her Appendix B, where
+ a test for a two letter prefix ('test hop 2') is implicitly
+ assumed. Note that 'e' next 'u' corresponds to her u*e because
+ Snowball is scanning backwards. */
+
+ define A as ( hop 2 )
+ define B as ( hop 3 )
+ define C as ( hop 4 )
+ define D as ( hop 5 )
+ define E as ( test hop 2 not 'e' )
+ define F as ( test hop 3 not 'e' )
+ define G as ( test hop 3 'f' )
+ define H as ( test hop 2 't' or 'll' )
+ define I as ( test hop 2 not 'o' not 'e' )
+ define J as ( test hop 2 not 'a' not 'e' )
+ define K as ( test hop 3 'l' or 'i' or ('e' next 'u') )
+ define L as ( test hop 2 not 'u' not 'x' not ('s' not 'o') )
+ define M as ( test hop 2 not 'a' not 'c' not 'e' not 'm' )
+ define N as ( test hop 3 ( hop 2 not 's' or hop 2 ) )
+ define O as ( test hop 2 'l' or 'i' )
+ define P as ( test hop 2 not 'c' )
+ define Q as ( test hop 2 test hop 3 not 'l' not 'n' )
+ define R as ( test hop 2 'n' or 'r' )
+ define S as ( test hop 2 'dr' or ('t' not 't') )
+ define T as ( test hop 2 's' or ('t' not 'o') )
+ define U as ( test hop 2 'l' or 'm' or 'n' or 'r' )
+ define V as ( test hop 2 'c' )
+ define W as ( test hop 2 not 's' not 'u' )
+ define X as ( test hop 2 'l' or 'i' or ('e' next 'u') )
+ define Y as ( test hop 2 'in' )
+ define Z as ( test hop 2 not 'f' )
+ define AA as ( test hop 2 among ( 'd' 'f' 'ph' 'th' 'l' 'er' 'or'
+ 'es' 't' ) )
+ define BB as ( test hop 3 not 'met' not 'ryst' )
+ define CC as ( test hop 2 'l' )
+
+
+ /* The system of endings, as given in Appendix A. */
+
+ define endings as (
+ [substring] among(
+ 'alistically' B 'arizability' A 'izationally' B
+
+ 'antialness' A 'arisations' A 'arizations' A 'entialness' A
+
+ 'allically' C 'antaneous' A 'antiality' A 'arisation' A
+ 'arization' A 'ationally' B 'ativeness' A 'eableness' E
+ 'entations' A 'entiality' A 'entialize' A 'entiation' A
+ 'ionalness' A 'istically' A 'itousness' A 'izability' A
+ 'izational' A
+
+ 'ableness' A 'arizable' A 'entation' A 'entially' A
+ 'eousness' A 'ibleness' A 'icalness' A 'ionalism' A
+ 'ionality' A 'ionalize' A 'iousness' A 'izations' A
+ 'lessness' A
+
+ 'ability' A 'aically' A 'alistic' B 'alities' A
+ 'ariness' E 'aristic' A 'arizing' A 'ateness' A
+ 'atingly' A 'ational' B 'atively' A 'ativism' A
+ 'elihood' E 'encible' A 'entally' A 'entials' A
+ 'entiate' A 'entness' A 'fulness' A 'ibility' A
+ 'icalism' A 'icalist' A 'icality' A 'icalize' A
+ 'ication' G 'icianry' A 'ination' A 'ingness' A
+ 'ionally' A 'isation' A 'ishness' A 'istical' A
+ 'iteness' A 'iveness' A 'ivistic' A 'ivities' A
+ 'ization' F 'izement' A 'oidally' A 'ousness' A
+
+ 'aceous' A 'acious' B 'action' G 'alness' A
+ 'ancial' A 'ancies' A 'ancing' B 'ariser' A
+ 'arized' A 'arizer' A 'atable' A 'ations' B
+ 'atives' A 'eature' Z 'efully' A 'encies' A
+ 'encing' A 'ential' A 'enting' C 'entist' A
+ 'eously' A 'ialist' A 'iality' A 'ialize' A
+ 'ically' A 'icance' A 'icians' A 'icists' A
+ 'ifully' A 'ionals' A 'ionate' D 'ioning' A
+ 'ionist' A 'iously' A 'istics' A 'izable' E
+ 'lessly' A 'nesses' A 'oidism' A
+
+ 'acies' A 'acity' A 'aging' B 'aical' A
+ 'alist' A 'alism' B 'ality' A 'alize' A
+ 'allic'BB 'anced' B 'ances' B 'antic' C
+ 'arial' A 'aries' A 'arily' A 'arity' B
+ 'arize' A 'aroid' A 'ately' A 'ating' I
+ 'ation' B 'ative' A 'ators' A 'atory' A
+ 'ature' E 'early' Y 'ehood' A 'eless' A
+ 'elity' A 'ement' A 'enced' A 'ences' A
+ 'eness' E 'ening' E 'ental' A 'ented' C
+ 'ently' A 'fully' A 'ially' A 'icant' A
+ 'ician' A 'icide' A 'icism' A 'icist' A
+ 'icity' A 'idine' I 'iedly' A 'ihood' A
+ 'inate' A 'iness' A 'ingly' B 'inism' J
+ 'inity'CC 'ional' A 'ioned' A 'ished' A
+ 'istic' A 'ities' A 'itous' A 'ively' A
+ 'ivity' A 'izers' F 'izing' F 'oidal' A
+ 'oides' A 'otide' A 'ously' A
+
+ 'able' A 'ably' A 'ages' B 'ally' B
+ 'ance' B 'ancy' B 'ants' B 'aric' A
+ 'arly' K 'ated' I 'ates' A 'atic' B
+ 'ator' A 'ealy' Y 'edly' E 'eful' A
+ 'eity' A 'ence' A 'ency' A 'ened' E
+ 'enly' E 'eous' A 'hood' A 'ials' A
+ 'ians' A 'ible' A 'ibly' A 'ical' A
+ 'ides' L 'iers' A 'iful' A 'ines' M
+ 'ings' N 'ions' B 'ious' A 'isms' B
+ 'ists' A 'itic' H 'ized' F 'izer' F
+ 'less' A 'lily' A 'ness' A 'ogen' A
+ 'ward' A 'wise' A 'ying' B 'yish' A
+
+ 'acy' A 'age' B 'aic' A 'als'BB
+ 'ant' B 'ars' O 'ary' F 'ata' A
+ 'ate' A 'eal' Y 'ear' Y 'ely' E
+ 'ene' E 'ent' C 'ery' E 'ese' A
+ 'ful' A 'ial' A 'ian' A 'ics' A
+ 'ide' L 'ied' A 'ier' A 'ies' P
+ 'ily' A 'ine' M 'ing' N 'ion' Q
+ 'ish' C 'ism' B 'ist' A 'ite'AA
+ 'ity' A 'ium' A 'ive' A 'ize' F
+ 'oid' A 'one' R 'ous' A
+
+ 'ae' A 'al'BB 'ar' X 'as' B
+ 'ed' E 'en' F 'es' E 'ia' A
+ 'ic' A 'is' A 'ly' B 'on' S
+ 'or' T 'um' U 'us' V 'yl' R
+ '{'}s' A 's{'}' A
+
+ 'a' A 'e' A 'i' A 'o' A
+ 's' W 'y' B
+
+ (delete)
+ )
+ )
+
+ /* Undoubling is rule 1 of appendix C. */
+
+ define undouble as (
+ test substring among ('bb' 'dd' 'gg' 'll' 'mm' 'nn' 'pp' 'rr' 'ss'
+ 'tt')
+ [next] delete
+ )
+
+ /* The other appendix C rules can be done together. */
+
+ define respell as (
+ [substring] among (
+ 'iev' (<-'ief')
+ 'uct' (<-'uc')
+ 'umpt' (<-'um')
+ 'rpt' (<-'rb')
+ 'urs' (<-'ur')
+ 'istr' (<-'ister')
+ 'metr' (<-'meter')
+ 'olv' (<-'olut')
+ 'ul' (not 'a' not 'i' not 'o' <-'l')
+ 'bex' (<-'bic')
+ 'dex' (<-'dic')
+ 'pex' (<-'pic')
+ 'tex' (<-'tic')
+ 'ax' (<-'ac')
+ 'ex' (<-'ec')
+ 'ix' (<-'ic')
+ 'lux' (<-'luc')
+ 'uad' (<-'uas')
+ 'vad' (<-'vas')
+ 'cid' (<-'cis')
+ 'lid' (<-'lis')
+ 'erid' (<-'eris')
+ 'pand' (<-'pans')
+ 'end' (not 's' <-'ens')
+ 'ond' (<-'ons')
+ 'lud' (<-'lus')
+ 'rud' (<-'rus')
+ 'her' (not 'p' not 't' <-'hes')
+ 'mit' (<-'mis')
+ 'ent' (not 'm' <-'ens')
+ /* 'ent' was 'end' in the 1968 paper - a typo. */
+ 'ert' (<-'ers')
+ 'et' (not 'n' <-'es')
+ 'yt' (<-'ys')
+ 'yz' (<-'ys')
+ )
+ )
+)
+
+define stem as (
+
+ backwards (
+ do endings
+ do undouble
+ do respell
+ )
+)
+
diff --git a/contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..94a071653
--- /dev/null
+++ b/contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl
@@ -0,0 +1,80 @@
+routines (
+ mark_regions
+ main_suffix
+ consonant_pair
+ other_suffix
+)
+
+externals ( stem )
+
+integers ( p1 x )
+
+groupings ( v s_ending )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef ae hex 'E6'
+stringdef ao hex 'E5'
+stringdef o/ hex 'F8'
+
+define v 'aeiouy{ae}{ao}{o/}'
+
+define s_ending 'bcdfghjlmnoprtvyz'
+
+define mark_regions as (
+
+ $p1 = limit
+
+ test ( hop 3 setmark x )
+ goto v gopast non-v setmark p1
+ try ( $p1 < x $p1 = x )
+)
+
+backwardmode (
+
+ define main_suffix as (
+ setlimit tomark p1 for ([substring])
+ among(
+
+ 'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar'
+ 'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens'
+ 'hetens' 'ers' 'ets' 'et' 'het' 'ast'
+ (delete)
+ 's'
+ (s_ending or ('k' non-v) delete)
+ 'erte' 'ert'
+ (<-'er')
+ )
+ )
+
+ define consonant_pair as (
+ test (
+ setlimit tomark p1 for ([substring])
+ among(
+ 'dt' 'vt'
+ )
+ )
+ next] delete
+ )
+
+ define other_suffix as (
+ setlimit tomark p1 for ([substring])
+ among(
+ 'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov'
+ 'hetslov'
+ (delete)
+ )
+ )
+)
+
+define stem as (
+
+ do mark_regions
+ backwards (
+ do main_suffix
+ do consonant_pair
+ do other_suffix
+ )
+)
diff --git a/contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl
new file mode 100644
index 000000000..f57483354
--- /dev/null
+++ b/contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl
@@ -0,0 +1,80 @@
+routines (
+ mark_regions
+ main_suffix
+ consonant_pair
+ other_suffix
+)
+
+externals ( stem )
+
+integers ( p1 x )
+
+groupings ( v s_ending )
+
+stringescapes {}
+
+/* special characters (in MS-DOS Latin I) */
+
+stringdef ae hex '91'
+stringdef ao hex '86'
+stringdef o/ hex '9B'
+
+define v 'aeiouy{ae}{ao}{o/}'
+
+define s_ending 'bcdfghjlmnoprtvyz'
+
+define mark_regions as (
+
+ $p1 = limit
+
+ test ( hop 3 setmark x )
+ goto v gopast non-v setmark p1
+ try ( $p1 < x $p1 = x )
+)
+
+backwardmode (
+
+ define main_suffix as (
+ setlimit tomark p1 for ([substring])
+ among(
+
+ 'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar'
+ 'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens'
+ 'hetens' 'ers' 'ets' 'et' 'het' 'ast'
+ (delete)
+ 's'
+ (s_ending or ('k' non-v) delete)
+ 'erte' 'ert'
+ (<-'er')
+ )
+ )
+
+ define consonant_pair as (
+ test (
+ setlimit tomark p1 for ([substring])
+ among(
+ 'dt' 'vt'
+ )
+ )
+ next] delete
+ )
+
+ define other_suffix as (
+ setlimit tomark p1 for ([substring])
+ among(
+ 'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov'
+ 'hetslov'
+ (delete)
+ )
+ )
+)
+
+define stem as (
+
+ do mark_regions
+ backwards (
+ do main_suffix
+ do consonant_pair
+ do other_suffix
+ )
+)
diff --git a/contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..9533b7932
--- /dev/null
+++ b/contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl
@@ -0,0 +1,139 @@
+integers ( p1 p2 )
+booleans ( Y_found )
+
+routines (
+ shortv
+ R1 R2
+ Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5a Step_5b
+)
+
+externals ( stem )
+
+groupings ( v v_WXY )
+
+define v 'aeiouy'
+define v_WXY v + 'wxY'
+
+backwardmode (
+
+ define shortv as ( non-v_WXY v non-v )
+
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define Step_1a as (
+ [substring] among (
+ 'sses' (<-'ss')
+ 'ies' (<-'i')
+ 'ss' ()
+ 's' (delete)
+ )
+ )
+
+ define Step_1b as (
+ [substring] among (
+ 'eed' (R1 <-'ee')
+ 'ed'
+ 'ing' (
+ test gopast v delete
+ test substring among(
+ 'at' 'bl' 'iz'
+ (<+ 'e')
+ 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt'
+ // ignoring double c, h, j, k, q, v, w, and x
+ ([next] delete)
+ '' (atmark p1 test shortv <+ 'e')
+ )
+ )
+ )
+ )
+
+ define Step_1c as (
+ ['y' or 'Y']
+ gopast v
+ <-'i'
+ )
+
+ define Step_2 as (
+ [substring] R1 among (
+ 'tional' (<-'tion')
+ 'enci' (<-'ence')
+ 'anci' (<-'ance')
+ 'abli' (<-'able')
+ 'entli' (<-'ent')
+ 'eli' (<-'e')
+ 'izer' 'ization'
+ (<-'ize')
+ 'ational' 'ation' 'ator'
+ (<-'ate')
+ 'alli' (<-'al')
+ 'alism' 'aliti'
+ (<-'al')
+ 'fulness' (<-'ful')
+ 'ousli' 'ousness'
+ (<-'ous')
+ 'iveness' 'iviti'
+ (<-'ive')
+ 'biliti' (<-'ble')
+ )
+ )
+
+ define Step_3 as (
+ [substring] R1 among (
+ 'alize' (<-'al')
+ 'icate' 'iciti' 'ical'
+ (<-'ic')
+ 'ative' 'ful' 'ness'
+ (delete)
+ )
+ )
+
+ define Step_4 as (
+ [substring] R2 among (
+ 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement'
+ 'ment' 'ent' 'ou' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize'
+ (delete)
+ 'ion' ('s' or 't' delete)
+ )
+ )
+
+ define Step_5a as (
+ ['e']
+ R2 or (R1 not shortv)
+ delete
+ )
+
+ define Step_5b as (
+ ['l']
+ R2 'l'
+ delete
+ )
+)
+
+define stem as (
+
+ unset Y_found
+ do ( ['y'] <-'Y' set Y_found)
+ do repeat(goto (v ['y']) <-'Y' set Y_found)
+
+ $p1 = limit
+ $p2 = limit
+ do(
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+
+ backwards (
+ do Step_1a
+ do Step_1b
+ do Step_1c
+ do Step_2
+ do Step_3
+ do Step_4
+ do Step_5a
+ do Step_5b
+ )
+
+ do(Y_found repeat(goto (['Y']) <-'y'))
+
+)
diff --git a/contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..3e7da08d4
--- /dev/null
+++ b/contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl
@@ -0,0 +1,218 @@
+routines (
+ prelude postlude mark_regions
+ RV R1 R2
+ standard_suffix
+ verb_suffix
+ residual_suffix
+ residual_form
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef a' hex 'E1' // a-acute
+stringdef a^ hex 'E2' // a-circumflex e.g. 'bota^nico
+stringdef e' hex 'E9' // e-acute
+stringdef e^ hex 'EA' // e-circumflex
+stringdef i' hex 'ED' // i-acute
+stringdef o^ hex 'F4' // o-circumflex
+stringdef o' hex 'F3' // o-acute
+stringdef u' hex 'FA' // u-acute
+stringdef c, hex 'E7' // c-cedilla
+
+stringdef a~ hex 'E3' // a-tilde
+stringdef o~ hex 'F5' // o-tilde
+
+
+define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}'
+
+define prelude as repeat (
+ [substring] among(
+ '{a~}' (<- 'a~')
+ '{o~}' (<- 'o~')
+ '' (next)
+ ) //or next
+)
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v (non-v gopast v) or (v gopast non-v) )
+ or
+ ( non-v (non-v gopast v) or (v next) )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+ [substring] among(
+ 'a~' (<- '{a~}')
+ 'o~' (<- '{o~}')
+ '' (next)
+ ) //or next
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define standard_suffix as (
+ [substring] among(
+
+ 'eza' 'ezas'
+ 'ico' 'ica' 'icos' 'icas'
+ 'ismo' 'ismos'
+ '{a'}vel'
+ '{i'}vel'
+ 'ista' 'istas'
+ 'oso' 'osa' 'osos' 'osas'
+ 'amento' 'amentos'
+ 'imento' 'imentos'
+
+ 'adora' 'ador' 'a{c,}a~o'
+ 'adoras' 'adores' 'a{c,}o~es' // no -ic test
+ 'ante' 'antes' '{a^}ncia' // Note 1
+ (
+ R2 delete
+ )
+ 'log{i'}a'
+ 'log{i'}as'
+ (
+ R2 <- 'log'
+ )
+ 'uci{o'}n' 'uciones'
+ (
+ R2 <- 'u'
+ )
+ '{e^}ncia' '{e^}ncias'
+ (
+ R2 <- 'ente'
+ )
+ 'amente'
+ (
+ R1 delete
+ try (
+ [substring] R2 delete among(
+ 'iv' (['at'] R2 delete)
+ 'os'
+ 'ic'
+ 'ad'
+ )
+ )
+ )
+ 'mente'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'ante' // Note 1
+ 'avel'
+ '{i'}vel' (R2 delete)
+ )
+ )
+ )
+ 'idade'
+ 'idades'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'abil'
+ 'ic'
+ 'iv' (R2 delete)
+ )
+ )
+ )
+ 'iva' 'ivo'
+ 'ivas' 'ivos'
+ (
+ R2 delete
+ try (
+ ['at'] R2 delete // but not a further ['ic'] R2 delete
+ )
+ )
+ 'ira' 'iras'
+ (
+ RV 'e' // -eira -eiras usually non-verbal
+ <- 'ir'
+ )
+ )
+ )
+
+ define verb_suffix as setlimit tomark pV for (
+ [substring] among(
+ 'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}'
+ 'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste'
+ 'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam'
+ 'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem'
+ 'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o'
+ 'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias'
+ 'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras'
+ 'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres'
+ 'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is'
+ 'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis'
+ '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis'
+ '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos'
+ '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos'
+ 'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos'
+ 'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos'
+ '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou'
+
+ 'ira' 'iras'
+ (delete)
+ )
+ )
+
+ define residual_suffix as (
+ [substring] among(
+ 'os'
+ 'a' 'i' 'o' '{a'}' '{i'}' '{o'}'
+ ( RV delete )
+ )
+ )
+
+ define residual_form as (
+ [substring] among(
+ 'e' '{e'}' '{e^}'
+ ( RV delete [('u'] test 'g') or
+ ('i'] test 'c') RV delete )
+ '{c,}' (<-'c')
+ )
+ )
+)
+
+define stem as (
+ do prelude
+ do mark_regions
+ backwards (
+ do (
+ ( ( standard_suffix or verb_suffix )
+ and do ( ['i'] test 'c' RV delete )
+ )
+ or residual_suffix
+ )
+ do residual_form
+ )
+ do postlude
+)
+
+/*
+ Note 1: additions of 15 Jun 2005
+*/
diff --git a/contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl
new file mode 100644
index 000000000..4d6c85214
--- /dev/null
+++ b/contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl
@@ -0,0 +1,218 @@
+routines (
+ prelude postlude mark_regions
+ RV R1 R2
+ standard_suffix
+ verb_suffix
+ residual_suffix
+ residual_form
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+stringescapes {}
+
+/* special characters (in MS-DOS Latin I) */
+
+stringdef a' hex 'A0' // a-acute
+stringdef a^ hex '83' // a-circumflex e.g. 'bota^nico
+stringdef e' hex '82' // e-acute
+stringdef e^ hex '88' // e-circumflex
+stringdef i' hex 'A1' // i-acute
+stringdef o^ hex '93' // o-circumflex
+stringdef o' hex 'A2' // o-acute
+stringdef u' hex 'A3' // u-acute
+stringdef c, hex '87' // c-cedilla
+
+stringdef a~ hex 'C6' // a-tilde
+stringdef o~ hex 'E4' // o-tilde
+
+
+define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}'
+
+define prelude as repeat (
+ [substring] among(
+ '{a~}' (<- 'a~')
+ '{o~}' (<- 'o~')
+ '' (next)
+ ) //or next
+)
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v (non-v gopast v) or (v gopast non-v) )
+ or
+ ( non-v (non-v gopast v) or (v next) )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+ [substring] among(
+ 'a~' (<- '{a~}')
+ 'o~' (<- '{o~}')
+ '' (next)
+ ) //or next
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define standard_suffix as (
+ [substring] among(
+
+ 'eza' 'ezas'
+ 'ico' 'ica' 'icos' 'icas'
+ 'ismo' 'ismos'
+ '{a'}vel'
+ '{i'}vel'
+ 'ista' 'istas'
+ 'oso' 'osa' 'osos' 'osas'
+ 'amento' 'amentos'
+ 'imento' 'imentos'
+
+ 'adora' 'ador' 'a{c,}a~o'
+ 'adoras' 'adores' 'a{c,}o~es' // no -ic test
+ 'ante' 'antes' '{a^}ncia' // Note 1
+ (
+ R2 delete
+ )
+ 'log{i'}a'
+ 'log{i'}as'
+ (
+ R2 <- 'log'
+ )
+ 'uci{o'}n' 'uciones'
+ (
+ R2 <- 'u'
+ )
+ '{e^}ncia' '{e^}ncias'
+ (
+ R2 <- 'ente'
+ )
+ 'amente'
+ (
+ R1 delete
+ try (
+ [substring] R2 delete among(
+ 'iv' (['at'] R2 delete)
+ 'os'
+ 'ic'
+ 'ad'
+ )
+ )
+ )
+ 'mente'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'ante' // Note 1
+ 'avel'
+ '{i'}vel' (R2 delete)
+ )
+ )
+ )
+ 'idade'
+ 'idades'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'abil'
+ 'ic'
+ 'iv' (R2 delete)
+ )
+ )
+ )
+ 'iva' 'ivo'
+ 'ivas' 'ivos'
+ (
+ R2 delete
+ try (
+ ['at'] R2 delete // but not a further ['ic'] R2 delete
+ )
+ )
+ 'ira' 'iras'
+ (
+ RV 'e' // -eira -eiras usually non-verbal
+ <- 'ir'
+ )
+ )
+ )
+
+ define verb_suffix as setlimit tomark pV for (
+ [substring] among(
+ 'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}'
+ 'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste'
+ 'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam'
+ 'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem'
+ 'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o'
+ 'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias'
+ 'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras'
+ 'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres'
+ 'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is'
+ 'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis'
+ '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis'
+ '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos'
+ '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos'
+ 'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos'
+ 'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos'
+ '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou'
+
+ 'ira' 'iras'
+ (delete)
+ )
+ )
+
+ define residual_suffix as (
+ [substring] among(
+ 'os'
+ 'a' 'i' 'o' '{a'}' '{i'}' '{o'}'
+ ( RV delete )
+ )
+ )
+
+ define residual_form as (
+ [substring] among(
+ 'e' '{e'}' '{e^}'
+ ( RV delete [('u'] test 'g') or
+ ('i'] test 'c') RV delete )
+ '{c,}' (<-'c')
+ )
+ )
+)
+
+define stem as (
+ do prelude
+ do mark_regions
+ backwards (
+ do (
+ ( ( standard_suffix or verb_suffix )
+ and do ( ['i'] test 'c' RV delete )
+ )
+ or residual_suffix
+ )
+ do residual_form
+ )
+ do postlude
+)
+
+/*
+ Note 1: additions of 15 Jun 2005
+*/
diff --git a/contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl b/contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl
new file mode 100644
index 000000000..48a148321
--- /dev/null
+++ b/contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl
@@ -0,0 +1,236 @@
+
+routines (
+ prelude postlude mark_regions
+ RV R1 R2
+ step_0
+ standard_suffix combo_suffix
+ verb_suffix
+ vowel_suffix
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+booleans ( standard_suffix_removed )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef a^ hex 'E2' // a circumflex
+stringdef i^ hex 'EE' // i circumflex
+stringdef a+ hex 'E3' // a breve
+stringdef s, hex 'BA' // s cedilla
+stringdef t, hex 'FE' // t cedilla
+
+define v 'aeiou{a^}{i^}{a+}'
+
+define prelude as (
+ repeat goto (
+ v [ ('u' ] v <- 'U') or
+ ('i' ] v <- 'I')
+ )
+)
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v (non-v gopast v) or (v gopast non-v) )
+ or
+ ( non-v (non-v gopast v) or (v next) )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'I' (<- 'i')
+ 'U' (<- 'u')
+ '' (next)
+ )
+
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define step_0 as (
+ [substring] R1 among(
+ 'ul' 'ului'
+ ( delete )
+ 'aua'
+ ( <-'a' )
+ 'ea' 'ele' 'elor'
+ ( <-'e' )
+ 'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor'
+ ( <-'i')
+ 'ile'
+ ( not 'ab' <- 'i' )
+ 'atei'
+ ( <- 'at' )
+ 'a{t,}ie' 'a{t,}ia'
+ ( <- 'a{t,}i' )
+ )
+ )
+
+ define combo_suffix as test (
+ [substring] R1 (
+ among(
+ /* 'IST'. alternative: include the following
+ 'alism' 'alisme'
+ 'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' (
+ <- 'al'
+ )
+ */
+ 'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' (
+ <- 'abil'
+ )
+ 'ibilitate' (
+ <- 'ibil'
+ )
+ 'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' (
+ <- 'iv'
+ )
+ 'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i'
+ 'icator' 'icatori'
+ 'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}'
+ 'ical' 'icala' 'icale' 'icali' 'ical{a+}' (
+ <- 'ic'
+ )
+ 'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune'
+ 'atoare' 'ator' 'atori'
+ '{a+}toare' '{a+}tor' '{a+}tori' (
+ <- 'at'
+ )
+ 'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune'
+ 'itoare' 'itor' 'itori' (
+ <- 'it'
+ )
+ )
+ set standard_suffix_removed
+ )
+ )
+
+ define standard_suffix as (
+ unset standard_suffix_removed
+ repeat combo_suffix
+ [substring] R2 (
+ among(
+
+ // past participle is treated here, rather than
+ // as a verb ending:
+ 'at' 'ata' 'at{a+}' 'ati' 'ate'
+ 'ut' 'uta' 'ut{a+}' 'uti' 'ute'
+ 'it' 'ita' 'it{a+}' 'iti' 'ite'
+
+ 'ic' 'ica' 'ice' 'ici' 'ic{a+}'
+ 'abil' 'abila' 'abile' 'abili' 'abil{a+}'
+ 'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}'
+ 'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i'
+ 'ant' 'anta' 'ante' 'anti' 'ant{a+}'
+ 'ator' 'atori'
+ 'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i'
+ 'iv' 'iva' 'ive' 'ivi' 'iv{a+}' (
+ delete
+ )
+ 'iune' 'iuni' (
+ '{t,}'] <- 't'
+ )
+ 'ism' 'isme'
+ 'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' (
+ <- 'ist'
+ /* 'IST'. alternative: remove with <- '' */
+ )
+ )
+ set standard_suffix_removed
+ )
+ )
+
+ define verb_suffix as setlimit tomark pV for (
+ [substring] among(
+ // 'long' infinitive:
+ 'are' 'ere' 'ire' '{a^}re'
+
+ // gerund:
+ 'ind' '{a^}nd'
+ 'indu' '{a^}ndu'
+
+ 'eze'
+ 'easc{a+}'
+ // present:
+ 'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti'
+ 'e{s,}te'
+ '{a+}sc' '{a+}{s,}ti'
+ '{a+}{s,}te'
+
+ // imperfect:
+ 'am' 'ai' 'au'
+ 'eam' 'eai' 'ea' 'ea{t,}i' 'eau'
+ 'iam' 'iai' 'ia' 'ia{t,}i' 'iau'
+
+ // past: // (not 'ii')
+ 'ui'
+ 'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}'
+ 'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}'
+ 'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}'
+ '{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}'
+
+ // pluferfect:
+ 'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}'
+ 'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}'
+ '{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i'
+ '{a^}ser{a+}'
+ 'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}'
+
+ ( non-v or 'u' delete )
+
+ // present:
+ '{a+}m' 'a{t,}i'
+ 'em' 'e{t,}i'
+ 'im' 'i{t,}i'
+ '{a^}m' '{a^}{t,}i'
+
+ // past:
+ 'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}'
+ 'sei' 'se'
+
+ // pluperfect:
+ 'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}'
+ (delete)
+ )
+ )
+
+ define vowel_suffix as (
+ [substring] RV among (
+ 'a' 'e' 'i' 'ie' '{a+}' ( delete )
+ )
+ )
+)
+
+define stem as (
+ do prelude
+ do mark_regions
+ backwards (
+ do step_0
+ do standard_suffix
+ do ( standard_suffix_removed or verb_suffix )
+ do vowel_suffix
+ )
+ do postlude
+)
+
diff --git a/contrib/snowball/algorithms/romanian/stem_Unicode.sbl b/contrib/snowball/algorithms/romanian/stem_Unicode.sbl
new file mode 100644
index 000000000..09aec6429
--- /dev/null
+++ b/contrib/snowball/algorithms/romanian/stem_Unicode.sbl
@@ -0,0 +1,236 @@
+
+routines (
+ prelude postlude mark_regions
+ RV R1 R2
+ step_0
+ standard_suffix combo_suffix
+ verb_suffix
+ vowel_suffix
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+booleans ( standard_suffix_removed )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef a^ hex '0E2' // a circumflex
+stringdef i^ hex '0EE' // i circumflex
+stringdef a+ hex '103' // a breve
+stringdef s, hex '15F' // s cedilla
+stringdef t, hex '163' // t cedilla
+
+define v 'aeiou{a^}{i^}{a+}'
+
+define prelude as (
+ repeat goto (
+ v [ ('u' ] v <- 'U') or
+ ('i' ] v <- 'I')
+ )
+)
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v (non-v gopast v) or (v gopast non-v) )
+ or
+ ( non-v (non-v gopast v) or (v next) )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'I' (<- 'i')
+ 'U' (<- 'u')
+ '' (next)
+ )
+
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define step_0 as (
+ [substring] R1 among(
+ 'ul' 'ului'
+ ( delete )
+ 'aua'
+ ( <-'a' )
+ 'ea' 'ele' 'elor'
+ ( <-'e' )
+ 'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor'
+ ( <-'i')
+ 'ile'
+ ( not 'ab' <- 'i' )
+ 'atei'
+ ( <- 'at' )
+ 'a{t,}ie' 'a{t,}ia'
+ ( <- 'a{t,}i' )
+ )
+ )
+
+ define combo_suffix as test (
+ [substring] R1 (
+ among(
+ /* 'IST'. alternative: include the following
+ 'alism' 'alisme'
+ 'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' (
+ <- 'al'
+ )
+ */
+ 'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' (
+ <- 'abil'
+ )
+ 'ibilitate' (
+ <- 'ibil'
+ )
+ 'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' (
+ <- 'iv'
+ )
+ 'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i'
+ 'icator' 'icatori'
+ 'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}'
+ 'ical' 'icala' 'icale' 'icali' 'ical{a+}' (
+ <- 'ic'
+ )
+ 'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune'
+ 'atoare' 'ator' 'atori'
+ '{a+}toare' '{a+}tor' '{a+}tori' (
+ <- 'at'
+ )
+ 'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune'
+ 'itoare' 'itor' 'itori' (
+ <- 'it'
+ )
+ )
+ set standard_suffix_removed
+ )
+ )
+
+ define standard_suffix as (
+ unset standard_suffix_removed
+ repeat combo_suffix
+ [substring] R2 (
+ among(
+
+ // past participle is treated here, rather than
+ // as a verb ending:
+ 'at' 'ata' 'at{a+}' 'ati' 'ate'
+ 'ut' 'uta' 'ut{a+}' 'uti' 'ute'
+ 'it' 'ita' 'it{a+}' 'iti' 'ite'
+
+ 'ic' 'ica' 'ice' 'ici' 'ic{a+}'
+ 'abil' 'abila' 'abile' 'abili' 'abil{a+}'
+ 'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}'
+ 'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i'
+ 'ant' 'anta' 'ante' 'anti' 'ant{a+}'
+ 'ator' 'atori'
+ 'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i'
+ 'iv' 'iva' 'ive' 'ivi' 'iv{a+}' (
+ delete
+ )
+ 'iune' 'iuni' (
+ '{t,}'] <- 't'
+ )
+ 'ism' 'isme'
+ 'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' (
+ <- 'ist'
+ /* 'IST'. alternative: remove with <- '' */
+ )
+ )
+ set standard_suffix_removed
+ )
+ )
+
+ define verb_suffix as setlimit tomark pV for (
+ [substring] among(
+ // 'long' infinitive:
+ 'are' 'ere' 'ire' '{a^}re'
+
+ // gerund:
+ 'ind' '{a^}nd'
+ 'indu' '{a^}ndu'
+
+ 'eze'
+ 'easc{a+}'
+ // present:
+ 'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti'
+ 'e{s,}te'
+ '{a+}sc' '{a+}{s,}ti'
+ '{a+}{s,}te'
+
+ // imperfect:
+ 'am' 'ai' 'au'
+ 'eam' 'eai' 'ea' 'ea{t,}i' 'eau'
+ 'iam' 'iai' 'ia' 'ia{t,}i' 'iau'
+
+ // past: // (not 'ii')
+ 'ui'
+ 'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}'
+ 'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}'
+ 'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}'
+ '{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}'
+
+ // pluferfect:
+ 'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}'
+ 'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}'
+ '{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i'
+ '{a^}ser{a+}'
+ 'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}'
+
+ ( non-v or 'u' delete )
+
+ // present:
+ '{a+}m' 'a{t,}i'
+ 'em' 'e{t,}i'
+ 'im' 'i{t,}i'
+ '{a^}m' '{a^}{t,}i'
+
+ // past:
+ 'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}'
+ 'sei' 'se'
+
+ // pluperfect:
+ 'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}'
+ (delete)
+ )
+ )
+
+ define vowel_suffix as (
+ [substring] RV among (
+ 'a' 'e' 'i' 'ie' '{a+}' ( delete )
+ )
+ )
+)
+
+define stem as (
+ do prelude
+ do mark_regions
+ backwards (
+ do step_0
+ do standard_suffix
+ do ( standard_suffix_removed or verb_suffix )
+ do vowel_suffix
+ )
+ do postlude
+)
+
diff --git a/contrib/snowball/algorithms/russian/stem_KOI8_R.sbl b/contrib/snowball/algorithms/russian/stem_KOI8_R.sbl
new file mode 100644
index 000000000..cdacb19c4
--- /dev/null
+++ b/contrib/snowball/algorithms/russian/stem_KOI8_R.sbl
@@ -0,0 +1,217 @@
+stringescapes {}
+
+/* the 32 Cyrillic letters in the KOI8-R coding scheme, and represented
+ in Latin characters following the conventions of the standard Library
+ of Congress transliteration: */
+
+stringdef a hex 'C1'
+stringdef b hex 'C2'
+stringdef v hex 'D7'
+stringdef g hex 'C7'
+stringdef d hex 'C4'
+stringdef e hex 'C5'
+stringdef zh hex 'D6'
+stringdef z hex 'DA'
+stringdef i hex 'C9'
+stringdef i` hex 'CA'
+stringdef k hex 'CB'
+stringdef l hex 'CC'
+stringdef m hex 'CD'
+stringdef n hex 'CE'
+stringdef o hex 'CF'
+stringdef p hex 'D0'
+stringdef r hex 'D2'
+stringdef s hex 'D3'
+stringdef t hex 'D4'
+stringdef u hex 'D5'
+stringdef f hex 'C6'
+stringdef kh hex 'C8'
+stringdef ts hex 'C3'
+stringdef ch hex 'DE'
+stringdef sh hex 'DB'
+stringdef shch hex 'DD'
+stringdef " hex 'DF'
+stringdef y hex 'D9'
+stringdef ' hex 'D8'
+stringdef e` hex 'DC'
+stringdef iu hex 'C0'
+stringdef ia hex 'D1'
+
+routines ( mark_regions R2
+ perfective_gerund
+ adjective
+ adjectival
+ reflexive
+ verb
+ noun
+ derivational
+ tidy_up
+)
+
+externals ( stem )
+
+integers ( pV p2 )
+
+groupings ( v )
+
+define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}'
+
+define mark_regions as (
+
+ $pV = limit
+ $p2 = limit
+ do (
+ gopast v setmark pV gopast non-v
+ gopast v gopast non-v setmark p2
+ )
+)
+
+backwardmode (
+
+ define R2 as $p2 <= cursor
+
+ define perfective_gerund as (
+ [substring] among (
+ '{v}'
+ '{v}{sh}{i}'
+ '{v}{sh}{i}{s}{'}'
+ ('{a}' or '{ia}' delete)
+ '{i}{v}'
+ '{i}{v}{sh}{i}'
+ '{i}{v}{sh}{i}{s}{'}'
+ '{y}{v}'
+ '{y}{v}{sh}{i}'
+ '{y}{v}{sh}{i}{s}{'}'
+ (delete)
+ )
+ )
+
+ define adjective as (
+ [substring] among (
+ '{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}'
+ '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}'
+ '{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}'
+ '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}'
+ '{ia}{ia}'
+ // and -
+ '{o}{iu}' // - which is somewhat archaic
+ '{e}{iu}' // - soft form of {o}{iu}
+ (delete)
+ )
+ )
+
+ define adjectival as (
+ adjective
+
+ /* of the participle forms, em, vsh, ivsh, yvsh are readily removable.
+ nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of
+ errors. Removing im, uem, enn creates too many errors.
+ */
+
+ try (
+ [substring] among (
+ '{e}{m}' // present passive participle
+ '{n}{n}' // adjective from past passive participle
+ '{v}{sh}' // past active participle
+ '{iu}{shch}' '{shch}' // present active participle
+ ('{a}' or '{ia}' delete)
+
+ //but not '{i}{m}' '{u}{e}{m}' // present passive participle
+ //or '{e}{n}{n}' // adjective from past passive participle
+
+ '{i}{v}{sh}' '{y}{v}{sh}'// past active participle
+ '{u}{iu}{shch}' // present active participle
+ (delete)
+ )
+ )
+
+ )
+
+ define reflexive as (
+ [substring] among (
+ '{s}{ia}'
+ '{s}{'}'
+ (delete)
+ )
+ )
+
+ define verb as (
+ [substring] among (
+ '{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}'
+ '{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}'
+ '{n}{y}' '{t}{'}' '{e}{sh}{'}'
+
+ '{n}{n}{o}'
+ ('{a}' or '{ia}' delete)
+
+ '{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}'
+ '{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}'
+ '{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}'
+ '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}'
+ '{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}'
+ '{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}'
+ (delete)
+ /* note the short passive participle tests:
+ '{n}{a}' '{n}' '{n}{o}' '{n}{y}'
+ '{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}'
+ */
+ )
+ )
+
+ define noun as (
+ [substring] among (
+ '{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}'
+ '{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}'
+ '{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}'
+ '{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}'
+ '{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}'
+ '{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}'
+ (delete)
+ /* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}'
+ '{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}'
+ omitted - they only occur on 12 words.
+ */
+ )
+ )
+
+ define derivational as (
+ [substring] R2 among (
+ '{o}{s}{t}'
+ '{o}{s}{t}{'}'
+ (delete)
+ )
+ )
+
+ define tidy_up as (
+ [substring] among (
+
+ '{e}{i`}{sh}'
+ '{e}{i`}{sh}{e}' // superlative forms
+ (delete
+ ['{n}'] '{n}' delete
+ )
+ '{n}'
+ ('{n}' delete) // e.g. -nno endings
+ '{'}'
+ (delete) // with some slight false conflations
+ )
+ )
+)
+
+define stem as (
+
+ do mark_regions
+ backwards setlimit tomark pV for (
+ do (
+ perfective_gerund or
+ ( try reflexive
+ adjectival or verb or noun
+ )
+ )
+ try([ '{i}' ] delete)
+ // because noun ending -i{iu} is being treated as verb ending -{iu}
+
+ do derivational
+ do tidy_up
+ )
+)
diff --git a/contrib/snowball/algorithms/russian/stem_Unicode.sbl b/contrib/snowball/algorithms/russian/stem_Unicode.sbl
new file mode 100644
index 000000000..9e1a93f93
--- /dev/null
+++ b/contrib/snowball/algorithms/russian/stem_Unicode.sbl
@@ -0,0 +1,215 @@
+stringescapes {}
+
+/* the 32 Cyrillic letters in Unicode */
+
+stringdef a hex '430'
+stringdef b hex '431'
+stringdef v hex '432'
+stringdef g hex '433'
+stringdef d hex '434'
+stringdef e hex '435'
+stringdef zh hex '436'
+stringdef z hex '437'
+stringdef i hex '438'
+stringdef i` hex '439'
+stringdef k hex '43A'
+stringdef l hex '43B'
+stringdef m hex '43C'
+stringdef n hex '43D'
+stringdef o hex '43E'
+stringdef p hex '43F'
+stringdef r hex '440'
+stringdef s hex '441'
+stringdef t hex '442'
+stringdef u hex '443'
+stringdef f hex '444'
+stringdef kh hex '445'
+stringdef ts hex '446'
+stringdef ch hex '447'
+stringdef sh hex '448'
+stringdef shch hex '449'
+stringdef " hex '44A'
+stringdef y hex '44B'
+stringdef ' hex '44C'
+stringdef e` hex '44D'
+stringdef iu hex '44E'
+stringdef ia hex '44F'
+
+routines ( mark_regions R2
+ perfective_gerund
+ adjective
+ adjectival
+ reflexive
+ verb
+ noun
+ derivational
+ tidy_up
+)
+
+externals ( stem )
+
+integers ( pV p2 )
+
+groupings ( v )
+
+define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}'
+
+define mark_regions as (
+
+ $pV = limit
+ $p2 = limit
+ do (
+ gopast v setmark pV gopast non-v
+ gopast v gopast non-v setmark p2
+ )
+)
+
+backwardmode (
+
+ define R2 as $p2 <= cursor
+
+ define perfective_gerund as (
+ [substring] among (
+ '{v}'
+ '{v}{sh}{i}'
+ '{v}{sh}{i}{s}{'}'
+ ('{a}' or '{ia}' delete)
+ '{i}{v}'
+ '{i}{v}{sh}{i}'
+ '{i}{v}{sh}{i}{s}{'}'
+ '{y}{v}'
+ '{y}{v}{sh}{i}'
+ '{y}{v}{sh}{i}{s}{'}'
+ (delete)
+ )
+ )
+
+ define adjective as (
+ [substring] among (
+ '{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}'
+ '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}'
+ '{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}'
+ '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}'
+ '{ia}{ia}'
+ // and -
+ '{o}{iu}' // - which is somewhat archaic
+ '{e}{iu}' // - soft form of {o}{iu}
+ (delete)
+ )
+ )
+
+ define adjectival as (
+ adjective
+
+ /* of the participle forms, em, vsh, ivsh, yvsh are readily removable.
+ nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of
+ errors. Removing im, uem, enn creates too many errors.
+ */
+
+ try (
+ [substring] among (
+ '{e}{m}' // present passive participle
+ '{n}{n}' // adjective from past passive participle
+ '{v}{sh}' // past active participle
+ '{iu}{shch}' '{shch}' // present active participle
+ ('{a}' or '{ia}' delete)
+
+ //but not '{i}{m}' '{u}{e}{m}' // present passive participle
+ //or '{e}{n}{n}' // adjective from past passive participle
+
+ '{i}{v}{sh}' '{y}{v}{sh}'// past active participle
+ '{u}{iu}{shch}' // present active participle
+ (delete)
+ )
+ )
+
+ )
+
+ define reflexive as (
+ [substring] among (
+ '{s}{ia}'
+ '{s}{'}'
+ (delete)
+ )
+ )
+
+ define verb as (
+ [substring] among (
+ '{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}'
+ '{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}'
+ '{n}{y}' '{t}{'}' '{e}{sh}{'}'
+
+ '{n}{n}{o}'
+ ('{a}' or '{ia}' delete)
+
+ '{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}'
+ '{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}'
+ '{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}'
+ '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}'
+ '{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}'
+ '{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}'
+ (delete)
+ /* note the short passive participle tests:
+ '{n}{a}' '{n}' '{n}{o}' '{n}{y}'
+ '{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}'
+ */
+ )
+ )
+
+ define noun as (
+ [substring] among (
+ '{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}'
+ '{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}'
+ '{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}'
+ '{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}'
+ '{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}'
+ '{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}'
+ (delete)
+ /* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}'
+ '{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}'
+ omitted - they only occur on 12 words.
+ */
+ )
+ )
+
+ define derivational as (
+ [substring] R2 among (
+ '{o}{s}{t}'
+ '{o}{s}{t}{'}'
+ (delete)
+ )
+ )
+
+ define tidy_up as (
+ [substring] among (
+
+ '{e}{i`}{sh}'
+ '{e}{i`}{sh}{e}' // superlative forms
+ (delete
+ ['{n}'] '{n}' delete
+ )
+ '{n}'
+ ('{n}' delete) // e.g. -nno endings
+ '{'}'
+ (delete) // with some slight false conflations
+ )
+ )
+)
+
+define stem as (
+
+ do mark_regions
+ backwards setlimit tomark pV for (
+ do (
+ perfective_gerund or
+ ( try reflexive
+ adjectival or verb or noun
+ )
+ )
+ try([ '{i}' ] delete)
+ // because noun ending -i{iu} is being treated as verb ending -{iu}
+
+ do derivational
+ do tidy_up
+ )
+)
diff --git a/contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..9dee289cc
--- /dev/null
+++ b/contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl
@@ -0,0 +1,230 @@
+routines (
+ postlude mark_regions
+ RV R1 R2
+ attached_pronoun
+ standard_suffix
+ y_verb_suffix
+ verb_suffix
+ residual_suffix
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef a' hex 'E1' // a-acute
+stringdef e' hex 'E9' // e-acute
+stringdef i' hex 'ED' // i-acute
+stringdef o' hex 'F3' // o-acute
+stringdef u' hex 'FA' // u-acute
+stringdef u" hex 'FC' // u-diaeresis
+stringdef n~ hex 'F1' // n-tilde
+
+define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}'
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v (non-v gopast v) or (v gopast non-v) )
+ or
+ ( non-v (non-v gopast v) or (v next) )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+ [substring] among(
+ '{a'}' (<- 'a')
+ '{e'}' (<- 'e')
+ '{i'}' (<- 'i')
+ '{o'}' (<- 'o')
+ '{u'}' (<- 'u')
+ // and possibly {u"}->u here, or in prelude
+ '' (next)
+ ) //or next
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define attached_pronoun as (
+ [substring] among(
+ 'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo'
+ 'las' 'les' 'los' 'nos'
+ )
+ substring RV among(
+ 'i{e'}ndo' (] <- 'iendo')
+ '{a'}ndo' (] <- 'ando')
+ '{a'}r' (] <- 'ar')
+ '{e'}r' (] <- 'er')
+ '{i'}r' (] <- 'ir')
+ 'ando'
+ 'iendo'
+ 'ar' 'er' 'ir'
+ (delete)
+ 'yendo' ('u' delete)
+ )
+ )
+
+ define standard_suffix as (
+ [substring] among(
+
+ 'anza' 'anzas'
+ 'ico' 'ica' 'icos' 'icas'
+ 'ismo' 'ismos'
+ 'able' 'ables'
+ 'ible' 'ibles'
+ 'ista' 'istas'
+ 'oso' 'osa' 'osos' 'osas'
+ 'amiento' 'amientos'
+ 'imiento' 'imientos'
+ (
+ R2 delete
+ )
+ 'adora' 'ador' 'aci{o'}n'
+ 'adoras' 'adores' 'aciones'
+ 'ante' 'antes' 'ancia' 'ancias'// Note 1
+ (
+ R2 delete
+ try ( ['ic'] R2 delete )
+ )
+ 'log{i'}a'
+ 'log{i'}as'
+ (
+ R2 <- 'log'
+ )
+ 'uci{o'}n' 'uciones'
+ (
+ R2 <- 'u'
+ )
+ 'encia' 'encias'
+ (
+ R2 <- 'ente'
+ )
+ 'amente'
+ (
+ R1 delete
+ try (
+ [substring] R2 delete among(
+ 'iv' (['at'] R2 delete)
+ 'os'
+ 'ic'
+ 'ad'
+ )
+ )
+ )
+ 'mente'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'ante' // Note 1
+ 'able'
+ 'ible' (R2 delete)
+ )
+ )
+ )
+ 'idad'
+ 'idades'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'abil'
+ 'ic'
+ 'iv' (R2 delete)
+ )
+ )
+ )
+ 'iva' 'ivo'
+ 'ivas' 'ivos'
+ (
+ R2 delete
+ try (
+ ['at'] R2 delete // but not a further ['ic'] R2 delete
+ )
+ )
+ )
+ )
+
+ define y_verb_suffix as (
+ setlimit tomark pV for ([substring]) among(
+ 'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}'
+ 'yas' 'yes' 'yais' 'yamos'
+ ('u' delete)
+ )
+ )
+
+ define verb_suffix as (
+ setlimit tomark pV for ([substring]) among(
+
+ 'en' 'es' '{e'}is' 'emos'
+ (try ('u' test 'g') ] delete)
+
+ 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais'
+ 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}'
+ 'ar{e'}'
+ 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais'
+ 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}'
+ 'er{e'}'
+ 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais'
+ 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}'
+ 'ir{e'}'
+
+ 'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed'
+ 'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an'
+ 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado'
+ 'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as'
+ 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases'
+ 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais'
+ 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados'
+ 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos'
+ '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos'
+ (delete)
+ )
+ )
+
+ define residual_suffix as (
+ [substring] among(
+ 'os'
+ 'a' 'o' '{a'}' '{i'}' '{o'}'
+ ( RV delete )
+ 'e' '{e'}'
+ ( RV delete try( ['u'] test 'g' RV delete ) )
+ )
+ )
+)
+
+define stem as (
+ do mark_regions
+ backwards (
+ do attached_pronoun
+ do ( standard_suffix or
+ y_verb_suffix or
+ verb_suffix
+ )
+ do residual_suffix
+ )
+ do postlude
+)
+
+/*
+ Note 1: additions of 15 Jun 2005
+*/
diff --git a/contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl
new file mode 100644
index 000000000..db7a46201
--- /dev/null
+++ b/contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl
@@ -0,0 +1,230 @@
+routines (
+ postlude mark_regions
+ RV R1 R2
+ attached_pronoun
+ standard_suffix
+ y_verb_suffix
+ verb_suffix
+ residual_suffix
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+stringescapes {}
+
+/* special characters (in MS-DOS Latin I) */
+
+stringdef a' hex 'A0' // a-acute
+stringdef e' hex '82' // e-acute
+stringdef i' hex 'A1' // i-acute
+stringdef o' hex 'A2' // o-acute
+stringdef u' hex 'A3' // u-acute
+stringdef u" hex '81' // u-diaeresis
+stringdef n~ hex 'A4' // n-tilde
+
+define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}'
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v (non-v gopast v) or (v gopast non-v) )
+ or
+ ( non-v (non-v gopast v) or (v next) )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+ [substring] among(
+ '{a'}' (<- 'a')
+ '{e'}' (<- 'e')
+ '{i'}' (<- 'i')
+ '{o'}' (<- 'o')
+ '{u'}' (<- 'u')
+ // and possibly {u"}->u here, or in prelude
+ '' (next)
+ ) //or next
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define attached_pronoun as (
+ [substring] among(
+ 'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo'
+ 'las' 'les' 'los' 'nos'
+ )
+ substring RV among(
+ 'i{e'}ndo' (] <- 'iendo')
+ '{a'}ndo' (] <- 'ando')
+ '{a'}r' (] <- 'ar')
+ '{e'}r' (] <- 'er')
+ '{i'}r' (] <- 'ir')
+ 'ando'
+ 'iendo'
+ 'ar' 'er' 'ir'
+ (delete)
+ 'yendo' ('u' delete)
+ )
+ )
+
+ define standard_suffix as (
+ [substring] among(
+
+ 'anza' 'anzas'
+ 'ico' 'ica' 'icos' 'icas'
+ 'ismo' 'ismos'
+ 'able' 'ables'
+ 'ible' 'ibles'
+ 'ista' 'istas'
+ 'oso' 'osa' 'osos' 'osas'
+ 'amiento' 'amientos'
+ 'imiento' 'imientos'
+ (
+ R2 delete
+ )
+ 'adora' 'ador' 'aci{o'}n'
+ 'adoras' 'adores' 'aciones'
+ 'ante' 'antes' 'ancia' 'ancias'// Note 1
+ (
+ R2 delete
+ try ( ['ic'] R2 delete )
+ )
+ 'log{i'}a'
+ 'log{i'}as'
+ (
+ R2 <- 'log'
+ )
+ 'uci{o'}n' 'uciones'
+ (
+ R2 <- 'u'
+ )
+ 'encia' 'encias'
+ (
+ R2 <- 'ente'
+ )
+ 'amente'
+ (
+ R1 delete
+ try (
+ [substring] R2 delete among(
+ 'iv' (['at'] R2 delete)
+ 'os'
+ 'ic'
+ 'ad'
+ )
+ )
+ )
+ 'mente'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'ante' // Note 1
+ 'able'
+ 'ible' (R2 delete)
+ )
+ )
+ )
+ 'idad'
+ 'idades'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'abil'
+ 'ic'
+ 'iv' (R2 delete)
+ )
+ )
+ )
+ 'iva' 'ivo'
+ 'ivas' 'ivos'
+ (
+ R2 delete
+ try (
+ ['at'] R2 delete // but not a further ['ic'] R2 delete
+ )
+ )
+ )
+ )
+
+ define y_verb_suffix as (
+ setlimit tomark pV for ([substring]) among(
+ 'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}'
+ 'yas' 'yes' 'yais' 'yamos'
+ ('u' delete)
+ )
+ )
+
+ define verb_suffix as (
+ setlimit tomark pV for ([substring]) among(
+
+ 'en' 'es' '{e'}is' 'emos'
+ (try ('u' test 'g') ] delete)
+
+ 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais'
+ 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}'
+ 'ar{e'}'
+ 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais'
+ 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}'
+ 'er{e'}'
+ 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais'
+ 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}'
+ 'ir{e'}'
+
+ 'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed'
+ 'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an'
+ 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado'
+ 'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as'
+ 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases'
+ 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais'
+ 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados'
+ 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos'
+ '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos'
+ (delete)
+ )
+ )
+
+ define residual_suffix as (
+ [substring] among(
+ 'os'
+ 'a' 'o' '{a'}' '{i'}' '{o'}'
+ ( RV delete )
+ 'e' '{e'}'
+ ( RV delete try( ['u'] test 'g' RV delete ) )
+ )
+ )
+)
+
+define stem as (
+ do mark_regions
+ backwards (
+ do attached_pronoun
+ do ( standard_suffix or
+ y_verb_suffix or
+ verb_suffix
+ )
+ do residual_suffix
+ )
+ do postlude
+)
+
+/*
+ Note 1: additions of 15 Jun 2005
+*/
diff --git a/contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..03ce1e22f
--- /dev/null
+++ b/contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl
@@ -0,0 +1,72 @@
+routines (
+ mark_regions
+ main_suffix
+ consonant_pair
+ other_suffix
+)
+
+externals ( stem )
+
+integers ( p1 x )
+
+groupings ( v s_ending )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef a" hex 'E4'
+stringdef ao hex 'E5'
+stringdef o" hex 'F6'
+
+define v 'aeiouy{a"}{ao}{o"}'
+
+define s_ending 'bcdfghjklmnoprtvy'
+
+define mark_regions as (
+
+ $p1 = limit
+ test ( hop 3 setmark x )
+ goto v gopast non-v setmark p1
+ try ( $p1 < x $p1 = x )
+)
+
+backwardmode (
+
+ define main_suffix as (
+ setlimit tomark p1 for ([substring])
+ among(
+
+ 'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne'
+ 'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter'
+ 'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens'
+ 'hetens' 'erns' 'at' 'andet' 'het' 'ast'
+ (delete)
+ 's'
+ (s_ending delete)
+ )
+ )
+
+ define consonant_pair as setlimit tomark p1 for (
+ among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt')
+ and ([next] delete)
+ )
+
+ define other_suffix as setlimit tomark p1 for (
+ [substring] among(
+ 'lig' 'ig' 'els' (delete)
+ 'l{o"}st' (<-'l{o"}s')
+ 'fullt' (<-'full')
+ )
+ )
+)
+
+define stem as (
+
+ do mark_regions
+ backwards (
+ do main_suffix
+ do consonant_pair
+ do other_suffix
+ )
+)
diff --git a/contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl
new file mode 100644
index 000000000..1631f401a
--- /dev/null
+++ b/contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl
@@ -0,0 +1,72 @@
+routines (
+ mark_regions
+ main_suffix
+ consonant_pair
+ other_suffix
+)
+
+externals ( stem )
+
+integers ( p1 x )
+
+groupings ( v s_ending )
+
+stringescapes {}
+
+/* special characters (in MS-DOS Latin I) */
+
+stringdef a" hex '84'
+stringdef ao hex '86'
+stringdef o" hex '94'
+
+define v 'aeiouy{a"}{ao}{o"}'
+
+define s_ending 'bcdfghjklmnoprtvy'
+
+define mark_regions as (
+
+ $p1 = limit
+ test ( hop 3 setmark x )
+ goto v gopast non-v setmark p1
+ try ( $p1 < x $p1 = x )
+)
+
+backwardmode (
+
+ define main_suffix as (
+ setlimit tomark p1 for ([substring])
+ among(
+
+ 'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne'
+ 'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter'
+ 'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens'
+ 'hetens' 'erns' 'at' 'andet' 'het' 'ast'
+ (delete)
+ 's'
+ (s_ending delete)
+ )
+ )
+
+ define consonant_pair as setlimit tomark p1 for (
+ among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt')
+ and ([next] delete)
+ )
+
+ define other_suffix as setlimit tomark p1 for (
+ [substring] among(
+ 'lig' 'ig' 'els' (delete)
+ 'l{o"}st' (<-'l{o"}s')
+ 'fullt' (<-'full')
+ )
+ )
+)
+
+define stem as (
+
+ do mark_regions
+ backwards (
+ do main_suffix
+ do consonant_pair
+ do other_suffix
+ )
+)
diff --git a/contrib/snowball/algorithms/turkish/stem_Unicode.sbl b/contrib/snowball/algorithms/turkish/stem_Unicode.sbl
new file mode 100644
index 000000000..16c02a51f
--- /dev/null
+++ b/contrib/snowball/algorithms/turkish/stem_Unicode.sbl
@@ -0,0 +1,477 @@
+/* Stemmer for Turkish
+ * author: Evren (Kapusuz) Çilden
+ * email: evren.kapusuz at gmail.com
+ * version: 1.0 (15.01.2007)
+
+
+ * stems nominal verb suffixes
+ * stems nominal inflections
+ * more than one syllable word check
+ * (y,n,s,U) context check
+ * vowel harmony check
+ * last consonant check and conversion (b, c, d, ğ to p, ç, t, k)
+
+ * The stemming algorithm is based on the paper "An Affix Stripping
+ * Morphological Analyzer for Turkish" by Gülşen Eryiğit and
+ * Eşref Adalı (Proceedings of the IAESTED International Conference
+ * ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004,
+ * Innsbruck, Austria
+
+ * Turkish is an agglutinative language and has a very rich morphological
+ * structure. In Turkish, you can form many different words from a single stem
+ * by appending a sequence of suffixes. Eg. The word "doktoruymuÅŸsunuz" means
+ * "You had been the doctor of him". The stem of the word is "doktor" and it
+ * takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about
+ * the append order of suffixes can be clearly described as FSMs.
+ * The paper referenced above defines some FSMs for right to left
+ * morphological analysis. I generated a method for constructing snowball
+ * expressions from right to left FSMs for stemming suffixes.
+*/
+
+routines (
+ append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings
+ check_vowel_harmony // tests vowel harmony for suffixes
+ is_reserved_word // tests whether current string is a reserved word ('ad','soyad')
+ mark_cAsInA // nominal verb suffix
+ mark_DA // noun suffix
+ mark_DAn // noun suffix
+ mark_DUr // nominal verb suffix
+ mark_ki // noun suffix
+ mark_lAr // noun suffix, nominal verb suffix
+ mark_lArI // noun suffix
+ mark_nA // noun suffix
+ mark_ncA // noun suffix
+ mark_ndA // noun suffix
+ mark_ndAn // noun suffix
+ mark_nU // noun suffix
+ mark_nUn // noun suffix
+ mark_nUz // nominal verb suffix
+ mark_sU // noun suffix
+ mark_sUn // nominal verb suffix
+ mark_sUnUz // nominal verb suffix
+ mark_possessives // -(U)m,-(U)n,-(U)mUz,-(U)nUz,
+ mark_yA // noun suffix
+ mark_ylA // noun suffix
+ mark_yU // noun suffix
+ mark_yUm // nominal verb suffix
+ mark_yUz // nominal verb suffix
+ mark_yDU // nominal verb suffix
+ mark_yken // nominal verb suffix
+ mark_ymUs_ // nominal verb suffix
+ mark_ysA // nominal verb suffix
+
+ mark_suffix_with_optional_y_consonant
+ mark_suffix_with_optional_U_vowel
+ mark_suffix_with_optional_n_consonant
+ mark_suffix_with_optional_s_consonant
+
+ more_than_one_syllable_word
+
+ post_process_last_consonants
+ postlude
+
+ stem_nominal_verb_suffixes
+ stem_noun_suffixes
+ stem_suffix_chain_before_ki
+)
+
+/* Special characters in Unicode Latin-1 and Latin Extended-A */
+stringdef c. hex 'E7' // LATIN SMALL LETTER C WITH CEDILLA
+stringdef g~ hex '011F' // LATIN SMALL LETTER G WITH BREVE
+stringdef i' hex '0131' // LATIN SMALL LETTER I WITHOUT DOT
+stringdef o" hex 'F6' // LATIN SMALL LETTER O WITH DIAERESIS
+stringdef s. hex '015F' // LATIN SMALL LETTER S WITH CEDILLA
+stringdef u" hex 'FC' // LATIN SMALL LETTER U WITH DIAERESIS
+
+stringescapes { }
+
+integers ( strlen ) // length of a string
+
+booleans ( continue_stemming_noun_suffixes )
+
+groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6)
+
+define vowel 'ae{i'}io{o"}u{u"}'
+define U '{i'}iu{u"}'
+
+// the vowel grouping definitions below are used for checking vowel harmony
+define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a'
+define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e'
+define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i''
+define vowel4 'ei' // vowels that can end with suffixes containing 'i'
+define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u'
+define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"'
+
+externals ( stem )
+
+backwardmode (
+ // checks vowel harmony for possible suffixes,
+ // helps to detect whether the candidate for suffix applies to vowel harmony
+ // this rule is added to prevent over stemming
+ define check_vowel_harmony as (
+ test
+ (
+ (goto vowel) // if there is a vowel
+ (
+ ('a' goto vowel1) or
+ ('e' goto vowel2) or
+ ('{i'}' goto vowel3) or
+ ('i' goto vowel4) or
+ ('o' goto vowel5) or
+ ('{o"}' goto vowel6) or
+ ('u' goto vowel5) or
+ ('{u"}' goto vowel6)
+ )
+ )
+ )
+
+ // if the last consonant before suffix is vowel and n then advance and delete
+ // if the last consonant before suffix is non vowel and n do nothing
+ // if the last consonant before suffix is not n then only delete the suffix
+ // assumption: slice beginning is set correctly
+ define mark_suffix_with_optional_n_consonant as (
+ ((test 'n') next (test vowel))
+ or
+ ((not(test 'n')) test(next (test vowel)))
+
+ )
+
+ // if the last consonant before suffix is vowel and s then advance and delete
+ // if the last consonant before suffix is non vowel and s do nothing
+ // if the last consonant before suffix is not s then only delete the suffix
+ // assumption: slice beginning is set correctly
+ define mark_suffix_with_optional_s_consonant as (
+ ((test 's') next (test vowel))
+ or
+ ((not(test 's')) test(next (test vowel)))
+ )
+
+ // if the last consonant before suffix is vowel and y then advance and delete
+ // if the last consonant before suffix is non vowel and y do nothing
+ // if the last consonant before suffix is not y then only delete the suffix
+ // assumption: slice beginning is set correctly
+ define mark_suffix_with_optional_y_consonant as (
+ ((test 'y') next (test vowel))
+ or
+ ((not(test 'y')) test(next (test vowel)))
+ )
+
+ define mark_suffix_with_optional_U_vowel as (
+ ((test U) next (test non-vowel))
+ or
+ ((not(test U)) test(next (test non-vowel)))
+
+ )
+
+ define mark_possessives as (
+ among ('m{i'}z' 'miz' 'muz' 'm{u"}z'
+ 'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n')
+ (mark_suffix_with_optional_U_vowel)
+ )
+
+ define mark_sU as (
+ check_vowel_harmony
+ U
+ (mark_suffix_with_optional_s_consonant)
+ )
+
+ define mark_lArI as (
+ among ('leri' 'lar{i'}')
+ )
+
+ define mark_yU as (
+ check_vowel_harmony
+ U
+ (mark_suffix_with_optional_y_consonant)
+ )
+
+ define mark_nU as (
+ check_vowel_harmony
+ among ('n{i'}' 'ni' 'nu' 'n{u"}')
+ )
+
+ define mark_nUn as (
+ check_vowel_harmony
+ among ('{i'}n' 'in' 'un' '{u"}n')
+ (mark_suffix_with_optional_n_consonant)
+ )
+
+ define mark_yA as (
+ check_vowel_harmony
+ among('a' 'e')
+ (mark_suffix_with_optional_y_consonant)
+ )
+
+ define mark_nA as (
+ check_vowel_harmony
+ among('na' 'ne')
+ )
+
+ define mark_DA as (
+ check_vowel_harmony
+ among('da' 'de' 'ta' 'te')
+ )
+
+ define mark_ndA as (
+ check_vowel_harmony
+ among('nda' 'nde')
+ )
+
+ define mark_DAn as (
+ check_vowel_harmony
+ among('dan' 'den' 'tan' 'ten')
+ )
+
+ define mark_ndAn as (
+ check_vowel_harmony
+ among('ndan' 'nden')
+ )
+
+ define mark_ylA as (
+ check_vowel_harmony
+ among('la' 'le')
+ (mark_suffix_with_optional_y_consonant)
+ )
+
+ define mark_ki as (
+ 'ki'
+ )
+
+ define mark_ncA as (
+ check_vowel_harmony
+ among('ca' 'ce')
+ (mark_suffix_with_optional_n_consonant)
+ )
+
+ define mark_yUm as (
+ check_vowel_harmony
+ among ('{i'}m' 'im' 'um' '{u"}m')
+ (mark_suffix_with_optional_y_consonant)
+ )
+
+ define mark_sUn as (
+ check_vowel_harmony
+ among ('s{i'}n' 'sin' 'sun' 's{u"}n' )
+ )
+
+ define mark_yUz as (
+ check_vowel_harmony
+ among ('{i'}z' 'iz' 'uz' '{u"}z')
+ (mark_suffix_with_optional_y_consonant)
+ )
+
+ define mark_sUnUz as (
+ among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z')
+ )
+
+ define mark_lAr as (
+ check_vowel_harmony
+ among ('ler' 'lar')
+ )
+
+ define mark_nUz as (
+ check_vowel_harmony
+ among ('n{i'}z' 'niz' 'nuz' 'n{u"}z')
+ )
+
+ define mark_DUr as (
+ check_vowel_harmony
+ among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r')
+ )
+
+ define mark_cAsInA as (
+ among ('cas{i'}na' 'cesine')
+ )
+
+ define mark_yDU as (
+ check_vowel_harmony
+ among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m'
+ 't{i'}n' 'tin' 'tun' 't{u"}n' 'd{i'}n' 'din' 'dun' 'd{u"}n'
+ 't{i'}k' 'tik' 'tuk' 't{u"}k' 'd{i'}k' 'dik' 'duk' 'd{u"}k'
+ 't{i'}' 'ti' 'tu' 't{u"}' 'd{i'}' 'di' 'du' 'd{u"}')
+ (mark_suffix_with_optional_y_consonant)
+ )
+
+ // does not fully obey vowel harmony
+ define mark_ysA as (
+ among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se')
+ (mark_suffix_with_optional_y_consonant)
+ )
+
+ define mark_ymUs_ as (
+ check_vowel_harmony
+ among ('m{i'}{s.}' 'mi{s.}' 'mu{s.}' 'm{u"}{s.}')
+ (mark_suffix_with_optional_y_consonant)
+ )
+
+ define mark_yken as (
+ 'ken' (mark_suffix_with_optional_y_consonant)
+ )
+
+ define stem_nominal_verb_suffixes as (
+ [
+ set continue_stemming_noun_suffixes
+ (mark_ymUs_ or mark_yDU or mark_ysA or mark_yken)
+ or
+ (mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)
+ or
+ (
+ mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_))
+ unset continue_stemming_noun_suffixes
+ )
+ or
+ (mark_nUz (mark_yDU or mark_ysA))
+ or
+ ((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_))
+ or
+ (mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_))
+ ]delete
+ )
+
+ // stems noun suffix chains ending with -ki
+ define stem_suffix_chain_before_ki as (
+ [
+ mark_ki
+ (
+ (mark_DA] delete try([
+ (mark_lAr] delete try(stem_suffix_chain_before_ki))
+ or
+ (mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+
+ ))
+ or
+ (mark_nUn] delete try([
+ (mark_lArI] delete)
+ or
+ ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+ or
+ (stem_suffix_chain_before_ki)
+ ))
+ or
+ (mark_ndA (
+ (mark_lArI] delete)
+ or
+ ((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki)))
+ or
+ (stem_suffix_chain_before_ki)
+ ))
+ )
+ )
+
+ define stem_noun_suffixes as (
+ ([mark_lAr] delete try(stem_suffix_chain_before_ki))
+ or
+ ([mark_ncA] delete
+ try(
+ ([mark_lArI] delete)
+ or
+ ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+ or
+ ([mark_lAr] delete stem_suffix_chain_before_ki)
+ )
+ )
+ or
+ ([(mark_ndA or mark_nA)
+ (
+ (mark_lArI] delete)
+ or
+ (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+ or
+ (stem_suffix_chain_before_ki)
+ )
+ )
+ or
+ ([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI)))
+ or
+ ( [mark_DAn] delete try ([
+ (
+ (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+ or
+ (mark_lAr] delete try(stem_suffix_chain_before_ki))
+ or
+ (stem_suffix_chain_before_ki)
+ ))
+ )
+ or
+ ([mark_nUn or mark_ylA] delete
+ try(
+ ([mark_lAr] delete stem_suffix_chain_before_ki)
+ or
+ ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+ or
+ stem_suffix_chain_before_ki
+ )
+ )
+ or
+ ([mark_lArI] delete)
+ or
+ (stem_suffix_chain_before_ki)
+ or
+ ([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki))
+ or
+ ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+ )
+
+ define post_process_last_consonants as (
+ [substring] among (
+ 'b' (<- 'p')
+ 'c' (<- '{c.}')
+ 'd' (<- 't')
+ '{g~}' (<- 'k')
+ )
+ )
+
+ // after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed
+ // like in 'kedim' -> 'ked'
+ // Turkish words don't usually end with 'd' or 'g'
+ // some very well known words are ignored (like 'ad' 'soyad'
+ // appends U to stems ending with d or g, decides which vowel to add
+ // based on the last vowel in the stem
+ define append_U_to_stems_ending_with_d_or_g as (
+ test('d' or 'g')
+ (test((goto vowel) 'a' or '{i'}') <+ '{i'}')
+ or
+ (test((goto vowel) 'e' or 'i') <+ 'i')
+ or
+ (test((goto vowel) 'o' or 'u') <+ 'u')
+ or
+ (test((goto vowel) '{o"}' or '{u"}') <+ '{u"}')
+ )
+
+)
+
+// Tests if there are more than one syllables
+// In Turkish each vowel indicates a distinct syllable
+define more_than_one_syllable_word as (
+ test (atleast 2 (gopast vowel))
+)
+
+define is_reserved_word as (
+ test(gopast 'ad' ($strlen = 2) ($strlen == limit))
+ or
+ test(gopast 'soyad' ($strlen = 5) ($strlen == limit))
+)
+
+define postlude as (
+ not(is_reserved_word)
+ backwards (
+ do append_U_to_stems_ending_with_d_or_g
+ do post_process_last_consonants
+
+ )
+)
+
+define stem as (
+ (more_than_one_syllable_word)
+ (
+ backwards (
+ do stem_nominal_verb_suffixes
+ continue_stemming_noun_suffixes
+ do stem_noun_suffixes
+ )
+
+ postlude
+ )
+)
+
+
diff --git a/contrib/snowball/compiler/analyser.c b/contrib/snowball/compiler/analyser.c
new file mode 100644
index 000000000..68c0d2d90
--- /dev/null
+++ b/contrib/snowball/compiler/analyser.c
@@ -0,0 +1,959 @@
+
+#include <stdio.h> /* printf etc */
+#include <stdlib.h> /* exit */
+#include <string.h> /* memmove */
+#include "header.h"
+
+/* recursive usage: */
+
+static void read_program_(struct analyser * a, int terminator);
+static struct node * read_C(struct analyser * a);
+static struct node * C_style(struct analyser * a, char * s, int token);
+
+
+static void fault(int n) { fprintf(stderr, "fault %d\n", n); exit(1); }
+
+static void print_node_(struct node * p, int n, const char * s) {
+
+ int i;
+ for (i = 0; i < n; i++) fputs(i == n - 1 ? s : " ", stdout);
+ printf("%s ", name_of_token(p->type));
+ unless (p->name == 0) report_b(stdout, p->name->b);
+ unless (p->literalstring == 0) {
+ printf("'");
+ report_b(stdout, p->literalstring);
+ printf("'");
+ }
+ printf("\n");
+ unless (p->AE == 0) print_node_(p->AE, n+1, "# ");
+ unless (p->left == 0) print_node_(p->left, n+1, " ");
+ unless (p->right == 0) print_node_(p->right, n, " ");
+ if (p->aux != 0) print_node_(p->aux, n+1, "@ ");
+}
+
+extern void print_program(struct analyser * a) {
+ print_node_(a->program, 0, " ");
+}
+
+static struct node * new_node(struct analyser * a, int type) {
+ NEW(node, p);
+ p->next = a->nodes; a->nodes = p;
+ p->left = 0;
+ p->right = 0;
+ p->aux = 0;
+ p->AE = 0;
+ p->name = 0;
+ p->literalstring = 0;
+ p->mode = a->mode;
+ p->line_number = a->tokeniser->line_number;
+ p->type = type;
+ return p;
+}
+
+static const char * name_of_mode(int n) {
+ switch (n) {
+ default: fault(0);
+ case m_backward: return "string backward";
+ case m_forward: return "string forward";
+ /* case m_integer: return "integer"; */
+ }
+}
+
+static const char * name_of_type(int n) {
+ switch (n) {
+ default: fault(1);
+ case 's': return "string";
+ case 'i': return "integer";
+ case 'r': return "routine";
+ case 'R': return "routine or grouping";
+ case 'g': return "grouping";
+ }
+}
+
+static void count_error(struct analyser * a) {
+ struct tokeniser * t = a->tokeniser;
+ if (t->error_count >= 20) { fprintf(stderr, "... etc\n"); exit(1); }
+ t->error_count++;
+}
+
+static void error2(struct analyser * a, int n, int x) {
+ struct tokeniser * t = a->tokeniser;
+ count_error(a);
+ fprintf(stderr, "%s:%d: ", t->file, t->line_number);
+ if (n >= 30) report_b(stderr, t->b);
+ switch (n) {
+ case 0:
+ fprintf(stderr, "%s omitted", name_of_token(t->omission)); break;
+ case 3:
+ fprintf(stderr, "in among(...), ");
+ case 1:
+ fprintf(stderr, "unexpected %s", name_of_token(t->token));
+ if (t->token == c_number) fprintf(stderr, " %d", t->number);
+ if (t->token == c_name) {
+ fprintf(stderr, " ");
+ report_b(stderr, t->b);
+ } break;
+ case 2:
+ fprintf(stderr, "string omitted"); break;
+
+ case 14:
+ fprintf(stderr, "unresolved substring on line %d", x); break;
+ case 15:
+ fprintf(stderr, "%s not allowed inside reverse(...)", name_of_token(t->token)); break;
+ case 16:
+ fprintf(stderr, "empty grouping"); break;
+ case 17:
+ fprintf(stderr, "backwards used when already in this mode"); break;
+ case 18:
+ fprintf(stderr, "empty among(...)"); break;
+ case 19:
+ fprintf(stderr, "two adjacent bracketed expressions in among(...)"); break;
+ case 20:
+ fprintf(stderr, "substring preceded by another substring on line %d", x); break;
+
+ case 30:
+ fprintf(stderr, " re-declared"); break;
+ case 31:
+ fprintf(stderr, " undeclared"); break;
+ case 32:
+ fprintf(stderr, " declared as %s mode; used as %s mode",
+ name_of_mode(a->mode), name_of_mode(x)); break;
+ case 33:
+ fprintf(stderr, " not of type %s", name_of_type(x)); break;
+ case 34:
+ fprintf(stderr, " not of type string or integer"); break;
+ case 35:
+ fprintf(stderr, " misplaced"); break;
+ case 36:
+ fprintf(stderr, " redefined"); break;
+ case 37:
+ fprintf(stderr, " mis-used as %s mode",
+ name_of_mode(x)); break;
+ default:
+ fprintf(stderr, " error %d", n); break;
+
+ }
+ if (n <= 13 && t->previous_token > 0)
+ fprintf(stderr, " after %s", name_of_token(t->previous_token));
+ fprintf(stderr, "\n");
+}
+
+static void error(struct analyser * a, int n) { error2(a, n, 0); }
+
+static void error3(struct analyser * a, struct node * p, symbol * b) {
+ count_error(a);
+ fprintf(stderr, "%s:%d: among(...) has repeated string '", a->tokeniser->file, p->line_number);
+ report_b(stderr, b);
+ fprintf(stderr, "'\n");
+}
+
+static void error4(struct analyser * a, struct name * q) {
+ count_error(a);
+ report_b(stderr, q->b);
+ fprintf(stderr, " undefined\n");
+}
+
+static void omission_error(struct analyser * a, int n) {
+ a->tokeniser->omission = n;
+ error(a, 0);
+}
+
+static int check_token(struct analyser * a, int code) {
+ struct tokeniser * t = a->tokeniser;
+ if (t->token != code) { omission_error(a, code); return false; }
+ return true;
+}
+
+static int get_token(struct analyser * a, int code) {
+ struct tokeniser * t = a->tokeniser;
+ read_token(t);
+ {
+ int x = check_token(a, code);
+ unless (x) t->token_held = true;
+ return x;
+ }
+}
+
+static struct name * look_for_name(struct analyser * a) {
+ struct name * p = a->names;
+ symbol * q = a->tokeniser->b;
+ repeat {
+ if (p == 0) return 0;
+ { symbol * b = p->b;
+ int n = SIZE(b);
+ if (n == SIZE(q) && memcmp(q, b, n * sizeof(symbol)) == 0) {
+ p->referenced = true;
+ return p;
+ }
+ }
+ p = p->next;
+ }
+}
+
+static struct name * find_name(struct analyser * a) {
+ struct name * p = look_for_name(a);
+ if (p == 0) error(a, 31);
+ return p;
+}
+
+static void check_routine_mode(struct analyser * a, struct name * p, int mode) {
+ if (p->mode < 0) p->mode = mode; else
+ unless (p->mode == mode) error2(a, 37, mode);
+}
+
+static void check_name_type(struct analyser * a, struct name * p, int type) {
+ switch (type) {
+ case 's': if (p->type == t_string) return; break;
+ case 'i': if (p->type == t_integer) return; break;
+ case 'b': if (p->type == t_boolean) return; break;
+ case 'R': if (p->type == t_grouping) return;
+ case 'r': if (p->type == t_routine ||
+ p->type == t_external) return; break;
+ case 'g': if (p->type == t_grouping) return; break;
+ }
+ error2(a, 33, type);
+}
+
+static void read_names(struct analyser * a, int type) {
+ struct tokeniser * t = a->tokeniser;
+ unless (get_token(a, c_bra)) return;
+ repeat {
+ if (read_token(t) != c_name) break;
+ if (look_for_name(a) != 0) error(a, 30); else {
+ NEW(name, p);
+ p->b = copy_b(t->b);
+ p->type = type;
+ p->mode = -1; /* routines, externals */
+ p->count = a->name_count[type];
+ p->referenced = false;
+ p->used = false;
+ p->grouping = 0;
+ p->definition = 0;
+ a->name_count[type] ++;
+ p->next = a->names;
+ a->names = p;
+ }
+ }
+ unless (check_token(a, c_ket)) t->token_held = true;
+}
+
+static symbol * new_literalstring(struct analyser * a) {
+ NEW(literalstring, p);
+ p->b = copy_b(a->tokeniser->b);
+ p->next = a->literalstrings;
+ a->literalstrings = p;
+ return p->b;
+}
+
+static int read_AE_test(struct analyser * a) {
+
+ struct tokeniser * t = a->tokeniser;
+ switch (read_token(t)) {
+ case c_assign: return c_mathassign;
+ case c_plusassign:
+ case c_minusassign:
+ case c_multiplyassign:
+ case c_divideassign:
+ case c_eq:
+ case c_ne:
+ case c_gr:
+ case c_ge:
+ case c_ls:
+ case c_le: return t->token;
+ default: error(a, 1); t->token_held = true; return c_eq;
+ }
+}
+
+static int binding(int t) {
+ switch (t) {
+ case c_plus: case c_minus: return 1;
+ case c_multiply: case c_divide: return 2;
+ default: return -2;
+ }
+}
+
+static void name_to_node(struct analyser * a, struct node * p, int type) {
+ struct name * q = find_name(a);
+ unless (q == 0) {
+ check_name_type(a, q, type);
+ q->used = true;
+ }
+ p->name = q;
+}
+
+static struct node * read_AE(struct analyser * a, int B) {
+ struct tokeniser * t = a->tokeniser;
+ struct node * p;
+ struct node * q;
+ switch (read_token(t)) {
+ case c_minus: /* monadic */
+ p = new_node(a, c_neg);
+ p->right = read_AE(a, 100);
+ break;
+ case c_bra:
+ p = read_AE(a, 0);
+ get_token(a, c_ket);
+ break;
+ case c_name:
+ p = new_node(a, c_name);
+ name_to_node(a, p, 'i');
+ break;
+ case c_maxint:
+ case c_minint:
+ case c_cursor:
+ case c_limit:
+ case c_size:
+ p = new_node(a, t->token);
+ break;
+ case c_number:
+ p = new_node(a, c_number);
+ p->number = t->number;
+ break;
+ case c_sizeof:
+ p = C_style(a, "s", c_sizeof);
+ break;
+ default:
+ error(a, 1);
+ t->token_held = true;
+ return 0;
+ }
+ repeat {
+ int token = read_token(t);
+ int b = binding(token);
+ unless (binding(token) > B) {
+ t->token_held = true;
+ return p;
+ }
+ q = new_node(a, token);
+ q->left = p;
+ q->right = read_AE(a, b);
+ p = q;
+ }
+}
+
+static struct node * read_C_connection(struct analyser * a, struct node * q, int op) {
+ struct tokeniser * t = a->tokeniser;
+ struct node * p = new_node(a, op);
+ struct node * p_end = q;
+ p->left = q;
+ repeat {
+ q = read_C(a);
+ p_end->right = q; p_end = q;
+ if (read_token(t) != op) {
+ t->token_held = true;
+ break;
+ }
+ }
+ return p;
+}
+
+static struct node * read_C_list(struct analyser * a) {
+ struct tokeniser * t = a->tokeniser;
+ struct node * p = new_node(a, c_bra);
+ struct node * p_end = 0;
+ repeat {
+ int token = read_token(t);
+ if (token == c_ket) return p;
+ if (token < 0) { omission_error(a, c_ket); return p; }
+ t->token_held = true;
+ {
+ struct node * q = read_C(a);
+ repeat {
+ token = read_token(t);
+ if (token != c_and && token != c_or) {
+ t->token_held = true;
+ break;
+ }
+ q = read_C_connection(a, q, token);
+ }
+ if (p_end == 0) p->left = q; else p_end->right = q;
+ p_end = q;
+ }
+ }
+}
+
+static struct node * C_style(struct analyser * a, char * s, int token) {
+ int i;
+ struct node * p = new_node(a, token);
+ for (i = 0; s[i] != 0; i++) switch(s[i]) {
+ case 'C':
+ p->left = read_C(a); continue;
+ case 'D':
+ p->aux = read_C(a); continue;
+ case 'A':
+ p->AE = read_AE(a, 0); continue;
+ case 'f':
+ get_token(a, c_for); continue;
+ case 'S':
+ {
+ int str_token = read_token(a->tokeniser);
+ if (str_token == c_name) name_to_node(a, p, 's'); else
+ if (str_token == c_literalstring) p->literalstring = new_literalstring(a);
+ else error(a, 2);
+ }
+ continue;
+ case 'b':
+ case 's':
+ case 'i':
+ if (get_token(a, c_name)) name_to_node(a, p, s[i]);
+ continue;
+ }
+ return p;
+}
+
+static struct node * read_literalstring(struct analyser * a) {
+ struct node * p = new_node(a, c_literalstring);
+ p->literalstring = new_literalstring(a);
+ return p;
+}
+
+static void reverse_b(symbol * b) {
+ int i = 0; int j = SIZE(b) - 1;
+ until (i >= j) {
+ int ch1 = b[i]; int ch2 = b[j];
+ b[i++] = ch2; b[j--] = ch1;
+ }
+}
+
+static int compare_amongvec(const void *pv, const void *qv) {
+ const struct amongvec * p = (const struct amongvec*)pv;
+ const struct amongvec * q = (const struct amongvec*)qv;
+ symbol * b_p = p->b; int p_size = p->size;
+ symbol * b_q = q->b; int q_size = q->size;
+ int smaller_size = p_size < q_size ? p_size : q_size;
+ int i;
+ for (i = 0; i < smaller_size; i++)
+ if (b_p[i] != b_q[i]) return b_p[i] - b_q[i];
+ return p_size - q_size;
+}
+
+static void make_among(struct analyser * a, struct node * p, struct node * substring) {
+
+ NEW(among, x);
+ NEWVEC(amongvec, v, p->number);
+ struct node * q = p->left;
+ struct amongvec * w0 = v;
+ struct amongvec * w1 = v;
+ int result = 1;
+
+ int direction = substring != 0 ? substring->mode : p->mode;
+ int backward = direction == m_backward;
+
+ if (a->amongs == 0) a->amongs = x; else a->amongs_end->next = x;
+ a->amongs_end = x;
+ x->next = 0;
+ x->b = v;
+ x->number = a->among_count++;
+ x->starter = 0;
+
+ if (q->type == c_bra) { x->starter = q; q = q->right; }
+
+ until (q == 0) {
+ if (q->type == c_literalstring) {
+ symbol * b = q->literalstring;
+ w1->b = b; /* pointer to case string */
+ w1->p = 0; /* pointer to corresponding case expression */
+ w1->size = SIZE(b); /* number of characters in string */
+ w1->i = -1; /* index of longest substring */
+ w1->result = -1; /* number of corresponding case expression */
+ w1->function = q->left == 0 ? 0 : q->left->name;
+ unless (w1->function == 0)
+ check_routine_mode(a, w1->function, direction);
+ w1++;
+ }
+ else
+ if (q->left == 0) /* empty command: () */
+ w0 = w1;
+ else {
+ until (w0 == w1) {
+ w0->p = q;
+ w0->result = result;
+ w0++;
+ }
+ result++;
+ }
+ q = q->right;
+ }
+ unless (w1-v == p->number) { fprintf(stderr, "oh! %d %d\n", (int)(w1-v), p->number); exit(1); }
+ if (backward) for (w0 = v; w0 < w1; w0++) reverse_b(w0->b);
+ qsort(v, w1 - v, sizeof(struct amongvec), compare_amongvec);
+
+ /* the following loop is O(n squared) */
+ for (w0 = w1 - 1; w0 >= v; w0--) {
+ symbol * b = w0->b;
+ int size = w0->size;
+ struct amongvec * w;
+
+ for (w = w0 - 1; w >= v; w--) {
+ if (w->size < size && memcmp(w->b, b, w->size * sizeof(symbol)) == 0) {
+ w0->i = w - v; /* fill in index of longest substring */
+ break;
+ }
+ }
+ }
+ if (backward) for (w0 = v; w0 < w1; w0++) reverse_b(w0->b);
+
+ for (w0 = v; w0 < w1 - 1; w0++)
+ if (w0->size == (w0 + 1)->size &&
+ memcmp(w0->b, (w0 + 1)->b, w0->size * sizeof(symbol)) == 0) error3(a, p, w0->b);
+
+ x->literalstring_count = p->number;
+ x->command_count = result - 1;
+ p->among = x;
+
+ x->substring = substring;
+ if (substring != 0) substring->among = x;
+ unless (x->command_count == 0 && x->starter == 0) a->amongvar_needed = true;
+}
+
+static struct node * read_among(struct analyser * a) {
+ struct tokeniser * t = a->tokeniser;
+ struct node * p = new_node(a, c_among);
+ struct node * p_end = 0;
+ int previous_token = -1;
+ struct node * substring = a->substring;
+
+ a->substring = 0;
+ p->number = 0; /* counts the number of literals */
+ unless (get_token(a, c_bra)) return p;
+ repeat {
+ struct node * q;
+ int token = read_token(t);
+ switch (token) {
+ case c_literalstring:
+ q = read_literalstring(a);
+ if (read_token(t) == c_name) {
+ struct node * r = new_node(a, c_name);
+ name_to_node(a, r, 'r');
+ q->left = r;
+ }
+ else t->token_held = true;
+ p->number++; break;
+ case c_bra:
+ if (previous_token == c_bra) error(a, 19);
+ q = read_C_list(a); break;
+ default:
+ error(a, 3);
+ case c_ket:
+ if (p->number == 0) error(a, 18);
+ if (t->error_count == 0) make_among(a, p, substring);
+ return p;
+ }
+ previous_token = token;
+ if (p_end == 0) p->left = q; else p_end->right = q;
+ p_end = q;
+ }
+}
+
+static struct node * read_substring(struct analyser * a) {
+
+ struct node * p = new_node(a, c_substring);
+ if (a->substring != 0) error2(a, 20, a->substring->line_number);
+ a->substring = p;
+ return p;
+}
+
+static void check_modifyable(struct analyser * a) {
+ unless (a->modifyable) error(a, 15);
+}
+
+static struct node * read_C(struct analyser * a) {
+ struct tokeniser * t = a->tokeniser;
+ int token = read_token(t);
+ switch (token) {
+ case c_bra:
+ return read_C_list(a);
+ case c_backwards:
+ {
+ int mode = a->mode;
+ if (a->mode == m_backward) error(a, 17); else a->mode = m_backward;
+ { struct node * p = C_style(a, "C", token);
+ a->mode = mode;
+ return p;
+ }
+ }
+ case c_reverse:
+ {
+ int mode = a->mode;
+ int modifyable = a->modifyable;
+ a->modifyable = false;
+ a->mode = mode == m_forward ? m_backward : m_forward;
+ {
+ struct node * p = C_style(a, "C", token);
+ a->mode = mode;
+ a->modifyable = modifyable;
+ return p;
+ }
+ }
+ case c_not:
+ case c_try:
+ case c_fail:
+ case c_test:
+ case c_do:
+ case c_goto:
+ case c_gopast:
+ case c_repeat:
+ return C_style(a, "C", token);
+ case c_loop:
+ case c_atleast:
+ return C_style(a, "AC", token);
+ case c_setmark:
+ return C_style(a, "i", token);
+ case c_tomark:
+ case c_atmark:
+ case c_hop:
+ return C_style(a, "A", token);
+ case c_delete:
+ check_modifyable(a);
+ case c_next:
+ case c_tolimit:
+ case c_atlimit:
+ case c_leftslice:
+ case c_rightslice:
+ case c_true:
+ case c_false:
+ case c_debug:
+ return C_style(a, "", token);
+ case c_assignto:
+ case c_sliceto:
+ check_modifyable(a);
+ return C_style(a, "s", token);
+ case c_assign:
+ case c_insert:
+ case c_attach:
+ case c_slicefrom:
+ check_modifyable(a);
+ return C_style(a, "S", token);
+ case c_setlimit:
+ return C_style(a, "CfD", token);
+ case c_set:
+ case c_unset:
+ return C_style(a, "b", token);
+ case c_dollar:
+ get_token(a, c_name);
+ {
+ struct node * p;
+ struct name * q = find_name(a);
+ int mode = a->mode;
+ int modifyable = a->modifyable;
+ switch (q ? q->type : t_string)
+ /* above line was: switch (q->type) - bug #1 fix 7/2/2003 */
+ {
+ default: error(a, 34);
+ case t_string:
+ a->mode = m_forward;
+ a->modifyable = true;
+ p = new_node(a, c_dollar);
+ p->left = read_C(a); break;
+ case t_integer:
+ /* a->mode = m_integer; */
+ p = new_node(a, read_AE_test(a));
+ p->AE = read_AE(a, 0); break;
+ }
+ p->name = q;
+ a->mode = mode;
+ a->modifyable = modifyable;
+ return p;
+ }
+ case c_name:
+ {
+ struct name * q = find_name(a);
+ struct node * p = new_node(a, c_name);
+ unless (q == 0) {
+ q->used = true;
+ switch (q->type) {
+ case t_boolean:
+ p->type = c_booltest; break;
+ case t_integer:
+ error(a, 35); /* integer name misplaced */
+ case t_string:
+ break;
+ case t_routine:
+ case t_external:
+ p->type = c_call;
+ check_routine_mode(a, q, a->mode);
+ break;
+ case t_grouping:
+ p->type = c_grouping; break;
+ }
+ }
+ p->name = q;
+ return p;
+ }
+ case c_non:
+ {
+ struct node * p = new_node(a, token);
+ read_token(t);
+ if (t->token == c_minus) read_token(t);
+ unless (check_token(a, c_name)) { omission_error(a, c_name); return p; }
+ name_to_node(a, p, 'g');
+ return p;
+ }
+ case c_literalstring:
+ return read_literalstring(a);
+ case c_among: return read_among(a);
+ case c_substring: return read_substring(a);
+ default: error(a, 1); return 0;
+ }
+}
+
+static int next_symbol(symbol * p, symbol * W, int utf8) {
+ if (utf8) {
+ int ch;
+ int j = get_utf8(p, & ch);
+ W[0] = ch; return j;
+ } else {
+ W[0] = p[0]; return 1;
+ }
+}
+
+static symbol * alter_grouping(symbol * p, symbol * q, int style, int utf8) {
+ int j = 0;
+ symbol W[1];
+ int width;
+ if (style == c_plus) {
+ while (j < SIZE(q)) {
+ width = next_symbol(q + j, W, utf8);
+ p = add_to_b(p, 1, W);
+ j += width;
+ }
+ } else {
+ while (j < SIZE(q)) {
+ int i;
+ width = next_symbol(q + j, W, utf8);
+ for (i = 0; i < SIZE(p); i++) {
+ if (p[i] == W[0]) {
+ memmove(p + i, p + i + 1, (SIZE(p) - i - 1) * sizeof(symbol));
+ SIZE(p)--;
+ }
+ }
+ j += width;
+ }
+ }
+ return p;
+}
+
+static void read_define_grouping(struct analyser * a, struct name * q) {
+ struct tokeniser * t = a->tokeniser;
+ int style = c_plus;
+ {
+ NEW(grouping, p);
+ if (a->groupings == 0) a->groupings = p; else a->groupings_end->next = p;
+ a->groupings_end = p;
+ q->grouping = p;
+ p->next = 0;
+ p->name = q;
+ p->number = q->count;
+ p->b = create_b(0);
+ repeat {
+ switch (read_token(t)) {
+ case c_name:
+ {
+ struct name * r = find_name(a);
+ unless (r == 0) {
+ check_name_type(a, r, 'g');
+ p->b = alter_grouping(p->b, r->grouping->b, style, false);
+ }
+ }
+ break;
+ case c_literalstring:
+ p->b = alter_grouping(p->b, t->b, style, a->utf8);
+ break;
+ default: error(a, 1); return;
+ }
+ switch (read_token(t)) {
+ case c_plus:
+ case c_minus: style = t->token; break;
+ default: goto label0;
+ }
+ }
+ label0:
+ {
+ int i;
+ int max = 0;
+ int min = 1<<16;
+ for (i = 0; i < SIZE(p->b); i++) {
+ if (p->b[i] > max) max = p->b[i];
+ if (p->b[i] < min) min = p->b[i];
+ }
+ p->largest_ch = max;
+ p->smallest_ch = min;
+ if (min == 1<<16) error(a, 16);
+ }
+ t->token_held = true; return;
+ }
+}
+
+static void read_define_routine(struct analyser * a, struct name * q) {
+ struct node * p = new_node(a, c_define);
+ a->amongvar_needed = false;
+ unless (q == 0) {
+ check_name_type(a, q, 'R');
+ if (q->definition != 0) error(a, 36);
+ if (q->mode < 0) q->mode = a->mode; else
+ if (q->mode != a->mode) error2(a, 32, q->mode);
+ }
+ p->name = q;
+ if (a->program == 0) a->program = p; else a->program_end->right = p;
+ a->program_end = p;
+ get_token(a, c_as);
+ p->left = read_C(a);
+ unless (q == 0) q->definition = p->left;
+
+ if (a->substring != 0) {
+ error2(a, 14, a->substring->line_number);
+ a->substring = 0;
+ }
+ p->amongvar_needed = a->amongvar_needed;
+}
+
+static void read_define(struct analyser * a) {
+ unless (get_token(a, c_name)) return;
+ {
+ struct name * q = find_name(a);
+ if (q != 0 && q->type == t_grouping) read_define_grouping(a, q);
+ else read_define_routine(a, q);
+ }
+}
+
+static void read_backwardmode(struct analyser * a) {
+ int mode = a->mode;
+ a->mode = m_backward;
+ if (get_token(a, c_bra)) {
+ read_program_(a, c_ket);
+ check_token(a, c_ket);
+ }
+ a->mode = mode;
+}
+
+static void read_program_(struct analyser * a, int terminator) {
+ struct tokeniser * t = a->tokeniser;
+ repeat {
+ switch (read_token(t)) {
+ case c_strings: read_names(a, t_string); break;
+ case c_booleans: read_names(a, t_boolean); break;
+ case c_integers: read_names(a, t_integer); break;
+ case c_routines: read_names(a, t_routine); break;
+ case c_externals: read_names(a, t_external); break;
+ case c_groupings: read_names(a, t_grouping); break;
+ case c_define: read_define(a); break;
+ case c_backwardmode:read_backwardmode(a); break;
+ case c_ket:
+ if (terminator == c_ket) return;
+ default:
+ error(a, 1); break;
+ case -1:
+ unless (terminator < 0) omission_error(a, c_ket);
+ return;
+ }
+ }
+}
+
+extern void read_program(struct analyser * a) {
+ read_program_(a, -1);
+ {
+ struct name * q = a->names;
+ until (q == 0) {
+ switch(q->type) {
+ case t_external: case t_routine:
+ if (q->used && q->definition == 0) error4(a, q); break;
+ case t_grouping:
+ if (q->used && q->grouping == 0) error4(a, q); break;
+ }
+ q = q->next;
+ }
+ }
+
+ if (a->tokeniser->error_count == 0) {
+ struct name * q = a->names;
+ int warned = false;
+ until (q == 0) {
+ unless (q->referenced) {
+ unless (warned) {
+ fprintf(stderr, "Declared but not used:");
+ warned = true;
+ }
+ fprintf(stderr, " "); report_b(stderr, q->b);
+ }
+ q = q->next;
+ }
+ if (warned) fprintf(stderr, "\n");
+
+ q = a->names;
+ warned = false;
+ until (q == 0) {
+ if (! q->used && (q->type == t_routine ||
+ q->type == t_grouping)) {
+ unless (warned) {
+ fprintf(stderr, "Declared and defined but not used:");
+ warned = true;
+ }
+ fprintf(stderr, " "); report_b(stderr, q->b);
+ }
+ q = q->next;
+ }
+ if (warned) fprintf(stderr, "\n");
+ }
+}
+
+extern struct analyser * create_analyser(struct tokeniser * t) {
+ NEW(analyser, a);
+ a->tokeniser = t;
+ a->nodes = 0;
+ a->names = 0;
+ a->literalstrings = 0;
+ a->program = 0;
+ a->amongs = 0;
+ a->among_count = 0;
+ a->groupings = 0;
+ a->mode = m_forward;
+ a->modifyable = true;
+ { int i; for (i = 0; i < t_size; i++) a->name_count[i] = 0; }
+ a->substring = 0;
+ return a;
+}
+
+extern void close_analyser(struct analyser * a) {
+ {
+ struct node * q = a->nodes;
+ until (q == 0) {
+ struct node * q_next = q->next;
+ FREE(q);
+ q = q_next;
+ }
+ }
+ {
+ struct name * q = a->names;
+ until (q == 0) {
+ struct name * q_next = q->next;
+ lose_b(q->b); FREE(q);
+ q = q_next;
+ }
+ }
+ {
+ struct literalstring * q = a->literalstrings;
+ until (q == 0) {
+ struct literalstring * q_next = q->next;
+ lose_b(q->b); FREE(q);
+ q = q_next;
+ }
+ }
+ {
+ struct among * q = a->amongs;
+ until (q == 0) {
+ struct among * q_next = q->next;
+ FREE(q->b); FREE(q);
+ q = q_next;
+ }
+ }
+ {
+ struct grouping * q = a->groupings;
+ until (q == 0) {
+ struct grouping * q_next = q->next;
+ lose_b(q->b); FREE(q);
+ q = q_next;
+ }
+ }
+ FREE(a);
+}
+
diff --git a/contrib/snowball/compiler/driver.c b/contrib/snowball/compiler/driver.c
new file mode 100644
index 000000000..fbb1e9cae
--- /dev/null
+++ b/contrib/snowball/compiler/driver.c
@@ -0,0 +1,257 @@
+#include <stdio.h> /* for fprintf etc */
+#include <stdlib.h> /* for free etc */
+#include <string.h> /* for strlen */
+#include "header.h"
+
+#define DEFAULT_PACKAGE "org.tartarus.snowball.ext"
+#define DEFAULT_BASE_CLASS "org.tartarus.snowball.SnowballProgram"
+#define DEFAULT_AMONG_CLASS "org.tartarus.snowball.Among"
+#define DEFAULT_STRING_CLASS "java.lang.StringBuilder"
+
+static int eq(const char * s1, const char * s2) {
+ int s1_len = strlen(s1);
+ int s2_len = strlen(s2);
+ return s1_len == s2_len && memcmp(s1, s2, s1_len) == 0;
+}
+
+static void print_arglist(void) {
+ fprintf(stderr, "Usage: snowball <file> [options]\n\n"
+ "options are: [-o[utput] file]\n"
+ " [-s[yntax]]\n"
+#ifndef DISABLE_JAVA
+ " [-j[ava]]\n"
+#endif
+ " [-c++]\n"
+ " [-w[idechars]]\n"
+ " [-u[tf8]]\n"
+ " [-n[ame] class name]\n"
+ " [-ep[refix] string]\n"
+ " [-vp[refix] string]\n"
+ " [-i[nclude] directory]\n"
+ " [-r[untime] path to runtime headers]\n"
+#ifndef DISABLE_JAVA
+ " [-p[arentclassname] fully qualified parent class name]\n"
+ " [-P[ackage] package name for stemmers]\n"
+ " [-S[tringclass] StringBuffer-compatible class]\n"
+ " [-a[mongclass] fully qualified name of the Among class]\n"
+#endif
+ );
+ exit(1);
+}
+
+static void check_lim(int i, int argc) {
+ if (i >= argc) {
+ fprintf(stderr, "argument list is one short\n");
+ print_arglist();
+ }
+}
+
+static FILE * get_output(symbol * b) {
+ char * s = b_to_s(b);
+ FILE * output = fopen(s, "w");
+ if (output == 0) {
+ fprintf(stderr, "Can't open output %s\n", s);
+ exit(1);
+ }
+ free(s);
+ return output;
+}
+
+static void read_options(struct options * o, int argc, char * argv[]) {
+ char * s;
+ int i = 2;
+
+ /* set defaults: */
+
+ o->output_file = 0;
+ o->syntax_tree = false;
+ o->externals_prefix = "";
+ o->variables_prefix = 0;
+ o->runtime_path = 0;
+ o->parent_class_name = DEFAULT_BASE_CLASS;
+ o->string_class = DEFAULT_STRING_CLASS;
+ o->among_class = DEFAULT_AMONG_CLASS;
+ o->package = DEFAULT_PACKAGE;
+ o->name = "";
+ o->make_lang = LANG_C;
+ o->widechars = false;
+ o->includes = 0;
+ o->includes_end = 0;
+ o->utf8 = false;
+
+ /* read options: */
+
+ repeat {
+ if (i >= argc) break;
+ s = argv[i++];
+ { if (eq(s, "-o") || eq(s, "-output")) {
+ check_lim(i, argc);
+ o->output_file = argv[i++];
+ continue;
+ }
+ if (eq(s, "-n") || eq(s, "-name")) {
+ check_lim(i, argc);
+ o->name = argv[i++];
+ continue;
+ }
+#ifndef DISABLE_JAVA
+ if (eq(s, "-j") || eq(s, "-java")) {
+ o->make_lang = LANG_JAVA;
+ o->widechars = true;
+ continue;
+ }
+#endif
+ if (eq(s, "-c++")) {
+ o->make_lang = LANG_CPLUSPLUS;
+ continue;
+ }
+ if (eq(s, "-w") || eq(s, "-widechars")) {
+ o->widechars = true;
+ o->utf8 = false;
+ continue;
+ }
+ if (eq(s, "-s") || eq(s, "-syntax")) {
+ o->syntax_tree = true;
+ continue;
+ }
+ if (eq(s, "-ep") || eq(s, "-eprefix")) {
+ check_lim(i, argc);
+ o->externals_prefix = argv[i++];
+ continue;
+ }
+ if (eq(s, "-vp") || eq(s, "-vprefix")) {
+ check_lim(i, argc);
+ o->variables_prefix = argv[i++];
+ continue;
+ }
+ if (eq(s, "-i") || eq(s, "-include")) {
+ check_lim(i, argc);
+
+ {
+ NEW(include, p);
+ symbol * b = add_s_to_b(0, argv[i++]);
+ b = add_s_to_b(b, "/");
+ p->next = 0; p->b = b;
+
+ if (o->includes == 0) o->includes = p; else
+ o->includes_end->next = p;
+ o->includes_end = p;
+ }
+ continue;
+ }
+ if (eq(s, "-r") || eq(s, "-runtime")) {
+ check_lim(i, argc);
+ o->runtime_path = argv[i++];
+ continue;
+ }
+ if (eq(s, "-u") || eq(s, "-utf8")) {
+ o->utf8 = true;
+ o->widechars = false;
+ continue;
+ }
+#ifndef DISABLE_JAVA
+ if (eq(s, "-p") || eq(s, "-parentclassname")) {
+ check_lim(i, argc);
+ o->parent_class_name = argv[i++];
+ continue;
+ }
+ if (eq(s, "-P") || eq(s, "-Package")) {
+ check_lim(i, argc);
+ o->package = argv[i++];
+ continue;
+ }
+ if (eq(s, "-S") || eq(s, "-stringclass")) {
+ check_lim(i, argc);
+ o->string_class = argv[i++];
+ continue;
+ }
+ if (eq(s, "-a") || eq(s, "-amongclass")) {
+ check_lim(i, argc);
+ o->among_class = argv[i++];
+ continue;
+ }
+#endif
+ fprintf(stderr, "'%s' misplaced\n", s);
+ print_arglist();
+ }
+ }
+}
+
+extern int main(int argc, char * argv[]) {
+
+ NEW(options, o);
+ if (argc == 1) print_arglist();
+ read_options(o, argc, argv);
+ {
+ symbol * filename = add_s_to_b(0, argv[1]);
+ char * file;
+ symbol * u = get_input(filename, &file);
+ if (u == 0) {
+ fprintf(stderr, "Can't open input %s\n", argv[1]);
+ exit(1);
+ }
+ {
+ struct tokeniser * t = create_tokeniser(u, file);
+ struct analyser * a = create_analyser(t);
+ t->widechars = o->widechars;
+ t->includes = o->includes;
+ a->utf8 = t->utf8 = o->utf8;
+ read_program(a);
+ if (t->error_count > 0) exit(1);
+ if (o->syntax_tree) print_program(a);
+ close_tokeniser(t);
+ unless (o->syntax_tree) {
+ struct generator * g;
+
+ char * s = o->output_file;
+ unless (s) {
+ fprintf(stderr, "Please include the -o option\n");
+ print_arglist();
+ exit(1);
+ }
+ if (o->make_lang == LANG_C || o->make_lang == LANG_CPLUSPLUS) {
+ symbol * b = add_s_to_b(0, s);
+ b = add_s_to_b(b, ".h");
+ o->output_h = get_output(b);
+ b[SIZE(b) - 1] = 'c';
+ if (o->make_lang == LANG_CPLUSPLUS) {
+ b = add_s_to_b(b, "c");
+ }
+ o->output_c = get_output(b);
+ lose_b(b);
+
+ g = create_generator_c(a, o);
+ generate_program_c(g);
+ close_generator_c(g);
+ fclose(o->output_c);
+ fclose(o->output_h);
+ }
+#ifndef DISABLE_JAVA
+ if (o->make_lang == LANG_JAVA) {
+ symbol * b = add_s_to_b(0, s);
+ b = add_s_to_b(b, ".java");
+ o->output_java = get_output(b);
+ lose_b(b);
+ g = create_generator_java(a, o);
+ generate_program_java(g);
+ close_generator_java(g);
+ fclose(o->output_java);
+ }
+#endif
+ }
+ close_analyser(a);
+ }
+ lose_b(u);
+ lose_b(filename);
+ }
+ { struct include * p = o->includes;
+ until (p == 0)
+ { struct include * q = p->next;
+ lose_b(p->b); FREE(p); p = q;
+ }
+ }
+ FREE(o);
+ unless (space_count == 0) fprintf(stderr, "%d blocks unfreed\n", space_count);
+ return 0;
+}
+
diff --git a/contrib/snowball/compiler/generator.c b/contrib/snowball/compiler/generator.c
new file mode 100644
index 000000000..7294b0471
--- /dev/null
+++ b/contrib/snowball/compiler/generator.c
@@ -0,0 +1,1465 @@
+
+#include <limits.h> /* for INT_MAX */
+#include <stdio.h> /* for fprintf etc */
+#include <stdlib.h> /* for free etc */
+#include <string.h> /* for strlen */
+#include "header.h"
+
+/* Define this to get warning messages when optimisations can't be used. */
+/* #define OPTIMISATION_WARNINGS */
+
+/* recursive use: */
+
+static void generate(struct generator * g, struct node * p);
+
+enum special_labels {
+
+ x_return = -1
+
+};
+
+static int new_label(struct generator * g) {
+ return g->next_label++;
+}
+
+/* Output routines */
+static void output_str(FILE * outfile, struct str * str) {
+
+ char * s = b_to_s(str_data(str));
+ fprintf(outfile, "%s", s);
+ free(s);
+}
+
+static void wch(struct generator * g, int ch) {
+ str_append_ch(g->outbuf, ch); /* character */
+}
+
+static void wnl(struct generator * g) {
+ str_append_ch(g->outbuf, '\n'); /* newline */
+ g->line_count++;
+}
+
+static void ws(struct generator * g, const char * s) {
+ str_append_string(g->outbuf, s); /* string */
+}
+
+static void wi(struct generator * g, int i) {
+ str_append_int(g->outbuf, i); /* integer */
+}
+
+static void wh_ch(struct generator * g, int i) {
+ str_append_ch(g->outbuf, "0123456789ABCDEF"[i & 0xF]); /* hexchar */
+}
+
+static void wh(struct generator * g, int i) {
+ if (i >> 4) wh(g, i >> 4);
+ wh_ch(g, i); /* hex integer */
+}
+
+static void wi3(struct generator * g, int i) {
+ if (i < 100) wch(g, ' ');
+ if (i < 10) wch(g, ' ');
+ wi(g, i); /* integer (width 3) */
+}
+
+static void wvn(struct generator * g, struct name * p) { /* variable name */
+
+ int ch = "SBIrxg"[p->type];
+ switch (p->type) {
+ case t_string:
+ case t_boolean:
+ case t_integer:
+ wch(g, ch); wch(g, '['); wi(g, p->count); wch(g, ']'); return;
+ case t_external:
+ ws(g, g->options->externals_prefix); break;
+ default:
+ wch(g, ch); wch(g, '_');
+ }
+ str_append_b(g->outbuf, p->b);
+}
+
+static void wv(struct generator * g, struct name * p) { /* reference to variable */
+ if (p->type < t_routine) ws(g, "z->");
+ wvn(g, p);
+}
+
+static void wlitarray(struct generator * g, symbol * p) { /* write literal array */
+
+ ws(g, "{ ");
+ {
+ int i;
+ for (i = 0; i < SIZE(p); i++) {
+ int ch = p[i];
+ if (32 <= ch && ch < 127) {
+ wch(g, '\'');
+ switch (ch) {
+ case '\'':
+ case '\\': wch(g, '\\');
+ default: wch(g, ch);
+ }
+ wch(g, '\'');
+ } else {
+ wch(g, '0'); wch(g, 'x'); wh(g, ch);
+ }
+ if (i < SIZE(p) - 1) ws(g, ", ");
+ }
+ }
+ ws(g, " }");
+}
+
+static void wlitref(struct generator * g, symbol * p) { /* write ref to literal array */
+
+ if (SIZE(p) == 0) ws(g, "0"); else {
+ struct str * s = g->outbuf;
+ g->outbuf = g->declarations;
+ ws(g, "static const symbol s_"); wi(g, g->literalstring_count); ws(g, "[] = ");
+ wlitarray(g, p);
+ ws(g, ";\n");
+ g->outbuf = s;
+ ws(g, "s_"); wi(g, g->literalstring_count);
+ g->literalstring_count++;
+ }
+}
+
+
+static void wm(struct generator * g) { /* margin */
+ int i;
+ for (i = 0; i < g->margin; i++) ws(g, " ");
+}
+
+static void wc(struct generator * g, struct node * p) { /* comment */
+
+ ws(g, " /* ");
+ switch (p->type) {
+ case c_mathassign:
+ case c_plusassign:
+ case c_minusassign:
+ case c_multiplyassign:
+ case c_divideassign:
+ case c_eq:
+ case c_ne:
+ case c_gr:
+ case c_ge:
+ case c_ls:
+ case c_le:
+ if (p->name) {
+ wch(g, '$');
+ str_append_b(g->outbuf, p->name->b);
+ wch(g, ' ');
+ }
+ ws(g, name_of_token(p->type));
+ ws(g, " <integer expression>");
+ break;
+ default:
+ ws(g, name_of_token(p->type));
+ if (p->name) {
+ wch(g, ' ');
+ str_append_b(g->outbuf, p->name->b);
+ }
+ }
+ ws(g, ", line "); wi(g, p->line_number); ws(g, " */");
+ wnl(g);
+}
+
+static void wms(struct generator * g, const char * s) {
+ wm(g); ws(g, s); } /* margin + string */
+
+static void wbs(struct generator * g) { /* block start */
+ wms(g, "{ ");
+ g->margin++;
+}
+
+static void wbe(struct generator * g) { /* block end */
+
+ if (g->line_labelled == g->line_count) { wms(g, ";"); wnl(g); }
+ g->margin--;
+ wms(g, "}"); wnl(g);
+}
+
+static void wk(struct generator * g, struct node * p) { /* keep c */
+ ++g->keep_count;
+ if (p->mode == m_forward) {
+ ws(g, "int c"); wi(g, g->keep_count); ws(g, " = z->c;");
+ } else {
+ ws(g, "int m"); wi(g, g->keep_count); ws(g, " = z->l - z->c; (void)m");
+ wi(g, g->keep_count); ws(g, ";");
+ }
+}
+
+static void wrestore(struct generator * g, struct node * p, int keep_token) { /* restore c */
+ if (p->mode == m_forward) {
+ ws(g, "z->c = c");
+ } else {
+ ws(g, "z->c = z->l - m");
+ }
+ wi(g, keep_token); ws(g, ";");
+}
+
+static void winc(struct generator * g, struct node * p) { /* increment c */
+ ws(g, p->mode == m_forward ? "z->c++;" :
+ "z->c--;");
+}
+
+static void wsetl(struct generator * g, int n) {
+
+ g->margin--;
+ wms(g, "lab"); wi(g, n); wch(g, ':'); wnl(g);
+ g->line_labelled = g->line_count;
+ g->margin++;
+}
+
+static void wgotol(struct generator * g, int n) {
+ wms(g, "goto lab"); wi(g, n); wch(g, ';'); wnl(g);
+}
+
+static void wf(struct generator * g) { /* fail */
+ if (g->failure_string != 0) { ws(g, "{ "); ws(g, g->failure_string); wch(g, ' '); }
+ switch (g->failure_label)
+ {
+ case x_return:
+ ws(g, "return 0;");
+ break;
+ default:
+ ws(g, "goto lab");
+ wi(g, g->failure_label);
+ wch(g, ';');
+ g->label_used = 1;
+ }
+ if (g->failure_string != 0) ws(g, " }");
+}
+
+static void wlim(struct generator * g, struct node * p) { /* if at limit fail */
+
+ ws(g, p->mode == m_forward ? "if (z->c >= z->l) " :
+ "if (z->c <= z->lb) ");
+ wf(g);
+}
+
+static void wp(struct generator * g, const char * s, struct node * p) { /* formatted write */
+ int i = 0;
+ int l = strlen(s);
+ until (i >= l) {
+ int ch = s[i++];
+ if (ch != '~') wch(g, ch); else
+ switch(s[i++]) {
+ default: wch(g, s[i - 1]); continue;
+ case 'C': wc(g, p); continue;
+ case 'k': wk(g, p); continue;
+ case 'K': /* keep for c_test */
+ ws(g, p->mode == m_forward ? "int c_test = z->c;" :
+ "int m_test = z->l - z->c;");
+ continue;
+ case 'R': /* restore for c_test */
+ ws(g, p->mode == m_forward ? "z->c = c_test;" :
+ "z->c = z->l - m_test;");
+ continue;
+ case 'i': winc(g, p); continue;
+ case 'l': wlim(g, p); continue;
+ case 'f': wf(g); continue;
+ case 'M': wm(g); continue;
+ case 'N': wnl(g); continue;
+ case '{': wbs(g); continue;
+ case '}': wbe(g); continue;
+ case 'S': ws(g, g->S[s[i++] - '0']); continue;
+ case 'I': wi(g, g->I[s[i++] - '0']); continue;
+ case 'J': wi3(g, g->I[s[i++] - '0']); continue;
+ case 'V': wv(g, g->V[s[i++] - '0']); continue;
+ case 'W': wvn(g, g->V[s[i++] - '0']); continue;
+ case 'L': wlitref(g, g->L[s[i++] - '0']); continue;
+ case 'A': wlitarray(g, g->L[s[i++] - '0']); continue;
+ case '+': g->margin++; continue;
+ case '-': g->margin--; continue;
+ case '$': /* insert_s, insert_v etc */
+ wch(g, p->literalstring == 0 ? 'v' : 's');
+ continue;
+ case 'p': ws(g, g->options->externals_prefix); continue;
+ }
+ }
+}
+
+static void w(struct generator * g, const char * s) { wp(g, s, 0); }
+
+static void generate_AE(struct generator * g, struct node * p) {
+ char * s;
+ switch (p->type) {
+ case c_name:
+ wv(g, p->name); break;
+ case c_number:
+ wi(g, p->number); break;
+ case c_maxint:
+ ws(g, "MAXINT"); break;
+ case c_minint:
+ ws(g, "MININT"); break;
+ case c_neg:
+ wch(g, '-'); generate_AE(g, p->right); break;
+ case c_multiply:
+ s = " * "; goto label0;
+ case c_plus:
+ s = " + "; goto label0;
+ case c_minus:
+ s = " - "; goto label0;
+ case c_divide:
+ s = " / ";
+ label0:
+ wch(g, '('); generate_AE(g, p->left);
+ ws(g, s); generate_AE(g, p->right); wch(g, ')'); break;
+ case c_sizeof:
+ g->V[0] = p->name;
+ w(g, "SIZE(~V0)"); break;
+ case c_cursor:
+ w(g, "z->c"); break;
+ case c_limit:
+ w(g, p->mode == m_forward ? "z->l" : "z->lb"); break;
+ case c_size:
+ w(g, "SIZE(z->p)"); break;
+ }
+}
+
+/* K_needed() tests to see if we really need to keep c. Not true when the
+ the command does not touch the cursor. This and repeat_score() could be
+ elaborated almost indefinitely.
+*/
+
+static int K_needed(struct generator * g, struct node * p) {
+ until (p == 0) {
+ switch (p->type) {
+ case c_dollar:
+ case c_leftslice:
+ case c_rightslice:
+ case c_mathassign:
+ case c_plusassign:
+ case c_minusassign:
+ case c_multiplyassign:
+ case c_divideassign:
+ case c_eq:
+ case c_ne:
+ case c_gr:
+ case c_ge:
+ case c_ls:
+ case c_le:
+ case c_sliceto:
+ case c_true:
+ case c_false:
+ case c_debug:
+ break;
+
+ case c_call:
+ if (K_needed(g, p->name->definition)) return true;
+ break;
+
+ case c_bra:
+ if (K_needed(g, p->left)) return true;
+ break;
+
+ default: return true;
+ }
+ p = p->right;
+ }
+ return false;
+}
+
+static int repeat_score(struct generator * g, struct node * p) {
+ int score = 0;
+ until (p == 0)
+ {
+ switch (p->type) {
+ case c_dollar:
+ case c_leftslice:
+ case c_rightslice:
+ case c_mathassign:
+ case c_plusassign:
+ case c_minusassign:
+ case c_multiplyassign:
+ case c_divideassign:
+ case c_eq:
+ case c_ne:
+ case c_gr:
+ case c_ge:
+ case c_ls:
+ case c_le:
+ case c_sliceto: /* case c_not: must not be included here! */
+ case c_debug:
+ break;
+
+ case c_call:
+ score += repeat_score(g, p->name->definition);
+ break;
+
+ case c_bra:
+ score += repeat_score(g, p->left);
+ break;
+
+ case c_name:
+ case c_literalstring:
+ case c_next:
+ case c_grouping:
+ case c_non:
+ case c_hop:
+ score = score + 1; break;
+
+ default: score = 2; break;
+ }
+ p = p->right;
+ }
+ return score;
+}
+
+/* tests if an expression requires cursor reinstatement in a repeat */
+
+static int repeat_restore(struct generator * g, struct node * p) {
+ return repeat_score(g, p) >= 2;
+}
+
+static void generate_bra(struct generator * g, struct node * p) {
+ p = p->left;
+ until (p == 0) { generate(g, p); p = p->right; }
+}
+
+static void generate_and(struct generator * g, struct node * p) {
+ int keep_c = 0;
+ if (K_needed(g, p->left)) {
+ wp(g, "~{~k~C", p);
+ keep_c = g->keep_count;
+ } else {
+ wp(g, "~M~C", p);
+ }
+ p = p->left;
+ until (p == 0) {
+ generate(g, p);
+ if (keep_c && p->right != 0) {
+ w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N");
+ }
+ p = p->right;
+ }
+ if (keep_c) w(g, "~}");
+}
+
+static void generate_or(struct generator * g, struct node * p) {
+ int keep_c = 0;
+
+ int used = g->label_used;
+ int a0 = g->failure_label;
+ const char * a1 = g->failure_string;
+
+ int out_lab = new_label(g);
+
+ if (K_needed(g, p->left)) {
+ wp(g, "~{~k~C", p);
+ keep_c = g->keep_count;
+ } else {
+ wp(g, "~M~C", p);
+ }
+ p = p->left;
+ g->failure_string = 0;
+ until (p->right == 0) {
+ g->failure_label = new_label(g);
+ g->label_used = 0;
+ generate(g, p);
+ wgotol(g, out_lab);
+ if (g->label_used)
+ wsetl(g, g->failure_label);
+ if (keep_c) {
+ w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N");
+ }
+ p = p->right;
+ }
+ g->label_used = used;
+ g->failure_label = a0;
+ g->failure_string = a1;
+
+ generate(g, p);
+ if (keep_c) w(g, "~}");
+ wsetl(g, out_lab);
+}
+
+static void generate_backwards(struct generator * g, struct node * p) {
+
+ wp(g,"~Mz->lb = z->c; z->c = z->l;~C~N", p);
+ generate(g, p->left);
+ w(g, "~Mz->c = z->lb;~N");
+}
+
+
+static void generate_not(struct generator * g, struct node * p) {
+ int keep_c = 0;
+
+ int used = g->label_used;
+ int a0 = g->failure_label;
+ const char * a1 = g->failure_string;
+
+ if (K_needed(g, p->left)) {
+ wp(g, "~{~k~C", p);
+ keep_c = g->keep_count;
+ } else {
+ wp(g, "~M~C", p);
+ }
+
+ g->failure_label = new_label(g);
+ g->label_used = 0;
+ g->failure_string = 0;
+ generate(g, p->left);
+
+ {
+ int l = g->failure_label;
+ int u = g->label_used;
+
+ g->label_used = used;
+ g->failure_label = a0;
+ g->failure_string = a1;
+
+ w(g, "~M~f~N");
+ if (u)
+ wsetl(g, l);
+ }
+ if (keep_c) {
+ w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N~}");
+ }
+}
+
+
+static void generate_try(struct generator * g, struct node * p) {
+ int keep_c = K_needed(g, p->left);
+
+ if (keep_c) {
+ if (p->mode == m_forward) {
+ wp(g, "~{int c_keep = z->c;~C", p);
+ g->failure_string = "z->c = c_keep;";
+ } else {
+ wp(g, "~{int m_keep = z->l - z->c;/* (void) m_keep;*/~C", p);
+ g->failure_string = "z->c = z->l - m_keep;";
+ }
+ } else {
+ wp(g, "~M~C", p);
+ g->failure_string = 0;
+ }
+
+ g->failure_label = new_label(g);
+ g->label_used = 0;
+ generate(g, p->left);
+
+ if (g->label_used)
+ wsetl(g, g->failure_label);
+
+ if (keep_c) w(g, "~}");
+}
+
+static void generate_set(struct generator * g, struct node * p) {
+ g->V[0] = p->name; wp(g, "~M~V0 = 1;~C", p);
+}
+
+static void generate_unset(struct generator * g, struct node * p) {
+ g->V[0] = p->name; wp(g, "~M~V0 = 0;~C", p);
+}
+
+static void generate_fail(struct generator * g, struct node * p) {
+ generate(g, p->left);
+ wp(g, "~M~f~C", p);
+}
+
+/* generate_test() also implements 'reverse' */
+
+static void generate_test(struct generator * g, struct node * p) {
+ int keep_c = K_needed(g, p->left);
+ if (keep_c) wp(g, "~{~K~C", p);
+ else wp(g, "~M~C", p);
+
+ generate(g, p->left);
+
+ if (keep_c) wp(g, "~M~R~N"
+ "~}", p);
+}
+
+static void generate_do(struct generator * g, struct node * p) {
+ int keep_c = 0;
+ if (K_needed(g, p->left)) {
+ wp(g, "~{~k~C", p);
+ keep_c = g->keep_count;
+ } else {
+ wp(g, "~M~C", p);
+ }
+
+ g->failure_label = new_label(g);
+ g->label_used = 0;
+ g->failure_string = 0;
+ generate(g, p->left);
+
+ if (g->label_used)
+ wsetl(g, g->failure_label);
+ if (keep_c) {
+ w(g, "~M"); wrestore(g, p, keep_c);
+ w(g, "~N~}");
+ }
+}
+
+static void generate_next(struct generator * g, struct node * p) {
+ if (g->options->utf8) {
+ if (p->mode == m_forward)
+ w(g, "~{int ret = skip_utf8(z->p, z->c, 0, z->l, 1");
+ else
+ w(g, "~{int ret = skip_utf8(z->p, z->c, z->lb, 0, -1");
+ wp(g, ");~N"
+ "~Mif (ret < 0) ~f~N"
+ "~Mz->c = ret;~C"
+ "~}", p);
+ } else
+ wp(g, "~M~l~N"
+ "~M~i~C", p);
+}
+
+static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) {
+
+ struct grouping * q = p->name->grouping;
+ g->S[0] = p->mode == m_forward ? "" : "_b";
+ g->S[1] = complement ? "in" : "out";
+ g->S[2] = g->options->utf8 ? "_U" : "";
+ g->V[0] = p->name;
+ g->I[0] = q->smallest_ch;
+ g->I[1] = q->largest_ch;
+ if (is_goto) {
+ wp(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1) < 0) ~f /* goto */~C", p);
+ } else {
+ wp(g, "~{ /* gopast */~C"
+ "~Mint ret = ~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1);~N"
+ "~Mif (ret < 0) ~f~N", p);
+ if (p->mode == m_forward)
+ w(g, "~Mz->c += ret;~N");
+ else
+ w(g, "~Mz->c -= ret;~N");
+ w(g, "~}");
+ }
+}
+
+static void generate_GO(struct generator * g, struct node * p, int style) {
+ int keep_c = 0;
+
+ int used = g->label_used;
+ int a0 = g->failure_label;
+ const char * a1 = g->failure_string;
+
+ if (p->left->type == c_grouping || p->left->type == c_non) {
+ /* Special case for "goto" or "gopast" when used on a grouping or an
+ * inverted grouping - the movement of c by the matching action is
+ * exactly what we want! */
+#ifdef OPTIMISATION_WARNINGS
+ printf("Optimising %s %s\n", style ? "goto" : "gopast", p->left->type == c_non ? "non" : "grouping");
+#endif
+ generate_GO_grouping(g, p->left, style, p->left->type == c_non);
+ return;
+ }
+
+ w(g, "~Mwhile(1) {"); wp(g, "~C~+", p);
+
+ if (style == 1 || repeat_restore(g, p->left)) {
+ wp(g, "~M~k~N", p);
+ keep_c = g->keep_count;
+ }
+
+ g->failure_label = new_label(g);
+ g->label_used = 0;
+ generate(g, p->left);
+
+ if (style == 1) {
+ /* include for goto; omit for gopast */
+ w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N");
+ }
+ w(g, "~Mbreak;~N");
+ if (g->label_used)
+ wsetl(g, g->failure_label);
+ if (keep_c) {
+ w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N");
+ }
+
+ g->label_used = used;
+ g->failure_label = a0;
+ g->failure_string = a1;
+
+/* wp(g, "~M~l~N"
+ "~M~i~N", p); */
+ generate_next(g, p);
+ w(g, "~}");
+}
+
+static void generate_loop(struct generator * g, struct node * p) {
+ w(g, "~{int i; for (i = "); generate_AE(g, p->AE); wp(g, "; i > 0; i--)~C"
+ "~{", p);
+
+ generate(g, p->left);
+
+ w(g, "~}"
+ "~}");
+}
+
+static void generate_repeat(struct generator * g, struct node * p, int atleast_case) {
+ int keep_c = 0;
+ wp(g, "~Mwhile(1) {~C~+", p);
+
+ if (repeat_restore(g, p->left)) {
+ wp(g, "~M~k~N", p);
+ keep_c = g->keep_count;
+ }
+
+ g->failure_label = new_label(g);
+ g->label_used = 0;
+ g->failure_string = 0;
+ generate(g, p->left);
+
+ if (atleast_case) w(g, "~Mi--;~N");
+
+ w(g, "~Mcontinue;~N");
+ if (g->label_used)
+ wsetl(g, g->failure_label);
+
+ if (keep_c) {
+ w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N");
+ }
+
+ w(g, "~Mbreak;~N"
+ "~}");
+}
+
+static void generate_atleast(struct generator * g, struct node * p) {
+ w(g, "~{int i = "); generate_AE(g, p->AE); w(g, ";~N");
+ {
+ int used = g->label_used;
+ int a0 = g->failure_label;
+ const char * a1 = g->failure_string;
+
+ generate_repeat(g, p, true);
+
+ g->label_used = used;
+ g->failure_label = a0;
+ g->failure_string = a1;
+ }
+ w(g, "~Mif (i > 0) ~f~N"
+ "~}");
+}
+
+static void generate_setmark(struct generator * g, struct node * p) {
+ g->V[0] = p->name;
+ wp(g, "~M~V0 = z->c;~C", p);
+}
+
+static void generate_tomark(struct generator * g, struct node * p) {
+ g->S[0] = p->mode == m_forward ? ">" : "<";
+
+ w(g, "~Mif (z->c ~S0 "); generate_AE(g, p->AE); w(g, ") ~f~N");
+ w(g, "~Mz->c = "); generate_AE(g, p->AE); wp(g, ";~C", p);
+}
+
+static void generate_atmark(struct generator * g, struct node * p) {
+
+ w(g, "~Mif (z->c != "); generate_AE(g, p->AE); wp(g, ") ~f~C", p);
+}
+
+static void generate_hop(struct generator * g, struct node * p) {
+ g->S[0] = p->mode == m_forward ? "+" : "-";
+ g->S[1] = p->mode == m_forward ? "0" : "z->lb";
+ if (g->options->utf8) {
+ w(g, "~{int ret = skip_utf8(z->p, z->c, ~S1, z->l, ~S0 ");
+ generate_AE(g, p->AE); wp(g, ");~C", p);
+ w(g, "~Mif (ret < 0) ~f~N");
+ } else {
+ w(g, "~{int ret = z->c ~S0 ");
+ generate_AE(g, p->AE); wp(g, ";~C", p);
+ w(g, "~Mif (~S1 > ret || ret > z->l) ~f~N");
+ }
+ wp(g, "~Mz->c = ret;~C"
+ "~}", p);
+}
+
+static void generate_delete(struct generator * g, struct node * p) {
+ wp(g, "~{int ret = slice_del(z);~C", p);
+ wp(g, "~Mif (ret < 0) return ret;~N"
+ "~}", p);
+}
+
+static void generate_tolimit(struct generator * g, struct node * p) {
+ g->S[0] = p->mode == m_forward ? "" : "b";
+ wp(g, "~Mz->c = z->l~S0;~C", p);
+}
+
+static void generate_atlimit(struct generator * g, struct node * p) {
+ g->S[0] = p->mode == m_forward ? "" : "b";
+ g->S[1] = p->mode == m_forward ? "<" : ">";
+ wp(g, "~Mif (z->c ~S1 z->l~S0) ~f~C", p);
+}
+
+static void generate_leftslice(struct generator * g, struct node * p) {
+ g->S[0] = p->mode == m_forward ? "bra" : "ket";
+ wp(g, "~Mz->~S0 = z->c;~C", p);
+}
+
+static void generate_rightslice(struct generator * g, struct node * p) {
+ g->S[0] = p->mode == m_forward ? "ket" : "bra";
+ wp(g, "~Mz->~S0 = z->c;~C", p);
+}
+
+static void generate_assignto(struct generator * g, struct node * p) {
+ g->V[0] = p->name;
+ wp(g, "~M~V0 = assign_to(z, ~V0);~C"
+ "~Mif (~V0 == 0) return -1;~C", p);
+}
+
+static void generate_sliceto(struct generator * g, struct node * p) {
+ g->V[0] = p->name;
+ wp(g, "~M~V0 = slice_to(z, ~V0);~C"
+ "~Mif (~V0 == 0) return -1;~C", p);
+}
+
+static void generate_data_address(struct generator * g, struct node * p) {
+
+ symbol * b = p->literalstring;
+ if (b != 0) {
+ wi(g, SIZE(b)); w(g, ", ");
+ wlitref(g, b);
+ } else
+ wv(g, p->name);
+}
+
+static void generate_insert(struct generator * g, struct node * p, int style) {
+
+ int keep_c = style == c_attach;
+ if (p->mode == m_backward) keep_c = !keep_c;
+ wp(g, "~{", p);
+ if (keep_c) w(g, "int c_keep = z->c;~N~M");
+ wp(g, "int ret = insert_~$(z, z->c, z->c, ", p);
+ generate_data_address(g, p);
+ wp(g, ");~C", p);
+ if (keep_c) w(g, "~Mz->c = c_keep;~N");
+ wp(g, "~Mif (ret < 0) return ret;~N"
+ "~}", p);
+}
+
+static void generate_assignfrom(struct generator * g, struct node * p) {
+
+ int keep_c = p->mode == m_forward; /* like 'attach' */
+ wp(g, "~{", p);
+ if (keep_c) wp(g, "int c_keep = z->c;~N"
+ "~Mret = insert_~$(z, z->c, z->l, ", p);
+ else wp(g, "ret = insert_~$(z, z->lb, z->c, ", p);
+ generate_data_address(g, p);
+ wp(g, ");~C", p);
+ if (keep_c) w(g, "~Mz->c = c_keep;~N");
+ wp(g, "~Mif (ret < 0) return ret;~N"
+ "~}", p);
+}
+
+/* bugs marked <======= fixed 22/7/02. Similar fixes required for Java */
+
+static void generate_slicefrom(struct generator * g, struct node * p) {
+
+/* w(g, "~Mslice_from_s(z, "); <============= bug! should be: */
+ wp(g, "~{int ret = slice_from_~$(z, ", p);
+ generate_data_address(g, p);
+ wp(g, ");~C", p);
+ wp(g, "~Mif (ret < 0) return ret;~N"
+ "~}", p);
+}
+
+static void generate_setlimit(struct generator * g, struct node * p) {
+ int keep_c;
+ wp(g, "~{int mlimit;~C"
+ "~M~k~N"
+ , p);
+ keep_c = g->keep_count;
+ generate(g, p->left);
+ if (p->mode == m_forward) w(g, "~Mmlimit = z->l - z->c; z->l = z->c;~N");
+ else w(g, "~Mmlimit = z->lb; z->lb = z->c;~N");
+ w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N");
+ g->failure_string = p->mode == m_forward ? "z->l += mlimit;" :
+ "z->lb = mlimit;";
+ generate(g, p->aux);
+ wms(g, g->failure_string);
+ w(g, "~N"
+ "~}");
+}
+
+static void generate_dollar(struct generator * g, struct node * p) {
+
+ int used = g->label_used;
+ int a0 = g->failure_label;
+ const char * a1 = g->failure_string;
+ g->failure_label = new_label(g);
+ g->label_used = 0;
+ g->failure_string = 0;
+
+ g->V[0] = p->name;
+ wp(g, "~{struct SN_env env = * z;~C"
+ "~Mint failure = 1; /* assume failure */~N"
+ "~Mz->p = ~V0;~N"
+ "~Mz->lb = z->c = 0;~N"
+ "~Mz->l = SIZE(z->p);~N", p);
+ generate(g, p->left);
+ w(g, "~Mfailure = 0; /* mark success */~N");
+ if (g->label_used)
+ wsetl(g, g->failure_label);
+ g->V[0] = p->name; /* necessary */
+
+ g->label_used = used;
+ g->failure_label = a0;
+ g->failure_string = a1;
+
+ w(g, "~M~V0 = z->p;~N"
+ "~M* z = env;~N"
+ "~Mif (failure) ~f~N~}");
+}
+
+static void generate_integer_assign(struct generator * g, struct node * p, char * s) {
+
+ g->V[0] = p->name;
+ g->S[0] = s;
+ w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); wp(g, ";~C", p);
+}
+
+static void generate_integer_test(struct generator * g, struct node * p, char * s) {
+
+ g->V[0] = p->name;
+ g->S[0] = s;
+ w(g, "~Mif (!(~V0 ~S0 "); generate_AE(g, p->AE); wp(g, ")) ~f~C", p);
+}
+
+static void generate_call(struct generator * g, struct node * p) {
+
+ g->V[0] = p->name;
+ wp(g, "~{int ret = ~V0(z);~C"
+ "~Mif (ret == 0) ~f~N"
+ "~Mif (ret < 0) return ret;~N~}", p);
+}
+
+static void generate_grouping(struct generator * g, struct node * p, int complement) {
+
+ struct grouping * q = p->name->grouping;
+ g->S[0] = p->mode == m_forward ? "" : "_b";
+ g->S[1] = complement ? "out" : "in";
+ g->S[2] = g->options->utf8 ? "_U" : "";
+ g->V[0] = p->name;
+ g->I[0] = q->smallest_ch;
+ g->I[1] = q->largest_ch;
+ wp(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 0)) ~f~C", p);
+}
+
+static void generate_namedstring(struct generator * g, struct node * p) {
+
+ g->S[0] = p->mode == m_forward ? "" : "_b";
+ g->V[0] = p->name;
+ wp(g, "~Mif (!(eq_v~S0(z, ~V0))) ~f~C", p);
+}
+
+static void generate_literalstring(struct generator * g, struct node * p) {
+ symbol * b = p->literalstring;
+ g->S[0] = p->mode == m_forward ? "" : "_b";
+ g->I[0] = SIZE(b);
+ g->L[0] = b;
+
+ wp(g, "~Mif (!(eq_s~S0(z, ~I0, ~L0))) ~f~C", p);
+}
+
+static void generate_define(struct generator * g, struct node * p) {
+ struct name * q = p->name;
+ g->next_label = 0;
+
+ g->S[0] = q->type == t_routine ? "static" : "extern";
+ g->V[0] = q;
+
+ w(g, "~N~S0 int ~V0(struct SN_env * z) {~N~+");
+ if (p->amongvar_needed) w(g, "~Mint among_var;~N");
+ g->failure_string = 0;
+ g->failure_label = x_return;
+ g->label_used = 0;
+ g->keep_count = 0;
+ generate(g, p->left);
+ w(g, "~Mreturn 1;~N~}");
+}
+
+static void generate_substring(struct generator * g, struct node * p) {
+
+ struct among * x = p->among;
+ int block = -1;
+ unsigned int bitmap = 0;
+ struct amongvec * among_cases = x->b;
+ int c;
+ int empty_case = -1;
+ int n_cases = 0;
+ symbol cases[2];
+ int shortest_size = INT_MAX;
+
+ g->S[0] = p->mode == m_forward ? "" : "_b";
+ g->I[0] = x->number;
+ g->I[1] = x->literalstring_count;
+
+ /* In forward mode with non-ASCII UTF-8 characters, the first character
+ * of the string will often be the same, so instead look at the last
+ * common character position.
+ *
+ * In backward mode, we can't match if there are fewer characters before
+ * the current position than the minimum length.
+ */
+ for (c = 0; c < x->literalstring_count; ++c) {
+ int size = among_cases[c].size;
+ if (size != 0 && size < shortest_size) {
+ shortest_size = size;
+ }
+ }
+
+ for (c = 0; c < x->literalstring_count; ++c) {
+ symbol ch;
+ if (among_cases[c].size == 0) {
+ empty_case = c;
+ continue;
+ }
+ if (p->mode == m_forward) {
+ ch = among_cases[c].b[shortest_size - 1];
+ } else {
+ ch = among_cases[c].b[among_cases[c].size - 1];
+ }
+ if (n_cases == 0) {
+ block = ch >> 5;
+ } else if (ch >> 5 != block) {
+ block = -1;
+ if (n_cases > 2) break;
+ }
+ if (block == -1) {
+ if (ch == cases[0]) continue;
+ if (n_cases < 2) {
+ cases[n_cases++] = ch;
+ } else if (ch != cases[1]) {
+ ++n_cases;
+ break;
+ }
+ } else {
+ if ((bitmap & (1u << (ch & 0x1f))) == 0) {
+ bitmap |= 1u << (ch & 0x1f);
+ if (n_cases < 2)
+ cases[n_cases] = ch;
+ ++n_cases;
+ }
+ }
+ }
+
+ if (block != -1 || n_cases <= 2) {
+ char buf[64];
+ g->I[2] = block;
+ g->I[3] = bitmap;
+ g->I[4] = shortest_size - 1;
+ if (p->mode == m_forward) {
+ sprintf(buf, "z->p[z->c + %d]", shortest_size - 1);
+ g->S[1] = buf;
+ if (shortest_size == 1) {
+ wp(g, "~Mif (z->c >= z->l || ", p);
+ } else {
+ wp(g, "~Mif (z->c + ~I4 >= z->l || ", p);
+ }
+ } else {
+ g->S[1] = "z->p[z->c - 1]";
+ if (shortest_size == 1) {
+ wp(g, "~Mif (z->c <= z->lb || ", p);
+ } else {
+ wp(g, "~Mif (z->c - ~I4 <= z->lb || ", p);
+ }
+ }
+ if (n_cases == 0) {
+ /* We get this for the degenerate case: among { '' }
+ * This doesn't seem to be a useful construct, but it is
+ * syntactically valid.
+ */
+ wp(g, "0", p);
+ } else if (n_cases == 1) {
+ g->I[4] = cases[0];
+ wp(g, "~S1 != ~I4", p);
+ } else if (n_cases == 2) {
+ g->I[4] = cases[0];
+ g->I[5] = cases[1];
+ wp(g, "(~S1 != ~I4 && ~S1 != ~I5)", p);
+ } else {
+ wp(g, "~S1 >> 5 != ~I2 || !((~I3 >> (~S1 & 0x1f)) & 1)", p);
+ }
+ ws(g, ") ");
+ if (empty_case != -1) {
+ /* If the among includes the empty string, it can never fail
+ * so not matching the bitmap means we match the empty string.
+ */
+ g->I[4] = among_cases[empty_case].result;
+ wp(g, "among_var = ~I4; else~C", p);
+ } else {
+ wp(g, "~f~C", p);
+ }
+ } else {
+#ifdef OPTIMISATION_WARNINGS
+ printf("Couldn't shortcut among %d\n", x->number);
+#endif
+ }
+
+ if (x->command_count == 0 && x->starter == 0)
+ wp(g, "~Mif (!(find_among~S0(z, a_~I0, ~I1))) ~f~C", p);
+ else
+ wp(g, "~Mamong_var = find_among~S0(z, a_~I0, ~I1);~C"
+ "~Mif (!(among_var)) ~f~N", p);
+}
+
+static void generate_among(struct generator * g, struct node * p) {
+
+ struct among * x = p->among;
+ int case_number = 1;
+
+ if (x->substring == 0) generate_substring(g, p);
+ if (x->command_count == 0 && x->starter == 0) return;
+
+ unless (x->starter == 0) generate(g, x->starter);
+
+ p = p->left;
+ if (p != 0 && p->type != c_literalstring) p = p->right;
+ w(g, "~Mswitch(among_var) {~N~+"
+ "~Mcase 0: ~f~N");
+
+ until (p == 0) {
+ if (p->type == c_bra && p->left != 0) {
+ g->I[0] = case_number++;
+ w(g, "~Mcase ~I0:~N~+"); generate(g, p); w(g, "~Mbreak;~N~-");
+ }
+ p = p->right;
+ }
+ w(g, "~}");
+}
+
+static void generate_booltest(struct generator * g, struct node * p) {
+
+ g->V[0] = p->name;
+ wp(g, "~Mif (!(~V0)) ~f~C", p);
+}
+
+static void generate_false(struct generator * g, struct node * p) {
+
+ wp(g, "~M~f~C", p);
+}
+
+static void generate_debug(struct generator * g, struct node * p) {
+
+ g->I[0] = g->debug_count++;
+ g->I[1] = p->line_number;
+ wp(g, "~Mdebug(z, ~I0, ~I1);~C", p);
+
+}
+
+static void generate(struct generator * g, struct node * p) {
+
+ int used = g->label_used;
+ int a0 = g->failure_label;
+ const char * a1 = g->failure_string;
+
+ switch (p->type)
+ {
+ case c_define: generate_define(g, p); break;
+ case c_bra: generate_bra(g, p); break;
+ case c_and: generate_and(g, p); break;
+ case c_or: generate_or(g, p); break;
+ case c_backwards: generate_backwards(g, p); break;
+ case c_not: generate_not(g, p); break;
+ case c_set: generate_set(g, p); break;
+ case c_unset: generate_unset(g, p); break;
+ case c_try: generate_try(g, p); break;
+ case c_fail: generate_fail(g, p); break;
+ case c_reverse:
+ case c_test: generate_test(g, p); break;
+ case c_do: generate_do(g, p); break;
+ case c_goto: generate_GO(g, p, 1); break;
+ case c_gopast: generate_GO(g, p, 0); break;
+ case c_repeat: generate_repeat(g, p, false); break;
+ case c_loop: generate_loop(g, p); break;
+ case c_atleast: generate_atleast(g, p); break;
+ case c_setmark: generate_setmark(g, p); break;
+ case c_tomark: generate_tomark(g, p); break;
+ case c_atmark: generate_atmark(g, p); break;
+ case c_hop: generate_hop(g, p); break;
+ case c_delete: generate_delete(g, p); break;
+ case c_next: generate_next(g, p); break;
+ case c_tolimit: generate_tolimit(g, p); break;
+ case c_atlimit: generate_atlimit(g, p); break;
+ case c_leftslice: generate_leftslice(g, p); break;
+ case c_rightslice: generate_rightslice(g, p); break;
+ case c_assignto: generate_assignto(g, p); break;
+ case c_sliceto: generate_sliceto(g, p); break;
+ case c_assign: generate_assignfrom(g, p); break;
+ case c_insert:
+ case c_attach: generate_insert(g, p, p->type); break;
+ case c_slicefrom: generate_slicefrom(g, p); break;
+ case c_setlimit: generate_setlimit(g, p); break;
+ case c_dollar: generate_dollar(g, p); break;
+ case c_mathassign: generate_integer_assign(g, p, "="); break;
+ case c_plusassign: generate_integer_assign(g, p, "+="); break;
+ case c_minusassign: generate_integer_assign(g, p, "-="); break;
+ case c_multiplyassign:generate_integer_assign(g, p, "*="); break;
+ case c_divideassign: generate_integer_assign(g, p, "/="); break;
+ case c_eq: generate_integer_test(g, p, "=="); break;
+ case c_ne: generate_integer_test(g, p, "!="); break;
+ case c_gr: generate_integer_test(g, p, ">"); break;
+ case c_ge: generate_integer_test(g, p, ">="); break;
+ case c_ls: generate_integer_test(g, p, "<"); break;
+ case c_le: generate_integer_test(g, p, "<="); break;
+ case c_call: generate_call(g, p); break;
+ case c_grouping: generate_grouping(g, p, false); break;
+ case c_non: generate_grouping(g, p, true); break;
+ case c_name: generate_namedstring(g, p); break;
+ case c_literalstring: generate_literalstring(g, p); break;
+ case c_among: generate_among(g, p); break;
+ case c_substring: generate_substring(g, p); break;
+ case c_booltest: generate_booltest(g, p); break;
+ case c_false: generate_false(g, p); break;
+ case c_true: break;
+ case c_debug: generate_debug(g, p); break;
+ default: fprintf(stderr, "%d encountered\n", p->type);
+ exit(1);
+ }
+
+ if (g->failure_label != a0)
+ g->label_used = used;
+ g->failure_label = a0;
+ g->failure_string = a1;
+}
+
+static void generate_start_comment(struct generator * g) {
+
+ w(g, "~N/* This file was generated automatically by the Snowball to ANSI C compiler */~N");
+}
+
+static void generate_head(struct generator * g) {
+
+ if (g->options->runtime_path == 0) {
+ w(g, "~N#include \"header.h\"~N~N");
+ } else {
+ w(g, "~N#include \"");
+ ws(g, g->options->runtime_path);
+ if (g->options->runtime_path[strlen(g->options->runtime_path) - 1] != '/')
+ wch(g, '/');
+ w(g, "header.h\"~N~N");
+ }
+}
+
+static void generate_routine_headers(struct generator * g) {
+ struct name * q = g->analyser->names;
+ until (q == 0) {
+ g->V[0] = q;
+ switch (q->type) {
+ case t_routine:
+ w(g, "static int ~W0(struct SN_env * z);~N");
+ break;
+ case t_external:
+ w(g,
+ "#ifdef __cplusplus~N"
+ "extern \"C\" {~N"
+ "#endif~N"
+ "extern int ~W0(struct SN_env * z);~N"
+ "#ifdef __cplusplus~N"
+ "}~N"
+ "#endif~N"
+ );
+ break;
+ }
+ q = q->next;
+ }
+}
+
+static void generate_among_table(struct generator * g, struct among * x) {
+
+ struct amongvec * v = x->b;
+
+ g->I[0] = x->number;
+ {
+ int i;
+ for (i = 0; i < x->literalstring_count; i++)
+ {
+ g->I[1] = i;
+ g->I[2] = v->size;
+ g->L[0] = v->b;
+ unless (v->size == 0)
+ w(g, "static const symbol s_~I0_~I1[~I2] = ~A0;~N");
+ v++;
+ }
+ }
+
+ g->I[1] = x->literalstring_count;
+ w(g, "~N~Mstatic const struct among a_~I0[~I1] =~N{~N");
+
+ v = x->b;
+ {
+ int i;
+ for (i = 0; i < x->literalstring_count; i++) {
+ g->I[1] = i;
+ g->I[2] = v->size;
+ g->I[3] = v->i;
+ g->I[4] = v->result;
+ g->S[0] = i < x->literalstring_count - 1 ? "," : "";
+
+ w(g, "/*~J1 */ { ~I2, ");
+ if (v->size == 0) w(g, "0,");
+ else w(g, "s_~I0_~I1,");
+ w(g, " ~I3, ~I4, ");
+ if (v->function == 0) w(g, "0"); else
+ wvn(g, v->function);
+ w(g, "}~S0~N");
+ v++;
+ }
+ }
+ w(g, "};~N~N");
+}
+
+static void generate_amongs(struct generator * g) {
+ struct among * x = g->analyser->amongs;
+ until (x == 0) {
+ generate_among_table(g, x);
+ x = x->next;
+ }
+}
+
+static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; }
+
+static void generate_grouping_table(struct generator * g, struct grouping * q) {
+
+ int range = q->largest_ch - q->smallest_ch + 1;
+ int size = (range + 7)/ 8; /* assume 8 bits per symbol */
+ symbol * b = q->b;
+ symbol * map = create_b(size);
+ int i;
+ for (i = 0; i < size; i++) map[i] = 0;
+
+ for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch);
+
+ {
+ g->V[0] = q->name;
+
+ w(g, "static const unsigned char ~V0[] = { ");
+ for (i = 0; i < size; i++) {
+ wi(g, map[i]);
+ if (i < size - 1) w(g, ", ");
+ }
+ w(g, " };~N~N");
+ }
+ lose_b(map);
+}
+
+static void generate_groupings(struct generator * g) {
+ struct grouping * q = g->analyser->groupings;
+ until (q == 0) {
+ generate_grouping_table(g, q);
+ q = q->next;
+ }
+}
+
+static void generate_create(struct generator * g) {
+
+ int * p = g->analyser->name_count;
+ g->I[0] = p[t_string];
+ g->I[1] = p[t_integer];
+ g->I[2] = p[t_boolean];
+ w(g, "~N"
+ "extern struct SN_env * ~pcreate_env(void) { return SN_create_env(~I0, ~I1, ~I2); }"
+ "~N");
+}
+
+static void generate_close(struct generator * g) {
+
+ int * p = g->analyser->name_count;
+ g->I[0] = p[t_string];
+ w(g, "~Nextern void ~pclose_env(struct SN_env * z) { SN_close_env(z, ~I0); }~N~N");
+}
+
+static void generate_create_and_close_templates(struct generator * g) {
+ w(g, "~N"
+ "extern struct SN_env * ~pcreate_env(void);~N"
+ "extern void ~pclose_env(struct SN_env * z);~N"
+ "~N");
+}
+
+static void generate_header_file(struct generator * g) {
+
+ struct name * q = g->analyser->names;
+ char * vp = g->options->variables_prefix;
+ g->S[0] = vp;
+
+ w(g, "~N"
+ "#ifdef __cplusplus~N"
+ "extern \"C\" {~N"
+ "#endif~N"); /* for C++ */
+
+ generate_create_and_close_templates(g);
+ until (q == 0) {
+ g->V[0] = q;
+ switch (q->type)
+ {
+ case t_external:
+ w(g, "extern int ~W0(struct SN_env * z);~N");
+ break;
+ case t_string: g->S[1] = "S"; goto label0;
+ case t_integer: g->S[1] = "I"; goto label0;
+ case t_boolean: g->S[1] = "B";
+ label0:
+ if (vp) {
+ g->I[0] = q->count;
+ w(g, "#define ~S0");
+ str_append_b(g->outbuf, q->b);
+ w(g, " (~S1[~I0])~N");
+ }
+ break;
+ }
+ q = q->next;
+ }
+
+ w(g, "~N"
+ "#ifdef __cplusplus~N"
+ "}~N"
+ "#endif~N"); /* for C++ */
+
+ w(g, "~N");
+}
+
+extern void generate_program_c(struct generator * g) {
+
+ g->outbuf = str_new();
+ generate_start_comment(g);
+ generate_head(g);
+ generate_routine_headers(g);
+ w(g, "#ifdef __cplusplus~N"
+ "extern \"C\" {~N"
+ "#endif~N"
+ "~N");
+ generate_create_and_close_templates(g);
+ w(g, "~N"
+ "#ifdef __cplusplus~N"
+ "}~N"
+ "#endif~N");
+ generate_amongs(g);
+ generate_groupings(g);
+ g->declarations = g->outbuf;
+ g->outbuf = str_new();
+ g->literalstring_count = 0;
+ {
+ struct node * p = g->analyser->program;
+ until (p == 0) { generate(g, p); p = p->right; }
+ }
+ generate_create(g);
+ generate_close(g);
+ output_str(g->options->output_c, g->declarations);
+ str_delete(g->declarations);
+ output_str(g->options->output_c, g->outbuf);
+ str_clear(g->outbuf);
+
+ generate_start_comment(g);
+ generate_header_file(g);
+ output_str(g->options->output_h, g->outbuf);
+ str_delete(g->outbuf);
+}
+
+extern struct generator * create_generator_c(struct analyser * a, struct options * o) {
+ NEW(generator, g);
+ g->analyser = a;
+ g->options = o;
+ g->margin = 0;
+ g->debug_count = 0;
+ g->line_count = 0;
+ return g;
+}
+
+extern void close_generator_c(struct generator * g) {
+
+ FREE(g);
+}
+
diff --git a/contrib/snowball/compiler/generator_java.c b/contrib/snowball/compiler/generator_java.c
new file mode 100644
index 000000000..07a4b8e2e
--- /dev/null
+++ b/contrib/snowball/compiler/generator_java.c
@@ -0,0 +1,1452 @@
+
+#include <stdlib.h> /* for exit */
+#include <string.h> /* for strlen */
+#include <stdio.h> /* for fprintf etc */
+#include "header.h"
+
+/* prototypes */
+
+static void generate(struct generator * g, struct node * p);
+static void w(struct generator * g, const char * s);
+static void writef(struct generator * g, const char * s, struct node * p);
+
+
+enum special_labels {
+ x_return = -1
+};
+
+static int new_label(struct generator * g) {
+
+ return g->next_label++;
+}
+
+static struct str * vars_newname(struct generator * g) {
+
+ struct str * output;
+ g->var_number ++;
+ output = str_new();
+ str_append_string(output, "v_");
+ str_append_int(output, g->var_number);
+ return output;
+}
+
+/* Output routines */
+static void output_str(FILE * outfile, struct str * str) {
+
+ char * s = b_to_s(str_data(str));
+ fprintf(outfile, "%s", s);
+ free(s);
+}
+
+/* Write routines for simple entities */
+
+static void write_char(struct generator * g, int ch) {
+
+ str_append_ch(g->outbuf, ch);
+}
+
+static void write_newline(struct generator * g) {
+
+ str_append_string(g->outbuf, "\n");
+}
+
+static void write_string(struct generator * g, const char * s) {
+
+ str_append_string(g->outbuf, s);
+}
+
+static void write_b(struct generator * g, symbol * b) {
+
+ str_append_b(g->outbuf, b);
+}
+
+static void write_str(struct generator * g, struct str * str) {
+
+ str_append(g->outbuf, str);
+}
+
+static void write_int(struct generator * g, int i) {
+
+ str_append_int(g->outbuf, i);
+}
+
+
+/* Write routines for items from the syntax tree */
+
+static void write_varname(struct generator * g, struct name * p) {
+
+ int ch = "SBIrxg"[p->type];
+ if (p->type != t_external)
+ {
+ write_char(g, ch);
+ write_char(g, '_');
+ }
+ str_append_b(g->outbuf, p->b);
+}
+
+static void write_varref(struct generator * g, struct name * p) {
+
+ /* In java, references look just the same */
+ write_varname(g, p);
+}
+
+static void write_hexdigit(struct generator * g, int n) {
+
+ write_char(g, n < 10 ? n + '0' : n - 10 + 'A');
+}
+
+static void write_hex(struct generator * g, int ch) {
+
+ write_string(g, "\\u");
+ {
+ int i;
+ for (i = 12; i >= 0; i -= 4) write_hexdigit(g, ch >> i & 0xf);
+ }
+}
+
+static void write_literal_string(struct generator * g, symbol * p) {
+
+ int i;
+ write_string(g, "\"");
+ for (i = 0; i < SIZE(p); i++) {
+ int ch = p[i];
+ if (32 <= ch && ch <= 127) {
+ if (ch == '\"' || ch == '\\') write_string(g, "\\");
+ write_char(g, ch);
+ } else {
+ write_hex(g, ch);
+ }
+ }
+ write_string(g, "\"");
+}
+
+static void write_margin(struct generator * g) {
+
+ int i;
+ for (i = 0; i < g->margin; i++) write_string(g, " ");
+}
+
+/* Write a variable declaration. */
+static void write_declare(struct generator * g,
+ char * declaration,
+ struct node * p) {
+
+ struct str * temp = g->outbuf;
+ g->outbuf = g->declarations;
+ write_string(g, " ");
+ writef(g, declaration, p);
+ write_string(g, ";");
+ write_newline(g);
+ g->outbuf = temp;
+}
+
+static void write_comment(struct generator * g, struct node * p) {
+
+ write_margin(g);
+ write_string(g, "// ");
+ write_string(g, name_of_token(p->type));
+ if (p->name != 0) {
+ write_string(g, " ");
+ str_append_b(g->outbuf, p->name->b);
+ }
+ write_string(g, ", line ");
+ write_int(g, p->line_number);
+ write_newline(g);
+}
+
+static void write_block_start(struct generator * g) {
+
+ w(g, "~M{~+~N");
+}
+
+static void write_block_end(struct generator * g) /* block end */ {
+
+ w(g, "~-~M}~N");
+}
+
+static void write_savecursor(struct generator * g, struct node * p,
+ struct str * savevar) {
+
+ g->B[0] = str_data(savevar);
+ g->S[1] = "";
+ if (p->mode != m_forward) g->S[1] = "limit - ";
+ write_declare(g, "int ~B0", p);
+ writef(g, "~M~B0 = ~S1cursor;~N" , p);
+}
+
+static void restore_string(struct node * p, struct str * out, struct str * savevar) {
+
+ str_clear(out);
+ str_append_string(out, "cursor = ");
+ if (p->mode != m_forward) str_append_string(out, "limit - ");
+ str_append(out, savevar);
+ str_append_string(out, ";");
+}
+
+static void write_restorecursor(struct generator * g, struct node * p,
+ struct str * savevar) {
+
+ struct str * temp = str_new();
+ write_margin(g);
+ restore_string(p, temp, savevar);
+ write_str(g, temp);
+ write_newline(g);
+ str_delete(temp);
+}
+
+static void write_inc_cursor(struct generator * g, struct node * p) {
+
+ write_margin(g);
+ write_string(g, p->mode == m_forward ? "cursor++;" : "cursor--;");
+ write_newline(g);
+}
+
+static void wsetlab_begin(struct generator * g, int n) {
+
+ w(g, "~Mlab");
+ write_int(g, n);
+ w(g, ": do {~+~N");
+}
+
+static void wsetlab_end(struct generator * g) {
+
+ w(g, "~-~M} while (false);~N");
+}
+
+static void wgotol(struct generator * g, int n) {
+
+ write_margin(g);
+ write_string(g, "break lab");
+ write_int(g, n);
+ write_string(g, ";");
+ write_newline(g);
+}
+
+static void write_failure(struct generator * g) {
+
+ if (str_len(g->failure_str) != 0) {
+ write_margin(g);
+ write_str(g, g->failure_str);
+ write_newline(g);
+ }
+ write_margin(g);
+ switch (g->failure_label)
+ {
+ case x_return:
+ write_string(g, "return false;");
+ break;
+ default:
+ write_string(g, "break lab");
+ write_int(g, g->failure_label);
+ write_string(g, ";");
+ g->unreachable = true;
+ }
+ write_newline(g);
+}
+
+static void write_failure_if(struct generator * g, char * s, struct node * p) {
+
+ writef(g, "~Mif (", p);
+ writef(g, s, p);
+ writef(g, ")~N", p);
+ write_block_start(g);
+ write_failure(g);
+ write_block_end(g);
+ g->unreachable = false;
+}
+
+/* if at limit fail */
+static void write_check_limit(struct generator * g, struct node * p) {
+
+ if (p->mode == m_forward) {
+ write_failure_if(g, "cursor >= limit", p);
+ } else {
+ write_failure_if(g, "cursor <= limit_backward", p);
+ }
+}
+
+/* Formatted write. */
+static void writef(struct generator * g, const char * input, struct node * p) {
+
+ int i = 0;
+ int l = strlen(input);
+
+ while (i < l) {
+ int ch = input[i++];
+ if (ch == '~') {
+ switch(input[i++]) {
+ default: write_char(g, input[i - 1]); continue;
+ case 'C': write_comment(g, p); continue;
+ case 'f': write_block_start(g);
+ write_failure(g);
+ g->unreachable = false;
+ write_block_end(g);
+ continue;
+ case 'M': write_margin(g); continue;
+ case 'N': write_newline(g); continue;
+ case '{': write_block_start(g); continue;
+ case '}': write_block_end(g); continue;
+ case 'S': write_string(g, g->S[input[i++] - '0']); continue;
+ case 'B': write_b(g, g->B[input[i++] - '0']); continue;
+ case 'I': write_int(g, g->I[input[i++] - '0']); continue;
+ case 'V': write_varref(g, g->V[input[i++] - '0']); continue;
+ case 'W': write_varname(g, g->V[input[i++] - '0']); continue;
+ case 'L': write_literal_string(g, g->L[input[i++] - '0']); continue;
+ case '+': g->margin++; continue;
+ case '-': g->margin--; continue;
+ case 'n': write_string(g, g->options->name); continue;
+ }
+ } else {
+ write_char(g, ch);
+ }
+ }
+}
+
+static void w(struct generator * g, const char * s) {
+ writef(g, s, 0);
+}
+
+static void generate_AE(struct generator * g, struct node * p) {
+ char * s;
+ switch (p->type) {
+ case c_name:
+ write_varref(g, p->name); break;
+ case c_number:
+ write_int(g, p->number); break;
+ case c_maxint:
+ write_string(g, "MAXINT"); break;
+ case c_minint:
+ write_string(g, "MININT"); break;
+ case c_neg:
+ write_string(g, "-"); generate_AE(g, p->right); break;
+ case c_multiply:
+ s = " * "; goto label0;
+ case c_plus:
+ s = " + "; goto label0;
+ case c_minus:
+ s = " - "; goto label0;
+ case c_divide:
+ s = " / ";
+ label0:
+ write_string(g, "("); generate_AE(g, p->left);
+ write_string(g, s); generate_AE(g, p->right); write_string(g, ")"); break;
+ case c_sizeof:
+ g->V[0] = p->name;
+ w(g, "(~V0.length())"); break;
+ case c_cursor:
+ w(g, "cursor"); break;
+ case c_limit:
+ w(g, p->mode == m_forward ? "limit" : "limit_backward"); break;
+ case c_size:
+ w(g, "(current.length())"); break;
+ }
+}
+
+/* K_needed() tests to see if we really need to keep c. Not true when the
+ the command does not touch the cursor. This and repeat_score() could be
+ elaborated almost indefinitely.
+*/
+
+static int K_needed(struct generator * g, struct node * p) {
+
+ while (p != 0) {
+ switch (p->type) {
+ case c_dollar:
+ case c_leftslice:
+ case c_rightslice:
+ case c_mathassign:
+ case c_plusassign:
+ case c_minusassign:
+ case c_multiplyassign:
+ case c_divideassign:
+ case c_eq:
+ case c_ne:
+ case c_gr:
+ case c_ge:
+ case c_ls:
+ case c_le:
+ case c_sliceto:
+ case c_booltest:
+ case c_true:
+ case c_false:
+ case c_debug:
+ break;
+
+ case c_call:
+ if (K_needed(g, p->name->definition)) return true;
+ break;
+
+ case c_bra:
+ if (K_needed(g, p->left)) return true;
+ break;
+
+ default: return true;
+ }
+ p = p->right;
+ }
+ return false;
+}
+
+static int repeat_score(struct generator * g, struct node * p) {
+
+ int score = 0;
+ while (p != 0) {
+ switch (p->type) {
+ case c_dollar:
+ case c_leftslice:
+ case c_rightslice:
+ case c_mathassign:
+ case c_plusassign:
+ case c_minusassign:
+ case c_multiplyassign:
+ case c_divideassign:
+ case c_eq:
+ case c_ne:
+ case c_gr:
+ case c_ge:
+ case c_ls:
+ case c_le:
+ case c_sliceto: /* case c_not: must not be included here! */
+ case c_debug:
+ break;
+
+ case c_call:
+ score += repeat_score(g, p->name->definition);
+ break;
+
+ case c_bra:
+ score += repeat_score(g, p->left);
+ break;
+
+ case c_name:
+ case c_literalstring:
+ case c_next:
+ case c_grouping:
+ case c_non:
+ case c_hop:
+ score = score + 1;
+ break;
+
+ default:
+ score = 2;
+ break;
+ }
+ p = p->right;
+ }
+ return score;
+}
+
+/* tests if an expression requires cursor reinstatement in a repeat */
+
+static int repeat_restore(struct generator * g, struct node * p) {
+
+ return repeat_score(g, p) >= 2;
+}
+
+static void generate_bra(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ p = p->left;
+ while (p != 0) {
+ generate(g, p);
+ p = p->right;
+ }
+}
+
+static void generate_and(struct generator * g, struct node * p) {
+
+ struct str * savevar = vars_newname(g);
+ int keep_c = K_needed(g, p->left);
+
+ write_comment(g, p);
+
+ if (keep_c) write_savecursor(g, p, savevar);
+
+ p = p->left;
+ while (p != 0) {
+ generate(g, p);
+ if (g->unreachable) break;
+ if (keep_c && p->right != 0) write_restorecursor(g, p, savevar);
+ p = p->right;
+ }
+ str_delete(savevar);
+}
+
+static void generate_or(struct generator * g, struct node * p) {
+
+ struct str * savevar = vars_newname(g);
+ int keep_c = K_needed(g, p->left);
+
+ int a0 = g->failure_label;
+ struct str * a1 = str_copy(g->failure_str);
+
+ int out_lab = new_label(g);
+ write_comment(g, p);
+ wsetlab_begin(g, out_lab);
+
+ if (keep_c) write_savecursor(g, p, savevar);
+
+ p = p->left;
+ str_clear(g->failure_str);
+
+ if (p == 0) {
+ /* p should never be 0 after an or: there should be at least two
+ * sub nodes. */
+ fprintf(stderr, "Error: \"or\" node without children nodes.");
+ exit (1);
+ }
+ while (p->right != 0) {
+ g->failure_label = new_label(g);
+ wsetlab_begin(g, g->failure_label);
+ generate(g, p);
+ if (!g->unreachable) wgotol(g, out_lab);
+ wsetlab_end(g);
+ g->unreachable = false;
+ if (keep_c) write_restorecursor(g, p, savevar);
+ p = p->right;
+ }
+
+ g->failure_label = a0;
+ str_delete(g->failure_str);
+ g->failure_str = a1;
+
+ generate(g, p);
+ wsetlab_end(g);
+ str_delete(savevar);
+}
+
+static void generate_backwards(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ writef(g,"~Mlimit_backward = cursor; cursor = limit;~N", p);
+ generate(g, p->left);
+ w(g, "~Mcursor = limit_backward;");
+}
+
+
+static void generate_not(struct generator * g, struct node * p) {
+
+ struct str * savevar = vars_newname(g);
+ int keep_c = K_needed(g, p->left);
+
+ int a0 = g->failure_label;
+ struct str * a1 = str_copy(g->failure_str);
+
+ write_comment(g, p);
+ if (keep_c) {
+ write_block_start(g);
+ write_savecursor(g, p, savevar);
+ }
+
+ g->failure_label = new_label(g);
+ str_clear(g->failure_str);
+
+ wsetlab_begin(g, g->failure_label);
+
+ generate(g, p->left);
+
+ g->failure_label = a0;
+ str_delete(g->failure_str);
+ g->failure_str = a1;
+
+ if (!g->unreachable) write_failure(g);
+
+ wsetlab_end(g);
+ g->unreachable = false;
+
+ if (keep_c) write_restorecursor(g, p, savevar);
+ if (keep_c) write_block_end(g);
+ str_delete(savevar);
+}
+
+
+static void generate_try(struct generator * g, struct node * p) {
+
+ struct str * savevar = vars_newname(g);
+ int keep_c = K_needed(g, p->left);
+
+ write_comment(g, p);
+ if (keep_c) write_savecursor(g, p, savevar);
+
+ g->failure_label = new_label(g);
+ if (keep_c) restore_string(p, g->failure_str, savevar);
+
+ wsetlab_begin(g, g->failure_label);
+ generate(g, p->left);
+ wsetlab_end(g);
+ g->unreachable = false;
+
+ str_delete(savevar);
+}
+
+static void generate_set(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ g->V[0] = p->name;
+ writef(g, "~M~V0 = true;~N", p);
+}
+
+static void generate_unset(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ g->V[0] = p->name;
+ writef(g, "~M~V0 = false;~N", p);
+}
+
+static void generate_fail(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ generate(g, p->left);
+ if (!g->unreachable) write_failure(g);
+}
+
+/* generate_test() also implements 'reverse' */
+
+static void generate_test(struct generator * g, struct node * p) {
+
+ struct str * savevar = vars_newname(g);
+ int keep_c = K_needed(g, p->left);
+
+ write_comment(g, p);
+
+ if (keep_c) {
+ write_savecursor(g, p, savevar);
+ }
+
+ generate(g, p->left);
+
+ if (!g->unreachable) {
+ if (keep_c) {
+ write_restorecursor(g, p, savevar);
+ }
+ }
+ str_delete(savevar);
+}
+
+static void generate_do(struct generator * g, struct node * p) {
+
+ struct str * savevar = vars_newname(g);
+ int keep_c = K_needed(g, p->left);
+ write_comment(g, p);
+ if (keep_c) write_savecursor(g, p, savevar);
+
+ g->failure_label = new_label(g);
+ str_clear(g->failure_str);
+
+ wsetlab_begin(g, g->failure_label);
+ generate(g, p->left);
+ wsetlab_end(g);
+ g->unreachable = false;
+
+ if (keep_c) write_restorecursor(g, p, savevar);
+ str_delete(savevar);
+}
+
+static void generate_GO(struct generator * g, struct node * p, int style) {
+
+ int end_unreachable = false;
+ struct str * savevar = vars_newname(g);
+ int keep_c = style == 1 || repeat_restore(g, p->left);
+
+ int a0 = g->failure_label;
+ struct str * a1 = str_copy(g->failure_str);
+
+ int golab = new_label(g);
+ g->I[0] = golab;
+ write_comment(g, p);
+ w(g, "~Mgolab~I0: while(true)~N");
+ w(g, "~{");
+
+ if (keep_c) write_savecursor(g, p, savevar);
+
+ g->failure_label = new_label(g);
+ wsetlab_begin(g, g->failure_label);
+ generate(g, p->left);
+
+ if (g->unreachable) {
+ /* Cannot break out of this loop: therefore the code after the
+ * end of the loop is unreachable.*/
+ end_unreachable = true;
+ } else {
+ /* include for goto; omit for gopast */
+ if (style == 1) write_restorecursor(g, p, savevar);
+ g->I[0] = golab;
+ w(g, "~Mbreak golab~I0;~N");
+ }
+ g->unreachable = false;
+ wsetlab_end(g);
+ if (keep_c) write_restorecursor(g, p, savevar);
+
+ g->failure_label = a0;
+ str_delete(g->failure_str);
+ g->failure_str = a1;
+
+ write_check_limit(g, p);
+ write_inc_cursor(g, p);
+ write_block_end(g);
+ str_delete(savevar);
+ g->unreachable = end_unreachable;
+}
+
+static void generate_loop(struct generator * g, struct node * p) {
+
+ struct str * loopvar = vars_newname(g);
+ write_comment(g, p);
+ g->B[0] = str_data(loopvar);
+ write_declare(g, "int ~B0", p);
+ w(g, "~Mfor (~B0 = ");
+ generate_AE(g, p->AE);
+ g->B[0] = str_data(loopvar);
+ writef(g, "; ~B0 > 0; ~B0--)~N", p);
+ writef(g, "~{", p);
+
+ generate(g, p->left);
+
+ w(g, "~}");
+ str_delete(loopvar);
+ g->unreachable = false;
+}
+
+static void generate_repeat(struct generator * g, struct node * p, struct str * loopvar) {
+
+ struct str * savevar = vars_newname(g);
+ int keep_c = repeat_restore(g, p->left);
+ int replab = new_label(g);
+ g->I[0] = replab;
+ write_comment(g, p);
+ writef(g, "~Mreplab~I0: while(true)~N~{", p);
+
+ if (keep_c) write_savecursor(g, p, savevar);
+
+ g->failure_label = new_label(g);
+ str_clear(g->failure_str);
+ wsetlab_begin(g, g->failure_label);
+ generate(g, p->left);
+
+ if (!g->unreachable) {
+ if (loopvar != 0) {
+ g->B[0] = str_data(loopvar);
+ w(g, "~M~B0--;~N");
+ }
+
+ g->I[0] = replab;
+ w(g, "~Mcontinue replab~I0;~N");
+ }
+
+ wsetlab_end(g);
+ g->unreachable = false;
+
+ if (keep_c) write_restorecursor(g, p, savevar);
+
+ g->I[0] = replab;
+ w(g, "~Mbreak replab~I0;~N~}");
+ str_delete(savevar);
+}
+
+static void generate_atleast(struct generator * g, struct node * p) {
+
+ struct str * loopvar = vars_newname(g);
+ write_comment(g, p);
+ w(g, "~{");
+ g->B[0] = str_data(loopvar);
+ w(g, "~Mint ~B0 = ");
+ generate_AE(g, p->AE);
+ w(g, ";~N");
+ {
+ int a0 = g->failure_label;
+ struct str * a1 = str_copy(g->failure_str);
+
+ generate_repeat(g, p, loopvar);
+
+ g->failure_label = a0;
+ str_delete(g->failure_str);
+ g->failure_str = a1;
+ }
+ g->B[0] = str_data(loopvar);
+ write_failure_if(g, "~B0 > 0", p);
+ w(g, "~}");
+ str_delete(loopvar);
+}
+
+static void generate_setmark(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ g->V[0] = p->name;
+ writef(g, "~M~V0 = cursor;~N", p);
+}
+
+static void generate_tomark(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ g->S[0] = p->mode == m_forward ? ">" : "<";
+
+ w(g, "~Mif (cursor ~S0 "); generate_AE(g, p->AE); w(g, ")~N");
+ write_block_start(g);
+ write_failure(g);
+ write_block_end(g);
+ g->unreachable = false;
+ w(g, "~Mcursor = "); generate_AE(g, p->AE); writef(g, ";~N", p);
+}
+
+static void generate_atmark(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ w(g, "~Mif (cursor != "); generate_AE(g, p->AE); writef(g, ")~N", p);
+ write_block_start(g);
+ write_failure(g);
+ write_block_end(g);
+ g->unreachable = false;
+}
+
+
+static void generate_hop(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ g->S[0] = p->mode == m_forward ? "+" : "-";
+
+ w(g, "~{~Mint c = cursor ~S0 ");
+ generate_AE(g, p->AE);
+ w(g, ";~N");
+
+ g->S[0] = p->mode == m_forward ? "0" : "limit_backward";
+
+ write_failure_if(g, "~S0 > c || c > limit", p);
+ writef(g, "~Mcursor = c;~N", p);
+ writef(g, "~}", p);
+}
+
+static void generate_delete(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ writef(g, "~Mslice_del();~N", p);
+}
+
+
+static void generate_next(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ write_check_limit(g, p);
+ write_inc_cursor(g, p);
+}
+
+static void generate_tolimit(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ g->S[0] = p->mode == m_forward ? "limit" : "limit_backward";
+ writef(g, "~Mcursor = ~S0;~N", p);
+}
+
+static void generate_atlimit(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ g->S[0] = p->mode == m_forward ? "limit" : "limit_backward";
+ g->S[1] = p->mode == m_forward ? "<" : ">";
+ write_failure_if(g, "cursor ~S1 ~S0", p);
+}
+
+static void generate_leftslice(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ g->S[0] = p->mode == m_forward ? "bra" : "ket";
+ writef(g, "~M~S0 = cursor;~N", p);
+}
+
+static void generate_rightslice(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ g->S[0] = p->mode == m_forward ? "ket" : "bra";
+ writef(g, "~M~S0 = cursor;~N", p);
+}
+
+static void generate_assignto(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ g->V[0] = p->name;
+ writef(g, "~M~V0 = assign_to(~V0);~N", p);
+}
+
+static void generate_sliceto(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ g->V[0] = p->name;
+ writef(g, "~M~V0 = slice_to(~V0);~N", p);
+}
+
+static void generate_address(struct generator * g, struct node * p) {
+
+ symbol * b = p->literalstring;
+ if (b != 0) {
+ write_literal_string(g, b);
+ } else {
+ write_varref(g, p->name);
+ }
+}
+
+static void generate_insert(struct generator * g, struct node * p, int style) {
+
+ int keep_c = style == c_attach;
+ write_comment(g, p);
+ if (p->mode == m_backward) keep_c = !keep_c;
+ if (keep_c) w(g, "~{~Mint c = cursor;~N");
+ writef(g, "~Minsert(cursor, cursor, ", p);
+ generate_address(g, p);
+ writef(g, ");~N", p);
+ if (keep_c) w(g, "~Mcursor = c;~N~}");
+}
+
+static void generate_assignfrom(struct generator * g, struct node * p) {
+
+ int keep_c = p->mode == m_forward; /* like 'attach' */
+
+ write_comment(g, p);
+ if (keep_c) writef(g, "~{~Mint c = cursor;~N", p);
+ if (p->mode == m_forward) {
+ writef(g, "~Minsert(cursor, limit, ", p);
+ } else {
+ writef(g, "~Minsert(limit_backward, cursor, ", p);
+ }
+ generate_address(g, p);
+ writef(g, ");~N", p);
+ if (keep_c) w(g, "~Mcursor = c;~N~}");
+}
+
+
+static void generate_slicefrom(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ w(g, "~Mslice_from(");
+ generate_address(g, p);
+ writef(g, ");~N", p);
+}
+
+static void generate_setlimit(struct generator * g, struct node * p) {
+
+ struct str * savevar = vars_newname(g);
+ struct str * varname = vars_newname(g);
+ write_comment(g, p);
+ write_savecursor(g, p, savevar);
+ generate(g, p->left);
+
+ if (!g->unreachable) {
+ g->B[0] = str_data(varname);
+ write_declare(g, "int ~B0", p);
+ if (p->mode == m_forward) {
+ w(g, "~M~B0 = limit - cursor;~N");
+ w(g, "~Mlimit = cursor;~N");
+ } else {
+ w(g, "~M~B0 = limit_backward;~N");
+ w(g, "~Mlimit_backward = cursor;~N");
+ }
+ write_restorecursor(g, p, savevar);
+
+ if (p->mode == m_forward) {
+ str_assign(g->failure_str, "limit += ");
+ str_append(g->failure_str, varname);
+ str_append_ch(g->failure_str, ';');
+ } else {
+ str_assign(g->failure_str, "limit_backward = ");
+ str_append(g->failure_str, varname);
+ str_append_ch(g->failure_str, ';');
+ }
+ generate(g, p->aux);
+
+ if (!g->unreachable) {
+ write_margin(g);
+ write_str(g, g->failure_str);
+ write_newline(g);
+ }
+ }
+ str_delete(varname);
+ str_delete(savevar);
+}
+
+/* dollar sets snowball up to operate on a string variable as if it were the
+ * current string */
+static void generate_dollar(struct generator * g, struct node * p) {
+
+ struct str * savevar = vars_newname(g);
+ write_comment(g, p);
+ g->V[0] = p->name;
+
+ str_assign(g->failure_str, "copy_from(");
+ str_append(g->failure_str, savevar);
+ str_append_string(g->failure_str, ");");
+ g->B[0] = str_data(savevar);
+ writef(g, "~{~M~n ~B0 = this;~N"
+ "~Mcurrent = new StringBuffer(~V0.toString());~N"
+ "~Mcursor = 0;~N"
+ "~Mlimit = (current.length());~N", p);
+ generate(g, p->left);
+ if (!g->unreachable) {
+ write_margin(g);
+ write_str(g, g->failure_str);
+ write_newline(g);
+ }
+ w(g, "~}");
+ str_delete(savevar);
+}
+
+static void generate_integer_assign(struct generator * g, struct node * p, char * s) {
+
+ g->V[0] = p->name;
+ g->S[0] = s;
+ w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); w(g, ";~N");
+}
+
+static void generate_integer_test(struct generator * g, struct node * p, char * s) {
+
+ g->V[0] = p->name;
+ g->S[0] = s;
+ w(g, "~Mif (!(~V0 ~S0 "); generate_AE(g, p->AE); w(g, "))~N");
+ write_block_start(g);
+ write_failure(g);
+ write_block_end(g);
+ g->unreachable = false;
+}
+
+static void generate_call(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ g->V[0] = p->name;
+ write_failure_if(g, "!~V0()", p);
+}
+
+static void generate_grouping(struct generator * g, struct node * p, int complement) {
+
+ struct grouping * q = p->name->grouping;
+ g->S[0] = p->mode == m_forward ? "" : "_b";
+ g->S[1] = complement ? "out" : "in";
+ g->V[0] = p->name;
+ g->I[0] = q->smallest_ch;
+ g->I[1] = q->largest_ch;
+ if (q->no_gaps)
+ write_failure_if(g, "!(~S1_range~S0(~I0, ~I1))", p);
+ else
+ write_failure_if(g, "!(~S1_grouping~S0(~V0, ~I0, ~I1))", p);
+}
+
+static void generate_namedstring(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ g->S[0] = p->mode == m_forward ? "" : "_b";
+ g->V[0] = p->name;
+ write_failure_if(g, "!(eq_v~S0(~V0))", p);
+}
+
+static void generate_literalstring(struct generator * g, struct node * p) {
+
+ symbol * b = p->literalstring;
+ write_comment(g, p);
+ g->S[0] = p->mode == m_forward ? "" : "_b";
+ g->I[0] = SIZE(b);
+ g->L[0] = b;
+ write_failure_if(g, "!(eq_s~S0(~I0, ~L0))", p);
+}
+
+static void generate_define(struct generator * g, struct node * p) {
+
+ struct name * q = p->name;
+
+ struct str * saved_output = g->outbuf;
+ struct str * saved_declarations = g->declarations;
+
+ g->S[0] = q->type == t_routine ? "private" : "public";
+ g->V[0] = q;
+ w(g, "~+~+~N~M~S0 boolean ~V0() {~+~N");
+
+ g->outbuf = str_new();
+ g->declarations = str_new();
+
+ g->next_label = 0;
+ g->var_number = 0;
+
+ if (p->amongvar_needed) write_declare(g, "int among_var", p);
+ str_clear(g->failure_str);
+ g->failure_label = x_return;
+ g->unreachable = false;
+ generate(g, p->left);
+ if (!g->unreachable) w(g, "~Mreturn true;~N");
+ w(g, "~}~-~-");
+
+ str_append(saved_output, g->declarations);
+ str_append(saved_output, g->outbuf);
+ str_delete(g->declarations);
+ str_delete(g->outbuf);
+ g->declarations = saved_declarations;
+ g->outbuf = saved_output;
+}
+
+static void generate_substring(struct generator * g, struct node * p) {
+
+ struct among * x = p->among;
+
+ write_comment(g, p);
+
+ g->S[0] = p->mode == m_forward ? "" : "_b";
+ g->I[0] = x->number;
+ g->I[1] = x->literalstring_count;
+
+ if (x->command_count == 0 && x->starter == 0) {
+ write_failure_if(g, "find_among~S0(a_~I0, ~I1) == 0", p);
+ } else {
+ writef(g, "~Mamong_var = find_among~S0(a_~I0, ~I1);~N", p);
+ write_failure_if(g, "among_var == 0", p);
+ }
+}
+
+static void generate_among(struct generator * g, struct node * p) {
+
+ struct among * x = p->among;
+ int case_number = 1;
+
+ if (x->substring == 0) generate_substring(g, p);
+ if (x->command_count == 0 && x->starter == 0) return;
+
+ if (x->starter != 0) generate(g, x->starter);
+
+ p = p->left;
+ if (p != 0 && p->type != c_literalstring) p = p->right;
+ w(g, "~Mswitch(among_var) {~N~+");
+ w(g, "~Mcase 0:~N~+");
+ write_failure(g);
+ g->unreachable = false;
+ w(g, "~-");
+
+ while (p != 0) {
+ if (p->type == c_bra && p->left != 0) {
+ g->I[0] = case_number++;
+ w(g, "~Mcase ~I0:~N~+");
+ generate(g, p);
+ if (!g->unreachable) w(g, "~Mbreak;~N");
+ w(g, "~-");
+ g->unreachable = false;
+ }
+ p = p->right;
+ }
+ write_block_end(g);
+}
+
+static void generate_booltest(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ g->V[0] = p->name;
+ write_failure_if(g, "!(~V0)", p);
+}
+
+static void generate_false(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ write_failure(g);
+}
+
+static void generate_debug(struct generator * g, struct node * p) {
+
+ write_comment(g, p);
+ g->I[0] = g->debug_count++;
+ g->I[1] = p->line_number;
+ writef(g, "~Mdebug(~I0, ~I1);~N", p);
+}
+
+static void generate(struct generator * g, struct node * p) {
+
+ int a0;
+ struct str * a1;
+
+ if (g->unreachable) return;
+
+ a0 = g->failure_label;
+ a1 = str_copy(g->failure_str);
+
+ switch (p->type)
+ {
+ case c_define: generate_define(g, p); break;
+ case c_bra: generate_bra(g, p); break;
+ case c_and: generate_and(g, p); break;
+ case c_or: generate_or(g, p); break;
+ case c_backwards: generate_backwards(g, p); break;
+ case c_not: generate_not(g, p); break;
+ case c_set: generate_set(g, p); break;
+ case c_unset: generate_unset(g, p); break;
+ case c_try: generate_try(g, p); break;
+ case c_fail: generate_fail(g, p); break;
+ case c_reverse:
+ case c_test: generate_test(g, p); break;
+ case c_do: generate_do(g, p); break;
+ case c_goto: generate_GO(g, p, 1); break;
+ case c_gopast: generate_GO(g, p, 0); break;
+ case c_repeat: generate_repeat(g, p, 0); break;
+ case c_loop: generate_loop(g, p); break;
+ case c_atleast: generate_atleast(g, p); break;
+ case c_setmark: generate_setmark(g, p); break;
+ case c_tomark: generate_tomark(g, p); break;
+ case c_atmark: generate_atmark(g, p); break;
+ case c_hop: generate_hop(g, p); break;
+ case c_delete: generate_delete(g, p); break;
+ case c_next: generate_next(g, p); break;
+ case c_tolimit: generate_tolimit(g, p); break;
+ case c_atlimit: generate_atlimit(g, p); break;
+ case c_leftslice: generate_leftslice(g, p); break;
+ case c_rightslice: generate_rightslice(g, p); break;
+ case c_assignto: generate_assignto(g, p); break;
+ case c_sliceto: generate_sliceto(g, p); break;
+ case c_assign: generate_assignfrom(g, p); break;
+ case c_insert:
+ case c_attach: generate_insert(g, p, p->type); break;
+ case c_slicefrom: generate_slicefrom(g, p); break;
+ case c_setlimit: generate_setlimit(g, p); break;
+ case c_dollar: generate_dollar(g, p); break;
+ case c_mathassign: generate_integer_assign(g, p, "="); break;
+ case c_plusassign: generate_integer_assign(g, p, "+="); break;
+ case c_minusassign: generate_integer_assign(g, p, "-="); break;
+ case c_multiplyassign:generate_integer_assign(g, p, "*="); break;
+ case c_divideassign: generate_integer_assign(g, p, "/="); break;
+ case c_eq: generate_integer_test(g, p, "=="); break;
+ case c_ne: generate_integer_test(g, p, "!="); break;
+ case c_gr: generate_integer_test(g, p, ">"); break;
+ case c_ge: generate_integer_test(g, p, ">="); break;
+ case c_ls: generate_integer_test(g, p, "<"); break;
+ case c_le: generate_integer_test(g, p, "<="); break;
+ case c_call: generate_call(g, p); break;
+ case c_grouping: generate_grouping(g, p, false); break;
+ case c_non: generate_grouping(g, p, true); break;
+ case c_name: generate_namedstring(g, p); break;
+ case c_literalstring: generate_literalstring(g, p); break;
+ case c_among: generate_among(g, p); break;
+ case c_substring: generate_substring(g, p); break;
+ case c_booltest: generate_booltest(g, p); break;
+ case c_false: generate_false(g, p); break;
+ case c_true: break;
+ case c_debug: generate_debug(g, p); break;
+ default: fprintf(stderr, "%d encountered\n", p->type);
+ exit(1);
+ }
+
+ g->failure_label = a0;
+ str_delete(g->failure_str);
+ g->failure_str = a1;
+}
+
+static void generate_start_comment(struct generator * g) {
+
+ w(g, "// This file was generated automatically by the Snowball to Java compiler~N");
+ w(g, "~N");
+}
+
+static void generate_class_begin(struct generator * g) {
+
+ w(g, "package " );
+ w(g, g->options->package);
+ w(g, ";~N~N" );
+
+ w(g, "import ");
+ w(g, g->options->among_class );
+ w(g, ";~N"
+ "~N"
+ " /**~N"
+ " * This class was automatically generated by a Snowball to Java compiler ~N"
+ " * It implements the stemming algorithm defined by a snowball script.~N"
+ " */~N"
+ "~N"
+ "public class ~n extends ");
+
+ w(g, g->options->parent_class_name);
+ w(g, " {~N"
+ "~N"
+ "private static final long serialVersionUID = 1L;~N"
+ "~N"
+ "~+~+~Mprivate final static ~n methodObject = new ~n ();~N"
+ "~N");
+}
+
+static void generate_class_end(struct generator * g) {
+
+ w(g, "~N}");
+ w(g, "~N~N");
+}
+
+static void generate_equals(struct generator * g) {
+
+ w(g, "~N"
+ "~Mpublic boolean equals( Object o ) {~N"
+ "~+~Mreturn o instanceof ");
+ w(g, g->options->name);
+ w(g, ";~N~-~M}~N"
+ "~N"
+ "~Mpublic int hashCode() {~N"
+ "~+~Mreturn ");
+ w(g, g->options->name);
+ w(g, ".class.getName().hashCode();~N"
+ "~-~M}~N");
+ w(g, "~N~N");
+}
+
+static void generate_among_table(struct generator * g, struct among * x) {
+
+ struct amongvec * v = x->b;
+
+ g->I[0] = x->number;
+ g->I[1] = x->literalstring_count;
+
+ w(g, "~+~+~Mprivate final static Among a_~I0[] = {~N~+");
+ {
+ int i;
+ for (i = 0; i < x->literalstring_count; i++) {
+ g->I[0] = i;
+ g->I[1] = v->i;
+ g->I[2] = v->result;
+ g->L[0] = v->b;
+ g->S[0] = i < x->literalstring_count - 1 ? "," : "";
+
+ w(g, "~Mnew Among ( ~L0, ~I1, ~I2, \"");
+ if (v->function != 0) {
+ write_varname(g, v->function);
+ }
+ w(g, "\", methodObject )~S0~N");
+ v++;
+ }
+ }
+ w(g, "~-~M};~-~-~N~N");
+}
+
+static void generate_amongs(struct generator * g) {
+
+ struct among * x = g->analyser->amongs;
+ while (x != 0) {
+ generate_among_table(g, x);
+ x = x->next;
+ }
+}
+
+static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; }
+
+static int bit_is_set(symbol * b, int i) { return b[i/8] & 1 << i%8; }
+
+static void generate_grouping_table(struct generator * g, struct grouping * q) {
+
+ int range = q->largest_ch - q->smallest_ch + 1;
+ int size = (range + 7)/ 8; /* assume 8 bits per symbol */
+ symbol * b = q->b;
+ symbol * map = create_b(size);
+ int i;
+ for (i = 0; i < size; i++) map[i] = 0;
+
+ /* Using unicode would require revision here */
+
+ for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch);
+
+ q->no_gaps = true;
+ for (i = 0; i < range; i++) unless (bit_is_set(map, i)) q->no_gaps = false;
+
+ unless (q->no_gaps) {
+ g->V[0] = q->name;
+
+ w(g, "~+~+~Mprivate static final char ~V0[] = {");
+ for (i = 0; i < size; i++) {
+ write_int(g, map[i]);
+ if (i < size - 1) w(g, ", ");
+ }
+ w(g, " };~N~-~-~N");
+ }
+ lose_b(map);
+}
+
+static void generate_groupings(struct generator * g) {
+ struct grouping * q = g->analyser->groupings;
+ until (q == 0) {
+ generate_grouping_table(g, q);
+ q = q->next;
+ }
+}
+
+static void generate_members(struct generator * g) {
+
+ struct name * q = g->analyser->names;
+ until (q == 0) {
+ g->V[0] = q;
+ switch (q->type) {
+ case t_string:
+ w(g, " private ");
+ w(g, g->options->string_class );
+ w(g, " ~W0 = new ");
+ w(g, g->options->string_class);
+ w(g, "();~N");
+ break;
+ case t_integer:
+ w(g, " private int ~W0;~N");
+ break;
+ case t_boolean:
+ w(g, " private boolean ~W0;~N");
+ break;
+ }
+ q = q->next;
+ }
+ w(g, "~N");
+}
+
+static void generate_copyfrom(struct generator * g) {
+
+ struct name * q;
+ w(g, "~+~+~Mprivate void copy_from(~n other) {~+~N");
+ for (q = g->analyser->names; q != 0; q = q->next) {
+ g->V[0] = q;
+ switch (q->type) {
+ case t_string:
+ case t_integer:
+ case t_boolean:
+ w(g, "~M~W0 = other.~W0;~N");
+ break;
+ }
+ }
+ w(g, "~Msuper.copy_from(other);~N");
+ w(g, "~-~M}~-~-~N");
+}
+
+static void generate_methods(struct generator * g) {
+
+ struct node * p = g->analyser->program;
+ while (p != 0) {
+ generate(g, p);
+ g->unreachable = false;
+ p = p->right;
+ }
+}
+
+extern void generate_program_java(struct generator * g) {
+
+ g->outbuf = str_new();
+ g->failure_str = str_new();
+
+ generate_start_comment(g);
+ generate_class_begin(g);
+
+ generate_amongs(g);
+ generate_groupings(g);
+
+ generate_members(g);
+ generate_copyfrom(g);
+ generate_methods(g);
+ generate_equals(g);
+
+ generate_class_end(g);
+
+ output_str(g->options->output_java, g->outbuf);
+ str_delete(g->failure_str);
+ str_delete(g->outbuf);
+}
+
+extern struct generator * create_generator_java(struct analyser * a, struct options * o) {
+
+ NEW(generator, g);
+ g->analyser = a;
+ g->options = o;
+ g->margin = 0;
+ g->debug_count = 0;
+ g->unreachable = false;
+ return g;
+}
+
+extern void close_generator_java(struct generator * g) {
+
+ FREE(g);
+}
+
diff --git a/contrib/snowball/compiler/header.h b/contrib/snowball/compiler/header.h
new file mode 100644
index 000000000..9baf1d917
--- /dev/null
+++ b/contrib/snowball/compiler/header.h
@@ -0,0 +1,324 @@
+
+typedef unsigned char byte;
+typedef unsigned short symbol;
+
+#define true 1
+#define false 0
+#define repeat while(true)
+#define unless(C) if(!(C))
+#define until(C) while(!(C))
+
+#define MALLOC check_malloc
+#define FREE check_free
+
+#define NEW(type, p) struct type * p = (struct type *) MALLOC(sizeof(struct type))
+#define NEWVEC(type, p, n) struct type * p = (struct type *) MALLOC(sizeof(struct type) * n)
+
+#define STARTSIZE 10
+#define SIZE(p) ((int *)(p))[-1]
+#define CAPACITY(p) ((int *)(p))[-2]
+
+extern symbol * create_b(int n);
+extern void report_b(FILE * out, symbol * p);
+extern void lose_b(symbol * p);
+extern symbol * increase_capacity(symbol * p, int n);
+extern symbol * move_to_b(symbol * p, int n, symbol * q);
+extern symbol * add_to_b(symbol * p, int n, symbol * q);
+extern symbol * copy_b(symbol * p);
+extern char * b_to_s(symbol * p);
+extern symbol * add_s_to_b(symbol * p, const char * s);
+
+struct str; /* defined in space.c */
+
+extern struct str * str_new(void);
+extern void str_delete(struct str * str);
+extern void str_append(struct str * str, struct str * add);
+extern void str_append_ch(struct str * str, char add);
+extern void str_append_b(struct str * str, symbol * q);
+extern void str_append_string(struct str * str, const char * s);
+extern void str_append_int(struct str * str, int i);
+extern void str_clear(struct str * str);
+extern void str_assign(struct str * str, char * s);
+extern struct str * str_copy(struct str * old);
+extern symbol * str_data(struct str * str);
+extern int str_len(struct str * str);
+extern int get_utf8(const symbol * p, int * slot);
+extern int put_utf8(int ch, symbol * p);
+
+struct m_pair {
+
+ struct m_pair * next;
+ symbol * name;
+ symbol * value;
+
+};
+
+/* struct input must be a prefix of struct tokeniser. */
+struct input {
+
+ struct input * next;
+ symbol * p;
+ int c;
+ char * file;
+ int line_number;
+
+};
+
+struct include {
+
+ struct include * next;
+ symbol * b;
+
+};
+
+/* struct input must be a prefix of struct tokeniser. */
+struct tokeniser {
+
+ struct input * next;
+ symbol * p;
+ int c;
+ char * file;
+ int line_number;
+ symbol * b;
+ symbol * b2;
+ int number;
+ int m_start;
+ int m_end;
+ struct m_pair * m_pairs;
+ int get_depth;
+ int error_count;
+ int token;
+ int previous_token;
+ byte token_held;
+ byte widechars;
+ byte utf8;
+
+ int omission;
+ struct include * includes;
+
+};
+
+extern symbol * get_input(symbol * p, char ** p_file);
+extern struct tokeniser * create_tokeniser(symbol * b, char * file);
+extern int read_token(struct tokeniser * t);
+extern const char * name_of_token(int code);
+extern void close_tokeniser(struct tokeniser * t);
+
+enum token_codes {
+
+#include "syswords2.h"
+
+ c_mathassign,
+ c_name,
+ c_number,
+ c_literalstring,
+ c_neg,
+ c_call,
+ c_grouping,
+ c_booltest
+};
+
+extern int space_count;
+extern void * check_malloc(int n);
+extern void check_free(void * p);
+
+struct node;
+
+struct name {
+
+ struct name * next;
+ symbol * b;
+ int type; /* t_string etc */
+ int mode; /* )_ for routines, externals */
+ struct node * definition; /* ) */
+ int count; /* 0, 1, 2 for each type */
+ struct grouping * grouping; /* for grouping names */
+ byte referenced;
+ byte used;
+
+};
+
+struct literalstring {
+
+ struct literalstring * next;
+ symbol * b;
+
+};
+
+struct amongvec {
+
+ symbol * b; /* the string giving the case */
+ int size; /* - and its size */
+ struct node * p; /* the corresponding command */
+ int i; /* the amongvec index of the longest substring of b */
+ int result; /* the numeric result for the case */
+ struct name * function;
+
+};
+
+struct among {
+
+ struct among * next;
+ struct amongvec * b; /* pointer to the amongvec */
+ int number; /* amongs are numbered 0, 1, 2 ... */
+ int literalstring_count; /* in this among */
+ int command_count; /* in this among */
+ struct node * starter; /* i.e. among( (starter) 'string' ... ) */
+ struct node * substring; /* i.e. substring ... among ( ... ) */
+};
+
+struct grouping {
+
+ struct grouping * next;
+ int number; /* groupings are numbered 0, 1, 2 ... */
+ symbol * b; /* the characters of this group */
+ int largest_ch; /* character with max code */
+ int smallest_ch; /* character with min code */
+ byte no_gaps; /* not used in generator.c after 11/5/05 */
+ struct name * name; /* so g->name->grouping == g */
+};
+
+struct node {
+
+ struct node * next;
+ struct node * left;
+ struct node * aux; /* used in setlimit */
+ struct among * among; /* used in among */
+ struct node * right;
+ int type;
+ int mode;
+ struct node * AE;
+ struct name * name;
+ symbol * literalstring;
+ int number;
+ int line_number;
+ int amongvar_needed; /* used in routine definitions */
+};
+
+enum name_types {
+
+ t_size = 6,
+
+ t_string = 0, t_boolean = 1, t_integer = 2, t_routine = 3, t_external = 4,
+ t_grouping = 5
+
+/* If this list is extended, adjust wvn in generator.c */
+};
+
+/* In name_count[i] below, remember that
+ type is
+ ----+----
+ 0 | string
+ 1 | boolean
+ 2 | integer
+ 3 | routine
+ 4 | external
+ 5 | grouping
+*/
+
+struct analyser {
+
+ struct tokeniser * tokeniser;
+ struct node * nodes;
+ struct name * names;
+ struct literalstring * literalstrings;
+ int mode;
+ byte modifyable; /* false inside reverse(...) */
+ struct node * program;
+ struct node * program_end;
+ int name_count[t_size]; /* name_count[i] counts the number of names of type i */
+ struct among * amongs;
+ struct among * amongs_end;
+ int among_count;
+ int amongvar_needed; /* used in reading routine definitions */
+ struct grouping * groupings;
+ struct grouping * groupings_end;
+ struct node * substring; /* pending 'substring' in current routine definition */
+ byte utf8;
+};
+
+enum analyser_modes {
+
+ m_forward = 0, m_backward /*, m_integer */
+
+};
+
+extern void print_program(struct analyser * a);
+extern struct analyser * create_analyser(struct tokeniser * t);
+extern void close_analyser(struct analyser * a);
+
+extern void read_program(struct analyser * a);
+
+struct generator {
+
+ struct analyser * analyser;
+ struct options * options;
+ int unreachable; /* 0 if code can be reached, 1 if current code
+ * is unreachable. */
+ int var_number; /* Number of next variable to use. */
+ struct str * outbuf; /* temporary str to store output */
+ struct str * declarations; /* str storing variable declarations */
+ int next_label;
+ int margin;
+
+ const char * failure_string; /* String to output in case of a failure. */
+#ifndef DISABLE_JAVA
+ struct str * failure_str; /* This is used by the java generator instead of failure_string */
+#endif
+
+ int label_used; /* Keep track of whether the failure label is used. */
+ int failure_label;
+ int debug_count;
+
+ const char * S[10]; /* strings */
+ symbol * B[10]; /* blocks */
+ int I[10]; /* integers */
+ struct name * V[5]; /* variables */
+ symbol * L[5]; /* literals, used in formatted write */
+
+ int line_count; /* counts number of lines output */
+ int line_labelled; /* in ANSI C, will need extra ';' if it is a block end */
+ int literalstring_count;
+ int keep_count; /* used to number keep/restore pairs to avoid compiler warnings
+ about shadowed variables */
+};
+
+struct options {
+
+ /* for the command line: */
+
+ char * output_file;
+ char * name;
+ FILE * output_c;
+ FILE * output_h;
+#ifndef DISABLE_JAVA
+ FILE * output_java;
+#endif
+ byte syntax_tree;
+ byte widechars;
+ enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS } make_lang;
+ char * externals_prefix;
+ char * variables_prefix;
+ char * runtime_path;
+ char * parent_class_name;
+ char * package;
+ char * string_class;
+ char * among_class;
+ struct include * includes;
+ struct include * includes_end;
+ byte utf8;
+};
+
+/* Generator for C code. */
+extern struct generator * create_generator_c(struct analyser * a, struct options * o);
+extern void close_generator_c(struct generator * g);
+
+extern void generate_program_c(struct generator * g);
+
+#ifndef DISABLE_JAVA
+/* Generator for Java code. */
+extern struct generator * create_generator_java(struct analyser * a, struct options * o);
+extern void close_generator_java(struct generator * g);
+
+extern void generate_program_java(struct generator * g);
+#endif
diff --git a/contrib/snowball/compiler/space.c b/contrib/snowball/compiler/space.c
new file mode 100644
index 000000000..cd5fd863d
--- /dev/null
+++ b/contrib/snowball/compiler/space.c
@@ -0,0 +1,263 @@
+
+#include <stdio.h> /* for printf */
+#include <stdlib.h> /* malloc, free */
+#include <string.h> /* memmove */
+
+#include "header.h"
+
+#define HEAD 2*sizeof(int)
+#define EXTENDER 40
+
+
+/* This modules provides a simple mechanism for arbitrary length writable
+ strings, called 'blocks'. They are 'symbol *' items rather than 'char *'
+ items however.
+
+ The calls are:
+
+ symbol * b = create_b(n);
+ - create an empty block b with room for n symbols
+ b = increase_capacity(b, n);
+ - increase the capacity of block b by n symbols (b may change)
+ b2 = copy_b(b)
+ - copy block b into b2
+ lose_b(b);
+ - lose block b
+ b = move_to_b(b, n, p);
+ - set the data in b to be the n symbols at address p
+ b = add_to_b(b, n, p);
+ - add the n symbols at address p to the end of the data in b
+ SIZE(b)
+ - is the number of symbols in b
+ For example:
+
+ symbol * b = create_b(0);
+ { int i;
+ char p[10];
+ for (i = 0; i < 100; i++) {
+ sprintf(p, " %d", i);
+ add_s_to_b(b, p);
+ }
+ }
+
+ and b contains " 0 1 2 ... 99" spaced out as symbols.
+*/
+
+/* For a block b, SIZE(b) is the number of symbols so far written into it,
+ CAPACITY(b) the total number it can contain, so SIZE(b) <= CAPACITY(b).
+ In fact blocks have 1 extra character over the promised capacity so
+ they can be zero terminated by 'b[SIZE(b)] = 0;' without fear of
+ overwriting.
+*/
+
+extern symbol * create_b(int n) {
+ symbol * p = (symbol *) (HEAD + (char *) MALLOC(HEAD + (n + 1) * sizeof(symbol)));
+ CAPACITY(p) = n;
+ SIZE(p) = 0;
+ return p;
+}
+
+extern void report_b(FILE * out, symbol * p) {
+ int i;
+ for (i = 0; i < SIZE(p); i++) fprintf(out, "%c", p[i]);
+}
+
+extern void lose_b(symbol * p) {
+ if (p == 0) return;
+ FREE((char *) p - HEAD);
+}
+
+extern symbol * increase_capacity(symbol * p, int n) {
+ symbol * q = create_b(CAPACITY(p) + n + EXTENDER);
+ memmove(q, p, CAPACITY(p) * sizeof(symbol));
+ SIZE(q) = SIZE(p);
+ lose_b(p); return q;
+}
+
+extern symbol * move_to_b(symbol * p, int n, symbol * q) {
+ int x = n - CAPACITY(p);
+ if (x > 0) p = increase_capacity(p, x);
+ memmove(p, q, n * sizeof(symbol)); SIZE(p) = n; return p;
+}
+
+extern symbol * add_to_b(symbol * p, int n, symbol * q) {
+ int x = SIZE(p) + n - CAPACITY(p);
+ if (x > 0) p = increase_capacity(p, x);
+ memmove(p + SIZE(p), q, n * sizeof(symbol)); SIZE(p) += n; return p;
+}
+
+extern symbol * copy_b(symbol * p) {
+ int n = SIZE(p);
+ symbol * q = create_b(n);
+ move_to_b(q, n, p);
+ return q;
+}
+
+int space_count = 0;
+
+extern void * check_malloc(int n) {
+ space_count++;
+ return malloc(n);
+}
+
+extern void check_free(void * p) {
+ space_count--;
+ free(p);
+}
+
+/* To convert a block to a zero terminated string: */
+
+extern char * b_to_s(symbol * p) {
+ int n = SIZE(p);
+ char * s = (char *)malloc(n + 1);
+ {
+ int i;
+ for (i = 0; i < n; i++) {
+ if (p[i] > 255) {
+ printf("In b_to_s, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]);
+ exit(1);
+ }
+ s[i] = (char)p[i];
+ }
+ }
+ s[n] = 0;
+ return s;
+}
+
+/* To add a zero terminated string to a block. If p = 0 the
+ block is created. */
+
+extern symbol * add_s_to_b(symbol * p, const char * s) {
+ int n = strlen(s);
+ int k;
+ if (p == 0) p = create_b(n);
+ k = SIZE(p);
+ {
+ int x = k + n - CAPACITY(p);
+ if (x > 0) p = increase_capacity(p, x);
+ }
+ {
+ int i;
+ for (i = 0; i < n; i++) p[i + k] = s[i];
+ }
+ SIZE(p) += n;
+ return p;
+}
+
+/* The next section defines string handling capabilities in terms
+ of the lower level block handling capabilities of space.c */
+/* -------------------------------------------------------------*/
+
+struct str {
+ symbol * data;
+};
+
+/* Create a new string. */
+extern struct str * str_new() {
+
+ struct str * output = (struct str *) malloc(sizeof(struct str));
+ output->data = create_b(0);
+ return output;
+}
+
+/* Delete a string. */
+extern void str_delete(struct str * str) {
+
+ lose_b(str->data);
+ free(str);
+}
+
+/* Append a str to this str. */
+extern void str_append(struct str * str, struct str * add) {
+
+ symbol * q = add->data;
+ str->data = add_to_b(str->data, SIZE(q), q);
+}
+
+/* Append a character to this str. */
+extern void str_append_ch(struct str * str, char add) {
+
+ symbol q[1];
+ q[0] = add;
+ str->data = add_to_b(str->data, 1, q);
+}
+
+/* Append a low level block to a str. */
+extern void str_append_b(struct str * str, symbol * q) {
+
+ str->data = add_to_b(str->data, SIZE(q), q);
+}
+
+/* Append a (char *, null teminated) string to a str. */
+extern void str_append_string(struct str * str, const char * s) {
+
+ str->data = add_s_to_b(str->data, s);
+}
+
+/* Append an integer to a str. */
+extern void str_append_int(struct str * str, int i) {
+
+ char s[30];
+ sprintf(s, "%d", i);
+ str_append_string(str, s);
+}
+
+/* Clear a string */
+extern void str_clear(struct str * str) {
+
+ SIZE(str->data) = 0;
+}
+
+/* Set a string */
+extern void str_assign(struct str * str, char * s) {
+
+ str_clear(str);
+ str_append_string(str, s);
+}
+
+/* Copy a string. */
+extern struct str * str_copy(struct str * old) {
+
+ struct str * newstr = str_new();
+ str_append(newstr, old);
+ return newstr;
+}
+
+/* Get the data stored in this str. */
+extern symbol * str_data(struct str * str) {
+
+ return str->data;
+}
+
+/* Get the length of the str. */
+extern int str_len(struct str * str) {
+
+ return SIZE(str->data);
+}
+
+extern int get_utf8(const symbol * p, int * slot) {
+ int b0, b1;
+ b0 = *p++;
+ if (b0 < 0xC0) { /* 1100 0000 */
+ * slot = b0; return 1;
+ }
+ b1 = *p++;
+ if (b0 < 0xE0) { /* 1110 0000 */
+ * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
+ }
+ * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (*p & 0x3F); return 3;
+}
+
+extern int put_utf8(int ch, symbol * p) {
+ if (ch < 0x80) {
+ p[0] = ch; return 1;
+ }
+ if (ch < 0x800) {
+ p[0] = (ch >> 6) | 0xC0;
+ p[1] = (ch & 0x3F) | 0x80; return 2;
+ }
+ p[0] = (ch >> 12) | 0xE0;
+ p[1] = ((ch >> 6) & 0x3F) | 0x80;
+ p[2] = (ch & 0x3F) | 0x80; return 3;
+}
+
diff --git a/contrib/snowball/compiler/syswords.h b/contrib/snowball/compiler/syswords.h
new file mode 100644
index 000000000..917dd5751
--- /dev/null
+++ b/contrib/snowball/compiler/syswords.h
@@ -0,0 +1,84 @@
+static const struct system_word vocab[80+1] = {
+ { 0, (const byte *)"", 80+1},
+
+ { 1, (const byte *)"$", c_dollar },
+ { 1, (const byte *)"(", c_bra },
+ { 1, (const byte *)")", c_ket },
+ { 1, (const byte *)"*", c_multiply },
+ { 1, (const byte *)"+", c_plus },
+ { 1, (const byte *)"-", c_minus },
+ { 1, (const byte *)"/", c_divide },
+ { 1, (const byte *)"<", c_ls },
+ { 1, (const byte *)"=", c_assign },
+ { 1, (const byte *)">", c_gr },
+ { 1, (const byte *)"?", c_debug },
+ { 1, (const byte *)"[", c_leftslice },
+ { 1, (const byte *)"]", c_rightslice },
+ { 2, (const byte *)"!=", c_ne },
+ { 2, (const byte *)"*=", c_multiplyassign },
+ { 2, (const byte *)"+=", c_plusassign },
+ { 2, (const byte *)"-=", c_minusassign },
+ { 2, (const byte *)"->", c_sliceto },
+ { 2, (const byte *)"/*", c_comment2 },
+ { 2, (const byte *)"//", c_comment1 },
+ { 2, (const byte *)"/=", c_divideassign },
+ { 2, (const byte *)"<+", c_insert },
+ { 2, (const byte *)"<-", c_slicefrom },
+ { 2, (const byte *)"<=", c_le },
+ { 2, (const byte *)"==", c_eq },
+ { 2, (const byte *)"=>", c_assignto },
+ { 2, (const byte *)">=", c_ge },
+ { 2, (const byte *)"as", c_as },
+ { 2, (const byte *)"do", c_do },
+ { 2, (const byte *)"or", c_or },
+ { 3, (const byte *)"and", c_and },
+ { 3, (const byte *)"for", c_for },
+ { 3, (const byte *)"get", c_get },
+ { 3, (const byte *)"hex", c_hex },
+ { 3, (const byte *)"hop", c_hop },
+ { 3, (const byte *)"non", c_non },
+ { 3, (const byte *)"not", c_not },
+ { 3, (const byte *)"set", c_set },
+ { 3, (const byte *)"try", c_try },
+ { 4, (const byte *)"fail", c_fail },
+ { 4, (const byte *)"goto", c_goto },
+ { 4, (const byte *)"loop", c_loop },
+ { 4, (const byte *)"next", c_next },
+ { 4, (const byte *)"size", c_size },
+ { 4, (const byte *)"test", c_test },
+ { 4, (const byte *)"true", c_true },
+ { 5, (const byte *)"among", c_among },
+ { 5, (const byte *)"false", c_false },
+ { 5, (const byte *)"limit", c_limit },
+ { 5, (const byte *)"unset", c_unset },
+ { 6, (const byte *)"atmark", c_atmark },
+ { 6, (const byte *)"attach", c_attach },
+ { 6, (const byte *)"cursor", c_cursor },
+ { 6, (const byte *)"define", c_define },
+ { 6, (const byte *)"delete", c_delete },
+ { 6, (const byte *)"gopast", c_gopast },
+ { 6, (const byte *)"insert", c_insert },
+ { 6, (const byte *)"maxint", c_maxint },
+ { 6, (const byte *)"minint", c_minint },
+ { 6, (const byte *)"repeat", c_repeat },
+ { 6, (const byte *)"sizeof", c_sizeof },
+ { 6, (const byte *)"tomark", c_tomark },
+ { 7, (const byte *)"atleast", c_atleast },
+ { 7, (const byte *)"atlimit", c_atlimit },
+ { 7, (const byte *)"decimal", c_decimal },
+ { 7, (const byte *)"reverse", c_reverse },
+ { 7, (const byte *)"setmark", c_setmark },
+ { 7, (const byte *)"strings", c_strings },
+ { 7, (const byte *)"tolimit", c_tolimit },
+ { 8, (const byte *)"booleans", c_booleans },
+ { 8, (const byte *)"integers", c_integers },
+ { 8, (const byte *)"routines", c_routines },
+ { 8, (const byte *)"setlimit", c_setlimit },
+ { 9, (const byte *)"backwards", c_backwards },
+ { 9, (const byte *)"externals", c_externals },
+ { 9, (const byte *)"groupings", c_groupings },
+ { 9, (const byte *)"stringdef", c_stringdef },
+ { 9, (const byte *)"substring", c_substring },
+ { 12, (const byte *)"backwardmode", c_backwardmode },
+ { 13, (const byte *)"stringescapes", c_stringescapes }
+};
diff --git a/contrib/snowball/compiler/syswords2.h b/contrib/snowball/compiler/syswords2.h
new file mode 100644
index 000000000..eb8a91241
--- /dev/null
+++ b/contrib/snowball/compiler/syswords2.h
@@ -0,0 +1,13 @@
+ c_among = 4, c_and, c_as, c_assign, c_assignto, c_atleast,
+ c_atlimit, c_atmark, c_attach, c_backwardmode, c_backwards,
+ c_booleans, c_bra, c_comment1, c_comment2, c_cursor, c_debug,
+ c_decimal, c_define, c_delete, c_divide, c_divideassign, c_do,
+ c_dollar, c_eq, c_externals, c_fail, c_false, c_for, c_ge, c_get,
+ c_gopast, c_goto, c_gr, c_groupings, c_hex, c_hop, c_insert,
+ c_integers, c_ket, c_le, c_leftslice, c_limit, c_loop, c_ls,
+ c_maxint, c_minint, c_minus, c_minusassign, c_multiply,
+ c_multiplyassign, c_ne, c_next, c_non, c_not, c_or, c_plus,
+ c_plusassign, c_repeat, c_reverse, c_rightslice, c_routines,
+ c_set, c_setlimit, c_setmark, c_size, c_sizeof, c_slicefrom,
+ c_sliceto, c_stringdef, c_stringescapes, c_strings, c_substring,
+ c_test, c_tolimit, c_tomark, c_true, c_try, c_unset,
diff --git a/contrib/snowball/compiler/tokeniser.c b/contrib/snowball/compiler/tokeniser.c
new file mode 100644
index 000000000..3dae5f744
--- /dev/null
+++ b/contrib/snowball/compiler/tokeniser.c
@@ -0,0 +1,470 @@
+
+#include <stdio.h> /* stderr etc */
+#include <stdlib.h> /* malloc free */
+#include <string.h> /* strlen */
+#include <ctype.h> /* isalpha etc */
+#include "header.h"
+
+struct system_word {
+ int s_size; /* size of system word */
+ const byte * s; /* pointer to the system word */
+ int code; /* its internal code */
+};
+
+
+/* ASCII collating assumed in syswords.c */
+
+#include "syswords.h"
+
+static int smaller(int a, int b) { return a < b ? a : b; }
+
+extern symbol * get_input(symbol * p, char ** p_file) {
+
+ char * s = b_to_s(p);
+ {
+ FILE * input = fopen(s, "r");
+ if (input == 0) { free(s); return 0; }
+ *p_file = s;
+ {
+ symbol * u = create_b(STARTSIZE);
+ int size = 0;
+ repeat
+ { int ch = getc(input);
+ if (ch == EOF) break;
+ if (size >= CAPACITY(u)) u = increase_capacity(u, size/2);
+ u[size++] = ch;
+ }
+ fclose(input);
+ SIZE(u) = size; return u;
+ }
+ }
+}
+
+static void error(struct tokeniser * t, char * s1, int n, symbol * p, char * s2) {
+ if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); }
+ fprintf(stderr, "%s:%d: ", t->file, t->line_number);
+ unless (s1 == 0) fprintf(stderr, "%s", s1);
+ unless (p == 0) {
+ int i;
+ for (i = 0; i < n; i++) fprintf(stderr, "%c", p[i]);
+ }
+ unless (s2 == 0) fprintf(stderr, "%s", s2);
+ fprintf(stderr, "\n");
+ t->error_count++;
+}
+
+static void error1(struct tokeniser * t, char * s) {
+ error(t, s, 0,0, 0);
+}
+
+static void error2(struct tokeniser * t, char * s) {
+ error(t, "unexpected end of text after ", 0,0, s);
+}
+
+static int compare_words(int m, symbol * p, int n, const byte * q) {
+ unless (m == n) return m - n;
+ {
+ int i; for (i = 0; i < n; i++) {
+ int diff = p[i] - q[i];
+ unless (diff == 0) return diff;
+ }
+ }
+ return 0;
+}
+
+static int find_word(int n, symbol * p) {
+ int i = 0; int j = vocab->code;
+ repeat {
+ int k = i + (j - i)/2;
+ const struct system_word * w = vocab + k;
+ int diff = compare_words(n, p, w->s_size, w->s);
+ if (diff == 0) return w->code;
+ if (diff < 0) j = k; else i = k;
+ if (j - i == 1) break;
+ }
+ return -1;
+}
+
+static int get_number(int n, symbol * p) {
+ int x = 0;
+ int i; for (i = 0; i < n; i++) x = 10*x + p[i] - '0';
+ return x;
+}
+
+static int eq_s(struct tokeniser * t, char * s) {
+ int l = strlen(s);
+ if (SIZE(t->p) - t->c < l) return false;
+ {
+ int i;
+ for (i = 0; i < l; i++) if (t->p[t->c + i] != s[i]) return false;
+ }
+ t->c += l; return true;
+}
+
+static int white_space(struct tokeniser * t, int ch) {
+ switch (ch) {
+ case '\n': t->line_number++;
+ case '\r':
+ case '\t':
+ case ' ': return true;
+ }
+ return false;
+}
+
+static symbol * find_in_m(struct tokeniser * t, int n, symbol * p) {
+ struct m_pair * q = t->m_pairs;
+ repeat {
+ if (q == 0) return 0;
+ {
+ symbol * name = q->name;
+ if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value;
+ }
+ q = q->next;
+ }
+}
+
+static int read_literal_string(struct tokeniser * t, int c) {
+ symbol * p = t->p;
+ int ch;
+ SIZE(t->b) = 0;
+ repeat {
+ if (c >= SIZE(p)) { error2(t, "'"); return c; }
+ ch = p[c];
+ if (ch == '\n') { error1(t, "string not terminated"); return c; }
+ c++;
+ if (ch == t->m_start) {
+ int c0 = c;
+ int newlines = false; /* no newlines as yet */
+ int black_found = false; /* no printing chars as yet */
+ repeat {
+ if (c >= SIZE(p)) { error2(t, "'"); return c; }
+ ch = p[c]; c++;
+ if (ch == t->m_end) break;
+ unless (white_space(t, ch)) black_found = true;
+ if (ch == '\n') newlines = true;
+ if (newlines && black_found) {
+ error1(t, "string not terminated");
+ return c;
+ }
+ }
+ unless (newlines) {
+ int n = c - c0 - 1; /* macro size */
+ int firstch = p[c0];
+ symbol * q = find_in_m(t, n, p + c0);
+ if (q == 0) {
+ if (n == 1 && (firstch == '\'' || firstch == t->m_start))
+ t->b = add_to_b(t->b, 1, p + c0);
+ else
+ error(t, "string macro '", n, p + c0, "' undeclared");
+ } else
+ t->b = add_to_b(t->b, SIZE(q), q);
+ }
+ } else {
+ if (ch == '\'') return c;
+ t->b = add_to_b(t->b, 1, p + c - 1);
+ }
+ }
+}
+
+static int next_token(struct tokeniser * t) {
+ symbol * p = t->p;
+ int c = t->c;
+ int ch;
+ int code = -1;
+ repeat {
+ if (c >= SIZE(p)) { t->c = c; return -1; }
+ ch = p[c];
+ if (white_space(t, ch)) { c++; continue; }
+ if (isalpha(ch)) {
+ int c0 = c;
+ while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++;
+ code = find_word(c - c0, p + c0);
+ if (code < 0) {
+ t->b = move_to_b(t->b, c - c0, p + c0);
+ code = c_name;
+ }
+ } else
+ if (isdigit(ch)) {
+ int c0 = c;
+ while (c < SIZE(p) && isdigit(p[c])) c++;
+ t->number = get_number(c - c0, p + c0);
+ code = c_number;
+ } else
+ if (ch == '\'') {
+ c = read_literal_string(t, c + 1);
+ code = c_literalstring;
+ } else
+ {
+ int lim = smaller(2, SIZE(p) - c);
+ int i;
+ for (i = lim; i > 0; i--) {
+ code = find_word(i, p + c);
+ if (code >= 0) { c += i; break; }
+ }
+ }
+ if (code >= 0) {
+ t->c = c;
+ return code;
+ }
+ error(t, "'", 1, p + c, "' unknown");
+ c++;
+ continue;
+ }
+}
+
+static int next_char(struct tokeniser * t) {
+ if (t->c >= SIZE(t->p)) return -1;
+ return t->p[t->c++];
+}
+
+static int next_real_char(struct tokeniser * t) {
+ repeat {
+ int ch = next_char(t);
+ if (white_space(t, ch)) continue;
+ return ch;
+ }
+}
+
+static void read_chars(struct tokeniser * t) {
+ int ch = next_real_char(t);
+ if (ch < 0) { error2(t, "stringdef"); return; }
+ {
+ int c0 = t->c-1;
+ repeat {
+ ch = next_char(t);
+ if (white_space(t, ch) || ch < 0) break;
+ }
+ t->b2 = move_to_b(t->b2, t->c - c0 - 1, t->p + c0);
+ }
+}
+
+static int decimal_to_num(int ch) {
+ if ('0' <= ch && ch <= '9') return ch - '0';
+ return -1;
+}
+
+static int hex_to_num(int ch) {
+ if ('0' <= ch && ch <= '9') return ch - '0';
+ if ('a' <= ch && ch <= 'f') return ch - 'a' + 10;
+ return -1;
+}
+
+static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) {
+ int c = 0; int d = 0;
+ repeat {
+ while (c < SIZE(p) && p[c] == ' ') c++;
+ if (c == SIZE(p)) break;
+ {
+ int number = 0;
+ repeat {
+ int ch = p[c];
+ if (c == SIZE(p) || ch == ' ') break;
+ if (base == 10) {
+ ch = decimal_to_num(ch);
+ if (ch < 0) {
+ error1(t, "decimal string contains non-digits");
+ return;
+ }
+ } else {
+ ch = hex_to_num(tolower(ch));
+ if (ch < 0) {
+ error1(t, "hex string contains non-hex characters");
+ return;
+ }
+ }
+ number = base * number + ch;
+ c++;
+ }
+ if (t->widechars || t->utf8) {
+ unless (0 <= number && number <= 0xffff) {
+ error1(t, "character values exceed 64K");
+ return;
+ }
+ } else {
+ unless (0 <= number && number <= 0xff) {
+ error1(t, "character values exceed 256");
+ return;
+ }
+ }
+ if (t->utf8)
+ d += put_utf8(number, p + d);
+ else
+ p[d++] = number;
+ }
+ }
+ SIZE(p) = d;
+}
+
+extern int read_token(struct tokeniser * t) {
+ symbol * p = t->p;
+ int held = t->token_held;
+ t->token_held = false;
+ if (held) return t->token;
+ repeat {
+ int code = next_token(t);
+ switch (code) {
+ case c_comment1: /* slash-slash comment */
+ while (t->c < SIZE(p) && p[t->c] != '\n') t->c++;
+ continue;
+ case c_comment2: /* slash-star comment */
+ repeat {
+ if (t->c >= SIZE(p)) {
+ error1(t, "/* comment not terminated");
+ t->token = -1;
+ return -1;
+ }
+ if (p[t->c] == '\n') t->line_number++;
+ if (eq_s(t, "*/")) break;
+ t->c++;
+ }
+ continue;
+ case c_stringescapes:
+ {
+ int ch1 = next_real_char(t);
+ int ch2 = next_real_char(t);
+ if (ch2 < 0)
+ { error2(t, "stringescapes"); continue; }
+ if (ch1 == '\'')
+ { error1(t, "first stringescape cannot be '"); continue; }
+ t->m_start = ch1;
+ t->m_end = ch2;
+ }
+ continue;
+ case c_stringdef:
+ {
+ int base = 0;
+ read_chars(t);
+ code = read_token(t);
+ if (code == c_hex) { base = 16; code = read_token(t); } else
+ if (code == c_decimal) { base = 10; code = read_token(t); }
+ unless (code == c_literalstring)
+ { error1(t, "string omitted after stringdef"); continue; }
+ if (base > 0) convert_numeric_string(t, t->b, base);
+ { NEW(m_pair, q);
+ q->next = t->m_pairs;
+ q->name = copy_b(t->b2);
+ q->value = copy_b(t->b);
+ t->m_pairs = q;
+ }
+ }
+ continue;
+ case c_get:
+ code = read_token(t);
+ unless (code == c_literalstring) {
+ error1(t, "string omitted after get"); continue;
+ }
+ t->get_depth++;
+ if (t->get_depth > 10) {
+ fprintf(stderr, "get directives go 10 deep. Looping?\n");
+ exit(1);
+ }
+ {
+ char * file;
+ NEW(input, q);
+ symbol * u = get_input(t->b, &file);
+ if (u == 0) {
+ struct include * r = t->includes;
+ until (r == 0) {
+ symbol * b = copy_b(r->b);
+ b = add_to_b(b, SIZE(t->b), t->b);
+ u = get_input(b, &file);
+ lose_b(b);
+ unless (u == 0) break;
+ r = r->next;
+ }
+ }
+ if (u == 0) {
+ error(t, "Can't get '", SIZE(t->b), t->b, "'");
+ exit(1);
+ }
+ memmove(q, t, sizeof(struct input));
+ t->next = q;
+ t->p = u;
+ t->c = 0;
+ t->file = file;
+ t->line_number = 1;
+ }
+ p = t->p;
+ continue;
+ case -1:
+ unless (t->next == 0) {
+ lose_b(p);
+ {
+ struct input * q = t->next;
+ memmove(t, q, sizeof(struct input)); p = t->p;
+ FREE(q);
+ }
+ t->get_depth--;
+ continue;
+ }
+ /* drop through */
+ default:
+ t->previous_token = t->token;
+ t->token = code;
+ return code;
+ }
+ }
+}
+
+extern const char * name_of_token(int code) {
+ int i;
+ for (i = 1; i < vocab->code; i++)
+ if ((vocab + i)->code == code) return (const char *)(vocab + i)->s;
+ switch (code) {
+ case c_mathassign: return "=";
+ case c_name: return "name";
+ case c_number: return "number";
+ case c_literalstring:return "literal";
+ case c_neg: return "neg";
+ case c_grouping: return "grouping";
+ case c_call: return "call";
+ case c_booltest: return "Boolean test";
+ case -2: return "start of text";
+ case -1: return "end of text";
+ default: return "?";
+ }
+}
+
+extern struct tokeniser * create_tokeniser(symbol * p, char * file) {
+ NEW(tokeniser, t);
+ t->next = 0;
+ t->p = p;
+ t->c = 0;
+ t->file = file;
+ t->line_number = 1;
+ t->b = create_b(0);
+ t->b2 = create_b(0);
+ t->m_start = -1;
+ t->m_pairs = 0;
+ t->get_depth = 0;
+ t->error_count = 0;
+ t->token_held = false;
+ t->token = -2;
+ t->previous_token = -2;
+ return t;
+}
+
+extern void close_tokeniser(struct tokeniser * t) {
+ lose_b(t->b);
+ lose_b(t->b2);
+ {
+ struct m_pair * q = t->m_pairs;
+ until (q == 0) {
+ struct m_pair * q_next = q->next;
+ lose_b(q->name);
+ lose_b(q->value);
+ FREE(q);
+ q = q_next;
+ }
+ }
+ {
+ struct input * q = t->next;
+ until (q == 0) {
+ struct input * q_next = q->next;
+ FREE(q);
+ q = q_next;
+ }
+ }
+ free(t->file);
+ FREE(t);
+}
diff --git a/contrib/snowball/doc/TODO b/contrib/snowball/doc/TODO
new file mode 100644
index 000000000..0cfa1b1a8
--- /dev/null
+++ b/contrib/snowball/doc/TODO
@@ -0,0 +1,15 @@
+Things to do:
+
+ - Write documentation for how to use libstemmer (as opposed to how stemming
+ algorithms themselves work).
+ Currently, the documentation in the include/libstemmer.h header file is
+ pretty clear and comprehensive, but an overview document wouldn't go amiss.
+
+Things that would be nice to include at some point.
+
+ - Add version numbers to each stemming algorithm, and allow the interface to
+ request a specific version of the stemming algorithms. Default to providing
+ the latest version of the algorithm.
+ - Make mkmodules.pl generate the build system, instead of being called from it.
+ This would allow it to generate the list of modules to be built, so that it's
+ not necessary to change things in more than one place to add a new algorithm.
diff --git a/contrib/snowball/doc/libstemmer_c_README b/contrib/snowball/doc/libstemmer_c_README
new file mode 100644
index 000000000..9d3af8bec
--- /dev/null
+++ b/contrib/snowball/doc/libstemmer_c_README
@@ -0,0 +1,125 @@
+libstemmer_c
+============
+
+This document pertains to the C version of the libstemmer distribution,
+available for download from:
+
+http://snowball.tartarus.org/dist/libstemmer_c.tgz
+
+
+Compiling the library
+=====================
+
+A simple makefile is provided for Unix style systems. On such systems, it
+should be possible simply to run "make", and the file "libstemmer.o"
+and the example program "stemwords" will be generated.
+
+If this doesn't work on your system, you need to write your own build
+system (or call the compiler directly). The files to compile are
+all contained in the "libstemmer", "runtime" and "src_c" directories,
+and the public header file is contained in the "include" directory.
+
+The library comes in two flavours; UTF-8 only, and UTF-8 plus other character
+sets. To use the utf-8 only flavour, compile "libstemmer_utf8.c" instead of
+"libstemmer.c".
+
+For convenience "mkinc.mak" is a makefile fragment listing the source files and
+header files used to compile the standard version of the library.
+"mkinc_utf8.mak" is a comparable makefile fragment listing just the source
+files for the UTF-8 only version of the library.
+
+
+Using the library
+=================
+
+The library provides a simple C API. Essentially, a new stemmer can
+be obtained by using "sb_stemmer_new". "sb_stemmer_stem" is then
+used to stem a word, "sb_stemmer_length" returns the stemmed
+length of the last word processed, and "sb_stemmer_delete" is
+used to delete a stemmer.
+
+Creating a stemmer is a relatively expensive operation - the expected
+usage pattern is that a new stemmer is created when needed, used
+to stem many words, and deleted after some time.
+
+Stemmers are re-entrant, but not threadsafe. In other words, if
+you wish to access the same stemmer object from multiple threads,
+you must ensure that all access is protected by a mutex or similar
+device.
+
+libstemmer does not currently incorporate any mechanism for caching the results
+of stemming operations. Such caching can greatly increase the performance of a
+stemmer under certain situations, so suitable patches will be considered for
+inclusion.
+
+The standard libstemmer sources contain an algorithm for each of the supported
+languages. The algorithm may be selected using the english name of the
+language, or using the 2 or 3 letter ISO 639 language codes. In addition,
+the traditional "Porter" stemming algorithm for english is included for
+backwards compatibility purposes, but we recommend use of the "English"
+stemmer in preference for new projects.
+
+(Some minor algorithms which are included only as curiosities in the snowball
+website, such as the Lovins stemmer and the Kraaij Pohlmann stemmer, are not
+included in the standard libstemmer sources. These are not really supported by
+the snowball project, but it would be possible to compile a modified libstemmer
+library containing these if desired.)
+
+
+The stemwords example
+=====================
+
+The stemwords example program allows you to run any of the stemmers
+compiled into the libstemmer library on a sample vocabulary. For
+details on how to use it, run it with the "-h" command line option.
+
+
+Using the library in a larger system
+====================================
+
+If you are incorporating the library into the build system of a larger
+program, I recommend copying the unpacked tarball without modification into
+a subdirectory of the sources of your program. Future versions of the
+library are intended to keep the same structure, so this will keep the
+work required to move to a new version of the library to a minimum.
+
+As an additional convenience, the list of source and header files used
+in the library is detailed in mkinc.mak - a file which is in a suitable
+format for inclusion by a Makefile. By including this file in your build
+system, you can link the snowball system into your program with a few
+extra rules.
+
+Using the library in a system using GNU autotools
+=================================================
+
+The libstemmer_c library can be integrated into a larger system which uses the
+GNU autotool framework (and in particular, automake and autoconf) as follows:
+
+1) Unpack libstemmer_c.tgz in the top level project directory so that there is
+ a libstemmer_c subdirectory of the top level directory of the project.
+
+2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing:
+
+noinst_LTLIBRARIES = libstemmer.la
+include $(srcdir)/mkinc.mak
+noinst_HEADERS = $(snowball_headers)
+libstemmer_la_SOURCES = $(snowball_sources)
+
+(You may also need to add other lines to this, for example, if you are using
+compiler options which are not compatible with compiling the libstemmer
+library.)
+
+3) Add libstemmer_c to the AC_CONFIG_FILES declaration in the project's
+ configure.ac file.
+
+4) Add to the top level makefile the following lines (or modify existing
+ assignments to these variables appropriately):
+
+AUTOMAKE_OPTIONS = subdir-objects
+AM_CPPFLAGS = -I$(top_srcdir)/libstemmer_c/include
+SUBDIRS=libstemmer_c
+<name>_LIBADD = libstemmer_c/libstemmer.la
+
+(Where <name> is the name of the library or executable which links against
+libstemmer.)
+
diff --git a/contrib/snowball/doc/libstemmer_java_README b/contrib/snowball/doc/libstemmer_java_README
new file mode 100644
index 000000000..38b1af693
--- /dev/null
+++ b/contrib/snowball/doc/libstemmer_java_README
@@ -0,0 +1,40 @@
+libstemmer_java
+===============
+
+This document pertains to the Java version of the libstemmer distribution,
+available for download from:
+
+http://snowball.tartarus.org/dist/libstemmer_java.tgz
+
+
+Compiling the library
+=====================
+
+Simply run the java compiler on all the java source files under the java
+directory. For example, this can be done under unix by changing directory into
+the java directory, and running:
+
+ javac org/tartarus/snowball/*.java org/tartarus/snowball/ext/*.java
+
+This will compile the library and also an example program "TestApp" which
+provides a command line interface to the library.
+
+
+Using the library
+=================
+
+There is currently no formal documentation on the use of the Java version
+of the library. Additionally, its interface is not guaranteed to be
+stable.
+
+The best documentation of the library is the source of the TestApp example
+program.
+
+
+The TestApp example
+===================
+
+The TestApp example program allows you to run any of the stemmers
+compiled into the libstemmer library on a sample vocabulary. For
+details on how to use it, run it with no command line parameters.
+
diff --git a/contrib/snowball/examples/stemwords.c b/contrib/snowball/examples/stemwords.c
new file mode 100644
index 000000000..995bf40dc
--- /dev/null
+++ b/contrib/snowball/examples/stemwords.c
@@ -0,0 +1,209 @@
+/* This is a simple program which uses libstemmer to provide a command
+ * line interface for stemming using any of the algorithms provided.
+ */
+
+#include <stdio.h>
+#include <stdlib.h> /* for malloc, free */
+#include <string.h> /* for memmove */
+#include <ctype.h> /* for isupper, tolower */
+
+#include "libstemmer.h"
+
+const char * progname;
+static int pretty = 1;
+
+static void
+stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
+{
+#define INC 10
+ int lim = INC;
+ sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol));
+
+ while(1) {
+ int ch = getc(f_in);
+ if (ch == EOF) {
+ free(b); return;
+ }
+ {
+ int i = 0;
+ int inlen = 0;
+ while(1) {
+ if (ch == '\n' || ch == EOF) break;
+ if (i == lim) {
+ sb_symbol * newb;
+ newb = (sb_symbol *)
+ realloc(b, (lim + INC) * sizeof(sb_symbol));
+ if (newb == 0) goto error;
+ b = newb;
+ lim = lim + INC;
+ }
+ /* Update count of utf-8 characters. */
+ if (ch < 0x80 || ch > 0xBF) inlen += 1;
+ /* force lower case: */
+ if (isupper(ch)) ch = tolower(ch);
+
+ b[i] = ch;
+ i++;
+ ch = getc(f_in);
+ }
+
+ {
+ const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i);
+ if (stemmed == NULL)
+ {
+ fprintf(stderr, "Out of memory");
+ exit(1);
+ }
+ else
+ {
+ if (pretty == 1) {
+ fwrite(b, i, 1, f_out);
+ fputs(" -> ", f_out);
+ } else if (pretty == 2) {
+ fwrite(b, i, 1, f_out);
+ if (sb_stemmer_length(stemmer) > 0) {
+ int j;
+ if (inlen < 30) {
+ for (j = 30 - inlen; j > 0; j--)
+ fputs(" ", f_out);
+ } else {
+ fputs("\n", f_out);
+ for (j = 30; j > 0; j--)
+ fputs(" ", f_out);
+ }
+ }
+ }
+
+ fputs((const char *)stemmed, f_out);
+ putc('\n', f_out);
+ }
+ }
+ }
+ }
+error:
+ if (b != 0) free(b);
+ return;
+}
+
+/** Display the command line syntax, and then exit.
+ * @param n The value to exit with.
+ */
+static void
+usage(int n)
+{
+ printf("usage: %s [-l <language>] [-i <input file>] [-o <output file>] [-c <character encoding>] [-p[2]] [-h]\n"
+ "\n"
+ "The input file consists of a list of words to be stemmed, one per\n"
+ "line. Words should be in lower case, but (for English) A-Z letters\n"
+ "are mapped to their a-z equivalents anyway. If omitted, stdin is\n"
+ "used.\n"
+ "\n"
+ "If -c is given, the argument is the character encoding of the input\n"
+ "and output files. If it is omitted, the UTF-8 encoding is used.\n"
+ "\n"
+ "If -p is given the output file consists of each word of the input\n"
+ "file followed by \"->\" followed by its stemmed equivalent.\n"
+ "If -p2 is given the output file is a two column layout containing\n"
+ "the input words in the first column and the stemmed equivalents in\n"
+ "the second column.\n"
+ "Otherwise, the output file consists of the stemmed words, one per\n"
+ "line.\n"
+ "\n"
+ "-h displays this help\n",
+ progname);
+ exit(n);
+}
+
+int
+main(int argc, char * argv[])
+{
+ char * in = 0;
+ char * out = 0;
+ FILE * f_in;
+ FILE * f_out;
+ struct sb_stemmer * stemmer;
+
+ char * language = "english";
+ char * charenc = NULL;
+
+ char * s;
+ int i = 1;
+ pretty = 0;
+
+ progname = argv[0];
+
+ while(i < argc) {
+ s = argv[i++];
+ if (s[0] == '-') {
+ if (strcmp(s, "-o") == 0) {
+ if (i >= argc) {
+ fprintf(stderr, "%s requires an argument\n", s);
+ exit(1);
+ }
+ out = argv[i++];
+ } else if (strcmp(s, "-i") == 0) {
+ if (i >= argc) {
+ fprintf(stderr, "%s requires an argument\n", s);
+ exit(1);
+ }
+ in = argv[i++];
+ } else if (strcmp(s, "-l") == 0) {
+ if (i >= argc) {
+ fprintf(stderr, "%s requires an argument\n", s);
+ exit(1);
+ }
+ language = argv[i++];
+ } else if (strcmp(s, "-c") == 0) {
+ if (i >= argc) {
+ fprintf(stderr, "%s requires an argument\n", s);
+ exit(1);
+ }
+ charenc = argv[i++];
+ } else if (strcmp(s, "-p2") == 0) {
+ pretty = 2;
+ } else if (strcmp(s, "-p") == 0) {
+ pretty = 1;
+ } else if (strcmp(s, "-h") == 0) {
+ usage(0);
+ } else {
+ fprintf(stderr, "option %s unknown\n", s);
+ usage(1);
+ }
+ } else {
+ fprintf(stderr, "unexpected parameter %s\n", s);
+ usage(1);
+ }
+ }
+
+ /* prepare the files */
+ f_in = (in == 0) ? stdin : fopen(in, "r");
+ if (f_in == 0) {
+ fprintf(stderr, "file %s not found\n", in);
+ exit(1);
+ }
+ f_out = (out == 0) ? stdout : fopen(out, "w");
+ if (f_out == 0) {
+ fprintf(stderr, "file %s cannot be opened\n", out);
+ exit(1);
+ }
+
+ /* do the stemming process: */
+ stemmer = sb_stemmer_new(language, charenc);
+ if (stemmer == 0) {
+ if (charenc == NULL) {
+ fprintf(stderr, "language `%s' not available for stemming\n", language);
+ exit(1);
+ } else {
+ fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc);
+ exit(1);
+ }
+ }
+ stem_file(stemmer, f_in, f_out);
+ sb_stemmer_delete(stemmer);
+
+ if (in != 0) (void) fclose(f_in);
+ if (out != 0) (void) fclose(f_out);
+
+ return 0;
+}
+
diff --git a/contrib/snowball/include/libstemmer.h b/contrib/snowball/include/libstemmer.h
new file mode 100644
index 000000000..9d86b8581
--- /dev/null
+++ b/contrib/snowball/include/libstemmer.h
@@ -0,0 +1,79 @@
+
+/* Make header file work when included from C++ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct sb_stemmer;
+typedef unsigned char sb_symbol;
+
+/* FIXME - should be able to get a version number for each stemming
+ * algorithm (which will be incremented each time the output changes). */
+
+/** Returns an array of the names of the available stemming algorithms.
+ * Note that these are the canonical names - aliases (ie, other names for
+ * the same algorithm) will not be included in the list.
+ * The list is terminated with a null pointer.
+ *
+ * The list must not be modified in any way.
+ */
+const char ** sb_stemmer_list(void);
+
+/** Create a new stemmer object, using the specified algorithm, for the
+ * specified character encoding.
+ *
+ * All algorithms will usually be available in UTF-8, but may also be
+ * available in other character encodings.
+ *
+ * @param algorithm The algorithm name. This is either the english
+ * name of the algorithm, or the 2 or 3 letter ISO 639 codes for the
+ * language. Note that case is significant in this parameter - the
+ * value should be supplied in lower case.
+ *
+ * @param charenc The character encoding. NULL may be passed as
+ * this value, in which case UTF-8 encoding will be assumed. Otherwise,
+ * the argument may be one of "UTF_8", "ISO_8859_1" (ie, Latin 1),
+ * "CP850" (ie, MS-DOS Latin 1) or "KOI8_R" (Russian). Note that
+ * case is significant in this parameter.
+ *
+ * @return NULL if the specified algorithm is not recognised, or the
+ * algorithm is not available for the requested encoding. Otherwise,
+ * returns a pointer to a newly created stemmer for the requested algorithm.
+ * The returned pointer must be deleted by calling sb_stemmer_delete().
+ *
+ * @note NULL will also be returned if an out of memory error occurs.
+ */
+struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc);
+
+/** Delete a stemmer object.
+ *
+ * This frees all resources allocated for the stemmer. After calling
+ * this function, the supplied stemmer may no longer be used in any way.
+ *
+ * It is safe to pass a null pointer to this function - this will have
+ * no effect.
+ */
+void sb_stemmer_delete(struct sb_stemmer * stemmer);
+
+/** Stem a word.
+ *
+ * The return value is owned by the stemmer - it must not be freed or
+ * modified, and it will become invalid when the stemmer is called again,
+ * or if the stemmer is freed.
+ *
+ * The length of the return value can be obtained using sb_stemmer_length().
+ *
+ * If an out-of-memory error occurs, this will return NULL.
+ */
+const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer,
+ const sb_symbol * word, int size);
+
+/** Get the length of the result of the last stemmed word.
+ * This should not be called before sb_stemmer_stem() has been called.
+ */
+int sb_stemmer_length(struct sb_stemmer * stemmer);
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/contrib/snowball/java/org/tartarus/snowball/Among.java b/contrib/snowball/java/org/tartarus/snowball/Among.java
new file mode 100644
index 000000000..5ed37b503
--- /dev/null
+++ b/contrib/snowball/java/org/tartarus/snowball/Among.java
@@ -0,0 +1,31 @@
+package org.tartarus.snowball;
+
+import java.lang.reflect.Method;
+
+public class Among {
+ public Among (String s, int substring_i, int result,
+ String methodname, SnowballProgram methodobject) {
+ this.s_size = s.length();
+ this.s = s.toCharArray();
+ this.substring_i = substring_i;
+ this.result = result;
+ this.methodobject = methodobject;
+ if (methodname.length() == 0) {
+ this.method = null;
+ } else {
+ try {
+ this.method = methodobject.getClass().
+ getDeclaredMethod(methodname, new Class[0]);
+ } catch (NoSuchMethodException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ public final int s_size; /* search string */
+ public final char[] s; /* search string */
+ public final int substring_i; /* index to longest matching substring */
+ public final int result; /* result of the lookup */
+ public final Method method; /* method to use if substring matches */
+ public final SnowballProgram methodobject; /* object to invoke method on */
+};
diff --git a/contrib/snowball/java/org/tartarus/snowball/SnowballProgram.java b/contrib/snowball/java/org/tartarus/snowball/SnowballProgram.java
new file mode 100644
index 000000000..52d6baa78
--- /dev/null
+++ b/contrib/snowball/java/org/tartarus/snowball/SnowballProgram.java
@@ -0,0 +1,432 @@
+
+package org.tartarus.snowball;
+import java.lang.reflect.InvocationTargetException;
+
+public class SnowballProgram {
+ protected SnowballProgram()
+ {
+ current = new StringBuffer();
+ setCurrent("");
+ }
+
+ /**
+ * Set the current string.
+ */
+ public void setCurrent(String value)
+ {
+ current.replace(0, current.length(), value);
+ cursor = 0;
+ limit = current.length();
+ limit_backward = 0;
+ bra = cursor;
+ ket = limit;
+ }
+
+ /**
+ * Get the current string.
+ */
+ public String getCurrent()
+ {
+ String result = current.toString();
+ // Make a new StringBuffer. If we reuse the old one, and a user of
+ // the library keeps a reference to the buffer returned (for example,
+ // by converting it to a String in a way which doesn't force a copy),
+ // the buffer size will not decrease, and we will risk wasting a large
+ // amount of memory.
+ // Thanks to Wolfram Esser for spotting this problem.
+ current = new StringBuffer();
+ return result;
+ }
+
+ // current string
+ protected StringBuffer current;
+
+ protected int cursor;
+ protected int limit;
+ protected int limit_backward;
+ protected int bra;
+ protected int ket;
+
+ protected void copy_from(SnowballProgram other)
+ {
+ current = other.current;
+ cursor = other.cursor;
+ limit = other.limit;
+ limit_backward = other.limit_backward;
+ bra = other.bra;
+ ket = other.ket;
+ }
+
+ protected boolean in_grouping(char [] s, int min, int max)
+ {
+ if (cursor >= limit) return false;
+ char ch = current.charAt(cursor);
+ if (ch > max || ch < min) return false;
+ ch -= min;
+ if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
+ cursor++;
+ return true;
+ }
+
+ protected boolean in_grouping_b(char [] s, int min, int max)
+ {
+ if (cursor <= limit_backward) return false;
+ char ch = current.charAt(cursor - 1);
+ if (ch > max || ch < min) return false;
+ ch -= min;
+ if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
+ cursor--;
+ return true;
+ }
+
+ protected boolean out_grouping(char [] s, int min, int max)
+ {
+ if (cursor >= limit) return false;
+ char ch = current.charAt(cursor);
+ if (ch > max || ch < min) {
+ cursor++;
+ return true;
+ }
+ ch -= min;
+ if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
+ cursor ++;
+ return true;
+ }
+ return false;
+ }
+
+ protected boolean out_grouping_b(char [] s, int min, int max)
+ {
+ if (cursor <= limit_backward) return false;
+ char ch = current.charAt(cursor - 1);
+ if (ch > max || ch < min) {
+ cursor--;
+ return true;
+ }
+ ch -= min;
+ if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
+ cursor--;
+ return true;
+ }
+ return false;
+ }
+
+ protected boolean in_range(int min, int max)
+ {
+ if (cursor >= limit) return false;
+ char ch = current.charAt(cursor);
+ if (ch > max || ch < min) return false;
+ cursor++;
+ return true;
+ }
+
+ protected boolean in_range_b(int min, int max)
+ {
+ if (cursor <= limit_backward) return false;
+ char ch = current.charAt(cursor - 1);
+ if (ch > max || ch < min) return false;
+ cursor--;
+ return true;
+ }
+
+ protected boolean out_range(int min, int max)
+ {
+ if (cursor >= limit) return false;
+ char ch = current.charAt(cursor);
+ if (!(ch > max || ch < min)) return false;
+ cursor++;
+ return true;
+ }
+
+ protected boolean out_range_b(int min, int max)
+ {
+ if (cursor <= limit_backward) return false;
+ char ch = current.charAt(cursor - 1);
+ if(!(ch > max || ch < min)) return false;
+ cursor--;
+ return true;
+ }
+
+ protected boolean eq_s(int s_size, String s)
+ {
+ if (limit - cursor < s_size) return false;
+ int i;
+ for (i = 0; i != s_size; i++) {
+ if (current.charAt(cursor + i) != s.charAt(i)) return false;
+ }
+ cursor += s_size;
+ return true;
+ }
+
+ protected boolean eq_s_b(int s_size, String s)
+ {
+ if (cursor - limit_backward < s_size) return false;
+ int i;
+ for (i = 0; i != s_size; i++) {
+ if (current.charAt(cursor - s_size + i) != s.charAt(i)) return false;
+ }
+ cursor -= s_size;
+ return true;
+ }
+
+ protected boolean eq_v(CharSequence s)
+ {
+ return eq_s(s.length(), s.toString());
+ }
+
+ protected boolean eq_v_b(CharSequence s)
+ { return eq_s_b(s.length(), s.toString());
+ }
+
+ protected int find_among(Among v[], int v_size)
+ {
+ int i = 0;
+ int j = v_size;
+
+ int c = cursor;
+ int l = limit;
+
+ int common_i = 0;
+ int common_j = 0;
+
+ boolean first_key_inspected = false;
+
+ while(true) {
+ int k = i + ((j - i) >> 1);
+ int diff = 0;
+ int common = common_i < common_j ? common_i : common_j; // smaller
+ Among w = v[k];
+ int i2;
+ for (i2 = common; i2 < w.s_size; i2++) {
+ if (c + common == l) {
+ diff = -1;
+ break;
+ }
+ diff = current.charAt(c + common) - w.s[i2];
+ if (diff != 0) break;
+ common++;
+ }
+ if (diff < 0) {
+ j = k;
+ common_j = common;
+ } else {
+ i = k;
+ common_i = common;
+ }
+ if (j - i <= 1) {
+ if (i > 0) break; // v->s has been inspected
+ if (j == i) break; // only one item in v
+
+ // - but now we need to go round once more to get
+ // v->s inspected. This looks messy, but is actually
+ // the optimal approach.
+
+ if (first_key_inspected) break;
+ first_key_inspected = true;
+ }
+ }
+ while(true) {
+ Among w = v[i];
+ if (common_i >= w.s_size) {
+ cursor = c + w.s_size;
+ if (w.method == null) return w.result;
+ boolean res;
+ try {
+ Object resobj = w.method.invoke(w.methodobject,
+ new Object[0]);
+ res = resobj.toString().equals("true");
+ } catch (InvocationTargetException e) {
+ res = false;
+ // FIXME - debug message
+ } catch (IllegalAccessException e) {
+ res = false;
+ // FIXME - debug message
+ }
+ cursor = c + w.s_size;
+ if (res) return w.result;
+ }
+ i = w.substring_i;
+ if (i < 0) return 0;
+ }
+ }
+
+ // find_among_b is for backwards processing. Same comments apply
+ protected int find_among_b(Among v[], int v_size)
+ {
+ int i = 0;
+ int j = v_size;
+
+ int c = cursor;
+ int lb = limit_backward;
+
+ int common_i = 0;
+ int common_j = 0;
+
+ boolean first_key_inspected = false;
+
+ while(true) {
+ int k = i + ((j - i) >> 1);
+ int diff = 0;
+ int common = common_i < common_j ? common_i : common_j;
+ Among w = v[k];
+ int i2;
+ for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) {
+ if (c - common == lb) {
+ diff = -1;
+ break;
+ }
+ diff = current.charAt(c - 1 - common) - w.s[i2];
+ if (diff != 0) break;
+ common++;
+ }
+ if (diff < 0) {
+ j = k;
+ common_j = common;
+ } else {
+ i = k;
+ common_i = common;
+ }
+ if (j - i <= 1) {
+ if (i > 0) break;
+ if (j == i) break;
+ if (first_key_inspected) break;
+ first_key_inspected = true;
+ }
+ }
+ while(true) {
+ Among w = v[i];
+ if (common_i >= w.s_size) {
+ cursor = c - w.s_size;
+ if (w.method == null) return w.result;
+
+ boolean res;
+ try {
+ Object resobj = w.method.invoke(w.methodobject,
+ new Object[0]);
+ res = resobj.toString().equals("true");
+ } catch (InvocationTargetException e) {
+ res = false;
+ // FIXME - debug message
+ } catch (IllegalAccessException e) {
+ res = false;
+ // FIXME - debug message
+ }
+ cursor = c - w.s_size;
+ if (res) return w.result;
+ }
+ i = w.substring_i;
+ if (i < 0) return 0;
+ }
+ }
+
+ /* to replace chars between c_bra and c_ket in current by the
+ * chars in s.
+ */
+ protected int replace_s(int c_bra, int c_ket, String s)
+ {
+ int adjustment = s.length() - (c_ket - c_bra);
+ current.replace(c_bra, c_ket, s);
+ limit += adjustment;
+ if (cursor >= c_ket) cursor += adjustment;
+ else if (cursor > c_bra) cursor = c_bra;
+ return adjustment;
+ }
+
+ protected void slice_check()
+ {
+ if (bra < 0 ||
+ bra > ket ||
+ ket > limit ||
+ limit > current.length()) // this line could be removed
+ {
+ System.err.println("faulty slice operation");
+ // FIXME: report error somehow.
+ /*
+ fprintf(stderr, "faulty slice operation:\n");
+ debug(z, -1, 0);
+ exit(1);
+ */
+ }
+ }
+
+ protected void slice_from(String s)
+ {
+ slice_check();
+ replace_s(bra, ket, s);
+ }
+
+ protected void slice_from(CharSequence s)
+ {
+ slice_from(s.toString());
+ }
+
+ protected void slice_del()
+ {
+ slice_from("");
+ }
+
+ protected void insert(int c_bra, int c_ket, String s)
+ {
+ int adjustment = replace_s(c_bra, c_ket, s);
+ if (c_bra <= bra) bra += adjustment;
+ if (c_bra <= ket) ket += adjustment;
+ }
+
+ protected void insert(int c_bra, int c_ket, CharSequence s)
+ {
+ insert(c_bra, c_ket, s.toString());
+ }
+
+ /* Copy the slice into the supplied StringBuffer */
+ protected StringBuffer slice_to(StringBuffer s)
+ {
+ slice_check();
+ int len = ket - bra;
+ s.replace(0, s.length(), current.substring(bra, ket));
+ return s;
+ }
+
+ /* Copy the slice into the supplied StringBuilder */
+ protected StringBuilder slice_to(StringBuilder s)
+ {
+ slice_check();
+ int len = ket - bra;
+ s.replace(0, s.length(), current.substring(bra, ket));
+ return s;
+ }
+
+ protected StringBuffer assign_to(StringBuffer s)
+ {
+ s.replace(0, s.length(), current.substring(0, limit));
+ return s;
+ }
+
+ protected StringBuilder assign_to(StringBuilder s)
+ {
+ s.replace(0, s.length(), current.substring(0, limit));
+ return s;
+ }
+
+/*
+extern void debug(struct SN_env * z, int number, int line_count)
+{ int i;
+ int limit = SIZE(z->p);
+ //if (number >= 0) printf("%3d (line %4d): '", number, line_count);
+ if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
+ for (i = 0; i <= limit; i++)
+ { if (z->lb == i) printf("{");
+ if (z->bra == i) printf("[");
+ if (z->c == i) printf("|");
+ if (z->ket == i) printf("]");
+ if (z->l == i) printf("}");
+ if (i < limit)
+ { int ch = z->p[i];
+ if (ch == 0) ch = '#';
+ printf("%c", ch);
+ }
+ }
+ printf("'\n");
+}
+*/
+
+};
diff --git a/contrib/snowball/java/org/tartarus/snowball/SnowballStemmer.java b/contrib/snowball/java/org/tartarus/snowball/SnowballStemmer.java
new file mode 100644
index 000000000..960bd55f6
--- /dev/null
+++ b/contrib/snowball/java/org/tartarus/snowball/SnowballStemmer.java
@@ -0,0 +1,7 @@
+
+package org.tartarus.snowball;
+import java.lang.reflect.InvocationTargetException;
+
+public abstract class SnowballStemmer extends SnowballProgram {
+ public abstract boolean stem();
+};
diff --git a/contrib/snowball/java/org/tartarus/snowball/TestApp.java b/contrib/snowball/java/org/tartarus/snowball/TestApp.java
new file mode 100644
index 000000000..38803f673
--- /dev/null
+++ b/contrib/snowball/java/org/tartarus/snowball/TestApp.java
@@ -0,0 +1,77 @@
+
+package org.tartarus.snowball;
+
+import java.lang.reflect.Method;
+import java.io.Reader;
+import java.io.Writer;
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.OutputStream;
+import java.io.FileOutputStream;
+
+public class TestApp {
+ private static void usage()
+ {
+ System.err.println("Usage: TestApp <algorithm> <input file> [-o <output file>]");
+ }
+
+ public static void main(String [] args) throws Throwable {
+ if (args.length < 2) {
+ usage();
+ return;
+ }
+
+ Class stemClass = Class.forName("org.tartarus.snowball.ext." +
+ args[0] + "Stemmer");
+ SnowballStemmer stemmer = (SnowballStemmer) stemClass.newInstance();
+
+ Reader reader;
+ reader = new InputStreamReader(new FileInputStream(args[1]));
+ reader = new BufferedReader(reader);
+
+ StringBuffer input = new StringBuffer();
+
+ OutputStream outstream;
+
+ if (args.length > 2) {
+ if (args.length >= 4 && args[2].equals("-o")) {
+ outstream = new FileOutputStream(args[3]);
+ } else {
+ usage();
+ return;
+ }
+ } else {
+ outstream = System.out;
+ }
+ Writer output = new OutputStreamWriter(outstream);
+ output = new BufferedWriter(output);
+
+ int repeat = 1;
+ if (args.length > 4) {
+ repeat = Integer.parseInt(args[4]);
+ }
+
+ Object [] emptyArgs = new Object[0];
+ int character;
+ while ((character = reader.read()) != -1) {
+ char ch = (char) character;
+ if (Character.isWhitespace((char) ch)) {
+ if (input.length() > 0) {
+ stemmer.setCurrent(input.toString());
+ for (int i = repeat; i != 0; i--) {
+ stemmer.stem();
+ }
+ output.write(stemmer.getCurrent());
+ output.write('\n');
+ input.delete(0, input.length());
+ }
+ } else {
+ input.append(Character.toLowerCase(ch));
+ }
+ }
+ output.flush();
+ }
+}
diff --git a/contrib/snowball/runtime/api.c b/contrib/snowball/runtime/api.c
new file mode 100644
index 000000000..40039ef4a
--- /dev/null
+++ b/contrib/snowball/runtime/api.c
@@ -0,0 +1,66 @@
+
+#include <stdlib.h> /* for calloc, free */
+#include "header.h"
+
+extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
+{
+ struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env));
+ if (z == NULL) return NULL;
+ z->p = create_s();
+ if (z->p == NULL) goto error;
+ if (S_size)
+ {
+ int i;
+ z->S = (symbol * *) calloc(S_size, sizeof(symbol *));
+ if (z->S == NULL) goto error;
+
+ for (i = 0; i < S_size; i++)
+ {
+ z->S[i] = create_s();
+ if (z->S[i] == NULL) goto error;
+ }
+ }
+
+ if (I_size)
+ {
+ z->I = (int *) calloc(I_size, sizeof(int));
+ if (z->I == NULL) goto error;
+ }
+
+ if (B_size)
+ {
+ z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char));
+ if (z->B == NULL) goto error;
+ }
+
+ return z;
+error:
+ SN_close_env(z, S_size);
+ return NULL;
+}
+
+extern void SN_close_env(struct SN_env * z, int S_size)
+{
+ if (z == NULL) return;
+ if (S_size)
+ {
+ int i;
+ for (i = 0; i < S_size; i++)
+ {
+ lose_s(z->S[i]);
+ }
+ free(z->S);
+ }
+ free(z->I);
+ free(z->B);
+ if (z->p) lose_s(z->p);
+ free(z);
+}
+
+extern int SN_set_current(struct SN_env * z, int size, const symbol * s)
+{
+ int err = replace_s(z, 0, z->l, size, s, NULL);
+ z->c = 0;
+ return err;
+}
+
diff --git a/contrib/snowball/runtime/api.h b/contrib/snowball/runtime/api.h
new file mode 100644
index 000000000..8b997f0c2
--- /dev/null
+++ b/contrib/snowball/runtime/api.h
@@ -0,0 +1,26 @@
+
+typedef unsigned char symbol;
+
+/* Or replace 'char' above with 'short' for 16 bit characters.
+
+ More precisely, replace 'char' with whatever type guarantees the
+ character width you need. Note however that sizeof(symbol) should divide
+ HEAD, defined in header.h as 2*sizeof(int), without remainder, otherwise
+ there is an alignment problem. In the unlikely event of a problem here,
+ consult Martin Porter.
+
+*/
+
+struct SN_env {
+ symbol * p;
+ int c; int l; int lb; int bra; int ket;
+ symbol * * S;
+ int * I;
+ unsigned char * B;
+};
+
+extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size);
+extern void SN_close_env(struct SN_env * z, int S_size);
+
+extern int SN_set_current(struct SN_env * z, int size, const symbol * s);
+
diff --git a/contrib/snowball/runtime/header.h b/contrib/snowball/runtime/header.h
new file mode 100644
index 000000000..4d3078f50
--- /dev/null
+++ b/contrib/snowball/runtime/header.h
@@ -0,0 +1,58 @@
+
+#include <limits.h>
+
+#include "api.h"
+
+#define MAXINT INT_MAX
+#define MININT INT_MIN
+
+#define HEAD 2*sizeof(int)
+
+#define SIZE(p) ((int *)(p))[-1]
+#define SET_SIZE(p, n) ((int *)(p))[-1] = n
+#define CAPACITY(p) ((int *)(p))[-2]
+
+struct among
+{ int s_size; /* number of chars in string */
+ const symbol * s; /* search string */
+ int substring_i;/* index to longest matching substring */
+ int result; /* result of the lookup */
+ int (* function)(struct SN_env *);
+};
+
+extern symbol * create_s(void);
+extern void lose_s(symbol * p);
+
+extern int skip_utf8(const symbol * p, int c, int lb, int l, int n);
+
+extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+
+extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat);
+
+extern int eq_s(struct SN_env * z, int s_size, const symbol * s);
+extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s);
+extern int eq_v(struct SN_env * z, const symbol * p);
+extern int eq_v_b(struct SN_env * z, const symbol * p);
+
+extern int find_among(struct SN_env * z, const struct among * v, int v_size);
+extern int find_among_b(struct SN_env * z, const struct among * v, int v_size);
+
+extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment);
+extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s);
+extern int slice_from_v(struct SN_env * z, const symbol * p);
+extern int slice_del(struct SN_env * z);
+
+extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s);
+extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p);
+
+extern symbol * slice_to(struct SN_env * z, symbol * p);
+extern symbol * assign_to(struct SN_env * z, symbol * p);
+
+extern void debug(struct SN_env * z, int number, int line_count);
+
diff --git a/contrib/snowball/runtime/utilities.c b/contrib/snowball/runtime/utilities.c
new file mode 100644
index 000000000..1840f0280
--- /dev/null
+++ b/contrib/snowball/runtime/utilities.c
@@ -0,0 +1,478 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "header.h"
+
+#define unless(C) if(!(C))
+
+#define CREATE_SIZE 1
+
+extern symbol * create_s(void) {
+ symbol * p;
+ void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol));
+ if (mem == NULL) return NULL;
+ p = (symbol *) (HEAD + (char *) mem);
+ CAPACITY(p) = CREATE_SIZE;
+ SET_SIZE(p, CREATE_SIZE);
+ return p;
+}
+
+extern void lose_s(symbol * p) {
+ if (p == NULL) return;
+ free((char *) p - HEAD);
+}
+
+/*
+ new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
+ if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new
+ position, or 0 on failure.
+
+ -- used to implement hop and next in the utf8 case.
+*/
+
+extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) {
+ int b;
+ if (n >= 0) {
+ for (; n > 0; n--) {
+ if (c >= l) return -1;
+ b = p[c++];
+ if (b >= 0xC0) { /* 1100 0000 */
+ while (c < l) {
+ b = p[c];
+ if (b >= 0xC0 || b < 0x80) break;
+ /* break unless b is 10------ */
+ c++;
+ }
+ }
+ }
+ } else {
+ for (; n < 0; n++) {
+ if (c <= lb) return -1;
+ b = p[--c];
+ if (b >= 0x80) { /* 1000 0000 */
+ while (c > lb) {
+ b = p[c];
+ if (b >= 0xC0) break; /* 1100 0000 */
+ c--;
+ }
+ }
+ }
+ }
+ return c;
+}
+
+/* Code for character groupings: utf8 cases */
+
+static int get_utf8(const symbol * p, int c, int l, int * slot) {
+ int b0, b1;
+ if (c >= l) return 0;
+ b0 = p[c++];
+ if (b0 < 0xC0 || c == l) { /* 1100 0000 */
+ * slot = b0; return 1;
+ }
+ b1 = p[c++];
+ if (b0 < 0xE0 || c == l) { /* 1110 0000 */
+ * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
+ }
+ * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[c] & 0x3F); return 3;
+}
+
+static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
+ int b0, b1;
+ if (c <= lb) return 0;
+ b0 = p[--c];
+ if (b0 < 0x80 || c == lb) { /* 1000 0000 */
+ * slot = b0; return 1;
+ }
+ b1 = p[--c];
+ if (b1 >= 0xC0 || c == lb) { /* 1100 0000 */
+ * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
+ }
+ * slot = (p[c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
+}
+
+extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+ do {
+ int ch;
+ int w = get_utf8(z->p, z->c, z->l, & ch);
+ unless (w) return -1;
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return w;
+ z->c += w;
+ } while (repeat);
+ return 0;
+}
+
+extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+ do {
+ int ch;
+ int w = get_b_utf8(z->p, z->c, z->lb, & ch);
+ unless (w) return -1;
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return w;
+ z->c -= w;
+ } while (repeat);
+ return 0;
+}
+
+extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+ do {
+ int ch;
+ int w = get_utf8(z->p, z->c, z->l, & ch);
+ unless (w) return -1;
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return w;
+ z->c += w;
+ } while (repeat);
+ return 0;
+}
+
+extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+ do {
+ int ch;
+ int w = get_b_utf8(z->p, z->c, z->lb, & ch);
+ unless (w) return -1;
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return w;
+ z->c -= w;
+ } while (repeat);
+ return 0;
+}
+
+/* Code for character groupings: non-utf8 cases */
+
+extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+ do {
+ int ch;
+ if (z->c >= z->l) return -1;
+ ch = z->p[z->c];
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return 1;
+ z->c++;
+ } while (repeat);
+ return 0;
+}
+
+extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+ do {
+ int ch;
+ if (z->c <= z->lb) return -1;
+ ch = z->p[z->c - 1];
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return 1;
+ z->c--;
+ } while (repeat);
+ return 0;
+}
+
+extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+ do {
+ int ch;
+ if (z->c >= z->l) return -1;
+ ch = z->p[z->c];
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return 1;
+ z->c++;
+ } while (repeat);
+ return 0;
+}
+
+extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
+ do {
+ int ch;
+ if (z->c <= z->lb) return -1;
+ ch = z->p[z->c - 1];
+ unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return 1;
+ z->c--;
+ } while (repeat);
+ return 0;
+}
+
+extern int eq_s(struct SN_env * z, int s_size, const symbol * s) {
+ if (z->l - z->c < s_size || memcmp(z->p + z->c, s, s_size * sizeof(symbol)) != 0) return 0;
+ z->c += s_size; return 1;
+}
+
+extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s) {
+ if (z->c - z->lb < s_size || memcmp(z->p + z->c - s_size, s, s_size * sizeof(symbol)) != 0) return 0;
+ z->c -= s_size; return 1;
+}
+
+extern int eq_v(struct SN_env * z, const symbol * p) {
+ return eq_s(z, SIZE(p), p);
+}
+
+extern int eq_v_b(struct SN_env * z, const symbol * p) {
+ return eq_s_b(z, SIZE(p), p);
+}
+
+extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
+
+ int i = 0;
+ int j = v_size;
+
+ int c = z->c; int l = z->l;
+ symbol * q = z->p + c;
+
+ const struct among * w;
+
+ int common_i = 0;
+ int common_j = 0;
+
+ int first_key_inspected = 0;
+
+ while(1) {
+ int k = i + ((j - i) >> 1);
+ int diff = 0;
+ int common = common_i < common_j ? common_i : common_j; /* smaller */
+ w = v + k;
+ {
+ int i2; for (i2 = common; i2 < w->s_size; i2++) {
+ if (c + common == l) { diff = -1; break; }
+ diff = q[common] - w->s[i2];
+ if (diff != 0) break;
+ common++;
+ }
+ }
+ if (diff < 0) { j = k; common_j = common; }
+ else { i = k; common_i = common; }
+ if (j - i <= 1) {
+ if (i > 0) break; /* v->s has been inspected */
+ if (j == i) break; /* only one item in v */
+
+ /* - but now we need to go round once more to get
+ v->s inspected. This looks messy, but is actually
+ the optimal approach. */
+
+ if (first_key_inspected) break;
+ first_key_inspected = 1;
+ }
+ }
+ while(1) {
+ w = v + i;
+ if (common_i >= w->s_size) {
+ z->c = c + w->s_size;
+ if (w->function == 0) return w->result;
+ {
+ int res = w->function(z);
+ z->c = c + w->s_size;
+ if (res) return w->result;
+ }
+ }
+ i = w->substring_i;
+ if (i < 0) return 0;
+ }
+}
+
+/* find_among_b is for backwards processing. Same comments apply */
+
+extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) {
+
+ int i = 0;
+ int j = v_size;
+
+ int c = z->c; int lb = z->lb;
+ symbol * q = z->p + c - 1;
+
+ const struct among * w;
+
+ int common_i = 0;
+ int common_j = 0;
+
+ int first_key_inspected = 0;
+
+ while(1) {
+ int k = i + ((j - i) >> 1);
+ int diff = 0;
+ int common = common_i < common_j ? common_i : common_j;
+ w = v + k;
+ {
+ int i2; for (i2 = w->s_size - 1 - common; i2 >= 0; i2--) {
+ if (c - common == lb) { diff = -1; break; }
+ diff = q[- common] - w->s[i2];
+ if (diff != 0) break;
+ common++;
+ }
+ }
+ if (diff < 0) { j = k; common_j = common; }
+ else { i = k; common_i = common; }
+ if (j - i <= 1) {
+ if (i > 0) break;
+ if (j == i) break;
+ if (first_key_inspected) break;
+ first_key_inspected = 1;
+ }
+ }
+ while(1) {
+ w = v + i;
+ if (common_i >= w->s_size) {
+ z->c = c - w->s_size;
+ if (w->function == 0) return w->result;
+ {
+ int res = w->function(z);
+ z->c = c - w->s_size;
+ if (res) return w->result;
+ }
+ }
+ i = w->substring_i;
+ if (i < 0) return 0;
+ }
+}
+
+
+/* Increase the size of the buffer pointed to by p to at least n symbols.
+ * If insufficient memory, returns NULL and frees the old buffer.
+ */
+static symbol * increase_size(symbol * p, int n) {
+ symbol * q;
+ int new_size = n + 20;
+ void * mem = realloc((char *) p - HEAD,
+ HEAD + (new_size + 1) * sizeof(symbol));
+ if (mem == NULL) {
+ lose_s(p);
+ return NULL;
+ }
+ q = (symbol *) (HEAD + (char *)mem);
+ CAPACITY(q) = new_size;
+ return q;
+}
+
+/* to replace symbols between c_bra and c_ket in z->p by the
+ s_size symbols at s.
+ Returns 0 on success, -1 on error.
+ Also, frees z->p (and sets it to NULL) on error.
+*/
+extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjptr)
+{
+ int adjustment;
+ int len;
+ if (z->p == NULL) {
+ z->p = create_s();
+ if (z->p == NULL) return -1;
+ }
+ adjustment = s_size - (c_ket - c_bra);
+ len = SIZE(z->p);
+ if (adjustment != 0) {
+ if (adjustment + len > CAPACITY(z->p)) {
+ z->p = increase_size(z->p, adjustment + len);
+ if (z->p == NULL) return -1;
+ }
+ memmove(z->p + c_ket + adjustment,
+ z->p + c_ket,
+ (len - c_ket) * sizeof(symbol));
+ SET_SIZE(z->p, adjustment + len);
+ z->l += adjustment;
+ if (z->c >= c_ket)
+ z->c += adjustment;
+ else
+ if (z->c > c_bra)
+ z->c = c_bra;
+ }
+ unless (s_size == 0) memmove(z->p + c_bra, s, s_size * sizeof(symbol));
+ if (adjptr != NULL)
+ *adjptr = adjustment;
+ return 0;
+}
+
+static int slice_check(struct SN_env * z) {
+
+ if (z->bra < 0 ||
+ z->bra > z->ket ||
+ z->ket > z->l ||
+ z->p == NULL ||
+ z->l > SIZE(z->p)) /* this line could be removed */
+ {
+#if 0
+ fprintf(stderr, "faulty slice operation:\n");
+ debug(z, -1, 0);
+#endif
+ return -1;
+ }
+ return 0;
+}
+
+extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s) {
+ if (slice_check(z)) return -1;
+ return replace_s(z, z->bra, z->ket, s_size, s, NULL);
+}
+
+extern int slice_from_v(struct SN_env * z, const symbol * p) {
+ return slice_from_s(z, SIZE(p), p);
+}
+
+extern int slice_del(struct SN_env * z) {
+ return slice_from_s(z, 0, 0);
+}
+
+extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s) {
+ int adjustment;
+ if (replace_s(z, bra, ket, s_size, s, &adjustment))
+ return -1;
+ if (bra <= z->bra) z->bra += adjustment;
+ if (bra <= z->ket) z->ket += adjustment;
+ return 0;
+}
+
+extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) {
+ int adjustment;
+ if (replace_s(z, bra, ket, SIZE(p), p, &adjustment))
+ return -1;
+ if (bra <= z->bra) z->bra += adjustment;
+ if (bra <= z->ket) z->ket += adjustment;
+ return 0;
+}
+
+extern symbol * slice_to(struct SN_env * z, symbol * p) {
+ if (slice_check(z)) {
+ lose_s(p);
+ return NULL;
+ }
+ {
+ int len = z->ket - z->bra;
+ if (CAPACITY(p) < len) {
+ p = increase_size(p, len);
+ if (p == NULL)
+ return NULL;
+ }
+ memmove(p, z->p + z->bra, len * sizeof(symbol));
+ SET_SIZE(p, len);
+ }
+ return p;
+}
+
+extern symbol * assign_to(struct SN_env * z, symbol * p) {
+ int len = z->l;
+ if (CAPACITY(p) < len) {
+ p = increase_size(p, len);
+ if (p == NULL)
+ return NULL;
+ }
+ memmove(p, z->p, len * sizeof(symbol));
+ SET_SIZE(p, len);
+ return p;
+}
+
+#if 0
+extern void debug(struct SN_env * z, int number, int line_count) {
+ int i;
+ int limit = SIZE(z->p);
+ /*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/
+ if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
+ for (i = 0; i <= limit; i++) {
+ if (z->lb == i) printf("{");
+ if (z->bra == i) printf("[");
+ if (z->c == i) printf("|");
+ if (z->ket == i) printf("]");
+ if (z->l == i) printf("}");
+ if (i < limit)
+ { int ch = z->p[i];
+ if (ch == 0) ch = '#';
+ printf("%c", ch);
+ }
+ }
+ printf("'\n");
+}
+#endif