aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/snowball
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2020-02-25 09:55:31 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2020-02-25 09:55:31 +0000
commitb87995255fa2ef0de97d509b8cd27860f014e90f (patch)
treeff7fcc84aa85fcd4cd129d94f6fb23ac5f91d4cb /contrib/snowball
parent52154a6c1dd7e46c174d4aab782494b92f955df5 (diff)
downloadrspamd-b87995255fa2ef0de97d509b8cd27860f014e90f.tar.gz
rspamd-b87995255fa2ef0de97d509b8cd27860f014e90f.zip
[Rework] Update snowball stemmer to 2.0 and remove all crap aside of UTF8
Diffstat (limited to 'contrib/snowball')
-rw-r--r--contrib/snowball/CMakeLists.txt35
-rw-r--r--contrib/snowball/GNUmakefile300
-rw-r--r--contrib/snowball/NEWS407
-rw-r--r--contrib/snowball/algorithms/arabic.sbl561
-rw-r--r--contrib/snowball/algorithms/basque.sbl149
-rw-r--r--contrib/snowball/algorithms/catalan.sbl202
-rw-r--r--contrib/snowball/algorithms/danish.sbl (renamed from contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl)14
-rw-r--r--contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl91
-rw-r--r--contrib/snowball/algorithms/dutch.sbl (renamed from contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl)24
-rw-r--r--contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl164
-rw-r--r--contrib/snowball/algorithms/english.sbl (renamed from contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl)0
-rw-r--r--contrib/snowball/algorithms/finnish.sbl (renamed from contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl)15
-rw-r--r--contrib/snowball/algorithms/french.sbl (renamed from contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl)42
-rw-r--r--contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl239
-rw-r--r--contrib/snowball/algorithms/german.sbl (renamed from contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl)10
-rw-r--r--contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl139
-rw-r--r--contrib/snowball/algorithms/german2.sbl (renamed from contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl)10
-rw-r--r--contrib/snowball/algorithms/greek.sbl706
-rw-r--r--contrib/snowball/algorithms/hindi.sbl323
-rw-r--r--contrib/snowball/algorithms/hungarian.sbl (renamed from contrib/snowball/algorithms/hungarian/stem_Unicode.sbl)22
-rw-r--r--contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl241
-rw-r--r--contrib/snowball/algorithms/indonesian.sbl192
-rw-r--r--contrib/snowball/algorithms/irish.sbl151
-rw-r--r--contrib/snowball/algorithms/italian.sbl (renamed from contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl)24
-rw-r--r--contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl195
-rw-r--r--contrib/snowball/algorithms/kraaij_pohlmann.sbl (renamed from contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl)31
-rw-r--r--contrib/snowball/algorithms/lithuanian.sbl373
-rw-r--r--contrib/snowball/algorithms/lovins.sbl (renamed from contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl)0
-rw-r--r--contrib/snowball/algorithms/nepali.sbl92
-rw-r--r--contrib/snowball/algorithms/norwegian.sbl (renamed from contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl)8
-rw-r--r--contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl80
-rw-r--r--contrib/snowball/algorithms/porter.sbl (renamed from contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl)0
-rw-r--r--contrib/snowball/algorithms/portuguese.sbl (renamed from contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl)30
-rw-r--r--contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl218
-rw-r--r--contrib/snowball/algorithms/romanian.sbl (renamed from contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl)10
-rw-r--r--contrib/snowball/algorithms/romanian/stem_Unicode.sbl236
-rw-r--r--contrib/snowball/algorithms/russian.sbl (renamed from contrib/snowball/algorithms/russian/stem_KOI8_R.sbl)76
-rw-r--r--contrib/snowball/algorithms/russian/stem_Unicode.sbl215
-rw-r--r--contrib/snowball/algorithms/serbian.sbl2378
-rw-r--r--contrib/snowball/algorithms/spanish.sbl (renamed from contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl)18
-rw-r--r--contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl230
-rw-r--r--contrib/snowball/algorithms/swedish.sbl (renamed from contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl)8
-rw-r--r--contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl72
-rw-r--r--contrib/snowball/algorithms/tamil.sbl405
-rw-r--r--contrib/snowball/algorithms/turkish.sbl (renamed from contrib/snowball/algorithms/turkish/stem_Unicode.sbl)201
-rw-r--r--contrib/snowball/charsets/ISO-8859-2.sbl98
-rw-r--r--contrib/snowball/charsets/KOI8-R.sbl74
-rw-r--r--contrib/snowball/charsets/cp850.sbl130
-rw-r--r--contrib/snowball/compiler/analyser.c866
-rw-r--r--contrib/snowball/compiler/driver.c466
-rw-r--r--contrib/snowball/compiler/generator.c1158
-rw-r--r--contrib/snowball/compiler/generator_java.c1452
-rw-r--r--contrib/snowball/compiler/header.h215
-rw-r--r--contrib/snowball/compiler/space.c72
-rw-r--r--contrib/snowball/compiler/syswords.h6
-rw-r--r--contrib/snowball/compiler/syswords2.h4
-rw-r--r--contrib/snowball/compiler/tokeniser.c407
-rw-r--r--contrib/snowball/examples/stemwords.c209
-rw-r--r--contrib/snowball/include/libstemmer.h9
-rw-r--r--contrib/snowball/java/org/tartarus/snowball/Among.java31
-rw-r--r--contrib/snowball/java/org/tartarus/snowball/SnowballProgram.java432
-rw-r--r--contrib/snowball/java/org/tartarus/snowball/SnowballStemmer.java7
-rw-r--r--contrib/snowball/java/org/tartarus/snowball/TestApp.java77
-rw-r--r--contrib/snowball/libstemmer/libstemmer_c.in15
-rwxr-xr-xcontrib/snowball/libstemmer/mkmodules.pl25
-rw-r--r--contrib/snowball/libstemmer/modules.txt44
-rw-r--r--contrib/snowball/runtime/api.c10
-rw-r--r--contrib/snowball/runtime/api.h10
-rw-r--r--contrib/snowball/runtime/header.h3
-rw-r--r--contrib/snowball/runtime/utilities.c191
70 files changed, 8903 insertions, 6045 deletions
diff --git a/contrib/snowball/CMakeLists.txt b/contrib/snowball/CMakeLists.txt
index 7910c7b3a..015f75d1b 100644
--- a/contrib/snowball/CMakeLists.txt
+++ b/contrib/snowball/CMakeLists.txt
@@ -1,20 +1,14 @@
# End of configuration
-SET(LIBSTEM_ALGORITHMS danish dutch english finnish french german hungarian
- italian norwegian porter portuguese romanian
- russian spanish swedish turkish)
-SET(KOI8_ALGORITHMS russian)
-SET(ISO_8859_1_ALGORITHMS danish dutch english finnish french german italian
- norwegian porter portuguese spanish swedish)
-SET(ISO_8859_2_ALGORITHMS hungarian romanian)
-SET(OTHER_ALGORITHMS german2 kraaij_pohlmann lovins)
-SET(ALL_ALGORITHMS ${LIBSTEM_ALGORITHMS} ${OTHER_ALGORITHMS})
+SET(LIBSTEM_ALGORITHMS arabic danish dutch english finnish french german greek hindi hungarian
+ indonesian italian lithuanian nepali norwegian porter portuguese romanian
+ russian serbian spanish swedish tamil turkish)
+SET(ALL_ALGORITHMS ${LIBSTEM_ALGORITHMS})
SET(COMPILER_SOURCES compiler/space.c
compiler/tokeniser.c
compiler/analyser.c
compiler/generator.c
- compiler/driver.c
- compiler/generator_java.c)
+ compiler/driver.c)
SET(SNOWBALL_RUNTIME runtime/api.c
runtime/utilities.c)
@@ -24,9 +18,15 @@ SET(LIBSTEMMER_UTF8_SOURCES libstemmer/libstemmer_utf8.c)
#LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h
#LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in
-SET(STEMWORDS_SOURCES examples/stemwords.c)
SET(MODULES_H "modules.h")
CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/libstemmer_c.in ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/libstemmer.c @ONLY)
+ADD_DEFINITIONS("-DDISABLE_JS")
+ADD_DEFINITIONS("-DDISABLE_GO")
+ADD_DEFINITIONS("-DDISABLE_JAVA")
+ADD_DEFINITIONS("-DDISABLE_PYTHON")
+ADD_DEFINITIONS("-DDISABLE_CSHARP")
+ADD_DEFINITIONS("-DDISABLE_PASCAL")
+ADD_DEFINITIONS("-DDISABLE_RUST")
MACRO(gen_stem IN ENCODING)
FOREACH(_it ${IN})
@@ -34,7 +34,7 @@ MACRO(gen_stem IN ENCODING)
SET(_header "${_base}.h")
SET(_source "${_base}.c")
STRING(REPLACE "UTF_8" "Unicode" _in_enc "${ENCODING}")
- SET(_input "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}/stem_${_in_enc}.sbl")
+ SET(_input "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}.sbl")
IF(${_in_enc} STREQUAL "Unicode" AND NOT EXISTS ${_input})
ADD_CUSTOM_COMMAND(OUTPUT ${_source}
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/snowball "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}/stem_ISO_8859_1.sbl" -o ${_base} -eprefix ${_it}_${ENCODING}_ -r ../runtime -u
@@ -57,7 +57,7 @@ INCLUDE_DIRECTORIES("include")
ADD_EXECUTABLE(snowball ${COMPILER_SOURCES})
ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h
- COMMAND ${PERL_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/mkmodules.pl libstemmer/modules.h libstemmer ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/modules.txt libstemmer/mkinc.mak)
+ COMMAND ${PERL_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/mkmodules.pl ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h libstemmer ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/modules.txt ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/mkinc.mak)
ADD_CUSTOM_TARGET(modules DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h")
SET(STEMMER_SOURCES "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/libstemmer.c")
@@ -65,13 +65,6 @@ ADD_CUSTOM_TARGET(stemmer_deps ALL)
ADD_DEPENDENCIES(stemmer_deps modules)
gen_stem("${LIBSTEM_ALGORITHMS}" "UTF_8")
-gen_stem("${KOI8_ALGORITHMS}" "KOI8_R")
-gen_stem("${ISO_8859_1_ALGORITHMS}" "ISO_8859_1")
-gen_stem("${ISO_8859_2_ALGORITHMS}" "ISO_8859_2")
-
ADD_LIBRARY(stemmer ${LINK_TYPE} ${SNOWBALL_RUNTIME} ${STEMMER_SOURCES})
ADD_DEPENDENCIES(stemmer stemmer_deps)
-
-ADD_EXECUTABLE(stemwords ${STEMWORDS_SOURCES})
-TARGET_LINK_LIBRARIES(stemwords stemmer)
diff --git a/contrib/snowball/GNUmakefile b/contrib/snowball/GNUmakefile
deleted file mode 100644
index a30cafd89..000000000
--- a/contrib/snowball/GNUmakefile
+++ /dev/null
@@ -1,300 +0,0 @@
-# -*- makefile -*-
-
-c_src_dir = src_c
-java_src_main_dir = java/org/tartarus/snowball
-java_src_dir = $(java_src_main_dir)/ext
-
-libstemmer_algorithms = danish dutch english finnish french german hungarian \
- italian \
- norwegian porter portuguese romanian \
- russian spanish swedish turkish
-
-KOI8_R_algorithms = russian
-ISO_8859_1_algorithms = danish dutch english finnish french german italian \
- norwegian porter portuguese spanish swedish
-ISO_8859_2_algorithms = hungarian romanian
-
-other_algorithms = german2 kraaij_pohlmann lovins
-
-all_algorithms = $(libstemmer_algorithms) $(other_algorithms)
-
-COMPILER_SOURCES = compiler/space.c \
- compiler/tokeniser.c \
- compiler/analyser.c \
- compiler/generator.c \
- compiler/driver.c \
- compiler/generator_java.c
-COMPILER_HEADERS = compiler/header.h \
- compiler/syswords.h \
- compiler/syswords2.h
-
-RUNTIME_SOURCES = runtime/api.c \
- runtime/utilities.c
-RUNTIME_HEADERS = runtime/api.h \
- runtime/header.h
-
-JAVARUNTIME_SOURCES = java/org/tartarus/snowball/Among.java \
- java/org/tartarus/snowball/SnowballProgram.java \
- java/org/tartarus/snowball/SnowballStemmer.java \
- java/org/tartarus/snowball/TestApp.java
-
-LIBSTEMMER_SOURCES = libstemmer/libstemmer.c
-LIBSTEMMER_UTF8_SOURCES = libstemmer/libstemmer_utf8.c
-LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h
-LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in
-
-STEMWORDS_SOURCES = examples/stemwords.c
-
-ALL_ALGORITHM_FILES = $(all_algorithms:%=algorithms/%/stem*.sbl)
-C_LIB_SOURCES = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) \
- $(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.c) \
- $(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.c) \
- $(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.c)
-C_LIB_HEADERS = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) \
- $(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.h) \
- $(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.h) \
- $(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.h)
-C_OTHER_SOURCES = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c)
-C_OTHER_HEADERS = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h)
-JAVA_SOURCES = $(libstemmer_algorithms:%=$(java_src_dir)/%Stemmer.java)
-
-COMPILER_OBJECTS=$(COMPILER_SOURCES:.c=.o)
-RUNTIME_OBJECTS=$(RUNTIME_SOURCES:.c=.o)
-LIBSTEMMER_OBJECTS=$(LIBSTEMMER_SOURCES:.c=.o)
-LIBSTEMMER_UTF8_OBJECTS=$(LIBSTEMMER_UTF8_SOURCES:.c=.o)
-STEMWORDS_OBJECTS=$(STEMWORDS_SOURCES:.c=.o)
-C_LIB_OBJECTS = $(C_LIB_SOURCES:.c=.o)
-C_OTHER_OBJECTS = $(C_OTHER_SOURCES:.c=.o)
-JAVA_CLASSES = $(JAVA_SOURCES:.java=.class)
-JAVA_RUNTIME_CLASSES=$(JAVARUNTIME_SOURCES:.java=.class)
-
-CFLAGS=-Iinclude -O2
-CPPFLAGS=-W -Wall -Wmissing-prototypes -Wmissing-declarations
-
-all: snowball libstemmer.o stemwords $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS)
-
-clean:
- rm -f $(COMPILER_OBJECTS) $(RUNTIME_OBJECTS) \
- $(LIBSTEMMER_OBJECTS) $(LIBSTEMMER_UTF8_OBJECTS) $(STEMWORDS_OBJECTS) snowball \
- libstemmer.o stemwords \
- libstemmer/modules.h \
- libstemmer/modules_utf8.h \
- snowball.splint \
- $(C_LIB_SOURCES) $(C_LIB_HEADERS) $(C_LIB_OBJECTS) \
- $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) \
- $(JAVA_SOURCES) $(JAVA_CLASSES) $(JAVA_RUNTIME_CLASSES) \
- libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak \
- libstemmer/libstemmer.c libstemmer/libstemmer_utf8.c
- rm -rf dist
- rmdir $(c_src_dir) || true
-
-snowball: $(COMPILER_OBJECTS)
- $(CC) -o $@ $^
-
-$(COMPILER_OBJECTS): $(COMPILER_HEADERS)
-
-libstemmer/libstemmer.c: libstemmer/libstemmer_c.in
- sed 's/@MODULES_H@/modules.h/' $^ >$@
-
-libstemmer/libstemmer_utf8.c: libstemmer/libstemmer_c.in
- sed 's/@MODULES_H@/modules_utf8.h/' $^ >$@
-
-libstemmer/modules.h libstemmer/mkinc.mak: libstemmer/mkmodules.pl libstemmer/modules.txt
- libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules.txt libstemmer/mkinc.mak
-
-libstemmer/modules_utf8.h libstemmer/mkinc_utf8.mak: libstemmer/mkmodules.pl libstemmer/modules_utf8.txt
- libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules_utf8.txt libstemmer/mkinc_utf8.mak utf8
-
-libstemmer/libstemmer.o: libstemmer/modules.h $(C_LIB_HEADERS)
-
-libstemmer.o: libstemmer/libstemmer.o $(RUNTIME_OBJECTS) $(C_LIB_OBJECTS)
- $(AR) -cru $@ $^
-
-stemwords: $(STEMWORDS_OBJECTS) libstemmer.o
- $(CC) -o $@ $^
-
-algorithms/%/stem_Unicode.sbl: algorithms/%/stem_ISO_8859_1.sbl
- cp $^ $@
-
-$(c_src_dir)/stem_UTF_8_%.c $(c_src_dir)/stem_UTF_8_%.h: algorithms/%/stem_Unicode.sbl snowball
- @mkdir -p $(c_src_dir)
- @l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \
- o="$(c_src_dir)/stem_UTF_8_$${l}"; \
- echo "./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u"; \
- ./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u
-
-$(c_src_dir)/stem_KOI8_R_%.c $(c_src_dir)/stem_KOI8_R_%.h: algorithms/%/stem_KOI8_R.sbl snowball
- @mkdir -p $(c_src_dir)
- @l=`echo "$<" | sed 's!\(.*\)/stem_KOI8_R.sbl$$!\1!;s!^.*/!!'`; \
- o="$(c_src_dir)/stem_KOI8_R_$${l}"; \
- echo "./snowball $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime"; \
- ./snowball $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime
-
-$(c_src_dir)/stem_ISO_8859_1_%.c $(c_src_dir)/stem_ISO_8859_1_%.h: algorithms/%/stem_ISO_8859_1.sbl snowball
- @mkdir -p $(c_src_dir)
- @l=`echo "$<" | sed 's!\(.*\)/stem_ISO_8859_1.sbl$$!\1!;s!^.*/!!'`; \
- o="$(c_src_dir)/stem_ISO_8859_1_$${l}"; \
- echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime"; \
- ./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime
-
-$(c_src_dir)/stem_ISO_8859_2_%.c $(c_src_dir)/stem_ISO_8859_2_%.h: algorithms/%/stem_ISO_8859_2.sbl snowball
- @mkdir -p $(c_src_dir)
- @l=`echo "$<" | sed 's!\(.*\)/stem_ISO_8859_2.sbl$$!\1!;s!^.*/!!'`; \
- o="$(c_src_dir)/stem_ISO_8859_2_$${l}"; \
- echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime"; \
- ./snowball $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime
-
-$(c_src_dir)/stem_%.o: $(c_src_dir)/stem_%.c $(c_src_dir)/stem_%.h
- $(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ $<
-
-$(java_src_dir)/%Stemmer.java: algorithms/%/stem_Unicode.sbl snowball
- @mkdir -p $(java_src_dir)
- @l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \
- o="$(java_src_dir)/$${l}Stemmer"; \
- echo "./snowball $< -j -o $${o} -p \"org.tartarus.snowball.SnowballStemmer\" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer"; \
- ./snowball $< -j -o $${o} -p "org.tartarus.snowball.SnowballStemmer" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer
-
-splint: snowball.splint
-snowball.splint: $(COMPILER_SOURCES)
- splint $^ >$@ -weak
-
-# Make a full source distribution
-dist: dist_snowball dist_libstemmer_c dist_libstemmer_java
-
-# Make a distribution of all the sources involved in snowball
-dist_snowball: $(COMPILER_SOURCES) $(COMPILER_HEADERS) \
- $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \
- $(LIBSTEMMER_SOURCES) \
- $(LIBSTEMMER_UTF8_SOURCES) \
- $(LIBSTEMMER_HEADERS) \
- $(LIBSTEMMER_EXTRA) \
- $(ALL_ALGORITHM_FILES) $(STEMWORDS_SOURCES) \
- GNUmakefile README doc/TODO libstemmer/mkmodules.pl
- destname=snowball_code; \
- dest=dist/$${destname}; \
- rm -rf $${dest} && \
- rm -f $${dest}.tgz && \
- for file in $^; do \
- dir=`dirname $$file` && \
- mkdir -p $${dest}/$${dir} && \
- cp -a $${file} $${dest}/$${dir} || exit 1 ; \
- done && \
- (cd dist && tar zcf $${destname}.tgz $${destname}) && \
- rm -rf $${dest}
-
-# Make a distribution of all the sources required to compile the C library.
-dist_libstemmer_c: \
- $(RUNTIME_SOURCES) \
- $(RUNTIME_HEADERS) \
- $(LIBSTEMMER_SOURCES) \
- $(LIBSTEMMER_UTF8_SOURCES) \
- $(LIBSTEMMER_HEADERS) \
- $(LIBSTEMMER_EXTRA) \
- $(C_LIB_SOURCES) \
- $(C_LIB_HEADERS) \
- libstemmer/mkinc.mak \
- libstemmer/mkinc_utf8.mak
- destname=libstemmer_c; \
- dest=dist/$${destname}; \
- rm -rf $${dest} && \
- rm -f $${dest}.tgz && \
- mkdir -p $${dest} && \
- cp -a doc/libstemmer_c_README $${dest}/README && \
- mkdir -p $${dest}/examples && \
- cp -a examples/stemwords.c $${dest}/examples && \
- mkdir -p $${dest}/$(c_src_dir) && \
- cp -a $(C_LIB_SOURCES) $(C_LIB_HEADERS) $${dest}/$(c_src_dir) && \
- mkdir -p $${dest}/runtime && \
- cp -a $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) $${dest}/runtime && \
- mkdir -p $${dest}/libstemmer && \
- cp -a $(LIBSTEMMER_SOURCES) $(LIBSTEMMER_UTF8_SOURCES) $(LIBSTEMMER_HEADERS) $(LIBSTEMMER_EXTRA) $${dest}/libstemmer && \
- mkdir -p $${dest}/include && \
- mv $${dest}/libstemmer/libstemmer.h $${dest}/include && \
- (cd $${dest} && \
- echo "README" >> MANIFEST && \
- ls $(c_src_dir)/*.c $(c_src_dir)/*.h >> MANIFEST && \
- ls runtime/*.c runtime/*.h >> MANIFEST && \
- ls libstemmer/*.c libstemmer/*.h >> MANIFEST && \
- ls include/*.h >> MANIFEST) && \
- cp -a libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak $${dest}/ && \
- echo 'include mkinc.mak' >> $${dest}/Makefile && \
- echo 'CFLAGS=-Iinclude' >> $${dest}/Makefile && \
- echo 'all: libstemmer.o stemwords' >> $${dest}/Makefile && \
- echo 'libstemmer.o: $$(snowball_sources:.c=.o)' >> $${dest}/Makefile && \
- echo ' $$(AR) -cru $$@ $$^' >> $${dest}/Makefile && \
- echo 'stemwords: examples/stemwords.o libstemmer.o' >> $${dest}/Makefile && \
- echo ' $$(CC) -o $$@ $$^' >> $${dest}/Makefile && \
- echo 'clean:' >> $${dest}/Makefile && \
- echo ' rm -f stemwords *.o $(c_src_dir)/*.o runtime/*.o libstemmer/*.o' >> $${dest}/Makefile && \
- (cd dist && tar zcf $${destname}.tgz $${destname}) && \
- rm -rf $${dest}
-
-# Make a distribution of all the sources required to compile the Java library.
-dist_libstemmer_java: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \
- $(LIBSTEMMER_EXTRA) \
- $(JAVA_SOURCES)
- destname=libstemmer_java; \
- dest=dist/$${destname}; \
- rm -rf $${dest} && \
- rm -f $${dest}.tgz && \
- mkdir -p $${dest} && \
- cp -a doc/libstemmer_java_README $${dest}/README && \
- mkdir -p $${dest}/$(java_src_dir) && \
- cp -a $(JAVA_SOURCES) $${dest}/$(java_src_dir) && \
- mkdir -p $${dest}/$(java_src_main_dir) && \
- cp -a $(JAVARUNTIME_SOURCES) $${dest}/$(java_src_main_dir) && \
- (cd $${dest} && \
- echo "README" >> MANIFEST && \
- ls $(java_src_dir)/*.java >> MANIFEST && \
- ls $(java_src_main_dir)/*.java >> MANIFEST) && \
- (cd dist && tar zcf $${destname}.tgz $${destname}) && \
- rm -rf $${dest}
-
-check: check_utf8 check_iso_8859_1 check_iso_8859_2 check_koi8r
-
-check_utf8: $(libstemmer_algorithms:%=check_utf8_%)
-
-check_iso_8859_1: $(ISO_8859_1_algorithms:%=check_iso_8859_1_%)
-
-check_iso_8859_2: $(ISO_8859_2_algorithms:%=check_iso_8859_2_%)
-
-check_koi8r: $(KOI8_R_algorithms:%=check_koi8r_%)
-
-# Where the data files are located - assumed their repo is checked out as
-# a sibling to this one.
-STEMMING_DATA = ../snowball-data
-
-check_utf8_%: $(STEMMING_DATA)/% stemwords
- @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with UTF-8"
- @./stemwords -c UTF_8 -l `echo $<|sed 's!.*/!!'` -i $</voc.txt -o tmp.txt
- @diff -u $</output.txt tmp.txt
- @if [ -e $</diffs.txt ] ; \
- then \
- ./stemwords -c UTF_8 -l `echo $<|sed 's!.*/!!'` -i $</voc.txt -o tmp.txt -p2 && \
- diff -u $</diffs.txt tmp.txt; \
- fi
- @rm tmp.txt
-
-check_iso_8859_1_%: $(STEMMING_DATA)/% stemwords
- @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_1"
- @python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-1"))' | \
- ./stemwords -c ISO_8859_1 -l `echo $<|sed 's!.*/!!'` -o tmp.txt
- @python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-1"))' | \
- diff -u - tmp.txt
- @rm tmp.txt
-
-check_iso_8859_2_%: $(STEMMING_DATA)/% stemwords
- @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_2"
- @python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-2"))' | \
- ./stemwords -c ISO_8859_2 -l `echo $<|sed 's!.*/!!'` -o tmp.txt
- @python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-2"))' | \
- diff -u - tmp.txt
- @rm tmp.txt
-
-check_koi8r_%: $(STEMMING_DATA)/% stemwords
- @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with KOI8R"
- @python -c 'print(open("$</voc.txt").read().decode("utf8").encode("koi8_r"))' | \
- ./stemwords -c KOI8_R -l `echo $<|sed 's!.*/!!'` -o tmp.txt
- @python -c 'print(open("$</output.txt").read().decode("utf8").encode("koi8_r"))' | \
- diff -u - tmp.txt
- @rm tmp.txt
diff --git a/contrib/snowball/NEWS b/contrib/snowball/NEWS
new file mode 100644
index 000000000..c71c12dd3
--- /dev/null
+++ b/contrib/snowball/NEWS
@@ -0,0 +1,407 @@
+Snowball 2.0.0 (2019-10-02)
+===========================
+
+C/C++
+-----
+
+* Fully handle 4-byte UTF-8 sequences. Previously `hop` and `next` handled
+ sequences of any length, but commands which look at the character value only
+ handled sequences up to length 3. Fixes #89.
+
+* Fix handling of a 3-byte UTF-8 sequence in a grouping in `backwardmode`.
+
+Java
+----
+
+* TestApp.java:
+
+ - Always use UTF-8 for I/O. Patch from David Corbett (#80).
+
+ - Allow reading input from stdin.
+
+ - Remove rather pointless "stem n times" feature.
+
+ - Only lower case ASCII to match stemwords.c.
+
+ - Stem empty lines too to match stemwords.c.
+
+Code Quality Improvements
+-------------------------
+
+* Fix various warnings from newer compilers.
+
+* Improve use of `const`.
+
+* Share common functions between compiler backends rather than having multiple
+ copies of the same code.
+
+* Assorted code clean-up.
+
+* Initialise line_labelled member of struct generator to 0. Previously we were
+ invoking undefined behaviour, though in practice it'll be zero initialised on
+ most platforms.
+
+New Code Generators
+-------------------
+
+* Add Python generator (#24). Originally written by Yoshiki Shibukawa, with
+ additional updates by Dmitry Shachnev.
+
+* Add Javascript generator. Based on JSX generator (#26) written by Yoshiki
+ Shibukawa.
+
+* Add Rust generator from Jakob Demler (#51).
+
+* Add Go generator from Marty Schoch (#57).
+
+* Add C# generator. Based on patch from Cesar Souza (#16, #17).
+
+* Add Pascal generator. Based on Delphi backend from stemming.zip file on old
+ website (#75).
+
+New Language Features
+---------------------
+
+* Add `len` and `lenof` to measure Unicode length. These are similar to `size`
+ and `sizeof` (respectively), but `size` and `sizeof` return the length in
+ bytes under `-utf8`, whereas these new commands give the same result whether
+ using `-utf8`, `-widechars` or neither (but under `-utf8` they are O(n) in
+ the length of the string). For compatibility with existing code which might
+ use these as variable or function names, they stop being treated as tokens if
+ declared to be a variable or function.
+
+* New `{U+1234}` stringdef notation for Unicode codepoints.
+
+* More versatile integer tests. Now you can compare any two arithmetic
+ expressions with a relational operator in parentheses after the `$`, so for
+ example `$(len > 3)` can now be used when previously a temporary variable was
+ required: `$tmp = len $tmp > 3`
+
+Code generation improvements
+----------------------------
+
+* General:
+
+ + Avoid unnecessarily saving and restoring of the cursor for more commands -
+ `atlimit`, `do`, `set` and `unset` all leave the cursor alone or always
+ restore its value, and for C `booltest` (which other languages already
+ handled).
+
+ + Special case handling for `setlimit tomark AE`. All uses of setlimit in
+ the current stemmers we ship follow this pattern, and by special-casing we
+ can avoid having to save and restore the cursor (#74).
+
+ + Merge duplicate actions in the same `among`. This reduces the size of the
+ switch/if-chain in the generated code which dispatch the among for many of
+ the stemmers.
+
+ + Generate simpler code for `among`. We always check for a zero return value
+ when we call the among, so there's no point also checking for that in the
+ switch/if-chain. We can also avoid the switch/if-chain entirely when
+ there's only one possible outcome (besides the zero return).
+
+ + Optimise code generated for `do <function call>`. This speeds up "make
+ check_python" by about 2%, and should speed up other interpreted languages
+ too (#110).
+
+ + Generate more and better comments referencing snowball source.
+
+ + Add homepage URL and compiler version as comments in generated files.
+
+* C/C++:
+
+ + Fix `size` and `sizeof` to not report one too high (reported by Assem
+ Chelli in #32).
+
+ + If signal `f` from a function call would lead to return from the current
+ function then handle this and bailing out on an error together with a
+ simple `if (ret <= 0) return ret;`
+
+ + Inline testing for a single character literals.
+
+ + Avoiding generating `|| 0` in corner case - this can result in a compiler
+ warning when building the generated code.
+
+ + Implement `insert_v()` in terms of `insert_s()`.
+
+ + Add conditional `extern "C"` so `runtime/api.h` can be included from C++
+ code. Closes #90, reported by vvarma.
+
+* Java:
+
+ + Fix functions in `among` to work in Java. We seem to need to make the
+ methods called from among `public` instead of `private`, and to call them
+ on `this` instead of the `methodObject` (which is cleaner anyway). No
+ revision in version control seems to generate working code for this case,
+ but Richard says it definitely used to work - possibly older JVMs failed to
+ correctly enforce the access controls when methods were invoked by
+ reflection.
+
+ + Code after handling `f` by returning from the current function is
+ unreachable too.
+
+ + Previously we incorrectly decided that code after an `or` was
+ unreachable in certain cases. None of the current stemmers in the
+ distribution triggered this, but Martin Porter's snowball version
+ of the Schinke Latin stemmer does. Fixes #58, reported by Alexander
+ Myltsev.
+
+ + The reachability logic was failing to consider reachability from
+ the final command in an `or`. Fixes #82, reported by David Corbett.
+
+ + Fix `maxint` and `minint`. Patch from David Corbett in #31.
+
+ + Fix `$` on strings. The previous generated code was just wrong. This
+ doesn't affect any of the included algorithms, but for example breaks
+ Martin Porter's snowball implementation of Schinke's Latin Stemmer.
+ Issue noted by Jakob Demler while working on the Rust backend in #51,
+ and reported in the Schinke's Latin Stemmer by Alexander Myltsev
+ in #58.
+
+ + Make SnowballProgram objects serializable. Patch from Oleg Smirnov in #43.
+
+ + Eliminate range-check implementation for groupings. This was removed from
+ the C generator 10 years earlier, isn't used for any of the existing
+ algorithms, and it doesn't seem likely it would be - the grouping would
+ have to consist entirely of a contiguous block of Unicode code-points.
+
+ + Simplify code generated for `repeat` and `atleast`.
+
+ + Eliminate unused return values and variables from runtime functions.
+
+ + Only import the `among` and `SnowballProgram` classes if they're actually
+ used.
+
+ + Only generate `copy_from()` method if it's used.
+
+ + Merge runtime functions `eq_s` and `eq_v` functions.
+
+ + Java arrays know their own length so stop storing it separately.
+
+ + Escape char 127 (DEL) in generated Java code. It's unlikely that this
+ character would actually be used in a real stemmer, so this was more of a
+ theoretical bug.
+
+ + Drop unused import of InvocationTargetException from SnowballStemmer.
+ Reported by GerritDeMeulder in #72.
+
+ + Fix lint check issues in generated Java code. The stemmer classes are only
+ referenced in the example app via reflection, so add
+ @SuppressWarnings("unused") for them. The stemmer classes override
+ equals() and hashCode() methods from the standard java Object class, so
+ mark these with @Override. Both suggested by GerritDeMeulder in #72.
+
+ + Declare Java variables at point of use in generated code. Putting all
+ declarations at the top of the function was adding unnecessary complexity
+ to the Java generator code for no benefit.
+
+ + Improve formatting of generated code.
+
+New stemming algorithms
+-----------------------
+
+* Add Tamil stemmer from Damodharan Rajalingam (#2, #3).
+
+* Add Arabic stemmer from Assem Chelli (#32, #50).
+
+* Add Irish stemmer Jim O'Regan (#48).
+
+* Add Nepali stemmer from Arthur Zakirov (#70).
+
+* Add Indonesian stemmer from Olly Betts (#71).
+
+* Add Hindi stemmer from Olly Betts (#73). Thanks to David Corbett for review.
+
+* Add Lithuanian stemmer from Dainius Jocas (#22, #76).
+
+* Add Greek stemmer from Oleg Smirnov (#44).
+
+* Add Catalan and Basque stemmers from Israel Olalla (#104).
+
+Behavioural changes to existing algorithms
+------------------------------------------
+
+* Portuguese:
+
+ + Replace incorrect Spanish suffixes by Portuguese suffixes (#1).
+
+* French:
+
+ + The MSDOS CP850 version of the French algorithm was missing changes present
+ in the ISO8859-1 and Unicode versions. There's now a single version of
+ each algorithm which was based on the Unicode version.
+
+ + Recognize French suffixes even when they begin with diaereses. Patch from
+ David Corbett in #78.
+
+* Russian:
+
+ + We now normalise 'ё' to 'е' before stemming. The documentation has long
+ said "we assume ['ё'] is mapped into ['е']" but it's more convenient for
+ the stemmer to actually perform this normalisation. This change has no
+ effect if the caller is already normalising as we recommend. It's a change
+ in behaviour they aren't, but 'ё' occurs rarely (there are currently no
+ instances in our test vocabulary) and this improves behaviour when it does
+ occur. Patch from Eugene Mirotin (#65, #68).
+
+* Finish:
+
+ + Adjust the Finnish algorithm not to mangle numbers. This change also
+ means it tends to leave foreign words alone. Fixes #66.
+
+* Danish:
+
+ + Adjust Danish algorithm not to mangle alphanumeric codes. In particular
+ alphanumeric codes ending in a double digit (e.g. 0x0e00, hal9000,
+ space1999) are no longer mangled. See #81.
+
+Optimisations to existing algorithms
+------------------------------------
+
+* Turkish:
+
+ + Simplify uses of `test` in stemmer code.
+
+ + Check for 'ad' or 'soyad' more efficiently, and without needing the
+ strlen variable. This speeds up "make check_utf8_turkish" by 11%
+ on x86 Linux.
+
+* Kraaij-Pohlmann:
+
+ + Eliminate variable x `$p1 <= cursor` is simpler and a little more efficient
+ than `setmark x $x >= p1`.
+
+Code clarity improvements to existing algorithms
+------------------------------------------------
+
+* Turkish:
+
+ + Use , for cedilla to match the conventions used in other stemmers.
+
+* Kraaij-Pohlmann:
+
+ + Avoid cryptic `[among ( (])` ... `)` construct - instead use the same
+ `[substring] among (` ... `)` construct we do in other stemmers.
+
+Compiler
+--------
+
+* Support conventional --help and --version options.
+
+* Warn if -r or -ep used with backend other than C/C++.
+
+* Warn if encoding command line options are specified when generating code in a
+ language with a fixed encoding.
+
+* The default classname is now set based on the output filename, so `-n` is now
+ often no longer needed. Fixes #64.
+
+* Avoid potential one byte buffer over-read when parsing snowball code.
+
+* Avoid comparing with uninitialised array element during compilation.
+
+* Improve `-syntax` output for `setlimit L for C`.
+
+* Optimise away double negation so generators don't have to worry about
+ generating `--` (decrement operator in many languages). Fixes #52, reported
+ by David Corbett.
+
+* Improved compiler error and warning messages:
+
+ - We now report FILE:LINE: before each diagnostic message.
+
+ - Improve warnings for unused declarations/definitions.
+
+ - Warn for variables which are used, but either never initialised
+ or never read.
+
+ - Flag non-ASCII literal strings. This is an error for wide Unicode, but
+ only a warning for single-byte and UTF-8 which work so long as the source
+ encoding matches the encoding used in the generated stemmer code.
+
+ - Improve error recovery after an undeclared `define`. We now sniff the
+ token after the identifier and if it is `as` we parse as a routine,
+ otherwise we parse as a grouping. Previously we always just assumed it was
+ a routine, which gave a confusing second error if it was a grouping.
+
+ - Improve error recovery after an unexpected token in `among`. Previously
+ we acted as if the unexpected token closed the `among` (this probably
+ wasn't intended but just a missing `break;` in a switch statement). Now we
+ issue an error and try the next token.
+
+* Report error instead of silently truncating character values (e.g. `hex 123`
+ previously silently became byte 0x23 which is `#` rather than a
+ g-with-cedilla).
+
+* Enlarge the initial input buffer size to 8192 bytes and double each time we
+ hit the end. Snowball programs are typically a few KB in size (with the
+ current largest we ship being the Greek stemmer at 27KB) so the previous
+ approach of starting with a 10 byte input buffer and increasing its size by
+ 50% plus 40 bytes each time it filled was inefficient, needing up to 15
+ reallocations to load greek.sbl.
+
+* Identify variables only used by one `routine`/`external`. This information
+ isn't yet used, but such variables which are also always written to before
+ being read can be emitted as local variables in most target languages.
+
+* We now allow multiple source files on command line, and allow them to be
+ after (or even interspersed) with options to better match modern Unix
+ conventions. Support for multiple source files allows specifying a single
+ byte character set mapping via a source file of `stringdef`.
+
+* Avoid infinite recursion in compiler when optimising a recursive snowball
+ function. Recursive functions aren't typical in snowball programs, but
+ the compiler shouldn't crash for any input, especially not a valid one.
+ We now simply limit on how deep the compiler will recurse and make the
+ pessimistic assumption in the unlikely event we hit this limit.
+
+Build system:
+
+* `make clean` in C libstemmer_c distribution now removes `examples/*.o`.
+ (#59)
+
+* Fix all the places which previously had to have a list of stemmers to work
+ dynamically or be generated, so now only modules.txt needs updating to add
+ a new stemmer.
+
+* Add check_java make target which runs tests for java.
+
+* Support gzipped test data (the uncompressed arabic test data is too big for
+ github).
+
+* GNUmakefile: Drop useless `-eprefix` and `-r` options from snowball
+ invocations for Java - these are only meaningful when generating C code.
+
+* Pass CFLAGS when linking which matches convention (e.g. automake does it) and
+ facilitates use of tools such as ASan. Fixes #84, reported by Thomas
+ Pointhuber.
+
+* Add CI builds with -std=c90 to check compiler and generated code are C90
+ (#54)
+
+libstemmer stuff:
+
+* Split out CPPFLAGS from CFLAGS and use CFLAGS when linking stemwords.
+
+* Add -O2 to CFLAGS.
+
+* Make generated tables of encodings and modules const.
+
+* Fix clang static analyzer memory leak warning (in practice this code path
+ can never actually be taken). Patch from Patrick O. Perry (#56)
+
+documentation
+
+* Added copyright and licensing details (#10).
+
+* Document that libstemmer supports ISO_8859_2 encoding. Currently hungarian
+ and romanian are available in ISO_8859_2.
+
+* Remove documentation falsely claiming that libstemmer supports CP850
+ encoding.
+
+* CONTRIBUTING.rst: Add guidance for contributing new stemming algorithms and
+ new language backends.
+
+* Overhaul libstemmer_python_README. Most notably, replace the benchmark data
+ which was very out of date.
diff --git a/contrib/snowball/algorithms/arabic.sbl b/contrib/snowball/algorithms/arabic.sbl
new file mode 100644
index 000000000..d827ee7d8
--- /dev/null
+++ b/contrib/snowball/algorithms/arabic.sbl
@@ -0,0 +1,561 @@
+/*
+ * Authors:
+ * - Assem Chelli, < assem [dot] ch [at] gmail >
+ * - Abdelkrim Aries <ab [underscore] aries [at] esi [dot] dz>
+ *
+*/
+
+stringescapes { }
+
+/* the Arabic letters in Unicode */
+// Hamza
+stringdef o '{U+0621}' // Hamza
+stringdef ao '{U+0623}' // Hamza above Alef
+stringdef ao_ '{U+0625}' // Hamza below Alef
+stringdef a~ '{U+0622}' // Alef madda
+stringdef wo '{U+0624}' // Hamza above waw
+stringdef yo '{U+0626}' // Hamza above yeh
+
+// Letters
+stringdef a '{U+0627}' // Alef
+stringdef a_ '{U+0649}' // Alef Maksura
+stringdef b '{U+0628}' // Beh
+stringdef t_ '{U+0629}' // Teh_Marbuta
+stringdef t '{U+062A}' // Teh
+stringdef th '{U+062B}' // Theh
+stringdef j '{U+062C}' // Jeem
+stringdef h '{U+062D}' // Hah
+stringdef x '{U+062E}' // Khah
+stringdef d '{U+062F}' // Dal
+stringdef dz '{U+0630}' // Thal
+stringdef r '{U+0631}' // Reh
+stringdef z '{U+0632}' // Zain
+stringdef s '{U+0633}' // Seen
+stringdef sh '{U+0634}' // Sheen
+stringdef c '{U+0635}' // Sad
+stringdef dh '{U+0636}' // Dad
+stringdef tt '{U+0637}' // Tah
+stringdef zh '{U+0638}' // Zah
+stringdef i '{U+0639}' // Ain
+stringdef gh '{U+063A}' // Ghain
+stringdef f '{U+0641}' // Feh
+stringdef q '{U+0642}' // Qaf
+stringdef k '{U+0643}' // Kaf
+stringdef l '{U+0644}' // Lam
+stringdef m '{U+0645}' // Meem
+stringdef n '{U+0646}' // Noon
+stringdef e '{U+0647}' // Heh
+stringdef w '{U+0648}' // Waw
+stringdef y '{U+064A}' // Yeh
+
+// Diacritics
+stringdef aan '{U+064B}' // FatHatan
+stringdef uun '{U+064C}' // Dammatan
+stringdef iin '{U+064D}' // Kasratan
+stringdef aa '{U+064E}' // FatHa
+stringdef uu '{U+064F}' // Damma
+stringdef ii '{U+0650}' // Kasra
+stringdef oo '{U+0652}' // Sukun
+stringdef ~ '{U+0651}' // Shadda
+
+// Hindu–Arabic numerals
+stringdef 0 '{U+0660}'
+stringdef 1 '{U+0661}'
+stringdef 2 '{U+0662}'
+stringdef 3 '{U+0663}'
+stringdef 4 '{U+0664}'
+stringdef 5 '{U+0665}'
+stringdef 6 '{U+0666}'
+stringdef 7 '{U+0667}'
+stringdef 8 '{U+0668}'
+stringdef 9 '{U+0669}'
+
+
+// Kasheeda
+stringdef _ '{U+0640}' // Kasheeda, Tatweel
+
+// Shaped forms
+stringdef o1 '{U+FE80}' // HAMZA
+stringdef ao1 '{U+FE83}' // ALEF_HAMZA_ABOVE
+stringdef ao2 '{U+FE84}' // ALEF_HAMZA_ABOVE
+stringdef ao_1 '{U+FE87}' // ALEF_HAMZA_BELOW
+stringdef ao_2 '{U+FE88}' // ALEF_HAMZA_BELOW
+stringdef yo1 '{U+FE8B}' // YEH_HAMZA
+stringdef yo2 '{U+FE8C}' // YEH_HAMZA
+stringdef yo3 '{U+FE89}' // YEH_HAMZA
+stringdef yo4 '{U+FE8A}' // YEH_HAMZA
+stringdef a~1 '{U+FE81}' // ALEF_MADDA
+stringdef a~2 '{U+FE82}' // ALEF_MADDA
+stringdef wo1 '{U+FE85}' // WAW_HAMZA
+stringdef wo2 '{U+FE86}' // WAW_HAMZA
+stringdef a1 '{U+FE8D}' // ALEF
+stringdef a2 '{U+FE8E}' // ALEF
+stringdef b1 '{U+FE8F}' // BEH
+stringdef b2 '{U+FE90}' // BEH
+stringdef b3 '{U+FE91}' // BEH
+stringdef b4 '{U+FE92}' // BEH
+stringdef t_1 '{U+FE93}' // TEH_MARBUTA
+stringdef t_2 '{U+FE94}' // TEH_MARBUTA
+stringdef t1 '{U+FE97}' // TEH
+stringdef t2 '{U+FE98}' // TEH
+stringdef t3 '{U+FE95}' // TEH
+stringdef t4 '{U+FE96}' // TEH
+stringdef th1 '{U+FE9B}' // THEH
+stringdef th2 '{U+FE9C}' // THEH
+stringdef th3 '{U+FE9A}' // THEH
+stringdef th4 '{U+FE99}' // THEH
+stringdef j1 '{U+FE9F}' // JEEM
+stringdef j2 '{U+FEA0}' // JEEM
+stringdef j3 '{U+FE9D}' // JEEM
+stringdef j4 '{U+FE9E}' // JEEM
+stringdef h1 '{U+FEA3}' // HAH
+stringdef h2 '{U+FEA4}' // HAH
+stringdef h3 '{U+FEA1}' // HAH
+stringdef h4 '{U+FEA2}' // HAH
+stringdef x1 '{U+FEA7}' // KHAH
+stringdef x2 '{U+FEA8}' // KHAH
+stringdef x3 '{U+FEA5}' // KHAH
+stringdef x4 '{U+FEA6}' // KHAH
+stringdef d1 '{U+FEA9}' // DAL
+stringdef d2 '{U+FEAA}' // DAL
+stringdef dz1 '{U+FEAB}' // THAL
+stringdef dz2 '{U+FEAC}' // THAL
+stringdef r1 '{U+FEAD}' // REH
+stringdef r2 '{U+FEAE}' // REH
+stringdef z1 '{U+FEAF}' // ZAIN
+stringdef z2 '{U+FEB0}' // ZAIN
+stringdef s1 '{U+FEB3}' // SEEN
+stringdef s2 '{U+FEB4}' // SEEN
+stringdef s3 '{U+FEB1}' // SEEN
+stringdef s4 '{U+FEB2}' // SEEN
+stringdef sh1 '{U+FEB7}' // SHEEN
+stringdef sh2 '{U+FEB8}' // SHEEN
+stringdef sh3 '{U+FEB5}' // SHEEN
+stringdef sh4 '{U+FEB6}' // SHEEN
+stringdef c1 '{U+FEBB}' // SAD
+stringdef c2 '{U+FEBC}' // SAD
+stringdef c3 '{U+FEB9}' // SAD
+stringdef c4 '{U+FEBA}' // SAD
+stringdef dh1 '{U+FEBF}' // DAD
+stringdef dh2 '{U+FEC0}' // DAD
+stringdef dh3 '{U+FEBD}' // DAD
+stringdef dh4 '{U+FEBE}' // DAD
+stringdef tt1 '{U+FEC3}' // TAH
+stringdef tt2 '{U+FEC4}' // TAH
+stringdef tt3 '{U+FEC1}' // TAH
+stringdef tt4 '{U+FEC2}' // TAH
+stringdef zh1 '{U+FEC7}' // ZAH
+stringdef zh2 '{U+FEC8}' // ZAH
+stringdef zh3 '{U+FEC5}' // ZAH
+stringdef zh4 '{U+FEC6}' // ZAH
+stringdef i1 '{U+FECB}' // AIN
+stringdef i2 '{U+FECC}' // AIN
+stringdef i3 '{U+FEC9}' // AIN
+stringdef i4 '{U+FECA}' // AIN
+stringdef gh1 '{U+FECF}' // GHAIN
+stringdef gh2 '{U+FED0}' // GHAIN
+stringdef gh3 '{U+FECD}' // GHAIN
+stringdef gh4 '{U+FECE}' // GHAIN
+stringdef f1 '{U+FED3}' // FEH
+stringdef f2 '{U+FED4}' // FEH
+stringdef f3 '{U+FED1}' // FEH
+stringdef f4 '{U+FED2}' // FEH
+stringdef q1 '{U+FED7}' // QAF
+stringdef q2 '{U+FED8}' // QAF
+stringdef q3 '{U+FED5}' // QAF
+stringdef q4 '{U+FED6}' // QAF
+stringdef k1 '{U+FEDB}' // KAF
+stringdef k2 '{U+FEDC}' // KAF
+stringdef k3 '{U+FED9}' // KAF
+stringdef k4 '{U+FEDA}' // KAF
+stringdef l1 '{U+FEDF}' // LAM
+stringdef l2 '{U+FEE0}' // LAM
+stringdef l3 '{U+FEDD}' // LAM
+stringdef l4 '{U+FEDE}' // LAM
+stringdef m1 '{U+FEE3}' // MEEM
+stringdef m2 '{U+FEE4}' // MEEM
+stringdef m3 '{U+FEE1}' // MEEM
+stringdef m4 '{U+FEE2}' // MEEM
+stringdef n1 '{U+FEE7}' // NOON
+stringdef n2 '{U+FEE8}' // NOON
+stringdef n3 '{U+FEE5}' // NOON
+stringdef n4 '{U+FEE6}' // NOON
+stringdef e1 '{U+FEEB}' // HEH
+stringdef e2 '{U+FEEC}' // HEH
+stringdef e3 '{U+FEE9}' // HEH
+stringdef e4 '{U+FEEA}' // HEH
+stringdef w1 '{U+FEED}' // WAW
+stringdef w2 '{U+FEEE}' // WAW
+stringdef a_1 '{U+FEEF}' // ALEF_MAKSURA
+stringdef a_2 '{U+FEF0}' // ALEF_MAKSURA
+stringdef y1 '{U+FEF3}' // YEH
+stringdef y2 '{U+FEF4}' // YEH
+stringdef y3 '{U+FEF1}' // YEH
+stringdef y4 '{U+FEF2}' // YEH
+
+// Ligatures Lam-Alef
+stringdef la '{U+FEFB}' // LAM_ALEF
+stringdef la2 '{U+FEFC}' // LAM_ALEF
+stringdef lao '{U+FEF7}' // LAM_ALEF_HAMZA_ABOVE
+stringdef lao2 '{U+FEF8}' // LAM_ALEF_HAMZA_ABOVE
+stringdef lao_ '{U+FEF9}' // LAM_ALEF_HAMZA_BELOW
+stringdef lao_2 '{U+FEFA}' // LAM_ALEF_HAMZA_BELOW
+stringdef la~ '{U+FEF5}' // LAM_ALEF_MADDA_ABOVE
+stringdef la~2 '{U+FEF6}' // LAM_ALEF_MADDA_ABOVE
+
+
+booleans (
+ is_noun
+ is_verb
+ is_defined
+ )
+
+routines (
+ Prefix_Step1
+ Prefix_Step2
+ Prefix_Step3a_Noun
+ Prefix_Step3b_Noun
+ Prefix_Step3_Verb
+ Prefix_Step4_Verb
+
+ Suffix_All_alef_maqsura
+ Suffix_Noun_Step1a
+ Suffix_Noun_Step1b
+ Suffix_Noun_Step2a
+ Suffix_Noun_Step2b
+ Suffix_Noun_Step2c1
+ Suffix_Noun_Step2c2
+ Suffix_Noun_Step3
+ Suffix_Verb_Step1
+ Suffix_Verb_Step2a
+ Suffix_Verb_Step2b
+ Suffix_Verb_Step2c
+
+ Normalize_post
+ Normalize_pre
+
+ Checks1
+)
+
+externals ( stem )
+
+groupings ( )
+
+
+// Normalizations
+define Normalize_pre as (
+ do repeat (
+ (
+ [substring] among (
+ '{aan}' '{uun}' '{iin}' '{aa}' '{uu}' '{ii}' '{oo}' '{~}'( delete ) // strip vocalization
+ '{_}' ( delete ) // strip kasheeda
+
+ // Hindu–Arabic numerals
+ '{0}' ( <- '0')
+ '{1}' ( <- '1')
+ '{2}' ( <- '2')
+ '{3}' ( <- '3')
+ '{4}' ( <- '4')
+ '{5}' ( <- '5')
+ '{6}' ( <- '6')
+ '{7}' ( <- '7')
+ '{8}' ( <- '8')
+ '{9}' ( <- '9')
+
+ // Shaped forms
+ '{o1}' ( <- '{o}' ) // HAMZA
+ '{ao1}' '{ao2}' ( <- '{ao}' ) // ALEF_HAMZA_ABOVE
+ '{ao_1}' '{ao_2}' ( <- '{ao_}' ) // ALEF_HAMZA_BELOW
+ '{yo1}' '{yo2}' '{yo3}' '{yo4}' ( <- '{yo}' ) // YEH_HAMZA
+ '{a~1}' '{a~2}'( <- '{a~}' ) // ALEF_MADDA
+ '{wo1}' '{wo2}'( <- '{wo}' ) // WAW_HAMZA
+ '{a1}' '{a2}' ( <- '{a}' ) // ALEF
+ '{b1}' '{b2}' '{b3}' '{b4}' ( <- '{b}' ) // BEH
+ '{t_1}' '{t_2}' ( <- '{t_}' ) // TEH_MARBUTA
+ '{t1}' '{t2}' '{t3}' '{t4}' ( <- '{t}' ) // TEH
+ '{th1}' '{th2}' '{th3}' '{th4}' ( <- '{th}' ) // THEH
+ '{j1}' '{j2}' '{j3}' '{j4}'( <- '{j}' ) // JEEM
+ '{h1}' '{h2}' '{h3}' '{h4}' ( <- '{h}' ) // HAH
+ '{x1}' '{x2}' '{x3}' '{x4}'( <- '{x}' ) // KHAH
+ '{d1}' '{d2}' ( <- '{d}' ) // DAL
+ '{dz1}''{dz2}' ( <- '{dz}' ) // THAL
+ '{r1}' '{r2}'( <- '{r}' ) // REH
+ '{z1}' '{z2}' ( <- '{z}' ) // ZAIN
+ '{s1}' '{s2}' '{s3}' '{s4}'( <- '{s}' ) // SEEN
+ '{sh1}' '{sh2}' '{sh3}' '{sh4}' ( <- '{sh}' ) // SHEEN
+ '{c1}' '{c2}' '{c3}' '{c4}'( <- '{c}' ) // SAD
+ '{dh1}' '{dh2}' '{dh3}' '{dh4}'( <- '{dh}' ) // DAD
+ '{tt1}' '{tt2}' '{tt3}' '{tt4}' ( <- '{tt}' ) // TAH
+ '{zh1}' '{zh2}' '{zh3}' '{zh4}'( <- '{zh}' ) // ZAH
+ '{i1}' '{i2}' '{i3}' '{i4}'( <- '{i}' ) // AIN
+ '{gh1}' '{gh2}' '{gh3}' '{gh4}'( <- '{gh}' ) // GHAIN
+ '{f1}' '{f2}' '{f3}' '{f4}' ( <- '{f}' ) // FEH
+ '{q1}' '{q2}' '{q3}' '{q4}' ( <- '{q}' ) // QAF
+ '{k1}' '{k2}' '{k3}' '{k4}'( <- '{k}' ) // KAF
+ '{l1}' '{l2}' '{l3}' '{l4}'( <- '{l}' ) // LAM
+ '{m1}' '{m2}' '{m3}' '{m4}' ( <- '{m}' ) // MEEM
+ '{n1}' '{n2}' '{n3}' '{n4}'( <- '{n}' ) // NOON
+ '{e1}' '{e2}' '{e3}' '{e4}' ( <- '{e}' ) // HEH
+ '{w1}' '{w2}' ( <- '{w}' ) // WAW
+ '{a_1}' '{a_2}' ( <- '{a_}' ) // ALEF_MAKSURA
+ '{y1}' '{y2}' '{y3}' '{y4}' ( <- '{y}' ) // YEH
+
+ // Ligatures Lam-Alef
+ '{la}' '{la2}' (<- '{l}{a}')
+ '{lao}' '{lao2}' (<- '{l}{ao}')
+ '{lao_}' '{lao_2}' (<- '{l}{ao_}')
+ '{la~}' '{la~2}' (<- '{l}{a~}')
+
+ )
+ )
+ or
+ next
+ )
+)
+
+define Normalize_post as (
+
+ do (
+ // normalize last hamza
+ backwards (
+ [substring] among (
+ '{ao}''{ao_}' '{a~}' ( <- '{o}')
+ '{wo}' ( <- '{o}')
+ '{yo}' ( <- '{o}')
+ )
+ )
+ )
+
+ do repeat (
+ (
+ // normalize other hamza's
+ [substring] among (
+ '{ao}''{ao_}' '{a~}' ( <- '{a}')
+ '{wo}' ( <- '{w}')
+ '{yo}' ( <- '{y}')
+ )
+ )
+ or
+ next
+ )
+)
+
+// Checks
+define Checks1 as (
+ [substring] among (
+ '{b}{a}{l}' '{k}{a}{l}' ($(len > 4) set is_noun unset is_verb set is_defined)
+ '{l}{l}' '{a}{l}' ($(len > 3) set is_noun unset is_verb set is_defined)
+ )
+)
+
+
+//prefixes
+define Prefix_Step1 as (
+ [substring] among (
+ '{ao}{ao}' ($(len > 3) <- '{ao}' )
+ '{ao}{a~}' ($(len > 3) <- '{a~}' )
+ '{ao}{wo}' ($(len > 3) <- '{ao}' )
+ '{ao}{a}' ($(len > 3) <- '{a}' )
+ '{ao}{ao_}' ($(len > 3) <- '{ao_}' )
+ // '{ao}' ($(len > 3) delete) //rare case
+ )
+)
+
+define Prefix_Step2 as (
+ not '{f}{a}'
+ not '{w}{a}'
+ [substring] among (
+ '{f}' ($(len > 3) delete)
+ '{w}' ($(len > 3) delete)
+ )
+)
+
+define Prefix_Step3a_Noun as ( // it is noun and defined
+ [substring] among (
+ '{b}{a}{l}' '{k}{a}{l}' ($(len > 5) delete)
+ '{l}{l}' '{a}{l}' ($(len > 4) delete)
+ )
+)
+
+define Prefix_Step3b_Noun as ( // probably noun and defined
+ not '{b}{a}' // exception
+ [substring] among (
+ '{b}' ($(len > 3) delete)
+ // '{k}' '{l}' ($(len > 3) delete) // BUG: cause confusion
+ '{b}{b}' ($(len > 3) <- '{b}' )
+ '{k}{k}' ($(len > 3) <- '{k}' )
+ )
+
+)
+
+define Prefix_Step3_Verb as (
+ [substring] among (
+ //'{s}' ($(len > 4) delete)// BUG: cause confusion
+ '{s}{y}' ($(len > 4) <- '{y}' )
+ '{s}{t}' ($(len > 4) <- '{t}')
+ '{s}{n}' ($(len > 4) <- '{n}')
+ '{s}{ao}' ($(len > 4) <- '{ao}')
+ )
+)
+
+define Prefix_Step4_Verb as (
+ [substring] among (
+ '{y}{s}{t}' '{n}{s}{t}' '{t}{s}{t}' ($(len > 4) set is_verb unset is_noun <- '{a}{s}{t}' )
+ )
+)
+
+// suffixes
+backwardmode (
+
+ define Suffix_Noun_Step1a as (
+ [substring] among (
+ '{y}' '{k}' '{e}' ($(len >= 4) delete)
+ '{n}{a}' '{k}{m}' '{e}{a}' '{e}{n}' '{e}{m}' ($(len >= 5) delete)
+ '{k}{m}{a}' '{e}{m}{a}' ($(len >= 6) delete)
+ )
+ )
+ define Suffix_Noun_Step1b as (
+ [substring] among (
+ '{n}' ($(len > 5) delete)
+ )
+ )
+
+ define Suffix_Noun_Step2a as (
+ [substring] among (
+ '{a}' '{y}' '{w}' ($(len > 4) delete)
+ )
+ )
+
+ define Suffix_Noun_Step2b as (
+ [substring] among (
+ '{a}{t}' ($(len >= 5) delete)
+ )
+ )
+
+ define Suffix_Noun_Step2c1 as (
+ [substring] among (
+ '{t}' ($(len >= 4) delete)
+ )
+ )
+ define Suffix_Noun_Step2c2 as ( // feminine t_
+ [substring] among (
+ '{t_}' ($(len >= 4) delete)
+ )
+ )
+ define Suffix_Noun_Step3 as ( // ya' nisbiya
+ [substring] among (
+ '{y}' ($(len >= 3) delete)
+ )
+ )
+
+ define Suffix_Verb_Step1 as (
+ [substring] among (
+ '{e}' '{k}' ($(len >= 4) delete)
+ '{n}{y}' '{n}{a}' '{e}{a}' '{e}{m}' '{e}{n}' '{k}{m}' '{k}{n}' ($(len >= 5) delete)
+ '{e}{m}{a}' '{k}{m}{a}' '{k}{m}{w}'($(len >= 6) delete)
+ )
+ )
+ define Suffix_Verb_Step2a as (
+ [substring] among (
+ '{t}' ($(len >= 4) delete)
+ '{a}' '{n}' '{y}' ($(len >= 4) delete)
+ '{n}{a}' '{t}{a}' '{t}{n}' ($(len >= 5) delete)// past
+ '{a}{n}' '{w}{n}' '{y}{n}' ($(len > 5) delete) // present
+ '{t}{m}{a}' ($(len >= 6) delete)
+ )
+ )
+
+ define Suffix_Verb_Step2b as (
+ [substring] among (
+ '{w}{a}' '{t}{m}' ($(len >= 5) delete)
+ )
+ )
+
+
+ define Suffix_Verb_Step2c as (
+ [substring] among (
+ '{w}' ($(len >= 4) delete)
+ '{t}{m}{w}' ($(len >= 6) delete)
+ )
+ )
+
+ define Suffix_All_alef_maqsura as (
+ [substring] among (
+ '{a_}' ( <- '{y}' ) // spell error
+ // '{a_}' ( delete ) // if noun > 3
+ // '{a_}' ( <- '{a}') // if verb
+ )
+ )
+)
+
+define stem as (
+ // set initial values
+ set is_noun
+ set is_verb
+ unset is_defined
+
+ // guess type and properties
+ do Checks1
+
+ // normalization pre-stemming
+ do Normalize_pre
+
+
+ backwards (
+
+ do (
+ //Suffixes for verbs
+ (
+ is_verb
+ (
+ (
+ (atleast 1 Suffix_Verb_Step1)
+ ( Suffix_Verb_Step2a or Suffix_Verb_Step2c or next)
+ )
+ or Suffix_Verb_Step2b
+ or Suffix_Verb_Step2a
+ )
+ )
+ //Suffixes for nouns
+ or (
+ is_noun
+ (
+
+ try (
+ Suffix_Noun_Step2c2
+ or (not is_defined Suffix_Noun_Step1a (
+ Suffix_Noun_Step2a
+ or Suffix_Noun_Step2b
+ or Suffix_Noun_Step2c1
+ or next))
+ or (Suffix_Noun_Step1b (
+ Suffix_Noun_Step2a
+ or Suffix_Noun_Step2b
+ or Suffix_Noun_Step2c1))
+ or (not is_defined Suffix_Noun_Step2a)
+ or (Suffix_Noun_Step2b)
+ )
+ Suffix_Noun_Step3
+ )
+
+ )
+
+ // Suffixes for alef maqsura
+ or Suffix_All_alef_maqsura
+ )
+ )
+
+ //Prefixes
+ do (
+ try Prefix_Step1
+ try Prefix_Step2
+ ( Prefix_Step3a_Noun
+ or (is_noun Prefix_Step3b_Noun)
+ or (is_verb try Prefix_Step3_Verb Prefix_Step4_Verb)
+ )
+ )
+
+ // normalization post-stemming
+ do Normalize_post
+
+)
diff --git a/contrib/snowball/algorithms/basque.sbl b/contrib/snowball/algorithms/basque.sbl
new file mode 100644
index 000000000..267abc7e1
--- /dev/null
+++ b/contrib/snowball/algorithms/basque.sbl
@@ -0,0 +1,149 @@
+routines (
+ aditzak
+ izenak
+ adjetiboak
+ mark_regions
+ RV R2 R1
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef n~ '{U+00F1}'
+
+define v 'aeiou'
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v (non-v gopast v) or (v gopast non-v) )
+ or
+ ( non-v (non-v gopast v) or (v next) )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R2 as $p2 <= cursor
+ define R1 as $p1 <= cursor
+
+ define aditzak as (
+ [substring] among(
+ 'le' 'la' 'tzaile' 'aldatu' 'atu' 'tzailea' 'taile' 'tailea' 'pera' 'gale' 'galea'
+ 'gura' 'kura' 'kor' 'korra' 'or' 'orra' 'tun' 'tuna' 'gaitz' 'gaitza'
+ 'kaitz' 'kaitza' 'ezin' 'ezina' 'tezin' 'tezina' 'errez' 'erreza'
+ 'karri' 'karria' 'tzaga' 'tzaka' 'tzake' 'tzeke' 'ez' 'eza' 'tzez'
+ 'keta' 'eta' 'etan' 'pen' 'pena' 'tze' 'atze' 'kuntza' 'kunde' 'kundea'
+ 'kune' 'kunea' 'kuna' 'kera' 'era' 'kizun' 'kizuna' 'dura' 'tura' 'men' 'mena'
+ 'go' 'ago' 'tio' 'taldi' 'taldia' 'aldi' 'aldia' 'gune' 'gunea' 'bide' 'bidea'
+ 'pide' 'pidea' 'gai' 'gaia' 'ki' 'kin' 'rekin' 'kina' 'kari' 'karia' 'ari' 'tari' 'etari'
+ 'gailu' 'gailua' 'kide' 'kidea' 'ide' 'idea' 'du' 'ka' 'kan' 'an' 'ean' 'tu' 'lari' 'tatu'
+ 'rean' 'tarazi' 'arazi' 'tzat' 'bera' 'dako'
+ ( RV delete )
+ 'garri' 'garria' 'tza'
+ (R2 delete)
+ 'atseden'
+ (<- 'atseden')
+ 'arabera'
+ (<- 'arabera')
+ 'baditu'
+ (<- 'baditu')
+
+ )
+ )
+
+ define izenak as (
+ [substring] among(
+ 'ari' 'aria' 'bizia' 'kari' 'karia' 'lari' 'laria' 'tari' 'taria' 'zain' 'zaina'
+ 'tzain' 'tzaina' 'zale' 'zalea' 'tzale' 'tzalea' 'aizun' 'orde' 'ordea'
+ 'burua' 'ohi' 'ohia' 'kintza' 'gintzo' 'gintzu' 'tzu' 'tzua'
+ 'tzo' 'tzoa' 'kuntza' 'talde' 'taldea' 'eria' 'keria' 'teria' 'di'
+ 'za' 'ada' 'tara' 'etara' 'tra' 'ta' 'tegi' 'tegia' 'keta' 'z' 'zko' 'zkoa'
+ 'ti' 'tia' 'tsu' 'tsua' 'zu' 'zua' 'bera' 'pera' 'zto' 'ztoa' 'asi' 'asia'
+ 'gile' 'gilea' 'estu' 'estua' 'larri' 'larria' 'nahi' 'nahia'
+ 'koi' 'koia' 'oi' 'oia' 'goi' 'min' 'mina' 'dun' 'duna' 'duru' 'durua'
+ 'duri' 'duria' 'os' 'osa' 'oso' 'osoa' 'ar' 'ara' 'tar' 'dar' 'dara'
+ 'tiar' 'tiara' 'liar' 'liara' 'gabe' 'gabea' 'kabe' 'kabea' 'ga' 'ge'
+ 'kada' 'tasun' 'tasuna' 'asun' 'asuna' 'go' 'mendu' 'mendua' 'mentu' 'mentua'
+ 'mendi' 'mendia' 'zio' 'zioa' 'zino' 'zinoa' 'zione' 'zionea' 'ezia'
+ 'degi' 'degia' 'egi' 'egia' 'toki' 'tokia' 'leku' 'lekua' 'gintza' 'alde'
+ 'aldea' 'kalde' 'kaldea' 'gune' 'gunea' 'une' 'unea' 'una' 'pe' 'pea'
+ 'gibel' 'gibela' 'ondo' 'ondoa' 'arte' 'artea' 'aurre' 'aurrea'
+ 'etxe' 'etxea' 'ola' 'ontzi' 'ontzia' 'gela' 'denda' 'taldi' 'taldia'
+ 'aldi' 'aldia' 'te' 'tea' 'zaro' 'zaroa' 'taro' 'taroa' 'oro' 'oroa'
+ 'aro' 'aroa' 'ero' 'eroa' 'eroz' 'eroza' 'ka' 'kan' 'kana' 'tako' 'etako' 'takoa'
+ 'kote' 'kotea' 'tzar' 'tzarra' 'handi' 'handia' 'kondo' 'kondoa' 'skila'
+ 'no' 'noa' '{n~}o' '{n~}oa' 'ska' 'xka' 'zka' 'tila' 'to' 'toa' 'tto' 'ttoa'
+ 'txo' 'txoa' 'txu' 'txua' 'anda' 'anga' 'urren' 'urrena' 'gai' 'gaia'
+ 'gei' 'geia' 'eme' 'emea' 'kume' 'kumea' 'sa' 'ko' 'eko' 'koa' 'ena'
+ 'enea' 'ne' 'nea' 'kor' 'korra' 'ez' 'eza' 'eta' 'etan'
+ 'ki' 'kia' 'kin' 'kina' 'tu' 'tua' 'du' 'dua' 'ek'
+ 'tarik' 'tariko' 'tan' 'ordu' 'ordua' 'oste' 'ostea' 'tzara'
+ 'ra' 'antza' 'behar' 'ro' 'giro' 'ak' 'zp' 'ket'
+ 'kail' 'kaila' 'ail' 'kirri' 'kirria' 'ngo' 'ngoa' '{n~}i' 'sko'
+ 'sta' 'koitz' 'koitza' 'na' 'garren' 'garrena' 'kera'
+ 'gerren' 'gerrena' 'garna' 'kide' 'tz' 'tuko'
+ ( RV delete )
+ 'ora' 'garri' 'garria' 'or' 'buru' 'ren' 'tza'
+ ( R2 delete )
+ 'joka'
+ (<- 'jok')
+ 'tzen' 'ten' 'en' 'tatu'
+ (R1 delete)
+ 'trako'
+ (<- 'tra')
+ 'minutuko'
+ (<- 'minutu')
+ 'zehar'
+ (<- 'zehar')
+ 'geldi'
+ (<- 'geldi')
+ 'igaro'
+ (<- 'igaro')
+ 'aurka'
+ (<- 'aurka')
+ )
+ )
+
+ define adjetiboak as (
+ [substring] among(
+ 'era' 'ero' 'go' 'tate' 'tade' 'date' 'dade' 'keria'
+ 'ki' 'to' 'ro' 'la' 'gi' 'larik' 'lanik' 'ik' 'ztik' 'rik'
+ ( RV delete )
+ 'zlea'
+ (<- 'z')
+ )
+ )
+
+)
+
+define stem as (
+ do mark_regions
+ backwards (
+ repeat aditzak
+ repeat izenak
+ do adjetiboak
+ )
+
+)
+
+/*
+ Note 1: additions of 21 Jul 2010
+*/
diff --git a/contrib/snowball/algorithms/catalan.sbl b/contrib/snowball/algorithms/catalan.sbl
new file mode 100644
index 000000000..0a1e3a536
--- /dev/null
+++ b/contrib/snowball/algorithms/catalan.sbl
@@ -0,0 +1,202 @@
+routines (
+ cleaning mark_regions
+ R1 R2
+ attached_pronoun
+ standard_suffix
+ verb_suffix
+ residual_suffix
+)
+
+externals ( stem )
+
+integers ( p1 p2 )
+
+groupings ( v )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef a' '{U+00E1}' // a-acute
+stringdef a` '{U+00E0}' // a-grave
+stringdef c, '{U+00E7}' // c-cedilla
+stringdef e' '{U+00E9}' // e-acute
+stringdef e` '{U+00E8}' // e-grave
+stringdef i' '{U+00ED}' // i-acute
+stringdef i` '{U+00EC}' // i-grave
+stringdef i" '{U+00EF}' // i-diaeresis
+stringdef o' '{U+00F3}' // o-acute
+stringdef o` '{U+00F2}' // o-grave
+stringdef u' '{U+00FA}' // u-acute
+stringdef u" '{U+00FC}' // u-diaeresis
+stringdef . '{U+00B7}' // - per l aggeminades
+
+define v 'aeiou{a'}{a`}{e'}{e`}{i'}{i"}{o'}{o`}{u'}{u"}'
+
+define mark_regions as (
+
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define cleaning as repeat (
+ [substring] among(
+ '{a'}' (<- 'a')
+ '{a`}' (<- 'a')
+ '{e'}' (<- 'e')
+ '{e`}' (<- 'e')
+ '{i'}' (<- 'i')
+ '{i`}' (<- 'i')
+ '{o'}' (<- 'o')
+ '{o`}' (<- 'o')
+ '{u'}' (<- 'u')
+ '{u"}' (<- 'u')
+ '{i"}' (<- 'i')
+ '{.}' (<- '.')
+ '' (next)
+ )
+)
+
+backwardmode (
+
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define attached_pronoun as (
+ [substring] among (
+ '{'}s' '{'}hi' '{'}ho' '{'}l' '{'}ls'
+ '-ls' '-la' '-les' '-li'
+ 'vos' 'se' 'nos' '-nos' '-us' 'us'
+ '{'}n' '{'}ns' '-n' '-ns'
+ '{'}m' '-me' '-m'
+ '-te' '{'}t'
+ 'li' 'lo' 'los'
+ 'me' 'sela' 'selo' 'selas' 'selos' 'le'
+ 'la' 'las' 'les' 'ens' 'ho' 'hi'
+ (R1 delete)
+ )
+ )
+
+ define standard_suffix as (
+ [substring] among(
+ 'ar' 'atge' 'formes' 'icte' 'ictes'
+ 'ell' 'ells' 'ella' '{e'}s' '{e`}s' 'esc' 'essa' 'et' 'ets' 'eta'
+ 'eres' 'eries' 'ers' 'ina' 'ines' 'able' 'ls'
+ 'i{o'}' 'itat' 'itats' 'itzar' 'iva' 'ives' 'ivisme' 'ius'
+ 'fer' 'ment' 'amen' 'ament' 'aments' 'ments' 'ot' 'sfera' 'al' 'als' 'era' 'ana' 'iste'
+ 'aire' 'eria' 'esa' 'eses' 'esos' 'or' '{i'}cia' '{i'}cies' 'icis' 'ici' '{i'}ci' '{i'}cis'
+ '{a`}ria' '{a`}ries' 'alla' 'ci{o'}' 'cions' 'n{c,}a' 'nces' '{o'}' 'dor' 'all'
+ 'il' '{i'}stic' 'enc' 'enca' '{i'}s' 'issa' 'issos' '{i'}ssem' '{i'}ssiu' 'issem' 'isseu' '{i'}sseu'
+ '{o'}s' 'osa' 'dora' 'dores' 'dors' 'adura' 'ble' 'bles' '{i'}vol' '{i'}vola' 'd{i'}s' 'egar' 'ejar' 'ificar'
+ 'itar' 'ables' 'adors' 'idores' 'idors'
+ 'adora' 'aci{o'}' 'doras' 'dur' 'dures' 'alleng{u"}es'
+ 'ant' 'ants' 'ancia' 'ancies' 'at{o`}ria' 'at{o`}ries' 'tori' 'toris'
+ 'ats' 'ions' 'ota' 'isam' 'ors' 'ora' 'ores' 'isament'
+ 'bilitat' 'bilitats' 'ivitat' 'ivitats' 'ari' 'aris' 'ionisme' 'ionista' 'ionistes'
+ 'ialista' 'ialistes' 'ialisme' 'ialismes' 'ud' 'uts' 'uds' 'encia' 'encies' '{e`}ncia' '{e`}ncies'
+ '{i"}tat' '{i"}tats' 'atiu' 'atius' 'atives' 'ativa' 'ativitat' 'ativitats' 'ible' 'ibles'
+ 'assa' 'asses' 'assos'
+ 'ent' 'ents'
+ '{i'}ssim' '{i'}ssima' '{i'}ssims' '{i'}ssimes' '{i`}ssem' '{i`}sseu' '{i`}ssin'
+ 'ims' 'ima' 'imes'
+ 'isme' 'ista' 'ismes' 'istes'
+ 'inia' 'inies' '{i'}inia' '{i'}nies' 'ita' 'ites' 'triu' 'trius'
+ 'oses' 'osos' 'ient' 'otes' 'ots'
+ (R1 delete)
+ 'acions' 'ada' 'ades'
+ (R2 delete)
+ 'log{i'}a' 'log{i'}es''logia' 'logies' 'logi' 'logis' 'l{o'}gica' 'l{o'}gics' 'l{o'}giques'
+ (R2 <- 'log')
+ 'ic' 'ica' 'ics' 'iques'
+ (R2 <- 'ic')
+ 'qu{i'}ssim' 'qu{i'}ssims' 'qu{i'}ssimes' 'qu{i'}ssima'
+ (R1 <- 'c')
+ )
+ )
+
+ define verb_suffix as (
+ [substring] among(
+ 'ador' 'adora' 'adors' 'adores' 're' 'ie'
+ 'ent' 'ents' 'udes' 'ar{a`}' 'eren'
+ 'ar{a'}' 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais'
+ 'aria' 'arian' 'arien' 'aries' 'ar{a`}s'
+ 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ara'
+ 'ar{e'}' 'ar{e'}s'
+ 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais'
+ 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}'
+ 'er{e'}' 'er' 'erau' 'erass'
+ 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais'
+ 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}'
+ 'ir{e'}' '{i'}rem' '{i'}reu' '{i'}eu'
+ 'ia' 'ies' '{i'}em' '{i`}eu' 'ien'
+ 'at' 'ut' 'uda' 'ava' 'aves' 'avem' '{a'}vem' '{a`}vem' '{a`}veu' '{a'}veu' 'aven' 'au' 'ats'
+ 'asseu' 'esseu' 'eresseu' '{a`}sseu' '{a`}ssem' '{a`}ssim' '{a`}ssiu'
+ 'essen' 'esses' 'assen' 'asses' 'assim' 'assiu'
+ '{e'}ssen' '{e'}sseu' '{e'}ssim' '{e'}ssiu' '{e'}ssem'
+ '{i'}' 'ares' '{a`}rem' '{a`}reu' '{a`}ren'
+ 'ar{i'}em' 'ar{i'}eu'
+ 'areu' 'aren' 'ant' '{i"}m' '{i"}u'
+ '{e'}s' '{i"}en' 'en' 'es' 'em' 'am' 'ams' '{i"}a' '{i"}es'
+ 'dre' 'eix' 'eixer' 'tzar' 'eixes' 'ides' '{i"}des' 'it' '{i"}t' '{i"}da'
+ 'aba' 'ada' 'ades' 'ida' '{i'}a' 'iera' 'ad' 'ed' 'its'
+ 'id' 'ids' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an'
+ 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado'
+ 'ido' 'iendo' 'i{o'}' 'ar' 'ir' 'as'
+ 'ieu' 'ii' 'io' 'i{a`}'
+ 'ess' 'essin' 'essis' 'ass' 'assin' 'assis' 'essim' '{e`}ssim' '{e`}ssiu'
+ 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases'
+ 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais'
+ 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados'
+ 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' 'ques'
+ '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos'
+ 'ira' 'iran' 'irem' 'iren' 'ires' 'ireu' 'iria' 'irien'
+ 'iries' 'ir{a`}' 'ir{a`}s' 'ir{e`}' 'ir{i`}em' 'ir{i`}eu'
+ 'isquen' 'iguem' 'igueu' 'esqui' 'esquin' 'esquis' 'eixi' 'eixin' 'eixis'
+ 'eixen' 'eixo' 'isin' 'isis' 'esques' 'sis' 'sin'
+ 'int' 'ir{i'}em' 'ir{i'}eu' 'isc' 'atges' 'esca' 'esquen'
+ 'issen' 'isses' 'issin' 'issis' 'isca' 'issiu' 'issim'
+ '{i"}sc' '{i"}sca' '{i"}ssin' '{i'}ssiu' '{i'}ssim' '{i"}ssis' '{i"}guem' '{i"}gueu'
+ '{i"}ra' '{i"}ren' '{i"}res'
+ '{i"}squen' '{i"}sques' '{i"}ssen' '{i"}sses' '{i"}xo' '{i"}xen' '{i"}xes' '{i"}x'
+ 'ixo' 'ixen' 'ixes' 'ix' 'ixa' 'inin' 'inis' 'ini' 'ineu' 'itza' 'itzi' 'itzeu' 'itzis'
+ 'itzo' 'itz' 'itz{a`}' 'arem' 'in' '{a`}s' 'i{i"}' 'i{i"}n' 'i{i"}s'
+ (R1 delete)
+ 'ando'
+ (R2 delete)
+ )
+ )
+
+ define residual_suffix as (
+ [substring] among(
+ 'os' 'a' 'o' '{a'}' '{a`}' '{i'}' '{o'}' 'e' '{e'}' 'eu' 'iu'
+ 'is' 'i' 'ir' 's' '{i`}' 'itz' '{i"}' '{i"}n' '{i"}s' 'it'
+ (R1 delete)
+ 'iqu'
+ (R1 <- 'ic')
+ )
+ )
+)
+
+define stem as (
+ do mark_regions
+ backwards (
+ do attached_pronoun
+ do ( standard_suffix or
+ verb_suffix
+ )
+ do residual_suffix
+ )
+ do cleaning
+)
+
+/*
+ First works 2010/07/19
+ First Grammatical Reviews: https://ca.wikipedia.org/wiki/Gram%C3%A0tica_del_catal%C3%A0
+ Suffix list: https://ca.wikipedia.org/wiki/Llista_de_sufixos
+ Irregular Verbs: https://ca.wikipedia.org/wiki/Flexi%C3%B3_verbal_del_catal%C3%A0
+*/
diff --git a/contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/danish.sbl
index 0a8190a08..761270f7e 100644
--- a/contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl
+++ b/contrib/snowball/algorithms/danish.sbl
@@ -12,15 +12,17 @@ strings ( ch )
integers ( p1 x )
-groupings ( v s_ending )
+groupings ( c v s_ending )
stringescapes {}
-/* special characters (in ISO Latin I) */
+/* special characters */
-stringdef ae hex 'E6'
-stringdef ao hex 'E5'
-stringdef o/ hex 'F8'
+stringdef ae '{U+00E6}'
+stringdef ao '{U+00E5}'
+stringdef o/ '{U+00F8}'
+
+define c 'bcdfghjklmnpqrstvwxz'
define v 'aeiouy{ae}{ao}{o/}'
@@ -73,7 +75,7 @@ backwardmode (
)
)
define undouble as (
- setlimit tomark p1 for ([non-v] ->ch)
+ setlimit tomark p1 for ([c] ->ch)
ch
delete
)
diff --git a/contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl
deleted file mode 100644
index 1131a1cb7..000000000
--- a/contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl
+++ /dev/null
@@ -1,91 +0,0 @@
-routines (
- mark_regions
- main_suffix
- consonant_pair
- other_suffix
- undouble
-)
-
-externals ( stem )
-
-strings ( ch )
-
-integers ( p1 x )
-
-groupings ( v s_ending )
-
-stringescapes {}
-
-/* special characters (in MS-DOS Latin I) */
-
-stringdef ae hex '91'
-stringdef ao hex '86'
-stringdef o/ hex '9B'
-
-define v 'aeiouy{ae}{ao}{o/}'
-
-define s_ending 'abcdfghjklmnoprtvyz{ao}'
-
-define mark_regions as (
-
- $p1 = limit
-
- test ( hop 3 setmark x )
- goto v gopast non-v setmark p1
- try ( $p1 < x $p1 = x )
-)
-
-backwardmode (
-
- define main_suffix as (
- setlimit tomark p1 for ([substring])
- among(
-
- 'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere'
- 'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes'
- 'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets'
- 'erets' 'et' 'eret'
- (delete)
- 's'
- (s_ending delete)
- )
- )
-
- define consonant_pair as (
- test (
- setlimit tomark p1 for ([substring])
- among(
- 'gd' // significant in the call from other_suffix
- 'dt' 'gt' 'kt'
- )
- )
- next] delete
- )
-
- define other_suffix as (
- do ( ['st'] 'ig' delete )
- setlimit tomark p1 for ([substring])
- among(
- 'ig' 'lig' 'elig' 'els'
- (delete do consonant_pair)
- 'l{o/}st'
- (<-'l{o/}s')
- )
- )
- define undouble as (
- setlimit tomark p1 for ([non-v] ->ch)
- ch
- delete
- )
-)
-
-define stem as (
-
- do mark_regions
- backwards (
- do main_suffix
- do consonant_pair
- do other_suffix
- do undouble
- )
-)
diff --git a/contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/dutch.sbl
index 15b8718d1..f24c82de4 100644
--- a/contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl
+++ b/contrib/snowball/algorithms/dutch.sbl
@@ -18,21 +18,21 @@ groupings ( v v_I v_j )
stringescapes {}
-/* special characters (in MS-DOS Latin I) */
+/* special characters */
-stringdef a" hex '84'
-stringdef e" hex '89'
-stringdef i" hex '8B'
-stringdef o" hex '94'
-stringdef u" hex '81'
+stringdef a" '{U+00E4}'
+stringdef e" '{U+00EB}'
+stringdef i" '{U+00EF}'
+stringdef o" '{U+00F6}'
+stringdef u" '{U+00FC}'
-stringdef a' hex 'A0'
-stringdef e' hex '82'
-stringdef i' hex 'A1'
-stringdef o' hex 'A2'
-stringdef u' hex 'A3'
+stringdef a' '{U+00E1}'
+stringdef e' '{U+00E9}'
+stringdef i' '{U+00ED}'
+stringdef o' '{U+00F3}'
+stringdef u' '{U+00FA}'
-stringdef e` hex '8A'
+stringdef e` '{U+00E8}'
define v 'aeiouy{e`}'
define v_I v + 'I'
diff --git a/contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl
deleted file mode 100644
index f7609f766..000000000
--- a/contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl
+++ /dev/null
@@ -1,164 +0,0 @@
-routines (
- prelude postlude
- e_ending
- en_ending
- mark_regions
- R1 R2
- undouble
- standard_suffix
-)
-
-externals ( stem )
-
-booleans ( e_found )
-
-integers ( p1 p2 )
-
-groupings ( v v_I v_j )
-
-stringescapes {}
-
-/* special characters (in ISO Latin I) */
-
-stringdef a" hex 'E4'
-stringdef e" hex 'EB'
-stringdef i" hex 'EF'
-stringdef o" hex 'F6'
-stringdef u" hex 'FC'
-
-stringdef a' hex 'E1'
-stringdef e' hex 'E9'
-stringdef i' hex 'ED'
-stringdef o' hex 'F3'
-stringdef u' hex 'FA'
-
-stringdef e` hex 'E8'
-
-define v 'aeiouy{e`}'
-define v_I v + 'I'
-define v_j v + 'j'
-
-define prelude as (
- test repeat (
- [substring] among(
- '{a"}' '{a'}'
- (<- 'a')
- '{e"}' '{e'}'
- (<- 'e')
- '{i"}' '{i'}'
- (<- 'i')
- '{o"}' '{o'}'
- (<- 'o')
- '{u"}' '{u'}'
- (<- 'u')
- '' (next)
- ) //or next
- )
- try(['y'] <- 'Y')
- repeat goto (
- v [('i'] v <- 'I') or
- ('y'] <- 'Y')
- )
-)
-
-define mark_regions as (
-
- $p1 = limit
- $p2 = limit
-
- gopast v gopast non-v setmark p1
- try($p1 < 3 $p1 = 3) // at least 3
- gopast v gopast non-v setmark p2
-
-)
-
-define postlude as repeat (
-
- [substring] among(
- 'Y' (<- 'y')
- 'I' (<- 'i')
- '' (next)
- ) //or next
-
-)
-
-backwardmode (
-
- define R1 as $p1 <= cursor
- define R2 as $p2 <= cursor
-
- define undouble as (
- test among('kk' 'dd' 'tt') [next] delete
- )
-
- define e_ending as (
- unset e_found
- ['e'] R1 test non-v delete
- set e_found
- undouble
- )
-
- define en_ending as (
- R1 non-v and not 'gem' delete
- undouble
- )
-
- define standard_suffix as (
- do (
- [substring] among(
- 'heden'
- ( R1 <- 'heid'
- )
- 'en' 'ene'
- ( en_ending
- )
- 's' 'se'
- ( R1 non-v_j delete
- )
- )
- )
- do e_ending
-
- do ( ['heid'] R2 not 'c' delete
- ['en'] en_ending
- )
-
- do (
- [substring] among(
- 'end' 'ing'
- ( R2 delete
- (['ig'] R2 not 'e' delete) or undouble
- )
- 'ig'
- ( R2 not 'e' delete
- )
- 'lijk'
- ( R2 delete e_ending
- )
- 'baar'
- ( R2 delete
- )
- 'bar'
- ( R2 e_found delete
- )
- )
- )
- do (
- non-v_I
- test (
- among ('aa' 'ee' 'oo' 'uu')
- non-v
- )
- [next] delete
- )
- )
-)
-
-define stem as (
-
- do prelude
- do mark_regions
- backwards
- do standard_suffix
- do postlude
-)
diff --git a/contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/english.sbl
index fe18d7a91..fe18d7a91 100644
--- a/contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl
+++ b/contrib/snowball/algorithms/english.sbl
diff --git a/contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/finnish.sbl
index 9ac74f292..3891d2295 100644
--- a/contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl
+++ b/contrib/snowball/algorithms/finnish.sbl
@@ -24,16 +24,17 @@ externals ( stem )
integers ( p1 p2 )
strings ( x )
booleans ( ending_removed )
-groupings ( AEI V1 V2 particle_end )
+groupings ( AEI C V1 V2 particle_end )
stringescapes {}
-/* special characters (in ISO Latin I) */
+/* special characters */
-stringdef a" hex 'E4'
-stringdef o" hex 'F6'
+stringdef a" '{U+00E4}'
+stringdef o" '{U+00F6}'
define AEI 'a{a"}ei'
+define C 'bcdfghjklmnpqrstvwxz'
define V1 'aeiouy{a"}{o"}'
define V2 'aeiou{a"}{o"}'
define particle_end V1 + 'nt'
@@ -116,7 +117,7 @@ backwardmode (
)
'a' '{a"}' //-.
- (V1 non-V1) // |
+ (V1 C) // |
'tta' 'tt{a"}' // Partitive [32]
('e') // |
'ta' 't{a"}' //-'
@@ -172,11 +173,11 @@ backwardmode (
define tidy as (
setlimit tomark p1 for (
do ( LONG and ([next] delete ) ) // undouble vowel
- do ( [AEI] non-V1 delete ) // remove trailing a, a", e, i
+ do ( [AEI] C delete ) // remove trailing a, a", e, i
do ( ['j'] 'o' or 'u' delete )
do ( ['o'] 'j' delete )
)
- goto non-V1 [next] -> x x delete // undouble consonant
+ goto non-V1 [C] -> x x delete // undouble consonant
)
)
diff --git a/contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/french.sbl
index e972f227f..5c4f32d05 100644
--- a/contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl
+++ b/contrib/snowball/algorithms/french.sbl
@@ -17,21 +17,21 @@ groupings ( v keep_with_s )
stringescapes {}
-/* special characters (in ISO Latin I) */
-
-stringdef a^ hex 'E2' // a-circumflex
-stringdef a` hex 'E0' // a-grave
-stringdef c, hex 'E7' // c-cedilla
-
-stringdef e" hex 'EB' // e-diaeresis (rare)
-stringdef e' hex 'E9' // e-acute
-stringdef e^ hex 'EA' // e-circumflex
-stringdef e` hex 'E8' // e-grave
-stringdef i" hex 'EF' // i-diaeresis
-stringdef i^ hex 'EE' // i-circumflex
-stringdef o^ hex 'F4' // o-circumflex
-stringdef u^ hex 'FB' // u-circumflex
-stringdef u` hex 'F9' // u-grave
+/* special characters */
+
+stringdef a^ '{U+00E2}' // a-circumflex
+stringdef a` '{U+00E0}' // a-grave
+stringdef c, '{U+00E7}' // c-cedilla
+
+stringdef e" '{U+00EB}' // e-diaeresis (rare)
+stringdef e' '{U+00E9}' // e-acute
+stringdef e^ '{U+00EA}' // e-circumflex
+stringdef e` '{U+00E8}' // e-grave
+stringdef i" '{U+00EF}' // i-diaeresis
+stringdef i^ '{U+00EE}' // i-circumflex
+stringdef o^ '{U+00F4}' // o-circumflex
+stringdef u^ '{U+00FB}' // u-circumflex
+stringdef u` '{U+00F9}' // u-grave
define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}'
@@ -42,6 +42,10 @@ define prelude as repeat goto (
('y' ] <- 'Y')
)
or
+ ( [ '{e"}' ] <- 'He' )
+ or
+ ( [ '{i"}' ] <- 'Hi' )
+ or
( ['y'] v <- 'Y' )
or
( 'q' ['u'] <- 'U' )
@@ -78,6 +82,9 @@ define postlude as repeat (
'I' (<- 'i')
'U' (<- 'u')
'Y' (<- 'y')
+ 'He' (<- '{e"}')
+ 'Hi' (<- '{i"}')
+ 'H' (delete)
'' (next)
)
)
@@ -167,7 +174,7 @@ backwardmode (
'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait'
'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses'
'issez' 'issiez' 'issions' 'issons' 'it'
- (non-v delete)
+ (not 'H' non-v delete)
)
)
@@ -196,14 +203,13 @@ backwardmode (
define keep_with_s 'aiou{e`}s'
define residual_suffix as (
- try(['s'] test non-keep_with_s delete)
+ try(['s'] test ('Hi' or non-keep_with_s) delete)
setlimit tomark pV for (
[substring] among(
'ion' (R2 's' or 't' delete)
'ier' 'i{e`}re'
'Ier' 'I{e`}re' (<-'i')
'e' (delete)
- '{e"}' ('gu' delete)
)
)
)
diff --git a/contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl
deleted file mode 100644
index 996eba1ab..000000000
--- a/contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl
+++ /dev/null
@@ -1,239 +0,0 @@
-routines (
- prelude postlude mark_regions
- RV R1 R2
- standard_suffix
- i_verb_suffix
- verb_suffix
- residual_suffix
- un_double
- un_accent
-)
-
-externals ( stem )
-
-integers ( pV p1 p2 )
-
-groupings ( v keep_with_s )
-
-stringescapes {}
-
-/* special characters (in MS-DOS Latin I) */
-
-stringdef a^ hex '83' // a-circumflex
-stringdef a` hex '85' // a-grave
-stringdef c, hex '87' // c-cedilla
-
-stringdef e" hex '89' // e-diaeresis (rare)
-stringdef e' hex '82' // e-acute
-stringdef e^ hex '88' // e-circumflex
-stringdef e` hex '8A' // e-grave
-stringdef i" hex '8B' // i-diaeresis
-stringdef i^ hex '8C' // i-circumflex
-stringdef o^ hex '93' // o-circumflex
-stringdef u^ hex '96' // u-circumflex
-stringdef u` hex '97' // u-grave
-
-define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}'
-
-define prelude as repeat goto (
-
- ( v [ ('u' ] v <- 'U') or
- ('i' ] v <- 'I') or
- ('y' ] <- 'Y')
- )
- or
- ( ['y'] v <- 'Y' )
- or
- ( 'q' ['u'] <- 'U' )
-)
-
-define mark_regions as (
-
- $pV = limit
- $p1 = limit
- $p2 = limit // defaults
-
- do (
- ( v v next ) or ( next gopast v )
- setmark pV
- )
- do (
- gopast v gopast non-v setmark p1
- gopast v gopast non-v setmark p2
- )
-)
-
-define postlude as repeat (
-
- [substring] among(
- 'I' (<- 'i')
- 'U' (<- 'u')
- 'Y' (<- 'y')
- '' (next)
- )
-)
-
-backwardmode (
-
- define RV as $pV <= cursor
- define R1 as $p1 <= cursor
- define R2 as $p2 <= cursor
-
- define standard_suffix as (
- [substring] among(
-
- 'ance' 'iqUe' 'isme' 'able' 'iste' 'eux'
- 'ances' 'iqUes' 'ismes' 'ables' 'istes'
- ( R2 delete )
- 'atrice' 'ateur' 'ation'
- 'atrices' 'ateurs' 'ations'
- ( R2 delete
- try ( ['ic'] (R2 delete) or <-'iqU' )
- )
- 'logie'
- 'logies'
- ( R2 <- 'log' )
- 'usion' 'ution'
- 'usions' 'utions'
- ( R2 <- 'u' )
- 'ence'
- 'ences'
- ( R2 <- 'ent' )
- 'ement'
- 'ements'
- (
- RV delete
- try (
- [substring] among(
- 'iv' (R2 delete ['at'] R2 delete)
- 'eus' ((R2 delete) or (R1<-'eux'))
- 'abl' 'iqU'
- (R2 delete)
- 'i{e`}r' 'I{e`}r' //)
- (RV <-'i') //)--new 2 Sept 02
- )
- )
- )
- 'it{e'}'
- 'it{e'}s'
- (
- R2 delete
- try (
- [substring] among(
- 'abil' ((R2 delete) or <-'abl')
- 'ic' ((R2 delete) or <-'iqU')
- 'iv' (R2 delete)
- )
- )
- )
- 'if' 'ive'
- 'ifs' 'ives'
- (
- R2 delete
- try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' )
- )
- 'eaux' (<- 'eau')
- 'aux' (R1 <- 'al')
- 'euse'
- 'euses'((R2 delete) or (R1<-'eux'))
-
- 'issement'
- 'issements'(R1 non-v delete) // verbal
-
- // fail(...) below forces entry to verb_suffix. -ment typically
- // follows the p.p., e.g 'confus{e'}ment'.
-
- 'amment' (RV fail(<- 'ant'))
- 'emment' (RV fail(<- 'ent'))
- 'ment'
- 'ments' (test(v RV) fail(delete))
- // v is e,i,u,{e'},I or U
- )
- )
-
- define i_verb_suffix as setlimit tomark pV for (
- [substring] among (
- '{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai'
- 'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez'
- 'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait'
- 'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses'
- 'issez' 'issiez' 'issions' 'issons' 'it'
- (non-v delete)
- )
- )
-
- define verb_suffix as setlimit tomark pV for (
- [substring] among (
- 'ions'
- (R2 delete)
-
- '{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai'
- 'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions'
- 'erons' 'eront' 'ez' 'iez'
-
- // 'ons' //-best omitted
-
- (delete)
-
- '{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant'
- 'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez'
- 'assions'
- (delete
- try(['e'] delete)
- )
- )
- )
-
- define keep_with_s 'aiou{e`}s'
-
- define residual_suffix as (
- try(['s'] test non-keep_with_s delete)
- setlimit tomark pV for (
- [substring] among(
- 'ion' (R2 's' or 't' delete)
- 'ier' 'i{e`}re'
- 'Ier' 'I{e`}re' (<-'i')
- 'e' (delete)
- '{e"}' ('gu' delete)
- )
- )
- )
-
- define un_double as (
- test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete
- )
-
- define un_accent as (
- atleast 1 non-v
- [ '{e'}' or '{e`}' ] <-'e'
- )
-)
-
-define stem as (
-
- do prelude
- do mark_regions
- backwards (
-
- do (
- (
- ( standard_suffix or
- i_verb_suffix or
- verb_suffix
- )
- and
- try( [ ('Y' ] <- 'i' ) or
- ('{c,}'] <- 'c' )
- )
- ) or
- residual_suffix
- )
-
- // try(['ent'] RV delete) // is best omitted
-
- do un_double
- do un_accent
- )
- do postlude
-)
-
diff --git a/contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/german.sbl
index 7069daf0d..61f24ef92 100644
--- a/contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl
+++ b/contrib/snowball/algorithms/german.sbl
@@ -18,12 +18,12 @@ groupings ( v s_ending st_ending )
stringescapes {}
-/* special characters (in ISO Latin I) */
+/* special characters */
-stringdef a" hex 'E4'
-stringdef o" hex 'F6'
-stringdef u" hex 'FC'
-stringdef ss hex 'DF'
+stringdef a" '{U+00E4}'
+stringdef o" '{U+00F6}'
+stringdef u" '{U+00FC}'
+stringdef ss '{U+00DF}'
define v 'aeiouy{a"}{o"}{u"}'
diff --git a/contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl
deleted file mode 100644
index 3effb3257..000000000
--- a/contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl
+++ /dev/null
@@ -1,139 +0,0 @@
-
-/*
- Extra rule for -nisse ending added 11 Dec 2009
-*/
-
-routines (
- prelude postlude
- mark_regions
- R1 R2
- standard_suffix
-)
-
-externals ( stem )
-
-integers ( p1 p2 x )
-
-groupings ( v s_ending st_ending )
-
-stringescapes {}
-
-/* special characters (in MS-DOS Latin I) */
-
-stringdef a" hex '84'
-stringdef o" hex '94'
-stringdef u" hex '81'
-stringdef ss hex 'E1'
-
-define v 'aeiouy{a"}{o"}{u"}'
-
-define s_ending 'bdfghklmnrt'
-define st_ending s_ending - 'r'
-
-define prelude as (
-
- test repeat (
- (
- ['{ss}'] <- 'ss'
- ) or next
- )
-
- repeat goto (
- v [('u'] v <- 'U') or
- ('y'] v <- 'Y')
- )
-)
-
-define mark_regions as (
-
- $p1 = limit
- $p2 = limit
-
- test(hop 3 setmark x)
-
- gopast v gopast non-v setmark p1
- try($p1 < x $p1 = x) // at least 3
- gopast v gopast non-v setmark p2
-
-)
-
-define postlude as repeat (
-
- [substring] among(
- 'Y' (<- 'y')
- 'U' (<- 'u')
- '{a"}' (<- 'a')
- '{o"}' (<- 'o')
- '{u"}' (<- 'u')
- '' (next)
- )
-
-)
-
-backwardmode (
-
- define R1 as $p1 <= cursor
- define R2 as $p2 <= cursor
-
- define standard_suffix as (
- do (
- [substring] R1 among(
- 'em' 'ern' 'er'
- ( delete
- )
- 'e' 'en' 'es'
- ( delete
- try (['s'] 'nis' delete)
- )
- 's'
- ( s_ending delete
- )
- )
- )
- do (
- [substring] R1 among(
- 'en' 'er' 'est'
- ( delete
- )
- 'st'
- ( st_ending hop 3 delete
- )
- )
- )
- do (
- [substring] R2 among(
- 'end' 'ung'
- ( delete
- try (['ig'] not 'e' R2 delete)
- )
- 'ig' 'ik' 'isch'
- ( not 'e' delete
- )
- 'lich' 'heit'
- ( delete
- try (
- ['er' or 'en'] R1 delete
- )
- )
- 'keit'
- ( delete
- try (
- [substring] R2 among(
- 'lich' 'ig'
- ( delete
- )
- )
- )
- )
- )
- )
- )
-)
-
-define stem as (
- do prelude
- do mark_regions
- backwards
- do standard_suffix
- do postlude
-)
diff --git a/contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/german2.sbl
index ce6026a86..47ff61e38 100644
--- a/contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl
+++ b/contrib/snowball/algorithms/german2.sbl
@@ -18,12 +18,12 @@ groupings ( v s_ending st_ending )
stringescapes {}
-/* special characters (in ISO Latin I) */
+/* special characters */
-stringdef a" hex 'E4'
-stringdef o" hex 'F6'
-stringdef u" hex 'FC'
-stringdef ss hex 'DF'
+stringdef a" '{U+00E4}'
+stringdef o" '{U+00F6}'
+stringdef u" '{U+00FC}'
+stringdef ss '{U+00DF}'
define v 'aeiouy{a"}{o"}{u"}'
diff --git a/contrib/snowball/algorithms/greek.sbl b/contrib/snowball/algorithms/greek.sbl
new file mode 100644
index 000000000..02df6c33f
--- /dev/null
+++ b/contrib/snowball/algorithms/greek.sbl
@@ -0,0 +1,706 @@
+// A stemmer for Modern Greek language, based on:
+//
+// Ntais, Georgios. Development of a Stemmer for the Greek
+// Language. Diss. Royal Institute of Technology, 2006.
+// https://sais.se/mthprize/2007/ntais2007.pdf
+//
+// Saroukos, Spyridon. Enhancing a Greek language stemmer.
+// University of Tampere, 2008.
+// https://tampub.uta.fi/bitstream/handle/10024/80480/gradu03463.pdf
+
+stringescapes {}
+
+stringdef a '{U+03B1}' // alpha
+stringdef v '{U+03B2}' // beta
+stringdef g '{U+03B3}' // gamma
+stringdef d '{U+03B4}' // delta
+stringdef e '{U+03B5}' // epsilon
+stringdef z '{U+03B6}' // zeta
+stringdef i '{U+03B7}' // eta
+stringdef th '{U+03B8}' // theta
+stringdef y '{U+03B9}' // iota
+stringdef k '{U+03BA}' // kappa
+stringdef l '{U+03BB}' // lamda
+stringdef m '{U+03BC}' // mu
+stringdef n '{U+03BD}' // nu
+stringdef x '{U+03BE}' // xi
+stringdef o '{U+03BF}' // omicron
+stringdef p '{U+03C0}' // pi
+stringdef r '{U+03C1}' // rho
+stringdef ss '{U+03C2}' // sigma final
+stringdef s '{U+03C3}' // sigma
+stringdef t '{U+03C4}' // tau
+stringdef u '{U+03C5}' // upsilon
+stringdef f '{U+03C6}' // phi
+stringdef ch '{U+03C7}' // chi
+stringdef ps '{U+03C8}' // psi
+stringdef oo '{U+03C9}' // omega
+
+stringdef A '{U+0391}' // Alpha
+stringdef V '{U+0392}' // Beta
+stringdef G '{U+0393}' // Gamma
+stringdef D '{U+0394}' // Delta
+stringdef E '{U+0395}' // Epsilon
+stringdef Z '{U+0396}' // Zeta
+stringdef I '{U+0397}' // Eta
+stringdef Th '{U+0398}' // Theta
+stringdef Y '{U+0399}' // Iota
+stringdef K '{U+039A}' // Kappa
+stringdef L '{U+039B}' // Lamda
+stringdef M '{U+039C}' // Mu
+stringdef N '{U+039D}' // Nu
+stringdef X '{U+039E}' // Xi
+stringdef O '{U+039F}' // Omicron
+stringdef P '{U+03A0}' // Pi
+stringdef R '{U+03A1}' // Rho
+stringdef S '{U+03A3}' // Sigma
+stringdef T '{U+03A4}' // Tau
+stringdef U '{U+03A5}' // Upsilon
+stringdef F '{U+03A6}' // Phi
+stringdef Ch '{U+03A7}' // Chi
+stringdef Ps '{U+03A8}' // Psi
+stringdef Oo '{U+03A9}' // Omega
+
+stringdef Y: '{U+03AA}' // Iota with dialytika
+stringdef U: '{U+03AB}' // Upsilon with dialytika
+
+stringdef a' '{U+03AC}' // alpha with tonos
+stringdef e' '{U+03AD}' // epsilon with tonos
+stringdef i' '{U+03AE}' // eta with tonos
+stringdef y' '{U+03AF}' // iota with tonos
+stringdef o' '{U+03CC}' // omicron with tonos
+stringdef u' '{U+03CD}' // upsilon with tonos
+stringdef oo' '{U+03CE}' // omega with tonos
+
+stringdef i:' '{U+0390}' // iota with dialytika and tonos
+stringdef u:' '{U+03B0}' // upsilon with dialytika and tonos
+
+stringdef i: '{U+03CA}' // iota with dialytika
+stringdef u: '{U+03CB}' // upsilon with dialytika
+
+stringdef A' '{U+0386}' // Alpha with tonos
+stringdef E' '{U+0388}' // Epsilon with tonos
+stringdef I' '{U+0389}' // Eta with tonos
+stringdef Y' '{U+038A}' // Iota with tonos
+stringdef O' '{U+038C}' // Omicron with tonos
+stringdef U' '{U+038E}' // Upsilon with tonos
+stringdef OO' '{U+038F}' // Omega with tonos
+
+externals ( stem )
+
+booleans ( test1 )
+
+groupings ( v v2 )
+
+routines ( tolower has_min_length
+ steps1 steps2 steps3 steps4 steps5 steps6 steps7
+ steps8 steps9 steps10
+ step1 step2a step2b step2c step2d step3 step4
+ step5a step5b step5c step5d step5e step5f
+ step5g step5h step5i
+ step5j step5k step5l step5m
+ step6 step7 )
+
+define v '{a}{e}{i}{y}{o}{u}{oo}'
+define v2 '{a}{e}{i}{y}{o}{oo}'
+
+backwardmode (
+ define has_min_length as (
+ $(len >= 3)
+ )
+
+ define tolower as (
+ repeat (
+ [substring] among (
+ '{A}' (<- '{a}')
+ '{V}' (<- '{v}')
+ '{G}' (<- '{g}')
+ '{D}' (<- '{d}')
+ '{E}' (<- '{e}')
+ '{Z}' (<- '{z}')
+ '{I}' (<- '{i}')
+ '{Th}' (<- '{th}')
+ '{Y}' (<- '{y}')
+ '{K}' (<- '{k}')
+ '{L}' (<- '{l}')
+ '{M}' (<- '{m}')
+ '{N}' (<- '{n}')
+ '{X}' (<- '{x}')
+ '{O}' (<- '{o}')
+ '{P}' (<- '{p}')
+ '{R}' (<- '{r}')
+ '{S}' (<- '{s}')
+ '{T}' (<- '{t}')
+ '{U}' (<- '{u}')
+ '{F}' (<- '{f}')
+ '{Ch}' (<- '{ch}')
+ '{Ps}' (<- '{ps}')
+ '{Oo}' (<- '{oo}')
+ '{Y:}' (<- '{y}')
+ '{U:}' (<- '{u}')
+ '{a'}' (<- '{a}')
+ '{e'}' (<- '{e}')
+ '{i'}' (<- '{i}')
+ '{y'}' (<- '{y}')
+ '{o'}' (<- '{o}')
+ '{u'}' (<- '{u}')
+ '{oo'}' (<- '{oo}')
+ '{i:'}' (<- '{i}')
+ '{u:'}' (<- '{u}')
+ '{i:}' (<- '{i}')
+ '{u:}' (<- '{u}')
+ '{A'}' (<- '{a}')
+ '{E'}' (<- '{e}')
+ '{I'}' (<- '{i}')
+ '{Y'}' (<- '{y}')
+ '{O'}' (<- '{o}')
+ '{U'}' (<- '{u}')
+ '{OO'}' (<- '{oo}')
+ '{ss}' (<- '{s}')
+ '' (next)
+ )
+ )
+ )
+
+ define step1 as (
+ [substring] among (
+ '{f}{a}{g}{y}{a}' '{f}{a}{g}{y}{o}{u}' '{f}{a}{g}{y}{oo}{n}' (<- '{f}{a}')
+ '{s}{k}{a}{g}{y}{a}' '{s}{k}{a}{g}{y}{o}{u}' '{s}{k}{a}{g}{y}{oo}{n}' (<- '{s}{k}{a}')
+ '{o}{l}{o}{g}{y}{o}{u}' '{o}{l}{o}{g}{y}{a}' '{o}{l}{o}{g}{y}{oo}{n}' (<- '{o}{l}{o}')
+ '{s}{o}{g}{y}{o}{u}' '{s}{o}{g}{y}{a}' '{s}{o}{g}{y}{oo}{n}' (<- '{s}{o}')
+ '{t}{a}{t}{o}{g}{y}{a}' '{t}{a}{t}{o}{g}{y}{o}{u}' '{t}{a}{t}{o}{g}{y}{oo}{n}' (<- '{t}{a}{t}{o}')
+ '{k}{r}{e}{a}{s}' '{k}{r}{e}{a}{t}{o}{s}' '{k}{r}{e}{a}{t}{a}' '{k}{r}{e}{a}{t}{oo}{n}' (<- '{k}{r}{e}')
+ '{p}{e}{r}{a}{s}' '{p}{e}{r}{a}{t}{o}{s}' '{p}{e}{r}{a}{t}{i}' '{p}{e}{r}{a}{t}{a}' '{p}{e}{r}{a}{t}{oo}{n}' (<- '{p}{e}{r}')
+ '{t}{e}{r}{a}{s}' '{t}{e}{r}{a}{t}{o}{s}' '{t}{e}{r}{a}{t}{a}' '{t}{e}{r}{a}{t}{oo}{n}' (<- '{t}{e}{r}')
+ '{f}{oo}{s}' '{f}{oo}{t}{o}{s}' '{f}{oo}{t}{a}' '{f}{oo}{t}{oo}{n}' (<- '{f}{oo}')
+ '{k}{a}{th}{e}{s}{t}{oo}{s}' '{k}{a}{th}{e}{s}{t}{oo}{t}{o}{s}' '{k}{a}{th}{e}{s}{t}{oo}{t}{a}' '{k}{a}{th}{e}{s}{t}{oo}{t}{oo}{n}' (<- '{k}{a}{th}{e}{s}{t}')
+ '{g}{e}{g}{o}{n}{o}{s}' '{g}{e}{g}{o}{n}{o}{t}{o}{s}' '{g}{e}{g}{o}{n}{o}{t}{a}' '{g}{e}{g}{o}{n}{o}{t}{oo}{n}' (<- '{g}{e}{g}{o}{n}')
+ )
+ unset test1
+ )
+
+ define steps1 as (
+ [substring] among (
+ '{y}{z}{a}' '{y}{z}{e}{s}' '{y}{z}{e}' '{y}{z}{a}{m}{e}' '{y}{z}{a}{t}{e}' '{y}{z}{a}{n}' '{y}{z}{a}{n}{e}' '{y}{z}{oo}' '{y}{z}{e}{y}{s}' '{y}{z}{e}{y}'
+ '{y}{z}{o}{u}{m}{e}' '{y}{z}{e}{t}{e}' '{y}{z}{o}{u}{n}' '{y}{z}{o}{u}{n}{e}' (
+ delete
+ unset test1
+ ([] substring atlimit among (
+ '{a}{n}{a}{m}{p}{a}' '{e}{m}{p}{a}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{p}{a}' '{p}{e}{r}{y}{p}{a}' '{a}{th}{r}{o}' '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}'
+ (<- '{y}')
+ )) or
+ ([] substring atlimit among (
+ '{m}{a}{r}{k}' '{k}{o}{r}{n}' '{a}{m}{p}{a}{r}' '{a}{r}{r}' '{v}{a}{th}{u}{r}{y}' '{v}{a}{r}{k}' '{v}' '{v}{o}{l}{v}{o}{r}' '{g}{k}{r}'
+ '{g}{l}{u}{k}{o}{r}' '{g}{l}{u}{k}{u}{r}' '{y}{m}{p}' '{l}' '{l}{o}{u}' '{m}{a}{r}' '{m}' '{p}{r}' '{m}{p}{r}' '{p}{o}{l}{u}{r}' '{p}'
+ '{r}' '{p}{y}{p}{e}{r}{o}{r}'
+ (<- '{y}{z}')
+ ))
+ )
+ )
+ )
+
+ define steps2 as (
+ [substring] among (
+ '{oo}{th}{i}{k}{a}' '{oo}{th}{i}{k}{e}{s}' '{oo}{th}{i}{k}{e}' '{oo}{th}{i}{k}{a}{m}{e}' '{oo}{th}{i}{k}{a}{t}{e}' '{oo}{th}{i}{k}{a}{n}' '{oo}{th}{i}{k}{a}{n}{e}' (
+ delete
+ unset test1
+ [] substring atlimit among (
+ '{a}{l}' '{v}{y}' '{e}{n}' '{u}{ps}' '{l}{y}' '{z}{oo}' '{s}' '{ch}' (<- '{oo}{n}')
+ )
+ )
+ )
+ )
+
+ define steps3 as (
+ [substring] among (
+ '{y}{s}{a}' '{y}{s}{e}{s}' '{y}{s}{e}' '{y}{s}{a}{m}{e}' '{y}{s}{a}{t}{e}' '{y}{s}{a}{n}' '{y}{s}{a}{n}{e}' (
+ delete
+ unset test1
+ ('{y}{s}{a}' atlimit <- '{y}{s}') or
+ ([] substring atlimit among (
+ '{a}{n}{a}{m}{p}{a}' '{a}{th}{r}{o}' '{e}{m}{p}{a}' '{e}{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{e}{p}{e}' '{p}{e}{r}{y}{p}{a}'
+ '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' '{k}{l}{e}' '{ch}{a}{r}{t}{o}{p}{a}' '{e}{x}{a}{r}{ch}{a}' '{m}{e}{t}{e}{p}{e}' '{a}{p}{o}{k}{l}{e}'
+ '{a}{p}{e}{k}{l}{e}' '{e}{k}{l}{e}' '{p}{e}'
+ (<- '{y}')
+ )) or
+ ([] substring atlimit among (
+ '{a}{n}' '{a}{f}' '{g}{e}' '{g}{y}{g}{a}{n}{t}{o}{a}{f}' '{g}{k}{e}' '{d}{i}{m}{o}{k}{r}{a}{t}' '{k}{o}{m}' '{g}{k}' '{m}' '{p}'
+ '{p}{o}{u}{k}{a}{m}' '{o}{l}{o}' '{l}{a}{r}'
+ (<- '{y}{s}')
+ ))
+ )
+ )
+ )
+
+ define steps4 as (
+ [substring] among (
+ '{y}{s}{oo}' '{y}{s}{e}{y}{s}' '{y}{s}{e}{y}' '{y}{s}{o}{u}{m}{e}' '{y}{s}{e}{t}{e}' '{y}{s}{o}{u}{n}' '{y}{s}{o}{u}{n}{e}' (
+ delete
+ unset test1
+ [] substring atlimit among (
+ '{a}{n}{a}{m}{p}{a}' '{e}{m}{p}{a}' '{e}{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{e}{p}{e}' '{p}{e}{r}{y}{p}{a}' '{a}{th}{r}{o}'
+ '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' '{k}{l}{e}' '{ch}{a}{r}{t}{o}{p}{a}' '{e}{x}{a}{r}{ch}{a}' '{m}{e}{t}{e}{p}{e}' '{a}{p}{o}{k}{l}{e}' '{a}{p}{e}{k}{l}{e}'
+ '{e}{k}{l}{e}' '{p}{e}'
+ (<- '{y}')
+ )
+ )
+ )
+ )
+
+ define steps5 as (
+ [substring] among (
+ '{y}{s}{t}{o}{s}' '{y}{s}{t}{o}{u}' '{y}{s}{t}{o}' '{y}{s}{t}{e}' '{y}{s}{t}{o}{y}' '{y}{s}{t}{oo}{n}' '{y}{s}{t}{o}{u}{s}' '{y}{s}{t}{i}' '{y}{s}{t}{i}{s}'
+ '{y}{s}{t}{a}' '{y}{s}{t}{e}{s}' (
+ delete
+ unset test1
+ ([] substring atlimit among (
+ '{d}{a}{n}{e}' '{s}{u}{n}{a}{th}{r}{o}' '{k}{l}{e}' '{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{a}{s}{e}' '{p}{l}{e}'
+ (<- '{y}')
+ )) or
+ ([] substring atlimit among (
+ '{m}' '{p}' '{a}{p}' '{a}{r}' '{i}{d}' '{k}{t}' '{s}{k}' '{s}{ch}' '{u}{ps}' '{f}{a}' '{ch}{r}' '{ch}{t}' '{a}{k}{t}'
+ '{a}{o}{r}' '{a}{s}{ch}' '{a}{t}{a}' '{a}{ch}{n}' '{a}{ch}{t}' '{g}{e}{m}' '{g}{u}{r}' '{e}{m}{p}' '{e}{u}{p}' '{e}{ch}{th}' '{i}{f}{a}'
+ '{k}{a}{th}' '{k}{a}{k}' '{k}{u}{l}' '{l}{u}{g}' '{m}{a}{k}' '{m}{e}{g}' '{t}{a}{ch}' '{f}{y}{l}' '{ch}{oo}{r}'
+ (<- '{y}{s}{t}')
+ ))
+ )
+ )
+ )
+
+ define steps6 as (
+ [substring] among (
+ '{y}{s}{m}{o}' '{y}{s}{m}{o}{y}' '{y}{s}{m}{o}{s}' '{y}{s}{m}{o}{u}' '{y}{s}{m}{o}{u}{s}' '{y}{s}{m}{oo}{n}' (
+ delete
+ unset test1
+ ([] substring atlimit among (
+ '{s}{e}' '{m}{e}{t}{a}{s}{e}' '{m}{y}{k}{r}{o}{s}{e}' '{e}{g}{k}{l}{e}' '{a}{p}{o}{k}{l}{e}'
+ (<- '{y}{s}{m}')
+ )) or
+ ([] substring atlimit among (
+ '{d}{a}{n}{e}' '{a}{n}{t}{y}{d}{a}{n}{e}'
+ (<- '{y}')
+ )) or
+ ([substring] among (
+ '{a}{g}{n}{oo}{s}{t}{y}{k}' (<- '{a}{g}{n}{oo}{s}{t}')
+ '{a}{t}{o}{m}{y}{k}' (<- '{a}{t}{o}{m}')
+ '{g}{n}{oo}{s}{t}{y}{k}' (<- '{g}{n}{oo}{s}{t}')
+ '{e}{th}{n}{y}{k}' (<- '{e}{th}{n}')
+ '{e}{k}{l}{e}{k}{t}{y}{k}' (<- '{e}{k}{l}{e}{k}{t}')
+ '{s}{k}{e}{p}{t}{y}{k}' (<- '{s}{k}{e}{p}{t}')
+ '{t}{o}{p}{y}{k}' (<- '{t}{o}{p}')
+ '{a}{l}{e}{x}{a}{n}{d}{r}{y}{n}' (<- '{a}{l}{e}{x}{a}{n}{d}{r}')
+ '{v}{u}{z}{a}{n}{t}{y}{n}' (<- '{v}{u}{z}{a}{n}{t}')
+ '{th}{e}{a}{t}{r}{y}{n}' (<- '{th}{e}{a}{t}{r}')
+ ))
+ )
+ )
+ )
+
+ define steps7 as (
+ [substring] among (
+ '{a}{r}{a}{k}{y}' '{a}{r}{a}{k}{y}{a}' '{o}{u}{d}{a}{k}{y}' '{o}{u}{d}{a}{k}{y}{a}' (
+ delete
+ unset test1
+ [] substring atlimit among (
+ '{s}' '{ch}'
+ (<- '{a}{r}{a}{k}')
+ )
+ )
+ )
+ )
+
+ define steps8 as (
+ [substring] among (
+ '{a}{k}{y}' '{a}{k}{y}{a}' '{y}{t}{s}{a}' '{y}{t}{s}{a}{s}' '{y}{t}{s}{e}{s}' '{y}{t}{s}{oo}{n}' '{a}{r}{a}{k}{y}' '{a}{r}{a}{k}{y}{a}' (
+ delete
+ unset test1
+ ([] substring atlimit among (
+ '{v}{a}{m}{v}' '{v}{r}' '{k}{a}{y}{m}' '{k}{o}{n}' '{k}{o}{r}' '{l}{a}{v}{r}' '{l}{o}{u}{l}' '{m}{e}{r}' '{m}{o}{u}{s}{t}'
+ '{n}{a}{g}{k}{a}{s}' '{p}{l}' '{r}' '{r}{u}' '{s}' '{s}{k}' '{s}{o}{k}' '{s}{p}{a}{n}' '{t}{z}' '{f}{a}{r}{m}' '{ch}' '{k}{a}{p}{a}{k}'
+ '{a}{l}{y}{s}{f}' '{a}{m}{v}{r}' '{a}{n}{th}{r}' '{k}' '{f}{u}{l}' '{k}{a}{t}{r}{a}{p}' '{k}{l}{y}{m}' '{m}{a}{l}' '{s}{l}{o}{v}' '{f}'
+ '{s}{f}' '{t}{s}{e}{ch}{o}{s}{l}{o}{v}'
+ (<- '{a}{k}')
+ )) or
+ ([] substring atlimit among (
+ '{v}' '{v}{a}{l}' '{g}{y}{a}{n}' '{g}{l}' '{z}' '{i}{g}{o}{u}{m}{e}{n}' '{k}{a}{r}{d}' '{k}{o}{n}' '{m}{a}{k}{r}{u}{n}' '{n}{u}{f}'
+ '{p}{a}{t}{e}{r}' '{p}' '{s}{k}' '{t}{o}{s}' '{t}{r}{y}{p}{o}{l}'
+ (<- '{y}{t}{s}')
+ )) or
+ ([] '{k}{o}{r}' <- '{y}{t}{s}')
+ )
+ )
+ )
+
+ define steps9 as (
+ [substring] among (
+ '{y}{d}{y}{o}' '{y}{d}{y}{a}' '{y}{d}{y}{oo}{n}' (
+ delete
+ unset test1
+ ([] substring atlimit among (
+ '{a}{y}{f}{n}' '{y}{r}' '{o}{l}{o}' '{ps}{a}{l}' (<- '{y}{d}')
+ )) or
+ ([] substring among (
+ '{e}' '{p}{a}{y}{ch}{n}' (<- '{y}{d}')
+ ))
+ )
+ )
+ )
+
+ define steps10 as (
+ [substring] among (
+ '{y}{s}{k}{o}{s}' '{y}{s}{k}{o}{u}' '{y}{s}{k}{o}' '{y}{s}{k}{e}' (
+ delete
+ unset test1
+ [] substring atlimit among (
+ '{d}' '{y}{v}' '{m}{i}{n}' '{r}' '{f}{r}{a}{g}{k}' '{l}{u}{k}' '{o}{v}{e}{l}'
+ (<- '{y}{s}{k}')
+ )
+ )
+ )
+ )
+
+ define step2a as (
+ [substring] among (
+ '{a}{d}{e}{s}' '{a}{d}{oo}{n}' (delete)
+ )
+ not ([substring] among (
+ '{o}{k}' '{m}{a}{m}' '{m}{a}{n}' '{m}{p}{a}{m}{p}' '{p}{a}{t}{e}{r}' '{g}{y}{a}{g}{y}' '{n}{t}{a}{n}{t}' '{k}{u}{r}' '{th}{e}{y}' '{p}{e}{th}{e}{r}'
+ ))
+ insert '{a}{d}'
+ )
+
+ define step2b as (
+ [substring] among (
+ '{e}{d}{e}{s}' '{e}{d}{oo}{n}' (delete)
+ )
+ [] substring among (
+ '{o}{p}' '{y}{p}' '{e}{m}{p}' '{u}{p}' '{g}{i}{p}' '{d}{a}{p}' '{k}{r}{a}{s}{p}' '{m}{y}{l}' (<- '{e}{d}')
+ )
+ )
+
+ define step2c as (
+ [substring] among (
+ '{o}{u}{d}{e}{s}' '{o}{u}{d}{oo}{n}' (delete)
+ )
+ [] substring among (
+ '{a}{r}{k}' '{k}{a}{l}{y}{a}{k}' '{p}{e}{t}{a}{l}' '{l}{y}{ch}' '{p}{l}{e}{x}' '{s}{k}' '{s}' '{f}{l}' '{f}{r}' '{v}{e}{l}' '{l}{o}{u}{l}' '{ch}{n}'
+ '{s}{p}' '{t}{r}{a}{g}' '{f}{e}' (<- '{o}{u}{d}')
+ )
+ )
+
+ define step2d as (
+ [substring] among (
+ '{e}{oo}{s}' '{e}{oo}{n}' (delete unset test1)
+ )
+ [] substring atlimit among (
+ '{th}' '{d}' '{e}{l}' '{g}{a}{l}' '{n}' '{p}' '{y}{d}' '{p}{a}{r}' (<- '{e}')
+ )
+ )
+
+ define step3 as (
+ [substring] among (
+ '{y}{a}' '{y}{o}{u}' '{y}{oo}{n}' (delete unset test1)
+ )
+ ([] v <- '{y}')
+ )
+
+ define step4 as (
+ [substring] among (
+ '{y}{k}{a}' '{y}{k}{o}' '{y}{k}{o}{u}' '{y}{k}{oo}{n}' (delete unset test1)
+ )
+ ([] v <- '{y}{k}') or
+ [] substring atlimit among (
+ '{a}{l}' '{a}{d}' '{e}{n}{d}' '{a}{m}{a}{n}' '{a}{m}{m}{o}{ch}{a}{l}' '{i}{th}' '{a}{n}{i}{th}' '{a}{n}{t}{y}{d}' '{f}{u}{s}' '{v}{r}{oo}{m}' '{g}{e}{r}'
+ '{e}{x}{oo}{d}' '{k}{a}{l}{p}' '{k}{a}{l}{l}{y}{n}' '{k}{a}{t}{a}{d}' '{m}{o}{u}{l}' '{m}{p}{a}{n}' '{m}{p}{a}{g}{y}{a}{t}' '{m}{p}{o}{l}' '{m}{p}{o}{s}'
+ '{n}{y}{t}' '{x}{y}{k}' '{s}{u}{n}{o}{m}{i}{l}' '{p}{e}{t}{s}' '{p}{y}{t}{s}' '{p}{y}{k}{a}{n}{t}' '{p}{l}{y}{a}{t}{s}' '{p}{o}{s}{t}{e}{l}{n}' '{p}{r}{oo}{t}{o}{d}'
+ '{s}{e}{r}{t}' '{s}{u}{n}{a}{d}' '{t}{s}{a}{m}' '{u}{p}{o}{d}' '{f}{y}{l}{o}{n}' '{f}{u}{l}{o}{d}' '{ch}{a}{s}'
+ (<- '{y}{k}')
+ )
+ )
+
+ define step5a as (
+ do ('{a}{g}{a}{m}{e}' atlimit <- '{a}{g}{a}{m}')
+ do (
+ [substring] among (
+ '{a}{g}{a}{m}{e}' '{i}{s}{a}{m}{e}' '{o}{u}{s}{a}{m}{e}' '{i}{k}{a}{m}{e}' '{i}{th}{i}{k}{a}{m}{e}' (delete unset test1)
+ )
+ )
+ ['{a}{m}{e}']
+ delete
+ unset test1
+ [] substring atlimit among (
+ '{a}{n}{a}{p}' '{a}{p}{o}{th}' '{a}{p}{o}{k}' '{a}{p}{o}{s}{t}' '{v}{o}{u}{v}' '{x}{e}{th}' '{o}{u}{l}' '{p}{e}{th}' '{p}{y}{k}{r}' '{p}{o}{t}' '{s}{y}{ch}' '{ch}'
+ (<- '{a}{m}')
+ )
+ )
+
+ define step5b as (
+ do (
+ [substring] among (
+ '{a}{g}{a}{n}{e}' '{i}{s}{a}{n}{e}' '{o}{u}{s}{a}{n}{e}' '{y}{o}{n}{t}{a}{n}{e}' '{y}{o}{t}{a}{n}{e}' '{y}{o}{u}{n}{t}{a}{n}{e}' '{o}{n}{t}{a}{n}{e}' '{o}{t}{a}{n}{e}'
+ '{o}{u}{n}{t}{a}{n}{e}' '{i}{k}{a}{n}{e}' '{i}{th}{i}{k}{a}{n}{e}' (
+ delete
+ unset test1
+ [] substring atlimit among (
+ '{t}{r}' '{t}{s}' (<- '{a}{g}{a}{n}')
+ )
+ )
+ )
+ )
+ ['{a}{n}{e}']
+ delete
+ unset test1
+ ([] v2 <- '{a}{n}') or
+ [] substring atlimit among (
+ '{v}{e}{t}{e}{r}' '{v}{o}{u}{l}{k}' '{v}{r}{a}{ch}{m}' '{g}' '{d}{r}{a}{d}{o}{u}{m}'
+ '{th}' '{k}{a}{l}{p}{o}{u}{z}' '{k}{a}{s}{t}{e}{l}' '{k}{o}{r}{m}{o}{r}' '{l}{a}{o}{p}{l}' '{m}{oo}{a}{m}{e}{th}'
+ '{m}' '{m}{o}{u}{s}{o}{u}{l}{m}' '{n}' '{o}{u}{l}' '{p}' '{p}{e}{l}{e}{k}' '{p}{l}' '{p}{o}{l}{y}{s}'
+ '{p}{o}{r}{t}{o}{l}' '{s}{a}{r}{a}{k}{a}{t}{s}' '{s}{o}{u}{l}{t}' '{t}{s}{a}{r}{l}{a}{t}' '{o}{r}{f}'
+ '{t}{s}{y}{g}{g}' '{t}{s}{o}{p}' '{f}{oo}{t}{o}{s}{t}{e}{f}' '{ch}' '{ps}{u}{ch}{o}{p}{l}' '{a}{g}'
+ '{g}{a}{l}' '{g}{e}{r}' '{d}{e}{k}' '{d}{y}{p}{l}' '{a}{m}{e}{r}{y}{k}{a}{n}' '{o}{u}{r}' '{p}{y}{th}'
+ '{p}{o}{u}{r}{y}{t}' '{s}' '{z}{oo}{n}{t}' '{y}{k}' '{k}{a}{s}{t}' '{k}{o}{p}' '{l}{y}{ch}'
+ '{l}{o}{u}{th}{i}{r}' '{m}{a}{y}{n}{t}' '{m}{e}{l}' '{s}{y}{g}' '{s}{p}' '{s}{t}{e}{g}' '{t}{r}{a}{g}'
+ '{t}{s}{a}{g}' '{f}' '{e}{r}' '{a}{d}{a}{p}' '{a}{th}{y}{g}{g}' '{a}{m}{i}{ch}' '{a}{n}{y}{k}'
+ '{a}{n}{o}{r}{g}' '{a}{p}{i}{g}' '{a}{p}{y}{th}' '{a}{t}{s}{y}{g}{g}' '{v}{a}{s}' '{v}{a}{s}{k}'
+ '{v}{a}{th}{u}{g}{a}{l}' '{v}{y}{o}{m}{i}{ch}' '{v}{r}{a}{ch}{u}{k}' '{d}{y}{a}{t}' '{d}{y}{a}{f}' '{e}{n}{o}{r}{g}'
+ '{th}{u}{s}' '{k}{a}{p}{n}{o}{v}{y}{o}{m}{i}{ch}' '{k}{a}{t}{a}{g}{a}{l}' '{k}{l}{y}{v}' '{k}{o}{y}{l}{a}{r}{f}'
+ '{l}{y}{v}' '{m}{e}{g}{l}{o}{v}{y}{o}{m}{i}{ch}' '{m}{y}{k}{r}{o}{v}{y}{o}{m}{i}{ch}' '{n}{t}{a}{v}'
+ '{x}{i}{r}{o}{k}{l}{y}{v}' '{o}{l}{y}{g}{o}{d}{a}{m}' '{o}{l}{o}{g}{a}{l}' '{p}{e}{n}{t}{a}{r}{f}' '{p}{e}{r}{i}{f}'
+ '{p}{e}{r}{y}{t}{r}' '{p}{l}{a}{t}' '{p}{o}{l}{u}{d}{a}{p}' '{p}{o}{l}{u}{m}{i}{ch}' '{s}{t}{e}{f}' '{t}{a}{v}'
+ '{t}{e}{t}' '{u}{p}{e}{r}{i}{f}' '{u}{p}{o}{k}{o}{p}' '{ch}{a}{m}{i}{l}{o}{d}{a}{p}' '{ps}{i}{l}{o}{t}{a}{v}'
+ (<- '{a}{n}')
+ )
+ )
+
+ define step5c as (
+ do (
+ [substring] among (
+ '{i}{s}{e}{t}{e}' (delete unset test1)
+ )
+ )
+ ['{e}{t}{e}']
+ delete
+ unset test1
+ ([] v2 <- '{e}{t}') or
+ ([] substring among (
+ '{o}{d}' '{a}{y}{r}' '{f}{o}{r}' '{t}{a}{th}' '{d}{y}{a}{th}' '{s}{ch}' '{e}{n}{d}' '{e}{u}{r}' '{t}{y}{th}' '{u}{p}{e}{r}{th}'
+ '{r}{a}{th}' '{e}{n}{th}' '{r}{o}{th}' '{s}{th}' '{p}{u}{r}' '{a}{y}{n}' '{s}{u}{n}{d}' '{s}{u}{n}' '{s}{u}{n}{th}' '{ch}{oo}{r}'
+ '{p}{o}{n}' '{v}{r}' '{k}{a}{th}' '{e}{u}{th}' '{e}{k}{th}' '{n}{e}{t}' '{r}{o}{n}' '{a}{r}{k}' '{v}{a}{r}' '{v}{o}{l}' '{oo}{f}{e}{l}'
+ (<- '{e}{t}')
+ )) or
+ [] substring atlimit among (
+ '{a}{v}{a}{r}' '{v}{e}{n}' '{e}{n}{a}{r}' '{a}{v}{r}' '{a}{d}' '{a}{th}' '{a}{n}' '{a}{p}{l}' '{v}{a}{r}{o}{n}' '{n}{t}{r}' '{s}{k}' '{k}{o}{p}'
+ '{m}{p}{o}{r}' '{n}{y}{f}' '{p}{a}{g}' '{p}{a}{r}{a}{k}{a}{l}' '{s}{e}{r}{p}' '{s}{k}{e}{l}' '{s}{u}{r}{f}' '{t}{o}{k}' '{u}' '{d}' '{e}{m}'
+ '{th}{a}{r}{r}' '{th}'
+ (<- '{e}{t}')
+ )
+ )
+
+ define step5d as (
+ [substring] among (
+ '{o}{n}{t}{a}{s}' '{oo}{n}{t}{a}{s}' (
+ delete
+ unset test1
+ ([] '{a}{r}{ch}' atlimit <- '{o}{n}{t}') or
+ ([] '{k}{r}{e}' <- '{oo}{n}{t}')
+ )
+ )
+ )
+
+ define step5e as (
+ [substring] among (
+ '{o}{m}{a}{s}{t}{e}' '{y}{o}{m}{a}{s}{t}{e}' (
+ delete
+ unset test1
+ ([] '{o}{n}' atlimit <- '{o}{m}{a}{s}{t}')
+ )
+ )
+ )
+
+ define step5f as (
+ do (
+ ['{y}{e}{s}{t}{e}']
+ delete
+ unset test1
+ [] substring atlimit among (
+ '{p}' '{a}{p}' '{s}{u}{m}{p}' '{a}{s}{u}{m}{p}' '{a}{k}{a}{t}{a}{p}' '{a}{m}{e}{t}{a}{m}{f}' (<- '{y}{e}{s}{t}')
+ )
+ )
+ ['{e}{s}{t}{e}']
+ delete
+ unset test1
+ [] substring atlimit among (
+ '{a}{l}' '{a}{r}' '{e}{k}{t}{e}{l}' '{z}' '{m}' '{x}' '{p}{a}{r}{a}{k}{a}{l}' '{p}{r}{o}' '{n}{y}{s}'
+ (<- '{y}{e}{s}{t}')
+ )
+ )
+
+ define step5g as (
+ do (
+ [substring] among (
+ '{i}{th}{i}{k}{a}' '{i}{th}{i}{k}{e}{s}' '{i}{th}{i}{k}{e}' (delete unset test1)
+ )
+ )
+ [substring] among (
+ '{i}{k}{a}' '{i}{k}{e}{s}' '{i}{k}{e}' (
+ delete
+ unset test1
+ ([] substring among (
+ '{s}{k}{oo}{l}' '{s}{k}{o}{u}{l}' '{n}{a}{r}{th}' '{s}{f}' '{o}{th}' '{p}{y}{th}' (<- '{i}{k}')
+ )) or
+ ([] substring atlimit among (
+ '{d}{y}{a}{th}' '{th}' '{p}{a}{r}{a}{k}{a}{t}{a}{th}' '{p}{r}{o}{s}{th}' '{s}{u}{n}{th}' (<- '{i}{k}')
+ ))
+ )
+ )
+ )
+
+ define step5h as (
+ [substring] among (
+ '{o}{u}{s}{a}' '{o}{u}{s}{e}{s}' '{o}{u}{s}{e}' (
+ delete
+ unset test1
+ ([] substring among (
+ '{p}{o}{d}{a}{r}' '{v}{l}{e}{p}' '{p}{a}{n}{t}{a}{ch}' '{f}{r}{u}{d}' '{m}{a}{n}{t}{y}{l}' '{m}{a}{l}{l}' '{k}{u}{m}{a}{t}' '{l}{a}{ch}' '{l}{i}{g}'
+ '{f}{a}{g}' '{o}{m}' '{p}{r}{oo}{t}' (<- '{o}{u}{s}')
+
+ )) or
+ ([] substring atlimit among (
+ '{f}{a}{r}{m}{a}{k}' '{ch}{a}{d}' '{a}{g}{k}' '{a}{n}{a}{r}{r}' '{v}{r}{o}{m}' '{e}{k}{l}{y}{p}' '{l}{a}{m}{p}{y}{d}' '{l}{e}{ch}' '{m}' '{p}{a}{t}'
+ '{r}' '{l}' '{m}{e}{d}' '{m}{e}{s}{a}{z}' '{u}{p}{o}{t}{e}{y}{n}' '{a}{m}' '{a}{y}{th}' '{a}{n}{i}{k}' '{d}{e}{s}{p}{o}{z}'
+ '{e}{n}{d}{y}{a}{f}{e}{r}' '{d}{e}' '{d}{e}{u}{t}{e}{r}{e}{u}' '{k}{a}{th}{a}{r}{e}{u}' '{p}{l}{e}' '{t}{s}{a}'
+ (<- '{o}{u}{s}')
+ ))
+ )
+ )
+ )
+
+ define step5i as (
+ [substring] among (
+ '{a}{g}{a}' '{a}{g}{e}{s}' '{a}{g}{e}' (
+ delete
+ unset test1
+ ([] '{k}{o}{l}{l}' <- '{a}{g}') or (
+ not ([substring] among ('{ps}{o}{f}' '{n}{a}{u}{l}{o}{ch}'))
+ ([] substring among (
+ '{o}{f}' '{p}{e}{l}' '{ch}{o}{r}{t}' '{l}{l}' '{s}{f}' '{r}{p}' '{f}{r}' '{p}{r}' '{l}{o}{ch}' '{s}{m}{i}{n}'
+ (<- '{a}{g}')
+ )) or
+ ([] substring atlimit among (
+ '{a}{v}{a}{s}{t}' '{p}{o}{l}{u}{f}' '{a}{d}{i}{f}' '{p}{a}{m}{f}' '{r}' '{a}{s}{p}' '{a}{f}' '{a}{m}{a}{l}' '{a}{m}{a}{l}{l}{y}'
+ '{a}{n}{u}{s}{t}' '{a}{p}{e}{r}' '{a}{s}{p}{a}{r}' '{a}{ch}{a}{r}' '{d}{e}{r}{v}{e}{n}' '{d}{r}{o}{s}{o}{p}' '{x}{e}{f}' '{n}{e}{o}{p}'
+ '{n}{o}{m}{o}{t}' '{o}{l}{o}{p}' '{o}{m}{o}{t}' '{p}{r}{o}{s}{t}' '{p}{r}{o}{s}{oo}{p}{o}{p}' '{s}{u}{m}{p}' '{s}{u}{n}{t}' '{t}' '{u}{p}{o}{t}'
+ '{ch}{a}{r}' '{a}{e}{y}{p}' '{a}{y}{m}{o}{s}{t}' '{a}{n}{u}{p}' '{a}{p}{o}{t}' '{a}{r}{t}{y}{p}' '{d}{y}{a}{t}' '{e}{n}' '{e}{p}{y}{t}'
+ '{k}{r}{o}{k}{a}{l}{o}{p}' '{s}{y}{d}{i}{r}{o}{p}' '{l}' '{n}{a}{u}' '{o}{u}{l}{a}{m}' '{o}{u}{r}' '{p}' '{t}{r}' '{m}'
+ (<- '{a}{g}')
+ ))
+ )
+ )
+ )
+ )
+
+ define step5j as (
+ [substring] among (
+ '{i}{s}{e}' '{i}{s}{o}{u}' '{i}{s}{a}' (delete unset test1)
+ )
+ [] substring atlimit among (
+ '{n}' '{ch}{e}{r}{s}{o}{n}' '{d}{oo}{d}{e}{k}{a}{n}' '{e}{r}{i}{m}{o}{n}' '{m}{e}{g}{a}{l}{o}{n}' '{e}{p}{t}{a}{n}' (<- '{i}{s}')
+ )
+ )
+
+ define step5k as (
+ [substring] among (
+ '{i}{s}{t}{e}' (delete unset test1)
+ )
+ [] substring atlimit among (
+ '{a}{s}{v}' '{s}{v}' '{a}{ch}{r}' '{ch}{r}' '{a}{p}{l}' '{a}{e}{y}{m}{n}' '{d}{u}{s}{ch}{r}' '{e}{u}{ch}{r}' '{k}{o}{y}{n}{o}{ch}{r}' '{p}{a}{l}{y}{m}{ps}'
+ (<- '{i}{s}{t}')
+ )
+ )
+
+ define step5l as (
+ [substring] among (
+ '{o}{u}{n}{e}' '{i}{s}{o}{u}{n}{e}' '{i}{th}{o}{u}{n}{e}' (delete unset test1)
+ )
+ [] substring atlimit among (
+ '{n}' '{r}' '{s}{p}{y}' '{s}{t}{r}{a}{v}{o}{m}{o}{u}{t}{s}' '{k}{a}{k}{o}{m}{o}{u}{t}{s}' '{e}{x}{oo}{n}' (<- '{o}{u}{n}')
+ )
+ )
+
+ define step5m as (
+ [substring] among (
+ '{o}{u}{m}{e}' '{i}{s}{o}{u}{m}{e}' '{i}{th}{o}{u}{m}{e}' (delete unset test1)
+ )
+ [] substring atlimit among (
+ '{p}{a}{r}{a}{s}{o}{u}{s}' '{f}' '{ch}' '{oo}{r}{y}{o}{p}{l}' '{a}{z}' '{a}{l}{l}{o}{s}{o}{u}{s}' '{a}{s}{o}{u}{s}'
+ (<- '{o}{u}{m}')
+ )
+ )
+
+ define step6 as (
+ do (
+ [substring] among (
+ '{m}{a}{t}{a}' '{m}{a}{t}{oo}{n}' '{m}{a}{t}{o}{s}' (<- '{m}{a}')
+ )
+ )
+ test1
+ [substring] among (
+ '{a}' '{a}{g}{a}{t}{e}' '{a}{g}{a}{n}' '{a}{e}{y}' '{a}{m}{a}{y}' '{a}{n}' '{a}{s}' '{a}{s}{a}{y}' '{a}{t}{a}{y}' '{a}{oo}' '{e}' '{e}{y}'
+ '{e}{y}{s}' '{e}{y}{t}{e}' '{e}{s}{a}{y}' '{e}{s}' '{e}{t}{a}{y}' '{y}' '{y}{e}{m}{a}{y}' '{y}{e}{m}{a}{s}{t}{e}' '{y}{e}{t}{a}{y}' '{y}{e}{s}{a}{y}'
+ '{y}{e}{s}{a}{s}{t}{e}' '{y}{o}{m}{a}{s}{t}{a}{n}' '{y}{o}{m}{o}{u}{n}' '{y}{o}{m}{o}{u}{n}{a}' '{y}{o}{n}{t}{a}{n}' '{y}{o}{n}{t}{o}{u}{s}{a}{n}' '{y}{o}{s}{a}{s}{t}{a}{n}'
+ '{y}{o}{s}{a}{s}{t}{e}' '{y}{o}{s}{o}{u}{n}' '{y}{o}{s}{o}{u}{n}{a}' '{y}{o}{t}{a}{n}' '{y}{o}{u}{m}{a}' '{y}{o}{u}{m}{a}{s}{t}{e}' '{y}{o}{u}{n}{t}{a}{y}'
+ '{y}{o}{u}{n}{t}{a}{n}' '{i}' '{i}{d}{e}{s}' '{i}{d}{oo}{n}' '{i}{th}{e}{y}' '{i}{th}{e}{y}{s}' '{i}{th}{e}{y}{t}{e}' '{i}{th}{i}{k}{a}{t}{e}' '{i}{th}{i}{k}{a}{n}'
+ '{i}{th}{o}{u}{n}' '{i}{th}{oo}' '{i}{k}{a}{t}{e}' '{i}{k}{a}{n}' '{i}{s}' '{i}{s}{a}{n}' '{i}{s}{a}{t}{e}' '{i}{s}{e}{y}' '{i}{s}{e}{s}' '{i}{s}{o}{u}{n}'
+ '{i}{s}{oo}' '{o}' '{o}{y}' '{o}{m}{a}{y}' '{o}{m}{a}{s}{t}{a}{n}' '{o}{m}{o}{u}{n}' '{o}{m}{o}{u}{n}{a}' '{o}{n}{t}{a}{y}' '{o}{n}{t}{a}{n}'
+ '{o}{n}{t}{o}{u}{s}{a}{n}' '{o}{s}' '{o}{s}{a}{s}{t}{a}{n}' '{o}{s}{a}{s}{t}{e}' '{o}{s}{o}{u}{n}' '{o}{s}{o}{u}{n}{a}' '{o}{t}{a}{n}' '{o}{u}' '{o}{u}{m}{a}{y}'
+ '{o}{u}{m}{a}{s}{t}{e}' '{o}{u}{n}' '{o}{u}{n}{t}{a}{y}' '{o}{u}{n}{t}{a}{n}' '{o}{u}{s}' '{o}{u}{s}{a}{n}' '{o}{u}{s}{a}{t}{e}' '{u}' '{u}{s}' '{oo}'
+ '{oo}{n}' (delete)
+ )
+ )
+
+ define step7 as (
+ [substring] among (
+ '{e}{s}{t}{e}{r}' '{e}{s}{t}{a}{t}' '{o}{t}{e}{r}' '{o}{t}{a}{t}' '{u}{t}{e}{r}' '{u}{t}{a}{t}' '{oo}{t}{e}{r}' '{oo}{t}{a}{t}' (delete)
+ )
+ )
+)
+
+define stem as (
+ backwards (
+ do tolower
+ has_min_length
+ set test1
+ do step1
+ do steps1
+ do steps2
+ do steps3
+ do steps4
+ do steps5
+ do steps6
+ do steps7
+ do steps8
+ do steps9
+ do steps10
+ do step2a
+ do step2b
+ do step2c
+ do step2d
+ do step3
+ do step4
+ do step5a
+ do step5b
+ do step5c
+ do step5d
+ do step5e
+ do step5f
+ do step5g
+ do step5h
+ do step5j
+ do step5i
+ do step5k
+ do step5l
+ do step5m
+ do step6
+ do step7
+ )
+)
diff --git a/contrib/snowball/algorithms/hindi.sbl b/contrib/snowball/algorithms/hindi.sbl
new file mode 100644
index 000000000..bfdfac0e8
--- /dev/null
+++ b/contrib/snowball/algorithms/hindi.sbl
@@ -0,0 +1,323 @@
+// An implementation of "A Lightweight Stemmer for Hindi":
+// http://www.kbcs.in/downloads/papers/StmmerHindi.pdf
+
+externals ( stem )
+
+stringescapes {}
+
+// The transliteration scheme used for our stringdefs matches that used in the
+// paper, as documented in the appendix. It appears to match the WX notation
+// (https://en.wikipedia.org/wiki/WX_notation) except that WX apparently
+// uses 'z' for Anunasika whereas the paper uses Mh.
+//
+// We discriminate dependent vowels by adding a leading "_" to their stringdef
+// names (mnemonic: the _ signifies removing the implicit a from the preceding
+// character).
+
+// Vowels and sonorants:
+stringdef a '{U+0905}'
+stringdef A '{U+0906}'
+stringdef i '{U+0907}'
+stringdef I '{U+0908}'
+stringdef u '{U+0909}'
+stringdef U '{U+090A}'
+stringdef q '{U+090B}'
+stringdef e '{U+090F}'
+stringdef E '{U+0910}'
+stringdef o '{U+0913}'
+stringdef O '{U+0914}'
+
+// Vowel signs:
+stringdef _A '{U+093E}'
+stringdef _i '{U+093F}'
+stringdef _I '{U+0940}'
+stringdef _u '{U+0941}'
+stringdef _U '{U+0942}'
+stringdef _q '{U+0943}'
+stringdef _e '{U+0947}'
+stringdef _E '{U+0948}'
+stringdef _o '{U+094B}'
+stringdef _O '{U+094C}'
+
+// Diacritics:
+stringdef M '{U+0902}'
+stringdef H '{U+0903}'
+stringdef Mh '{U+0901}'
+stringdef Z '{U+093C}' // Nukta
+stringdef virama '{U+094D}'
+
+// Velar consonants:
+stringdef k '{U+0915}'
+stringdef K '{U+0916}'
+stringdef g '{U+0917}'
+stringdef G '{U+0918}'
+stringdef f '{U+0919}'
+
+// Palatal consonants:
+stringdef c '{U+091A}'
+stringdef C '{U+091B}'
+stringdef j '{U+091C}'
+stringdef J '{U+091D}'
+stringdef F '{U+091E}'
+
+// Retroflex consonants:
+stringdef t '{U+091F}'
+stringdef T '{U+0920}'
+stringdef d '{U+0921}'
+stringdef D '{U+0922}'
+stringdef N '{U+0923}'
+
+// Dental consonants:
+stringdef w '{U+0924}'
+stringdef W '{U+0925}'
+stringdef x '{U+0926}'
+stringdef X '{U+0927}'
+stringdef n '{U+0928}'
+
+// Labial consonants:
+stringdef p '{U+092A}'
+stringdef P '{U+092B}'
+stringdef b '{U+092C}'
+stringdef B '{U+092D}'
+stringdef m '{U+092E}'
+
+// Semi-vowels:
+stringdef y '{U+092F}'
+stringdef r '{U+0930}'
+stringdef l '{U+0932}'
+stringdef v '{U+0935}'
+
+// Fricatives:
+stringdef S '{U+0936}'
+stringdef R '{U+0937}'
+stringdef s '{U+0938}'
+stringdef h '{U+0939}'
+
+stringdef lY '{U+0933}'
+
+// Precomposed characters - letters + nukta:
+stringdef nZ '{U+0929}' // ≡ {n}{Z}
+stringdef rZ '{U+0931}' // ≡ {r}{Z}
+stringdef lYZ '{U+0934}' // ≡ {lY}{Z}
+stringdef kZ '{U+0958}' // ≡ {k}{Z}
+stringdef KZ '{U+0959}' // ≡ {K}{Z}
+stringdef gZ '{U+095A}' // ≡ {g}{Z}
+stringdef jZ '{U+095B}' // ≡ {j}{Z}
+stringdef dZ '{U+095C}' // ≡ {d}{Z}
+stringdef DZ '{U+095D}' // ≡ {D}{Z}
+stringdef PZ '{U+095E}' // ≡ {P}{Z}
+stringdef yZ '{U+095F}' // ≡ {y}{Z}
+
+integers ( p )
+
+groupings ( consonant )
+
+routines ( CONSONANT )
+
+define consonant '{k}{K}{g}{G}{f}' +
+ '{c}{C}{j}{J}{F}' +
+ '{t}{T}{d}{D}{N}' +
+ '{w}{W}{x}{X}{n}' +
+ '{p}{P}{b}{B}{m}' +
+ '{y}{r}{l}{v}' +
+ '{S}{R}{s}{h}' +
+ '{lY}' +
+ '{Z}' + // Nukta
+ // Precomposed characters - letter and nukta:
+ '{nZ}{rZ}{lYZ}{kZ}{KZ}{gZ}{jZ}{dZ}{DZ}{PZ}{yZ}'
+
+backwardmode ( define CONSONANT as ( consonant ) )
+
+define stem as (
+ test ( next setmark p )
+ backwards (
+ // We assume in this implementation that the whole word doesn't count
+ // as a valid suffix to remove, so we remove the longest suffix from
+ // the list which leaves at least one character. This change affects
+ // 47 words out of the 65,140 in the sample vocabulary from Hindi
+ // wikipedia.
+ setlimit tomark p for ([substring])
+ among (
+ // The list below is derived from figure 3 in the paper.
+ //
+ // We perform the stemming on the Devanagari characters rather than
+ // transliterating to Latin, so we have adapted the list below to
+ // reflect this by converting suffixes back to Devanagari as
+ // follows:
+ //
+ // * within the suffixes, "a" after a consonant is dropped since
+ // consonants have an implicit "a".
+ //
+ // * within the suffixes, a vowel other than "a" after a consonant
+ // is a dependent vowel (vowel sign); a vowel (including "a")
+ // after a non-consonant is an independent vowel.
+ //
+ // * to allow the vowel at the start of each suffix being dependent
+ // or independent, we include each suffix twice. For the
+ // dependent version, a leading "a" is dropped and we check that
+ // the suffix is preceded by a consonant (which will have an
+ // implicit "a").
+ //
+ // * we add '{a}', which is needed for the example given right at
+ // the end of section 5 to work (conflating BarawIya and
+ // BarawIyawA), and which 3.1 a.v strongly suggests should be in
+ // the list:
+ //
+ // Thus, the following suffix deletions (longest possible
+ // match) are required to reduce inflected forms of masculine
+ // nouns to a common stem:
+ // a A i [...]
+ //
+ // Adding '{a}' only affect 2 words out of the 65,140 in the
+ // sample vocabulary.
+ //
+ // * The transliterations of our stems would end with "a" when our
+ // stems end in a consonant, so we also include {virama} in the
+ // list of suffixes to remove (this affects 222 words from the
+ // sample vocabulary).
+ //
+ // We've also assumed that Mh in the suffix list always means {Mh}
+ // and never {M}{h}{virama}. Only one of the 65,140 words in the
+ // sample vocabulary stems differently due to this (and that word
+ // seems to be a typo).
+
+ '{virama}'
+
+ '{a}'
+ '{A}'
+ '{i}'
+ '{I}'
+ '{u}'
+ '{U}'
+ '{e}'
+ '{o}'
+ '{e}{M}'
+ '{o}{M}'
+ '{A}{M}'
+ '{u}{A}{M}'
+ '{u}{e}{M}'
+ '{u}{o}{M}'
+ '{A}{e}{M}'
+ '{A}{o}{M}'
+ '{i}{y}{_A}{M}'
+ '{i}{y}{_o}{M}'
+ '{A}{i}{y}{_A}{M}'
+ '{A}{i}{y}{_o}{M}'
+ '{A}{Mh}'
+ '{i}{y}{_A}{Mh}'
+ '{A}{i}{y}{_A}{Mh}'
+ '{a}{w}{_A}{e}{M}'
+ '{a}{w}{_A}{o}{M}'
+ '{a}{n}{_A}{e}{M}'
+ '{a}{n}{_A}{o}{M}'
+ '{a}{w}{_A}'
+ '{a}{w}{_I}'
+ '{I}{M}'
+ '{a}{w}{_I}{M}'
+ '{a}{w}{_e}'
+ '{A}{w}{_A}'
+ '{A}{w}{_I}'
+ '{A}{w}{_I}{M}'
+ '{A}{w}{_e}'
+ '{a}{n}{_A}'
+ '{a}{n}{_I}'
+ '{a}{n}{_e}'
+ '{A}{n}{_A}'
+ '{A}{n}{_e}'
+ '{U}{M}{g}{_A}'
+ '{U}{M}{g}{_I}'
+ '{A}{U}{M}{g}{_A}'
+ '{A}{U}{M}{g}{_I}'
+ '{e}{M}{g}{_e}'
+ '{e}{M}{g}{_I}'
+ '{A}{e}{M}{g}{_e}'
+ '{A}{e}{M}{g}{_I}'
+ '{o}{g}{_e}'
+ '{o}{g}{_I}'
+ '{A}{o}{g}{_e}'
+ '{A}{o}{g}{_I}'
+ '{e}{g}{_A}'
+ '{e}{g}{_I}'
+ '{A}{e}{g}{_A}'
+ '{A}{e}{g}{_I}'
+ '{A}{y}{_A}'
+ '{A}{e}'
+ '{A}{I}'
+ '{A}{I}{M}'
+ '{i}{e}'
+ '{A}{o}'
+ '{A}{i}{e}'
+ '{a}{k}{r}'
+ '{A}{k}{r}'
+
+ '{_A}'
+ '{_i}'
+ '{_I}'
+ '{_u}'
+ '{_U}'
+ '{_e}'
+ '{_o}'
+ '{_e}{M}'
+ '{_o}{M}'
+ '{_A}{M}'
+ '{_u}{A}{M}'
+ '{_u}{e}{M}'
+ '{_u}{o}{M}'
+ '{_A}{e}{M}'
+ '{_A}{o}{M}'
+ '{_i}{y}{_A}{M}'
+ '{_i}{y}{_o}{M}'
+ '{_A}{i}{y}{_A}{M}'
+ '{_A}{i}{y}{_o}{M}'
+ '{_A}{Mh}'
+ '{_i}{y}{_A}{Mh}'
+ '{_A}{i}{y}{_A}{Mh}'
+ '{_I}{M}'
+ '{_A}{w}{_A}'
+ '{_A}{w}{_I}'
+ '{_A}{w}{_I}{M}'
+ '{_A}{w}{_e}'
+ '{_A}{n}{_A}'
+ '{_A}{n}{_e}'
+ '{_U}{M}{g}{_A}'
+ '{_U}{M}{g}{_I}'
+ '{_A}{U}{M}{g}{_A}'
+ '{_A}{U}{M}{g}{_I}'
+ '{_e}{M}{g}{_e}'
+ '{_e}{M}{g}{_I}'
+ '{_A}{e}{M}{g}{_e}'
+ '{_A}{e}{M}{g}{_I}'
+ '{_o}{g}{_e}'
+ '{_o}{g}{_I}'
+ '{_A}{o}{g}{_e}'
+ '{_A}{o}{g}{_I}'
+ '{_e}{g}{_A}'
+ '{_e}{g}{_I}'
+ '{_A}{e}{g}{_A}'
+ '{_A}{e}{g}{_I}'
+ '{_A}{y}{_A}'
+ '{_A}{e}'
+ '{_A}{I}'
+ '{_A}{I}{M}'
+ '{_i}{e}'
+ '{_A}{o}'
+ '{_A}{i}{e}'
+ '{_A}{k}{r}'
+
+ /* Suffixes with a leading implicit a: */
+ '{w}{_A}{e}{M}' CONSONANT
+ '{w}{_A}{o}{M}' CONSONANT
+ '{n}{_A}{e}{M}' CONSONANT
+ '{n}{_A}{o}{M}' CONSONANT
+ '{w}{_A}' CONSONANT
+ '{w}{_I}' CONSONANT
+ '{w}{_I}{M}' CONSONANT
+ '{w}{_e}' CONSONANT
+ '{n}{_A}' CONSONANT
+ '{n}{_I}' CONSONANT
+ '{n}{_e}' CONSONANT
+ '{k}{r}' CONSONANT
+ )
+ delete
+ )
+)
diff --git a/contrib/snowball/algorithms/hungarian/stem_Unicode.sbl b/contrib/snowball/algorithms/hungarian.sbl
index c812e055c..2d7885cf1 100644
--- a/contrib/snowball/algorithms/hungarian/stem_Unicode.sbl
+++ b/contrib/snowball/algorithms/hungarian.sbl
@@ -27,17 +27,17 @@ groupings ( v )
stringescapes {}
-/* special characters (in Unicode) */
-
-stringdef a' hex 'E1' //a-acute
-stringdef e' hex 'E9' //e-acute
-stringdef i' hex 'ED' //i-acute
-stringdef o' hex 'F3' //o-acute
-stringdef o" hex 'F6' //o-umlaut
-stringdef oq hex '151' //o-double acute
-stringdef u' hex 'FA' //u-acute
-stringdef u" hex 'FC' //u-umlaut
-stringdef uq hex '171' //u-double acute
+/* special characters */
+
+stringdef a' '{U+00E1}' //a-acute
+stringdef e' '{U+00E9}' //e-acute
+stringdef i' '{U+00ED}' //i-acute
+stringdef o' '{U+00F3}' //o-acute
+stringdef o" '{U+00F6}' //o-umlaut
+stringdef oq '{U+0151}' //o-double acute
+stringdef u' '{U+00FA}' //u-acute
+stringdef u" '{U+00FC}' //u-umlaut
+stringdef uq '{U+0171}' //u-double acute
define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}'
diff --git a/contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl b/contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl
deleted file mode 100644
index d1a644931..000000000
--- a/contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
-Hungarian Stemmer
-Removes noun inflections
-*/
-
-routines (
- mark_regions
- R1
- v_ending
- case
- case_special
- case_other
- plural
- owned
- sing_owner
- plur_owner
- instrum
- factive
- undouble
- double
-)
-
-externals ( stem )
-
-integers ( p1 )
-groupings ( v )
-
-stringescapes {}
-
-/* special characters (in ISO Latin 2) */
-
-stringdef a' hex 'E1' //a-acute
-stringdef e' hex 'E9' //e-acute
-stringdef i' hex 'ED' //i-acute
-stringdef o' hex 'F3' //o-acute
-stringdef o" hex 'F6' //o-umlaut
-stringdef oq hex 'F5' //o-double acute
-stringdef u' hex 'FA' //u-acute
-stringdef u" hex 'FC' //u-umlaut
-stringdef uq hex 'FB' //u-double acute
-
-define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}'
-
-define mark_regions as (
-
- $p1 = limit
-
- (v goto non-v
- among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next
- setmark p1)
- or
-
- (non-v gopast v setmark p1)
-)
-
-backwardmode (
-
- define R1 as $p1 <= cursor
-
- define v_ending as (
- [substring] R1 among(
- '{a'}' (<- 'a')
- '{e'}' (<- 'e')
- )
- )
-
- define double as (
- test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm'
- 'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs')
- )
-
- define undouble as (
- next [hop 1] delete
- )
-
- define instrum as(
- [substring] R1 among(
- 'al' (double)
- 'el' (double)
- )
- delete
- undouble
- )
-
-
- define case as (
- [substring] R1 among(
- 'ban' 'ben'
- 'ba' 'be'
- 'ra' 're'
- 'nak' 'nek'
- 'val' 'vel'
- 't{o'}l' 't{oq}l'
- 'r{o'}l' 'r{oq}l'
- 'b{o'}l' 'b{oq}l'
- 'hoz' 'hez' 'h{o"}z'
- 'n{a'}l' 'n{e'}l'
- 'ig'
- 'at' 'et' 'ot' '{o"}t'
- '{e'}rt'
- 'k{e'}pp' 'k{e'}ppen'
- 'kor'
- 'ul' '{u"}l'
- 'v{a'}' 'v{e'}'
- 'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt'
- 'k{e'}nt'
- 'en' 'on' 'an' '{o"}n'
- 'n'
- 't'
- )
- delete
- v_ending
- )
-
- define case_special as(
- [substring] R1 among(
- '{e'}n' (<- 'e')
- '{a'}n' (<- 'a')
- '{a'}nk{e'}nt' (<- 'a')
- )
- )
-
- define case_other as(
- [substring] R1 among(
- 'astul' 'est{u"}l' (delete)
- 'stul' 'st{u"}l' (delete)
- '{a'}stul' (<- 'a')
- '{e'}st{u"}l' (<- 'e')
- )
- )
-
- define factive as(
- [substring] R1 among(
- '{a'}' (double)
- '{e'}' (double)
- )
- delete
- undouble
- )
-
- define plural as (
- [substring] R1 among(
- '{a'}k' (<- 'a')
- '{e'}k' (<- 'e')
- '{o"}k' (delete)
- 'ak' (delete)
- 'ok' (delete)
- 'ek' (delete)
- 'k' (delete)
- )
- )
-
- define owned as (
- [substring] R1 among (
- 'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete)
- '{e'}k{e'}' (<- 'e')
- '{a'}k{e'}' (<- 'a')
- 'k{e'}' (delete)
- '{e'}{e'}i' (<- 'e')
- '{a'}{e'}i' (<- 'a')
- '{e'}i' (delete)
- '{e'}{e'}' (<- 'e')
- '{e'}' (delete)
- )
- )
-
- define sing_owner as (
- [substring] R1 among(
- '{u"}nk' 'unk' (delete)
- '{a'}nk' (<- 'a')
- '{e'}nk' (<- 'e')
- 'nk' (delete)
- '{a'}juk' (<- 'a')
- '{e'}j{u"}k' (<- 'e')
- 'juk' 'j{u"}k' (delete)
- 'uk' '{u"}k' (delete)
- 'em' 'om' 'am' (delete)
- '{a'}m' (<- 'a')
- '{e'}m' (<- 'e')
- 'm' (delete)
- 'od' 'ed' 'ad' '{o"}d' (delete)
- '{a'}d' (<- 'a')
- '{e'}d' (<- 'e')
- 'd' (delete)
- 'ja' 'je' (delete)
- 'a' 'e' 'o' (delete)
- '{a'}' (<- 'a')
- '{e'}' (<- 'e')
- )
- )
-
- define plur_owner as (
- [substring] R1 among(
- 'jaim' 'jeim' (delete)
- '{a'}im' (<- 'a')
- '{e'}im' (<- 'e')
- 'aim' 'eim' (delete)
- 'im' (delete)
- 'jaid' 'jeid' (delete)
- '{a'}id' (<- 'a')
- '{e'}id' (<- 'e')
- 'aid' 'eid' (delete)
- 'id' (delete)
- 'jai' 'jei' (delete)
- '{a'}i' (<- 'a')
- '{e'}i' (<- 'e')
- 'ai' 'ei' (delete)
- 'i' (delete)
- 'jaink' 'jeink' (delete)
- 'eink' 'aink' (delete)
- '{a'}ink' (<- 'a')
- '{e'}ink' (<- 'e')
- 'ink'
- 'jaitok' 'jeitek' (delete)
- 'aitok' 'eitek' (delete)
- '{a'}itok' (<- 'a')
- '{e'}itek' (<- 'e')
- 'itek' (delete)
- 'jeik' 'jaik' (delete)
- 'aik' 'eik' (delete)
- '{a'}ik' (<- 'a')
- '{e'}ik' (<- 'e')
- 'ik' (delete)
- )
- )
-)
-
-define stem as (
- do mark_regions
- backwards (
- do instrum
- do case
- do case_special
- do case_other
- do factive
- do owned
- do sing_owner
- do plur_owner
- do plural
- )
-)
diff --git a/contrib/snowball/algorithms/indonesian.sbl b/contrib/snowball/algorithms/indonesian.sbl
new file mode 100644
index 000000000..ac0ee3655
--- /dev/null
+++ b/contrib/snowball/algorithms/indonesian.sbl
@@ -0,0 +1,192 @@
+// An implementation of the "Porter Stemmer for Bahasa Indonesia" from:
+// http://www.illc.uva.nl/Research/Publications/Reports/MoL-2003-02.text.pdf
+
+integers (
+ // The paper defines measure as the number of vowels in the word. We
+ // count this initially, then adjust the count each time we remove a
+ // prefix or suffix.
+ measure
+
+ // Numeric code for the type of prefix removed:
+ //
+ // 0 other/none
+ // 1 'di' or 'meng' or 'ter'
+ // 2 'per'
+ // 3 'ke' or 'peng'
+ // 4 'ber'
+ //
+ // Some of these have variant forms, so e.g. "meng" includes "men", "me",
+ // "meny", "mem".
+ //
+ // Note that the value of prefix is only used in remove_suffix (and
+ // routines it calls) so we don't need to worry about
+ // remove_second_order_prefix overwriting a value of prefix set by
+ // remove_first_order_prefix since remove_suffix gets called between
+ // the two.
+ prefix
+)
+
+groupings ( vowel )
+
+routines (
+ remove_particle
+ remove_possessive_pronoun
+ remove_first_order_prefix
+ remove_second_order_prefix
+ remove_suffix
+ KER
+ SUFFIX_KAN_OK
+ SUFFIX_AN_OK
+ SUFFIX_I_OK
+ VOWEL
+)
+
+externals ( stem )
+
+stringescapes {}
+
+backwardmode (
+
+ define remove_particle as (
+ [substring] among (
+ 'kah' 'lah' 'pun' (delete $measure-=1)
+ )
+ )
+
+ define remove_possessive_pronoun as (
+ [substring] among (
+ 'ku' 'mu' 'nya' (delete $measure-=1)
+ )
+ )
+
+ // prefix not in {ke, peng, per}
+ define SUFFIX_KAN_OK as (
+ // On page 29, the example "kompas Q.31" says "Both Nazief and Porter
+ // stemmer converted the word peledakan (blast, explotion) to ledak (to
+ // blast, to explode)". However, the algorithm as described doesn't
+ // behave in this way - grammatically the prefix pe- occurs as a
+ // variation of both the first-order derivational prefix peng- and the
+ // second-order derivational prefix per-, but table 2.5 doesn't include
+ // "pe", only table 2.6 does, so "peledakan" is handled (incorrectly)
+ // as having prefix "per" not "peng", and so we remove derivational
+ // suffix "kan" rather than "an" to give stem leda. (Porter-style
+ // stemmers remove the longest suffix they can amongst those available,
+ // which this paper notes in the last paragraph on page 15).
+ //
+ // We resolve this by amending the condition on suffix "kan" to
+ // "prefix ∉ {ke, peng, per}", which seems to make the stemmer's
+ // behaviour match all the examples in the paper except for one:
+ // "perbaikan" is shown in table 3.4 as stemming to "bai", but with
+ // this change it now stems to "baik". The table notes that "baik" is
+ // the actual root so this deviation is an improvement. In a sample
+ // vocabulary derived from the most common words in id.wikipedia.org,
+ // this change only affects 0.12% of words (76 out of 64,587, including
+ // "peledakan" and "perbaikan").
+ $prefix != 3 and $prefix != 2
+ )
+
+ // prefix not in {di, meng, ter}
+ define SUFFIX_AN_OK as ( $prefix != 1 )
+
+ define SUFFIX_I_OK as (
+ // prefix not in {ke, peng, ber}
+ $prefix <= 2
+
+ // The rest of the condition from the paper is:
+ // V|K...c₁c₁, c₁ ≠ s, c₂ ≠ i
+ //
+ // The meaning of this is unclear in several ways, and none of the
+ // examples given of the stemmer's behaviour in the paper help to
+ // resolve these issues.
+ //
+ // Notice that c₂ isn't actually used - the most obvious explanation
+ // seems to be that "c₁c₁" should read "c₁c₂", or maybe "c₂c₁".
+ //
+ // Elsewhere the paper defines V... as meaning "the stem starts with
+ // a vowel" and K... as meaning "the stem starts with a consonant".
+ //
+ // In other places where it says X|Y... it seems the | binds more
+ // tightly, so it's (V|K)...cᵢcⱼ not V|(K...cᵢcⱼ). That seems a bit
+ // odd as the first letter must be either a vowel or a consonant, so
+ // that really just means "ends cᵢcⱼ". However, nowhere in the paper
+ // uses or defines a notation such as ...X, which may explain this
+ // seemingly redundant way of specifying this.
+ //
+ // The conditions elsewhere on prefix removal (e.g. V...) are clearly
+ // on the stem left after the prefix is removed. None of the other
+ // rules for suffix removal have conditions on the stem, but for
+ // consistency with the prefix rules we might expect that the cᵢcⱼ
+ // test is on what's left *after* removing the "i" suffix.
+ //
+ // However, studying Indonesian wordlists and discussion with a native
+ // speaker leads us to conclude that the purpose of this check is to
+ // protect words of foreign origin (e.g. "televisi", "organisasi",
+ // "komunikasi") from stemming, and the common feature of these is
+ // that the word ends "-si", so we conclude that the condition here
+ // should be read as "word does not end -si", and this is what we
+ // have implemented.
+ not 's'
+ )
+
+ define remove_suffix as (
+ [substring] among (
+ 'kan' SUFFIX_KAN_OK 'an' SUFFIX_AN_OK 'i' SUFFIX_I_OK
+ (delete $measure-=1)
+ )
+ )
+)
+
+define vowel 'aeiou'
+
+define VOWEL as ( vowel )
+
+define KER as ( non-vowel 'er' )
+
+define remove_first_order_prefix as (
+ [substring] among (
+ 'di' 'meng' 'men' 'me' 'ter' (delete $prefix=1 $measure-=1)
+ 'ke' 'peng' 'pen' (delete $prefix=3 $measure-=1)
+ 'meny' VOWEL ($prefix=1 <-'s' $measure-=1)
+ 'peny' VOWEL ($prefix=3 <-'s' $measure-=1)
+ 'mem' ($prefix=1 $measure-=1 vowel and <-'p' or delete)
+ 'pem' ($prefix=3 $measure-=1 vowel and <-'p' or delete)
+ )
+)
+
+define remove_second_order_prefix as (
+ // The paper has the condition on removal of prefix "bel" and "pel" as
+ // just "ajar" not "ajar..." but it seems that the latter must be what
+ // is intended so that e.g. "pelajaran" stems to "ajar" not "lajar".
+ // This change only affects a very small number of words (11 out of
+ // 64,587) and only for the better.
+ [substring] among (
+ 'per' 'pe' (delete $prefix=2 $measure-=1)
+ 'pelajar' (<-'ajar' $measure-=1)
+ 'ber' (delete $prefix=4 $measure-=1)
+ 'belajar' (<-'ajar' $prefix=4 $measure-=1)
+ 'be' KER (delete $prefix=4 $measure-=1)
+ )
+)
+
+define stem as (
+ $measure = 0
+ do ( repeat ( gopast vowel $measure+=1 ) )
+ $measure > 2
+ $prefix = 0
+ backwards (
+ do remove_particle
+ $measure > 2
+ do remove_possessive_pronoun
+ )
+ $measure > 2
+ test (
+ remove_first_order_prefix
+ do (
+ test ($measure > 2 backwards remove_suffix)
+ $measure > 2 remove_second_order_prefix
+ )
+ ) or (
+ do remove_second_order_prefix
+ do ($measure > 2 backwards remove_suffix)
+ )
+)
diff --git a/contrib/snowball/algorithms/irish.sbl b/contrib/snowball/algorithms/irish.sbl
new file mode 100644
index 000000000..0b1288a16
--- /dev/null
+++ b/contrib/snowball/algorithms/irish.sbl
@@ -0,0 +1,151 @@
+routines (
+ R1 R2 RV
+ initial_morph
+ mark_regions
+ noun_sfx
+ deriv
+ verb_sfx
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+stringescapes {}
+
+/* Accented characters */
+
+stringdef a' '{U+00E1}' // a-acute
+stringdef e' '{U+00E9}' // e-acute
+stringdef i' '{U+00ED}' // i-acute
+stringdef o' '{U+00F3}' // o-acute
+stringdef u' '{U+00FA}' // u-acute
+
+define v 'aeiou{a'}{e'}{i'}{o'}{u'}'
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ gopast v setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define initial_morph as (
+ [substring] among (
+ 'h-' 'n-' 't-' //nAthair -> n-athair, but alone are problematic
+ (delete)
+
+ // verbs
+ 'd{'}'
+ (delete)
+ 'd{'}fh'
+ (<- 'f')
+ // other contractions
+ 'm{'}' 'b{'}'
+ (delete)
+
+ 'sh'
+ (<- 's')
+
+ 'mb'
+ (<- 'b')
+ 'gc'
+ (<- 'c')
+ 'nd'
+ (<- 'd')
+ 'bhf'
+ (<- 'f')
+ 'ng'
+ (<- 'g')
+ 'bp'
+ (<- 'p')
+ 'ts'
+ (<- 's')
+ 'dt'
+ (<- 't')
+
+ // Lenition
+ 'bh'
+ (<- 'b')
+ 'ch'
+ (<- 'c')
+ 'dh'
+ (<- 'd')
+ 'fh'
+ (<- 'f')
+ 'gh'
+ (<- 'g')
+ 'mh'
+ (<- 'm')
+ 'ph'
+ (<- 'p')
+ 'th'
+ (<- 't')
+ )
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define noun_sfx as (
+ [substring] among (
+ 'amh' 'eamh' 'abh' 'eabh'
+ 'aibh' 'ibh' 'aimh' 'imh'
+ 'a{i'}ocht' '{i'}ocht' 'a{i'}ochta' '{i'}ochta'
+ (R1 delete)
+ 'ire' 'ir{i'}' 'aire' 'air{i'}'
+ (R2 delete)
+ )
+ )
+ define deriv as (
+ [substring] among (
+ 'acht' 'eacht' 'ach' 'each' 'eacht{u'}il' 'eachta' 'acht{u'}il' 'achta'
+ (R2 delete) //siopadóireacht -> siopadóir but not poblacht -> pobl
+ 'arcacht' 'arcachta{i'}' 'arcachta'
+ (<- 'arc') // monarcacht -> monarc
+ 'gineach' 'gineas' 'ginis'
+ (<- 'gin')
+ 'grafa{i'}och' 'grafa{i'}ocht' 'grafa{i'}ochta' 'grafa{i'}ochta{i'}'
+ (<- 'graf')
+ 'paite' 'patach' 'pataigh' 'patacha'
+ (<- 'paite')
+ '{o'}ideach' '{o'}ideacha' '{o'}idigh'
+ (<- '{o'}id')
+ )
+ )
+ define verb_sfx as (
+ [substring] among (
+ 'imid' 'aimid' '{i'}mid' 'a{i'}mid'
+ 'faidh' 'fidh'
+ (RV delete)
+ 'ain'
+ 'eadh' 'adh'
+ '{a'}il'
+ 'tear' 'tar'
+ (R1 delete)
+ )
+ )
+)
+
+define stem as (
+ do initial_morph
+ do mark_regions
+ backwards (
+ do noun_sfx
+ do deriv
+ do verb_sfx
+ )
+)
diff --git a/contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/italian.sbl
index b43295c09..bf0c16176 100644
--- a/contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl
+++ b/contrib/snowball/algorithms/italian.sbl
@@ -16,18 +16,18 @@ groupings ( v AEIO CG )
stringescapes {}
-/* special characters (in MS-DOS Latin I) */
-
-stringdef a' hex 'A0'
-stringdef a` hex '85'
-stringdef e' hex '82'
-stringdef e` hex '8A'
-stringdef i' hex 'A1'
-stringdef i` hex '8D'
-stringdef o' hex 'A2'
-stringdef o` hex '95'
-stringdef u' hex 'A3'
-stringdef u` hex '97'
+/* special characters */
+
+stringdef a' '{U+00E1}'
+stringdef a` '{U+00E0}'
+stringdef e' '{U+00E9}'
+stringdef e` '{U+00E8}'
+stringdef i' '{U+00ED}'
+stringdef i` '{U+00EC}'
+stringdef o' '{U+00F3}'
+stringdef o` '{U+00F2}'
+stringdef u' '{U+00FA}'
+stringdef u` '{U+00F9}'
define v 'aeiou{a`}{e`}{i`}{o`}{u`}'
diff --git a/contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl
deleted file mode 100644
index 8d25cf64f..000000000
--- a/contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl
+++ /dev/null
@@ -1,195 +0,0 @@
-
-routines (
- prelude postlude mark_regions
- RV R1 R2
- attached_pronoun
- standard_suffix
- verb_suffix
- vowel_suffix
-)
-
-externals ( stem )
-
-integers ( pV p1 p2 )
-
-groupings ( v AEIO CG )
-
-stringescapes {}
-
-/* special characters (in ISO Latin I) */
-
-stringdef a' hex 'E1'
-stringdef a` hex 'E0'
-stringdef e' hex 'E9'
-stringdef e` hex 'E8'
-stringdef i' hex 'ED'
-stringdef i` hex 'EC'
-stringdef o' hex 'F3'
-stringdef o` hex 'F2'
-stringdef u' hex 'FA'
-stringdef u` hex 'F9'
-
-define v 'aeiou{a`}{e`}{i`}{o`}{u`}'
-
-define prelude as (
- test repeat (
- [substring] among(
- '{a'}' (<- '{a`}')
- '{e'}' (<- '{e`}')
- '{i'}' (<- '{i`}')
- '{o'}' (<- '{o`}')
- '{u'}' (<- '{u`}')
- 'qu' (<- 'qU')
- '' (next)
- )
- )
- repeat goto (
- v [ ('u' ] v <- 'U') or
- ('i' ] v <- 'I')
- )
-)
-
-define mark_regions as (
-
- $pV = limit
- $p1 = limit
- $p2 = limit // defaults
-
- do (
- ( v (non-v gopast v) or (v gopast non-v) )
- or
- ( non-v (non-v gopast v) or (v next) )
- setmark pV
- )
- do (
- gopast v gopast non-v setmark p1
- gopast v gopast non-v setmark p2
- )
-)
-
-define postlude as repeat (
-
- [substring] among(
- 'I' (<- 'i')
- 'U' (<- 'u')
- '' (next)
- )
-
-)
-
-backwardmode (
-
- define RV as $pV <= cursor
- define R1 as $p1 <= cursor
- define R2 as $p2 <= cursor
-
- define attached_pronoun as (
- [substring] among(
- 'ci' 'gli' 'la' 'le' 'li' 'lo'
- 'mi' 'ne' 'si' 'ti' 'vi'
- // the compound forms are:
- 'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene'
- 'mela' 'mele' 'meli' 'melo' 'mene'
- 'tela' 'tele' 'teli' 'telo' 'tene'
- 'cela' 'cele' 'celi' 'celo' 'cene'
- 'vela' 'vele' 'veli' 'velo' 'vene'
- )
- among( (RV)
- 'ando' 'endo' (delete)
- 'ar' 'er' 'ir' (<- 'e')
- )
- )
-
- define standard_suffix as (
- [substring] among(
-
- 'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo'
- 'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti'
- 'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente'
- 'atrice' 'atrici'
- 'ante' 'anti' // Note 1
- ( R2 delete )
- 'azione' 'azioni' 'atore' 'atori'
- ( R2 delete
- try ( ['ic'] R2 delete )
- )
- 'logia' 'logie'
- ( R2 <- 'log' )
- 'uzione' 'uzioni' 'usione' 'usioni'
- ( R2 <- 'u' )
- 'enza' 'enze'
- ( R2 <- 'ente' )
- 'amento' 'amenti' 'imento' 'imenti'
- ( RV delete )
- 'amente' (
- R1 delete
- try (
- [substring] R2 delete among(
- 'iv' ( ['at'] R2 delete )
- 'os' 'ic' 'abil'
- )
- )
- )
- 'it{a`}' (
- R2 delete
- try (
- [substring] among(
- 'abil' 'ic' 'iv' (R2 delete)
- )
- )
- )
- 'ivo' 'ivi' 'iva' 'ive' (
- R2 delete
- try ( ['at'] R2 delete ['ic'] R2 delete )
- )
- )
- )
-
- define verb_suffix as setlimit tomark pV for (
- [substring] among(
- 'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi'
- 'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate'
- 'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai'
- 'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo'
- 'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete'
- 'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo'
- 'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei'
- 'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono'
- 'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita'
- 'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo'
- 'ono' 'uta' 'ute' 'uti' 'uto'
-
- 'ar' 'ir' // but 'er' is problematical
- (delete)
- )
- )
-
- define AEIO 'aeio{a`}{e`}{i`}{o`}'
- define CG 'cg'
-
- define vowel_suffix as (
- try (
- [AEIO] RV delete
- ['i'] RV delete
- )
- try (
- ['h'] CG RV delete
- )
- )
-)
-
-define stem as (
- do prelude
- do mark_regions
- backwards (
- do attached_pronoun
- do (standard_suffix or verb_suffix)
- do vowel_suffix
- )
- do postlude
-)
-
-/*
- Note 1: additions of 15 Jun 2005
-*/
-
diff --git a/contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/kraaij_pohlmann.sbl
index cd79d1280..409bf0ff7 100644
--- a/contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl
+++ b/contrib/snowball/algorithms/kraaij_pohlmann.sbl
@@ -1,5 +1,5 @@
strings ( ch )
-integers ( x p1 p2 )
+integers ( p1 p2 )
booleans ( Y_found stemmed GE_removed )
routines (
@@ -20,8 +20,6 @@ groupings ( v v_WX AOU AIOU )
stringescapes {}
-stringdef ' hex '27' // yuk
-
define v 'aeiouy'
define v_WX v + 'wx'
define AOU 'aou'
@@ -29,8 +27,8 @@ define AIOU 'aiou'
backwardmode (
- define R1 as (setmark x $x >= p1)
- define R2 as (setmark x $x >= p2)
+ define R1 as ($p1 <= cursor)
+ define R2 as ($p2 <= cursor)
define V as test (v or 'ij')
define VX as test (next v or 'ij')
@@ -46,7 +44,7 @@ backwardmode (
define Step_1 as
(
- [among ( (])
+ [substring] among (
'{'}s' (delete)
's' (R1 not ('t' R1) C delete)
@@ -68,7 +66,7 @@ backwardmode (
define Step_2 as
(
- [among ( (])
+ [substring] among (
'je' (('{'}t' ] delete) or
('et' ] R1 C delete) or
('rnt' ] <-'rn') or
@@ -92,7 +90,7 @@ backwardmode (
define Step_3 as
(
- [among ( (])
+ [substring] among (
'atie' (R1 <-'eer')
'iteit' (R1 delete lengthen_V)
'heid'
@@ -112,7 +110,7 @@ backwardmode (
define Step_4 as
(
- ( [among ( (])
+ ( [substring] among (
'ioneel' (R1 <-'ie')
'atief' (R1 <-'eer')
'baar' (R1 delete)
@@ -132,7 +130,7 @@ backwardmode (
)
)
or
- ( [among ( (])
+ ( [substring] among (
'iger'
'igst'
'ig' (R1 C delete lengthen_V)
@@ -142,7 +140,7 @@ backwardmode (
define Step_7 as
(
- [among ( (])
+ [substring] among (
'kt' (<-'k')
'ft' (<-'f')
'pt' (<-'p')
@@ -151,7 +149,7 @@ backwardmode (
define Step_6 as
(
- [among ( (])
+ [substring] among (
'bb' (<-'b')
'cc' (<-'c')
'dd' (<-'d')
@@ -179,7 +177,7 @@ backwardmode (
define Step_1c as
(
- [among ( (] R1 C)
+ [substring] among ( (R1 C)
'd' (not ('n' R1) delete)
't' (not ('h' R1) delete)
)
@@ -200,11 +198,8 @@ define Lose_infix as (
)
define measure as (
- do (
- tolimit
- setmark p1
- setmark p2
- )
+ $p1 = limit
+ $p2 = limit
do(
repeat non-v atleast 1 ('ij' or v) non-v setmark p1
repeat non-v atleast 1 ('ij' or v) non-v setmark p2
diff --git a/contrib/snowball/algorithms/lithuanian.sbl b/contrib/snowball/algorithms/lithuanian.sbl
new file mode 100644
index 000000000..ff7fdb9da
--- /dev/null
+++ b/contrib/snowball/algorithms/lithuanian.sbl
@@ -0,0 +1,373 @@
+externals ( stem )
+
+// escape symbols for substituting lithuanian characters
+stringescapes { }
+
+/* Special characters in Unicode Latin Extended-A */
+// ' nosine
+stringdef a' '{U+0105}' // ą a + ogonek
+stringdef e' '{U+0119}' // ę e + ogonek
+stringdef i' '{U+012F}' // į i + ogonek
+stringdef u' '{U+0173}' // ų u + ogonek
+
+// . taskas
+stringdef e. '{U+0117}' // ė e + dot
+
+// - ilgoji
+stringdef u- '{U+016B}' // ū u + macron
+
+// * varnele
+stringdef c* '{U+010D}' // č c + caron (haček)
+stringdef s* '{U+0161}' // š s + caron (haček)
+stringdef z* '{U+017E}' // ž z + caron (haček)
+
+// [C](VC)^m[V|C]
+// definitions of variables for
+// p1 - position of m = 0
+integers ( p1 )
+
+// groupings
+// v - lithuanian vowels
+groupings ( v )
+
+// v - all lithuanian vowels
+define v 'aeiyou{a'}{e'}{i'}{u'}{e.}{u-}'
+
+// all lithuanian stemmer routines: 4 steps
+routines (
+ step2 R1 step1 fix_chdz fix_gd fix_conflicts
+)
+
+backwardmode (
+
+ define R1 as $p1 <= cursor
+ define step1 as (
+ setlimit tomark p1 for ([substring]) R1 among (
+ // Daiktavardžiai (Nouns)
+ // I linksniuotė (declension I)
+ 'as' 'ias' 'is' 'ys' // vyras, kelias, brolis, gaidys
+ 'o' 'io' // vyro, kelio
+ 'ui' 'iui' // vyrui, keliui
+ '{a'}' 'i{a'}' '{i'}' // vyrą, kelią, brolį
+ 'u' 'iu' // vyru, keliu
+ 'e' 'yje' // vyre, kelyje
+ 'y' 'au' 'i' // kely, brolau, broli,
+ 'an' // nusižengiman
+
+ 'ai' 'iai' // vyrai, keliai
+ '{u'}' 'i{u'}' // vyrų, kelių
+ 'ams' 'am' // vyrams, vyram
+ 'iams' 'iam' // broliams, broliam
+ 'us' 'ius' // vyrus, brolius
+ 'ais' 'iais' // vyrais, keliais
+ 'uose' 'iuose' 'uos' 'iuos' // vyruose, keliuose, vyruos, keliuos
+ 'uosna' 'iuosna' // vyruosna, keliuosna
+ 'ysna' // žutysna
+
+ 'asis' 'aisi' // sukimasis, sukimaisi
+ 'osi' '{u'}si' // sukimosi, sukimųsi
+ 'uisi' // sukimuisi
+ '{a'}si' // sukimąsi
+ 'usi' // sukimusi
+ 'esi' // sukimesi
+
+ 'uo' // mėnuo
+
+
+ // II linksniuote (declension II)
+ 'a' 'ia' // galva, vysnios
+ 'os' 'ios' // galvos, vysnios
+ 'oj' 'oje' 'ioje' // galvoje, vysnioje
+ 'osna' 'iosna' // galvosna, vyšniosna
+ 'om' 'oms' 'ioms' // galvoms, vysnioms
+ 'omis' 'iomis' // galvomis, vysniomis
+ 'ose' 'iose' // galvose, vysniose
+ 'on' 'ion' // galvon, vyšnion
+
+
+ // III linksniuote (declension III)
+ '{e.}' // gervė
+ '{e.}s' // gervės
+ 'ei' // gervei
+ '{e'}' // gervę
+ '{e.}j' '{e.}je' // gervėj, gervėje
+ '{e.}ms' // gervėms
+ 'es' // gerves
+ '{e.}mis' // gervėmis
+ '{e.}se' // gervėse
+ '{e.}sna' // gervėsna
+ '{e.}n' // žydaitėn
+
+
+ // IV linksniuote (declension IV)
+ 'aus' 'iaus' // sūnaus, skaičiaus
+ 'umi' 'iumi' // sūnumi, skaičiumi
+ 'uje' 'iuje' // sūnuje, skaičiuje
+ 'iau' // skaičiau
+
+ '{u-}s' // sūnūs
+ 'ums' // sūnums
+ 'umis' // sūnumis
+ 'un' 'iun' // sūnun, administratoriun
+
+
+ // V linksniuote (declension V)
+ 'ies' 'ens' 'enio' 'ers' // avies, vandens, sesers
+ 'eniui' 'eriai' // vandeniui, eriai
+ 'en{i'}' 'er{i'}' // vandenį, seserį
+ 'imi' 'eniu' 'erimi' 'eria' // avimi, vandeniu, seserimi, seseria
+ 'enyje' 'eryje' // vandenyje, seseryje
+ 'ie' 'enie' 'erie' // avie, vandenie, seserie
+
+ 'enys' 'erys' // vandenys, seserys
+ // 'en{u'}' konfliktas su 'žandenų' 'antenų'
+ 'er{u'}' // seserų
+ 'ims' 'enims' 'erims' // avims, vandemins, seserims
+ 'enis' // vandenis
+ 'imis' // žebenkštimis
+ 'enimis' // vandenimis
+ 'yse' 'enyse' 'eryse' // avyse, vandenyse, seseryse
+
+
+ // Būdvardžiai (Adjectives)
+ // (i)a linksniuotė
+ 'iem' 'iems' // geriem, geriems
+ 'ame' 'iame' // naujame, mediniame
+
+
+ // Veiksmažodžiai (Verbs)
+ // Tiesioginė nuosaka (indicative mood)
+ // esamasis laikas (present tense)
+ // (i)a asmenuotė (declension (i)a)
+ 'uosi' 'iuosi' // dirbuosi, traukiuosi
+ 'iesi' // dirbiesi
+ 'asi' 'iasi' // dirbasi, traukiasi
+ 'am{e.}s' 'iam{e.}s' // dirbamės, traukiamės
+ 'at' 'ate' 'iat' 'iate' // dirbat, dirbate, ariat, traukiate
+ 'at{e.}s' 'iat{e.}s' // dirbatės, traukiatės
+
+ // i asmenuotė (declension i)
+ 'isi' // tikisi
+ 'im' // mylim
+ // 'ime' konfliktassu daiktavardžiu vietininku, pvz. 'gėrime'
+ 'im{e.}s' // tikimės
+ 'it' 'ite' // mylit, mylite, tikitės
+ // 'it{e.}s' konfliktas su priesaga ir dgs. vardininko galūne -ait-ės pvz. žydaitės
+
+ // o asmenuotė (declension o)
+ 'ome' // mokome
+ 'ot' 'ote' // mokot, mokote
+
+ // būtasis laikas
+ // o asmenuotė (declension o)
+ '{e.}jo' '{e.}josi' // tikėjo, tikėjosi
+ 'ot{e.}s' // tikėjotės/bijotės
+
+ // ė asmenuotė (declension ė)
+ 'eisi' // mokeisi
+ '{e.}si' // mokėsi
+ '{e.}m' '{e.}me' // mokėm, mokėme
+ '{e.}m{e.}s' // mokėmės
+ '{e.}t' '{e.}te' // mokėt, mokėte
+ '{e.}t{e.}s' // mokėtės
+
+ // būtasis dažninis laikas (frequentative past tense)
+ 'ausi' // mokydavausi
+ 'om{e.}s' // mokydavomės/bijomės
+
+
+ // būsimasis laikas (future tense)
+ 'siu' 'siuosi' // dirbsiu, mokysiuosi
+ 'si' 'siesi' // dirbsi, dirbsiesi
+ 's' 'ysis' // dirbs, mokysis
+ 'sim' 'sime' // dirbsim, dirbsime
+ 'sit' 'site' // gersit, gersite
+
+ // tariamoji nuosaka (subjunctive mood)
+ '{c*}iau' '{c*}iausi' // dirbčiau
+ 'tum' 'tumei' // dirbtum, dirbtumei
+ 'tumeis' 'tumeisi' // mokytumeis, mokytumeisi
+ // 't{u'}' nes blogai batutų -> batų
+ 't{u'}si' // mokytųsi
+ // 'tume' konfliktas su 'šventume'
+ 'tum{e.}m' // dirbtumėm
+ 'tum{e.}me' // dirbtumėme
+ 'tum{e.}m{e.}s' // mokytumėmės
+ 'tute' 'tum{e.}t' // dirbtute, dirbtumėt
+ 'tum{e.}te' // dirbtumėte
+ 'tum{e.}t{e.}s' // mokytumėtės
+
+ // liepiamoji nuosaka (imperative mood)
+ 'k' 'ki' // dirbk, dirbki, mokykis
+ // 'kis' konfliktas viln-išk-is
+ // 'kime' konfliktas, nes pirkime
+ 'kim{e.}s' // mokykimės
+
+ // bendratis (infinitive)
+ 'uoti' 'iuoti' // meluoti, dygsniuoti
+ 'auti' 'iauti' // draugauti, girtuokliauti
+ 'oti' 'ioti' // dovanoti, meškerioti
+ '{e.}ti' // auklėti
+ 'yti' // akyti
+ 'inti' // auginti
+ 'in{e.}ti' // blusinėti
+ 'enti' // gyventi
+ 'tel{e.}ti' // bumbtelėti
+ 'ter{e.}ti' // bumbterėti
+
+ 'ti' // skalbti
+ // 'tis' konfliktas, nes rytme-tis -> rytme
+
+ // dalyviai (participles)
+ '{a'}s' 'i{a'}s' '{i'}s' // dirbąs, žaidžiąs, gulįs
+ 't{u'}s' // suktųs -> suk
+ 'sim{e.}s' // suksimės
+ 'sit{e.}s' // suksitės
+ 'kite' // supkite
+ )
+
+ delete
+ )
+
+ define step2 as repeat (
+ setlimit tomark p1 for ([substring]) among (
+ // daiktavardziu priesagos (Noun suffixes)
+
+ // budvardziu priesagos (Adjective suffixes)
+ // 'in' // konfliktas su 'augintinis' ir 'akiniais' // lauk-in-is
+ 'ing' // tvark-ing-as
+ 'i{s*}k' // lenk-išk-as
+ '{e.}t' // dem-ėt-as
+ 'ot' // garban-ot-as
+ 'uot' 'iuot' // lang-uot-as, akin-iuot-as
+ // 'tin', nes augintinis // dirb-tin-is
+ // 'ut', nes batutas, degutas etc. // maž-ut-is
+ 'yt' // maž-yt-is
+ 'iuk' // maž-iuk-as
+ 'iul' // maž-ul-is
+ '{e.}l' // maž-ėl-is
+ 'yl' // maž-yl-is
+ 'u{c*}iuk' // maž-učiuk-as
+ 'uliuk' // maž-uliuk-as
+ 'ut{e.}ait' // maž-utėlait-is
+ 'ok' // did-ok-as
+ 'iok' // višč-iok-as
+ 'sv' '{s*}v' 'zgan' // sal-sv-as, pilk-šv-as, bal-zgan-as
+ 'op' 'iop' // dvej-op-as, viener-iop-as
+ 'ain' // apval-ain-as
+ 'yk{s*}t' 'yk{s*}{c*}' // ten-ykšt-is, vakar-ykšč-ias
+
+ // laisniai
+ 'esn' // did-esn-is
+ 'aus' 'iaus' // nauj-aus-ias, ger-iaus-ias
+
+ // ivardziuotiniai budvardziai (Pronominal adjectives)
+ // vyriska gimine (Male gender)
+ 'ias' // žaliasis
+ 'oj' 'ioj' // gerojo, žaliojo
+ 'aj' 'iaj' // gerajam, žaliajam
+ '{a'}j' 'i{a'}j' // garąjį, žaliąjį
+ 'uoj' 'iuoj' // geruoju, žaliuoju
+ 'iej' // gerieji
+ '{u'}j' 'i{u'}j' // gerųjų, žaliųjų
+ 'ies' // geriesiems
+ 'uos' 'iuos' // geruosius, žaliuosius
+ 'ais' 'iais' // geraisiais, žaliaisiais
+
+ // moteriska gimine (Female gender)
+ 'os' 'ios' // gerosios, žaliosios
+ '{a'}s' 'i{a'}s' // gerąsios, žaliąsias
+
+ // būtasis dažninis laikas (frequentative past tense)
+ 'dav' // ei-dav-o
+
+ // dalyvių priesagos (particple suffix)
+ 'ant' 'iant'
+ 'int' // tur-int-is
+ '{e.}j' // tur-ėj-o
+ '{e'}' //
+ '{e.}j{e'}'
+ '{e'}s' // dirb-ęs-is
+
+ 'siant' // dirb-siant
+
+ // pusdalyviai (participle)
+ 'dam' // bėg-dam-as
+
+ 'auj' // ūkinink-auj-a
+ 'jam'
+ 'iau'
+ 'am' // baiminim-ams-i
+ )
+
+ delete
+ )
+
+ define fix_conflicts as (
+ [substring] among (
+ // 'lietuvaite' -> 'lietuvaitė', konfliktas su 'myl-ite'
+ 'aite' (<-'ait{e.}')
+ // 'lietuvaitės' -> 'lietuvaitė', konfliktas su 'myl-itės'
+ 'ait{e.}s' (<-'ait{e.}')
+
+ // ''ūs-uotės' -> 'ūs-uotė', konfliktas 'mokotės'
+ 'uot{e.}s' (<-'uot{e.}')
+ // ''ūs-uote' -> 'ūs-uotė', konfliktas 'mokote'
+ 'uote' (<-'uot{e.}')
+
+ // 'žerėjime' -> 'žėrėjimas', konfliktas su 'žais-ime'
+ '{e.}jime' (<-'{e.}jimas')
+
+ // 'žvilgesiu' -> 'žvilgesys', konfliktas su 'dirb-siu'
+ 'esiu' (<-'esys')
+ // 'duobkasiu' -> 'duobkasys', konfliktas su 'pakasiu'
+ 'asius' (<-'asys')
+
+ // 'žioravime' -> 'žioravimas', konfliktas su 'myl-ime'
+ 'avime' (<-'avimas')
+ 'ojime' (<-'ojimas')
+
+ // 'advokatės' -> 'advokatė', konfliktas su 'dirb-atės'
+ 'okat{e.}s' (<-'okat{e.}')
+ // 'advokate' -> 'advokatė', konfliktas su 'dirb-ate'
+ 'okate' (<-'okat{e.}')
+ )
+ )
+
+ define fix_chdz as (
+ [substring] among (
+ '{c*}' (<-'t')
+ 'd{z*}' (<-'d')
+ )
+ )
+
+ define fix_gd as (
+ [substring] among (
+ 'gd' (<-'g')
+ // '{e.}k' (<-'{e.}g')
+ )
+ )
+
+)
+
+define stem as (
+
+ $p1 = limit
+
+ do (
+ // priešdėlis 'a' ilgeniuose nei 6 raidės žodžiuose, pvz. 'a-liejus'.
+ try (test 'a' $(len > 6) hop 1)
+
+ gopast v gopast non-v setmark p1
+ )
+
+ backwards (
+ do fix_conflicts
+ do step1
+ do fix_chdz
+ do step2
+ do fix_chdz
+ do fix_gd
+ )
+
+)
diff --git a/contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/lovins.sbl
index 3f69f1572..3f69f1572 100644
--- a/contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl
+++ b/contrib/snowball/algorithms/lovins.sbl
diff --git a/contrib/snowball/algorithms/nepali.sbl b/contrib/snowball/algorithms/nepali.sbl
new file mode 100644
index 000000000..d3887486b
--- /dev/null
+++ b/contrib/snowball/algorithms/nepali.sbl
@@ -0,0 +1,92 @@
+/*
+ * Authors:
+ * - Ingroj Shrestha <ing.stha@gmail.com>, Nepali NLP Group
+ * - Oleg Bartunov <obartunov@gmail.com>, Postgres Professional Ltd.
+ * - Shreeya Singh Dhakal, Nepali NLP Group
+ */
+
+routines (
+ remove_category_1
+ check_category_2
+ remove_category_2
+ remove_category_3
+)
+
+stringescapes {}
+
+stringdef dsc '{U+0901}' // DEVANAGARI_SIGN_CANDRABINDU
+stringdef dsa '{U+0902}' // DEVANAGARI_SIGN_ANUSVARA
+stringdef dli '{U+0907}' // DEVANAGARI_LETTER_I
+stringdef dlii '{U+0908}' // DEVANAGARI_LETTER_II
+stringdef dle '{U+090F}' // DEVANAGARI_LETTER_E
+stringdef dlka '{U+0915}' // DEVANAGARI_LETTER_KA
+stringdef dlkha '{U+0916}' // DEVANAGARI_LETTER_KHA
+stringdef dlg '{U+0917}' // DEVANAGARI_LETTER_GA
+stringdef dlc '{U+091B}' // DEVANAGARI_LETTER_CHA
+stringdef dlta '{U+0924}' // DEVANAGARI_LETTER_TA
+stringdef dltha '{U+0925}' // DEVANAGARI_LETTER_THA
+stringdef dld '{U+0926}' // DEVANAGARI_LETTER_DA
+stringdef dln '{U+0928}' // DEVANAGARI_LETTER_NA
+stringdef dlpa '{U+092A}' // DEVANAGARI_LETTER_PA
+stringdef dlpha '{U+092B}' // DEVANAGARI_LETTER_PHA
+stringdef dlb '{U+092D}' // DEVANAGARI_LETTER_BHA
+stringdef dlm '{U+092E}' // DEVANAGARI_LETTER_MA
+stringdef dly '{U+092F}' // DEVANAGARI_LETTER_YA
+stringdef dlr '{U+0930}' // DEVANAGARI_LETTER_RA
+stringdef dll '{U+0932}' // DEVANAGARI_LETTER_LA
+stringdef dlv '{U+0935}' // DEVANAGARI_LETTER_VA
+stringdef dls '{U+0938}' // DEVANAGARI_LETTER_SA
+stringdef dlh '{U+0939}' // DEVANAGARI_LETTER_HA
+stringdef dvsaa '{U+093E}' // DEVANAGARI_VOWEL_SIGN_AA
+stringdef dvsi '{U+093F}' // DEVANAGARI_VOWEL_SIGN_I
+stringdef dvsii '{U+0940}' // DEVANAGARI_VOWEL_SIGN_II
+stringdef dvsu '{U+0941}' // DEVANAGARI_VOWEL_SIGN_U
+stringdef dvsuu '{U+0942}' // DEVANAGARI_VOWEL_SIGN_UU
+stringdef dvse '{U+0947}' // DEVANAGARI_VOWEL_SIGN_E
+stringdef dvsai '{U+0948}' // DEVANAGARI_VOWEL_SIGN_AI
+stringdef dvso '{U+094B}' // DEVANAGARI_VOWEL_SIGN_O
+stringdef dvsau '{U+094C}' // DEVANAGARI_VOWEL_SIGN_AU
+stringdef dsv '{U+094D}' // DEVANAGARI_SIGN_VIRAMA
+
+externals ( stem )
+backwardmode (
+ define remove_category_1 as(
+ [substring] among (
+ '{dlm}{dvsaa}{dlr}{dsv}{dlpha}{dlta}' '{dld}{dsv}{dlv}{dvsaa}{dlr}{dvsaa}' '{dls}{dsc}{dlg}{dvsai}' '{dls}{dsa}{dlg}'
+ '{dls}{dsc}{dlg}' '{dll}{dvsaa}{dli}' '{dll}{dvsaa}{dlii}' '{dlpa}{dlc}{dvsi}'
+ '{dll}{dvse}' '{dlr}{dlta}' '{dlm}{dvsai}' '{dlm}{dvsaa}'
+ (delete)
+ '{dlka}{dvso}' '{dlka}{dvsaa}' '{dlka}{dvsi}' '{dlka}{dvsii}' '{dlka}{dvsai}'(('{dle}' or '{dvse}' ()) or delete)
+ )
+ )
+
+ define check_category_2 as(
+ [substring] among(
+ '{dsc}' '{dsa}' '{dvsai}'
+ )
+ )
+
+ define remove_category_2 as (
+ [substring] among(
+ '{dsc}' '{dsa}' ('{dly}{dvsau}' or '{dlc}{dvsau}' or '{dln}{dvsau}' or '{dltha}{dvse}' delete)
+ '{dvsai}' ('{dlta}{dsv}{dlr}' delete)
+ )
+ )
+
+ define remove_category_3 as(
+ [substring] among(
+ '{dltha}{dvsi}{dli}{dls}{dsv}' '{dlh}{dvsu}{dln}{dvse}{dlc}' '{dlh}{dvsu}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dls}{dsv}' '{dln}{dvse}{dlc}{dln}{dsv}' '{dli}{dle}{dlka}{dvsii}' '{dli}{dle}{dlka}{dvsaa}' '{dli}{dle}{dlka}{dvso}' '{dvsi}{dle}{dlka}{dvsii}' '{dvsi}{dle}{dlka}{dvsaa}' '{dvsi}{dle}{dlka}{dvso}' '{dli}{dlc}{dln}{dsv}' '{dvsi}{dlc}{dln}{dsv}' '{dli}{dlc}{dls}{dsv}' '{dvsi}{dlc}{dls}{dsv}' '{dle}{dlc}{dln}{dsv}' '{dvse}{dlc}{dln}{dsv}' '{dle}{dlc}{dls}{dsv}' '{dvse}{dlc}{dls}{dsv}' '{dlc}{dvsi}{dln}{dsv}' '{dlc}{dvse}{dls}{dsv}' '{dlc}{dsv}{dly}{dvsau}' '{dltha}{dvsi}{dln}{dsv}' '{dltha}{dvsi}{dly}{dvso}' '{dltha}{dvsi}{dly}{dvsau}' '{dltha}{dvsi}{dls}{dsv}' '{dltha}{dsv}{dly}{dvso}' '{dltha}{dsv}{dly}{dvsau}' '{dld}{dvsi}{dly}{dvso}' '{dld}{dvse}{dlkha}{dvsi}' '{dld}{dvse}{dlkha}{dvsii}' '{dll}{dvsaa}{dln}{dsv}' '{dlm}{dvsaa}{dltha}{dvsi}' '{dln}{dvse}{dlka}{dvsai}' '{dln}{dvse}{dlka}{dvsaa}' '{dln}{dvse}{dlka}{dvso}' '{dln}{dvse}{dlc}{dvsau}' '{dlh}{dvso}{dls}{dsv}' '{dli}{dln}{dsv}{dlc}' '{dvsi}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dvsu}' '{dli}{dlc}{dvsau}' '{dvsi}{dlc}{dvsau}' '{dli}{dls}{dsv}' '{dvsi}{dls}{dsv}' '{dvsi}{dly}{dvso}' '{dli}{dly}{dvso}' '{dle}{dlka}{dvsaa}' '{dvse}{dlka}{dvsaa}' '{dle}{dlka}{dvsii}' '{dvse}{dlka}{dvsii}' '{dle}{dlka}{dvsai}' '{dvse}{dlka}{dvsai}' '{dle}{dlka}{dvso}' '{dvse}{dlka}{dvso}' '{dle}{dlc}{dvsu}' '{dvse}{dlc}{dvsu}' '{dle}{dlc}{dvsau}' '{dvse}{dlc}{dvsau}' '{dlc}{dln}{dsv}' '{dlc}{dls}{dsv}' '{dltha}{dvsi}{dle}' '{dlpa}{dlr}{dsv}' '{dlb}{dly}{dvso}' '{dlh}{dlr}{dvsu}' '{dlh}{dlr}{dvsuu}' '{dvsi}{dld}{dvsaa}' '{dli}{dld}{dvsaa}' '{dvsi}{dld}{dvso}' '{dli}{dld}{dvso}' '{dvsi}{dld}{dvsai}' '{dli}{dld}{dvsai}' '{dln}{dvse}{dlc}' '{dli}{dlc}' '{dvsi}{dlc}' '{dle}{dlc}' '{dvse}{dlc}' '{dlc}{dvsu}' '{dlc}{dvse}' '{dlc}{dvsau}' '{dltha}{dvsii}' '{dltha}{dvse}' '{dld}{dvsaa}' '{dld}{dvsii}' '{dld}{dvsai}' '{dld}{dvso}' '{dln}{dvsu}' '{dln}{dvse}' '{dly}{dvso}' '{dly}{dvsau}' '{dlc}'
+ (delete)
+ )
+ )
+
+)
+
+define stem as (
+ backwards (
+ do remove_category_1
+ do (
+ repeat (do (check_category_2 and remove_category_2) remove_category_3)
+ )
+ )
+)
diff --git a/contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/norwegian.sbl
index 94a071653..39f4aff0f 100644
--- a/contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl
+++ b/contrib/snowball/algorithms/norwegian.sbl
@@ -13,11 +13,11 @@ groupings ( v s_ending )
stringescapes {}
-/* special characters (in ISO Latin I) */
+/* special characters */
-stringdef ae hex 'E6'
-stringdef ao hex 'E5'
-stringdef o/ hex 'F8'
+stringdef ae '{U+00E6}'
+stringdef ao '{U+00E5}'
+stringdef o/ '{U+00F8}'
define v 'aeiouy{ae}{ao}{o/}'
diff --git a/contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl
deleted file mode 100644
index f57483354..000000000
--- a/contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl
+++ /dev/null
@@ -1,80 +0,0 @@
-routines (
- mark_regions
- main_suffix
- consonant_pair
- other_suffix
-)
-
-externals ( stem )
-
-integers ( p1 x )
-
-groupings ( v s_ending )
-
-stringescapes {}
-
-/* special characters (in MS-DOS Latin I) */
-
-stringdef ae hex '91'
-stringdef ao hex '86'
-stringdef o/ hex '9B'
-
-define v 'aeiouy{ae}{ao}{o/}'
-
-define s_ending 'bcdfghjlmnoprtvyz'
-
-define mark_regions as (
-
- $p1 = limit
-
- test ( hop 3 setmark x )
- goto v gopast non-v setmark p1
- try ( $p1 < x $p1 = x )
-)
-
-backwardmode (
-
- define main_suffix as (
- setlimit tomark p1 for ([substring])
- among(
-
- 'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar'
- 'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens'
- 'hetens' 'ers' 'ets' 'et' 'het' 'ast'
- (delete)
- 's'
- (s_ending or ('k' non-v) delete)
- 'erte' 'ert'
- (<-'er')
- )
- )
-
- define consonant_pair as (
- test (
- setlimit tomark p1 for ([substring])
- among(
- 'dt' 'vt'
- )
- )
- next] delete
- )
-
- define other_suffix as (
- setlimit tomark p1 for ([substring])
- among(
- 'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov'
- 'hetslov'
- (delete)
- )
- )
-)
-
-define stem as (
-
- do mark_regions
- backwards (
- do main_suffix
- do consonant_pair
- do other_suffix
- )
-)
diff --git a/contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/porter.sbl
index 9533b7932..9533b7932 100644
--- a/contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl
+++ b/contrib/snowball/algorithms/porter.sbl
diff --git a/contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/portuguese.sbl
index 3e7da08d4..3fb14f195 100644
--- a/contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl
+++ b/contrib/snowball/algorithms/portuguese.sbl
@@ -15,20 +15,20 @@ groupings ( v )
stringescapes {}
-/* special characters (in ISO Latin I) */
+/* special characters */
-stringdef a' hex 'E1' // a-acute
-stringdef a^ hex 'E2' // a-circumflex e.g. 'bota^nico
-stringdef e' hex 'E9' // e-acute
-stringdef e^ hex 'EA' // e-circumflex
-stringdef i' hex 'ED' // i-acute
-stringdef o^ hex 'F4' // o-circumflex
-stringdef o' hex 'F3' // o-acute
-stringdef u' hex 'FA' // u-acute
-stringdef c, hex 'E7' // c-cedilla
+stringdef a' '{U+00E1}' // a-acute
+stringdef a^ '{U+00E2}' // a-circumflex e.g. 'bota^nico
+stringdef e' '{U+00E9}' // e-acute
+stringdef e^ '{U+00EA}' // e-circumflex
+stringdef i' '{U+00ED}' // i-acute
+stringdef o^ '{U+00F4}' // o-circumflex
+stringdef o' '{U+00F3}' // o-acute
+stringdef u' '{U+00FA}' // u-acute
+stringdef c, '{U+00E7}' // c-cedilla
-stringdef a~ hex 'E3' // a-tilde
-stringdef o~ hex 'F5' // o-tilde
+stringdef a~ '{U+00E3}' // a-tilde
+stringdef o~ '{U+00F5}' // o-tilde
define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}'
@@ -92,12 +92,12 @@ backwardmode (
(
R2 delete
)
- 'log{i'}a'
- 'log{i'}as'
+ 'logia'
+ 'logias'
(
R2 <- 'log'
)
- 'uci{o'}n' 'uciones'
+ 'u{c,}a~o' 'u{c,}o~es'
(
R2 <- 'u'
)
diff --git a/contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl
deleted file mode 100644
index 4d6c85214..000000000
--- a/contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl
+++ /dev/null
@@ -1,218 +0,0 @@
-routines (
- prelude postlude mark_regions
- RV R1 R2
- standard_suffix
- verb_suffix
- residual_suffix
- residual_form
-)
-
-externals ( stem )
-
-integers ( pV p1 p2 )
-
-groupings ( v )
-
-stringescapes {}
-
-/* special characters (in MS-DOS Latin I) */
-
-stringdef a' hex 'A0' // a-acute
-stringdef a^ hex '83' // a-circumflex e.g. 'bota^nico
-stringdef e' hex '82' // e-acute
-stringdef e^ hex '88' // e-circumflex
-stringdef i' hex 'A1' // i-acute
-stringdef o^ hex '93' // o-circumflex
-stringdef o' hex 'A2' // o-acute
-stringdef u' hex 'A3' // u-acute
-stringdef c, hex '87' // c-cedilla
-
-stringdef a~ hex 'C6' // a-tilde
-stringdef o~ hex 'E4' // o-tilde
-
-
-define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}'
-
-define prelude as repeat (
- [substring] among(
- '{a~}' (<- 'a~')
- '{o~}' (<- 'o~')
- '' (next)
- ) //or next
-)
-
-define mark_regions as (
-
- $pV = limit
- $p1 = limit
- $p2 = limit // defaults
-
- do (
- ( v (non-v gopast v) or (v gopast non-v) )
- or
- ( non-v (non-v gopast v) or (v next) )
- setmark pV
- )
- do (
- gopast v gopast non-v setmark p1
- gopast v gopast non-v setmark p2
- )
-)
-
-define postlude as repeat (
- [substring] among(
- 'a~' (<- '{a~}')
- 'o~' (<- '{o~}')
- '' (next)
- ) //or next
-)
-
-backwardmode (
-
- define RV as $pV <= cursor
- define R1 as $p1 <= cursor
- define R2 as $p2 <= cursor
-
- define standard_suffix as (
- [substring] among(
-
- 'eza' 'ezas'
- 'ico' 'ica' 'icos' 'icas'
- 'ismo' 'ismos'
- '{a'}vel'
- '{i'}vel'
- 'ista' 'istas'
- 'oso' 'osa' 'osos' 'osas'
- 'amento' 'amentos'
- 'imento' 'imentos'
-
- 'adora' 'ador' 'a{c,}a~o'
- 'adoras' 'adores' 'a{c,}o~es' // no -ic test
- 'ante' 'antes' '{a^}ncia' // Note 1
- (
- R2 delete
- )
- 'log{i'}a'
- 'log{i'}as'
- (
- R2 <- 'log'
- )
- 'uci{o'}n' 'uciones'
- (
- R2 <- 'u'
- )
- '{e^}ncia' '{e^}ncias'
- (
- R2 <- 'ente'
- )
- 'amente'
- (
- R1 delete
- try (
- [substring] R2 delete among(
- 'iv' (['at'] R2 delete)
- 'os'
- 'ic'
- 'ad'
- )
- )
- )
- 'mente'
- (
- R2 delete
- try (
- [substring] among(
- 'ante' // Note 1
- 'avel'
- '{i'}vel' (R2 delete)
- )
- )
- )
- 'idade'
- 'idades'
- (
- R2 delete
- try (
- [substring] among(
- 'abil'
- 'ic'
- 'iv' (R2 delete)
- )
- )
- )
- 'iva' 'ivo'
- 'ivas' 'ivos'
- (
- R2 delete
- try (
- ['at'] R2 delete // but not a further ['ic'] R2 delete
- )
- )
- 'ira' 'iras'
- (
- RV 'e' // -eira -eiras usually non-verbal
- <- 'ir'
- )
- )
- )
-
- define verb_suffix as setlimit tomark pV for (
- [substring] among(
- 'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}'
- 'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste'
- 'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam'
- 'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem'
- 'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o'
- 'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias'
- 'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras'
- 'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres'
- 'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is'
- 'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis'
- '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis'
- '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos'
- '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos'
- 'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos'
- 'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos'
- '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou'
-
- 'ira' 'iras'
- (delete)
- )
- )
-
- define residual_suffix as (
- [substring] among(
- 'os'
- 'a' 'i' 'o' '{a'}' '{i'}' '{o'}'
- ( RV delete )
- )
- )
-
- define residual_form as (
- [substring] among(
- 'e' '{e'}' '{e^}'
- ( RV delete [('u'] test 'g') or
- ('i'] test 'c') RV delete )
- '{c,}' (<-'c')
- )
- )
-)
-
-define stem as (
- do prelude
- do mark_regions
- backwards (
- do (
- ( ( standard_suffix or verb_suffix )
- and do ( ['i'] test 'c' RV delete )
- )
- or residual_suffix
- )
- do residual_form
- )
- do postlude
-)
-
-/*
- Note 1: additions of 15 Jun 2005
-*/
diff --git a/contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl b/contrib/snowball/algorithms/romanian.sbl
index 48a148321..7db9e0a5a 100644
--- a/contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl
+++ b/contrib/snowball/algorithms/romanian.sbl
@@ -20,11 +20,11 @@ stringescapes {}
/* special characters */
-stringdef a^ hex 'E2' // a circumflex
-stringdef i^ hex 'EE' // i circumflex
-stringdef a+ hex 'E3' // a breve
-stringdef s, hex 'BA' // s cedilla
-stringdef t, hex 'FE' // t cedilla
+stringdef a^ '{U+00E2}' // a circumflex
+stringdef i^ '{U+00EE}' // i circumflex
+stringdef a+ '{U+0103}' // a breve
+stringdef s, '{U+015F}' // s cedilla
+stringdef t, '{U+0163}' // t cedilla
define v 'aeiou{a^}{i^}{a+}'
diff --git a/contrib/snowball/algorithms/romanian/stem_Unicode.sbl b/contrib/snowball/algorithms/romanian/stem_Unicode.sbl
deleted file mode 100644
index 09aec6429..000000000
--- a/contrib/snowball/algorithms/romanian/stem_Unicode.sbl
+++ /dev/null
@@ -1,236 +0,0 @@
-
-routines (
- prelude postlude mark_regions
- RV R1 R2
- step_0
- standard_suffix combo_suffix
- verb_suffix
- vowel_suffix
-)
-
-externals ( stem )
-
-integers ( pV p1 p2 )
-
-groupings ( v )
-
-booleans ( standard_suffix_removed )
-
-stringescapes {}
-
-/* special characters */
-
-stringdef a^ hex '0E2' // a circumflex
-stringdef i^ hex '0EE' // i circumflex
-stringdef a+ hex '103' // a breve
-stringdef s, hex '15F' // s cedilla
-stringdef t, hex '163' // t cedilla
-
-define v 'aeiou{a^}{i^}{a+}'
-
-define prelude as (
- repeat goto (
- v [ ('u' ] v <- 'U') or
- ('i' ] v <- 'I')
- )
-)
-
-define mark_regions as (
-
- $pV = limit
- $p1 = limit
- $p2 = limit // defaults
-
- do (
- ( v (non-v gopast v) or (v gopast non-v) )
- or
- ( non-v (non-v gopast v) or (v next) )
- setmark pV
- )
- do (
- gopast v gopast non-v setmark p1
- gopast v gopast non-v setmark p2
- )
-)
-
-define postlude as repeat (
-
- [substring] among(
- 'I' (<- 'i')
- 'U' (<- 'u')
- '' (next)
- )
-
-)
-
-backwardmode (
-
- define RV as $pV <= cursor
- define R1 as $p1 <= cursor
- define R2 as $p2 <= cursor
-
- define step_0 as (
- [substring] R1 among(
- 'ul' 'ului'
- ( delete )
- 'aua'
- ( <-'a' )
- 'ea' 'ele' 'elor'
- ( <-'e' )
- 'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor'
- ( <-'i')
- 'ile'
- ( not 'ab' <- 'i' )
- 'atei'
- ( <- 'at' )
- 'a{t,}ie' 'a{t,}ia'
- ( <- 'a{t,}i' )
- )
- )
-
- define combo_suffix as test (
- [substring] R1 (
- among(
- /* 'IST'. alternative: include the following
- 'alism' 'alisme'
- 'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' (
- <- 'al'
- )
- */
- 'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' (
- <- 'abil'
- )
- 'ibilitate' (
- <- 'ibil'
- )
- 'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' (
- <- 'iv'
- )
- 'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i'
- 'icator' 'icatori'
- 'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}'
- 'ical' 'icala' 'icale' 'icali' 'ical{a+}' (
- <- 'ic'
- )
- 'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune'
- 'atoare' 'ator' 'atori'
- '{a+}toare' '{a+}tor' '{a+}tori' (
- <- 'at'
- )
- 'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune'
- 'itoare' 'itor' 'itori' (
- <- 'it'
- )
- )
- set standard_suffix_removed
- )
- )
-
- define standard_suffix as (
- unset standard_suffix_removed
- repeat combo_suffix
- [substring] R2 (
- among(
-
- // past participle is treated here, rather than
- // as a verb ending:
- 'at' 'ata' 'at{a+}' 'ati' 'ate'
- 'ut' 'uta' 'ut{a+}' 'uti' 'ute'
- 'it' 'ita' 'it{a+}' 'iti' 'ite'
-
- 'ic' 'ica' 'ice' 'ici' 'ic{a+}'
- 'abil' 'abila' 'abile' 'abili' 'abil{a+}'
- 'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}'
- 'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i'
- 'ant' 'anta' 'ante' 'anti' 'ant{a+}'
- 'ator' 'atori'
- 'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i'
- 'iv' 'iva' 'ive' 'ivi' 'iv{a+}' (
- delete
- )
- 'iune' 'iuni' (
- '{t,}'] <- 't'
- )
- 'ism' 'isme'
- 'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' (
- <- 'ist'
- /* 'IST'. alternative: remove with <- '' */
- )
- )
- set standard_suffix_removed
- )
- )
-
- define verb_suffix as setlimit tomark pV for (
- [substring] among(
- // 'long' infinitive:
- 'are' 'ere' 'ire' '{a^}re'
-
- // gerund:
- 'ind' '{a^}nd'
- 'indu' '{a^}ndu'
-
- 'eze'
- 'easc{a+}'
- // present:
- 'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti'
- 'e{s,}te'
- '{a+}sc' '{a+}{s,}ti'
- '{a+}{s,}te'
-
- // imperfect:
- 'am' 'ai' 'au'
- 'eam' 'eai' 'ea' 'ea{t,}i' 'eau'
- 'iam' 'iai' 'ia' 'ia{t,}i' 'iau'
-
- // past: // (not 'ii')
- 'ui'
- 'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}'
- 'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}'
- 'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}'
- '{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}'
-
- // pluferfect:
- 'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}'
- 'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}'
- '{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i'
- '{a^}ser{a+}'
- 'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}'
-
- ( non-v or 'u' delete )
-
- // present:
- '{a+}m' 'a{t,}i'
- 'em' 'e{t,}i'
- 'im' 'i{t,}i'
- '{a^}m' '{a^}{t,}i'
-
- // past:
- 'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}'
- 'sei' 'se'
-
- // pluperfect:
- 'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}'
- (delete)
- )
- )
-
- define vowel_suffix as (
- [substring] RV among (
- 'a' 'e' 'i' 'ie' '{a+}' ( delete )
- )
- )
-)
-
-define stem as (
- do prelude
- do mark_regions
- backwards (
- do step_0
- do standard_suffix
- do ( standard_suffix_removed or verb_suffix )
- do vowel_suffix
- )
- do postlude
-)
-
diff --git a/contrib/snowball/algorithms/russian/stem_KOI8_R.sbl b/contrib/snowball/algorithms/russian.sbl
index cdacb19c4..20de6395e 100644
--- a/contrib/snowball/algorithms/russian/stem_KOI8_R.sbl
+++ b/contrib/snowball/algorithms/russian.sbl
@@ -1,41 +1,41 @@
stringescapes {}
-/* the 32 Cyrillic letters in the KOI8-R coding scheme, and represented
- in Latin characters following the conventions of the standard Library
- of Congress transliteration: */
-
-stringdef a hex 'C1'
-stringdef b hex 'C2'
-stringdef v hex 'D7'
-stringdef g hex 'C7'
-stringdef d hex 'C4'
-stringdef e hex 'C5'
-stringdef zh hex 'D6'
-stringdef z hex 'DA'
-stringdef i hex 'C9'
-stringdef i` hex 'CA'
-stringdef k hex 'CB'
-stringdef l hex 'CC'
-stringdef m hex 'CD'
-stringdef n hex 'CE'
-stringdef o hex 'CF'
-stringdef p hex 'D0'
-stringdef r hex 'D2'
-stringdef s hex 'D3'
-stringdef t hex 'D4'
-stringdef u hex 'D5'
-stringdef f hex 'C6'
-stringdef kh hex 'C8'
-stringdef ts hex 'C3'
-stringdef ch hex 'DE'
-stringdef sh hex 'DB'
-stringdef shch hex 'DD'
-stringdef " hex 'DF'
-stringdef y hex 'D9'
-stringdef ' hex 'D8'
-stringdef e` hex 'DC'
-stringdef iu hex 'C0'
-stringdef ia hex 'D1'
+/* the 33 Cyrillic letters represented in ASCII characters following the
+ * conventions of the standard Library of Congress transliteration: */
+
+stringdef a '{U+0430}'
+stringdef b '{U+0431}'
+stringdef v '{U+0432}'
+stringdef g '{U+0433}'
+stringdef d '{U+0434}'
+stringdef e '{U+0435}'
+stringdef e" '{U+0451}'
+stringdef zh '{U+0436}'
+stringdef z '{U+0437}'
+stringdef i '{U+0438}'
+stringdef i` '{U+0439}'
+stringdef k '{U+043A}'
+stringdef l '{U+043B}'
+stringdef m '{U+043C}'
+stringdef n '{U+043D}'
+stringdef o '{U+043E}'
+stringdef p '{U+043F}'
+stringdef r '{U+0440}'
+stringdef s '{U+0441}'
+stringdef t '{U+0442}'
+stringdef u '{U+0443}'
+stringdef f '{U+0444}'
+stringdef kh '{U+0445}'
+stringdef ts '{U+0446}'
+stringdef ch '{U+0447}'
+stringdef sh '{U+0448}'
+stringdef shch '{U+0449}'
+stringdef " '{U+044A}'
+stringdef y '{U+044B}'
+stringdef ' '{U+044C}'
+stringdef e` '{U+044D}'
+stringdef iu '{U+044E}'
+stringdef ia '{U+044F}'
routines ( mark_regions R2
perfective_gerund
@@ -200,6 +200,10 @@ backwardmode (
define stem as (
+ // Normalise {e"} to {e}. The documentation has long suggested the user
+ // should do this before calling the stemmer - we now do it for them.
+ do repeat ( goto (['{e"}']) <- '{e}' )
+
do mark_regions
backwards setlimit tomark pV for (
do (
diff --git a/contrib/snowball/algorithms/russian/stem_Unicode.sbl b/contrib/snowball/algorithms/russian/stem_Unicode.sbl
deleted file mode 100644
index 9e1a93f93..000000000
--- a/contrib/snowball/algorithms/russian/stem_Unicode.sbl
+++ /dev/null
@@ -1,215 +0,0 @@
-stringescapes {}
-
-/* the 32 Cyrillic letters in Unicode */
-
-stringdef a hex '430'
-stringdef b hex '431'
-stringdef v hex '432'
-stringdef g hex '433'
-stringdef d hex '434'
-stringdef e hex '435'
-stringdef zh hex '436'
-stringdef z hex '437'
-stringdef i hex '438'
-stringdef i` hex '439'
-stringdef k hex '43A'
-stringdef l hex '43B'
-stringdef m hex '43C'
-stringdef n hex '43D'
-stringdef o hex '43E'
-stringdef p hex '43F'
-stringdef r hex '440'
-stringdef s hex '441'
-stringdef t hex '442'
-stringdef u hex '443'
-stringdef f hex '444'
-stringdef kh hex '445'
-stringdef ts hex '446'
-stringdef ch hex '447'
-stringdef sh hex '448'
-stringdef shch hex '449'
-stringdef " hex '44A'
-stringdef y hex '44B'
-stringdef ' hex '44C'
-stringdef e` hex '44D'
-stringdef iu hex '44E'
-stringdef ia hex '44F'
-
-routines ( mark_regions R2
- perfective_gerund
- adjective
- adjectival
- reflexive
- verb
- noun
- derivational
- tidy_up
-)
-
-externals ( stem )
-
-integers ( pV p2 )
-
-groupings ( v )
-
-define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}'
-
-define mark_regions as (
-
- $pV = limit
- $p2 = limit
- do (
- gopast v setmark pV gopast non-v
- gopast v gopast non-v setmark p2
- )
-)
-
-backwardmode (
-
- define R2 as $p2 <= cursor
-
- define perfective_gerund as (
- [substring] among (
- '{v}'
- '{v}{sh}{i}'
- '{v}{sh}{i}{s}{'}'
- ('{a}' or '{ia}' delete)
- '{i}{v}'
- '{i}{v}{sh}{i}'
- '{i}{v}{sh}{i}{s}{'}'
- '{y}{v}'
- '{y}{v}{sh}{i}'
- '{y}{v}{sh}{i}{s}{'}'
- (delete)
- )
- )
-
- define adjective as (
- [substring] among (
- '{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}'
- '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}'
- '{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}'
- '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}'
- '{ia}{ia}'
- // and -
- '{o}{iu}' // - which is somewhat archaic
- '{e}{iu}' // - soft form of {o}{iu}
- (delete)
- )
- )
-
- define adjectival as (
- adjective
-
- /* of the participle forms, em, vsh, ivsh, yvsh are readily removable.
- nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of
- errors. Removing im, uem, enn creates too many errors.
- */
-
- try (
- [substring] among (
- '{e}{m}' // present passive participle
- '{n}{n}' // adjective from past passive participle
- '{v}{sh}' // past active participle
- '{iu}{shch}' '{shch}' // present active participle
- ('{a}' or '{ia}' delete)
-
- //but not '{i}{m}' '{u}{e}{m}' // present passive participle
- //or '{e}{n}{n}' // adjective from past passive participle
-
- '{i}{v}{sh}' '{y}{v}{sh}'// past active participle
- '{u}{iu}{shch}' // present active participle
- (delete)
- )
- )
-
- )
-
- define reflexive as (
- [substring] among (
- '{s}{ia}'
- '{s}{'}'
- (delete)
- )
- )
-
- define verb as (
- [substring] among (
- '{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}'
- '{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}'
- '{n}{y}' '{t}{'}' '{e}{sh}{'}'
-
- '{n}{n}{o}'
- ('{a}' or '{ia}' delete)
-
- '{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}'
- '{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}'
- '{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}'
- '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}'
- '{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}'
- '{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}'
- (delete)
- /* note the short passive participle tests:
- '{n}{a}' '{n}' '{n}{o}' '{n}{y}'
- '{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}'
- */
- )
- )
-
- define noun as (
- [substring] among (
- '{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}'
- '{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}'
- '{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}'
- '{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}'
- '{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}'
- '{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}'
- (delete)
- /* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}'
- '{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}'
- omitted - they only occur on 12 words.
- */
- )
- )
-
- define derivational as (
- [substring] R2 among (
- '{o}{s}{t}'
- '{o}{s}{t}{'}'
- (delete)
- )
- )
-
- define tidy_up as (
- [substring] among (
-
- '{e}{i`}{sh}'
- '{e}{i`}{sh}{e}' // superlative forms
- (delete
- ['{n}'] '{n}' delete
- )
- '{n}'
- ('{n}' delete) // e.g. -nno endings
- '{'}'
- (delete) // with some slight false conflations
- )
- )
-)
-
-define stem as (
-
- do mark_regions
- backwards setlimit tomark pV for (
- do (
- perfective_gerund or
- ( try reflexive
- adjectival or verb or noun
- )
- )
- try([ '{i}' ] delete)
- // because noun ending -i{iu} is being treated as verb ending -{iu}
-
- do derivational
- do tidy_up
- )
-)
diff --git a/contrib/snowball/algorithms/serbian.sbl b/contrib/snowball/algorithms/serbian.sbl
new file mode 100644
index 000000000..bddf76b22
--- /dev/null
+++ b/contrib/snowball/algorithms/serbian.sbl
@@ -0,0 +1,2378 @@
+/* Stemmer for Serbian language, based on:
+ *
+ * Ljubesic, Nikola. Pandzic, Ivan. Stemmer for Croatian
+ * http://nlp.ffzg.hr/resources/tools/stemmer-for-croatian/
+ *
+ * authors: Stefan Petkovic and Dragan Ivanovic
+ * emails: petkovic8 at gmail.com and dragan.ivanovic at uns.ac.rs
+ * version: 1.0 (20.04.2019)
+*/
+
+routines (
+ cyr_to_lat
+ prelude
+ mark_regions
+ R1 R2
+ Step_1
+ Step_2
+ Step_3
+)
+
+externals ( stem )
+
+integers ( p1 p2 p3 )
+
+groupings ( v ca sa rg )
+
+stringescapes {}
+
+/* special characters - Unicode codepoints */
+
+/* serbian cyrillic */
+
+stringdef cyrA '{U+0430}'
+stringdef cyrB '{U+0431}'
+stringdef cyrV '{U+0432}'
+stringdef cyrG '{U+0433}'
+stringdef cyrD '{U+0434}'
+stringdef cyrDx '{U+0452}'
+stringdef cyrE '{U+0435}'
+stringdef cyrZh '{U+0436}'
+stringdef cyrZ '{U+0437}'
+stringdef cyrI '{U+0438}'
+stringdef cyrJ '{U+0458}'
+stringdef cyrK '{U+043A}'
+stringdef cyrL '{U+043B}'
+stringdef cyrLJ '{U+0459}'
+stringdef cyrM '{U+043C}'
+stringdef cyrN '{U+043D}'
+stringdef cyrNJ '{U+045A}'
+stringdef cyrO '{U+043E}'
+stringdef cyrP '{U+043F}'
+stringdef cyrR '{U+0440}'
+stringdef cyrS '{U+0441}'
+stringdef cyrT '{U+0442}'
+stringdef cyrCy '{U+045B}'
+stringdef cyrU '{U+0443}'
+stringdef cyrF '{U+0444}'
+stringdef cyrH '{U+0445}'
+stringdef cyrC '{U+0446}'
+stringdef cyrCx '{U+0447}'
+stringdef cyrDzx '{U+045F}'
+stringdef cyrSx '{U+0448}'
+
+/* serbian latin with diacritics */
+
+stringdef cx '{U+010D}' // small c with caron
+stringdef cy '{U+0107}' // small c with acute
+stringdef zx '{U+017E}' // small z with caron
+stringdef sx '{U+0161}' // small s with caron
+stringdef dx '{U+0111}' // small d with stroke
+
+define v 'aeiou'
+define sa '{cx}{cy}{zx}{sx}{dx}'
+define ca 'bvgdzjklmnprstfhc' + sa
+define rg 'r'
+
+
+define cyr_to_lat as (
+
+ do repeat goto (
+ [substring] among (
+ '{cyrA}' (<- 'a')
+ '{cyrB}' (<- 'b')
+ '{cyrV}' (<- 'v')
+ '{cyrG}' (<- 'g')
+ '{cyrD}' (<- 'd')
+ '{cyrDx}' (<- '{dx}')
+ '{cyrE}' (<- 'e')
+ '{cyrZh}' (<- '{zx}')
+ '{cyrZ}' (<- 'z')
+ '{cyrI}' (<- 'i')
+ '{cyrJ}' (<- 'j')
+ '{cyrK}' (<- 'k')
+ '{cyrL}' (<- 'l')
+ '{cyrLJ}' (<- 'lj')
+ '{cyrM}' (<- 'm')
+ '{cyrN}' (<- 'n')
+ '{cyrNJ}' (<- 'nj')
+ '{cyrO}' (<- 'o')
+ '{cyrP}' (<- 'p')
+ '{cyrR}' (<- 'r')
+ '{cyrS}' (<- 's')
+ '{cyrT}' (<- 't')
+ '{cyrCy}' (<- '{cy}')
+ '{cyrU}' (<- 'u')
+ '{cyrF}' (<- 'f')
+ '{cyrH}' (<- 'h')
+ '{cyrC}' (<- 'c')
+ '{cyrCx}' (<- '{cx}')
+ '{cyrDzx}' (<- 'd{zx}')
+ '{cyrSx}' (<- '{sx}')
+ )
+ )
+
+)
+
+define prelude as (
+
+ do repeat goto (
+ ca ['ije'] ca <- 'e'
+ )
+
+ do repeat goto (
+ ca ['je'] ca <- 'e'
+ )
+
+ do repeat goto (
+ ['dj'] <- '{dx}'
+ )
+
+)
+
+define mark_regions as (
+
+ $p3 = 0
+
+ do (
+ gopast sa setmark p3
+ )
+
+ $p1 = limit
+ $p2 = 0
+
+ do (
+ gopast v setmark p1
+ )
+ do (
+ gopast 'r' setmark p2
+ $(p1 - p2 > 1) ($p1 = p2)
+ )
+ ($p1 < 2) (
+ ($p1 == p2 gopast 'r' gopast non-rg) or ($p1 != p2 gopast v gopast non-v)
+ setmark p1
+ )
+
+)
+
+backwardmode (
+
+ define R1 as $p1 <= cursor
+ define R2 as $p3 == 0
+
+ define Step_1 as (
+ [substring] among (
+ 'lozi'
+ 'lozima' (<-'loga')
+ 'pesi'
+ 'pesima' (<-'peh')
+ 'vojci' (<-'vojka')
+ 'bojci' (<-'bojka')
+ 'jaci'
+ 'jacima' (<-'jak')
+ '{cx}ajan' (<-'{cx}ajni')
+ 'cajan' (R2 <-'cajni')
+ 'eran' (<-'erni')
+ 'laran' (<-'larni')
+ 'esan' (<-'esni')
+ 'anjac' (<-'anjca')
+ 'ajac'
+ 'ajaca' (<-'ajca')
+ 'ljaca'
+ 'ljac' (<-'ljca')
+ 'ejac'
+ 'ejaca' (<-'ejca')
+ 'ojac'
+ 'ojaca' (<-'ojca')
+ 'ajaka' (<-'ajka')
+ 'ojaka' (<-'ojka')
+ '{sx}aca'
+ '{sx}ac' (<-'{sx}ca')
+ 'inzima'
+ 'inzi' (<-'ing')
+ 'tvenici' (<-'tvenik')
+ 'tetici'
+ 'teticima' (<-'tetika')
+ 'nstava' (<-'nstva')
+ 'nicima' (<-'nik')
+ 'ticima' (<-'tik')
+ 'zicima' (<-'zik')
+ 'snici' (<-'snik')
+ 'kuse' (<-'kusi')
+ 'kusan' (<-'kusni')
+ 'kustava' (<-'kustva')
+ 'du{sx}an' (<-'du{sx}ni')
+ 'dusan' (R2 <-'dusni')
+ 'antan' (<-'antni')
+ 'bilan' (<-'bilni')
+ 'tilan' (<-'tilni')
+ 'avilan' (<-'avilni')
+ 'silan' (<-'silni')
+ 'gilan' (<-'gilni')
+ 'rilan' (<-'rilni')
+ 'nilan' (<-'nilni')
+ 'alan' (<-'alni')
+ 'ozan' (<-'ozni')
+ 'rave' (<-'ravi')
+ 'stavan' (<-'stavni')
+ 'pravan' (<-'pravni')
+ 'tivan' (<-'tivni')
+ 'sivan' (<-'sivni')
+ 'atan' (<-'atni')
+ 'enat' (<-'enta')
+ 'tetan' (<-'tetni')
+ 'pletan' (<-'pletni')
+ '{sx}ave' (<-'{sx}avi')
+ 'save' (R2 <-'savi')
+ 'anata' (<-'anta')
+ 'a{cx}ak'
+ 'a{cx}aka' (<-'a{cx}ka')
+ 'acak'
+ 'acaka' (R2 <-'acka')
+ 'u{sx}ak' (<-'u{sx}ka')
+ 'usak' (R2 <-'uska')
+ 'atak'
+ 'ataka'
+ 'atci'
+ 'atcima' (<-'atka')
+ 'etak'
+ 'etaka' (<-'etka')
+ 'itak'
+ 'itaka'
+ 'itci' (<-'itka')
+ 'otak'
+ 'otaka' (<-'otka')
+ 'utak'
+ 'utaka'
+ 'utci'
+ 'utcima' (<-'utka')
+ 'eskan' (<-'eskna')
+ 'ti{cx}an' (<-'ti{cx}ni')
+ 'tican' (R2 <-'ticni')
+ 'ojsci' (<-'ojska')
+ 'esama' (<-'esma')
+ 'metar'
+ 'metara' (<-'metra')
+ 'centar'
+ 'centara' (<-'centra')
+ 'istar'
+ 'istara' (<-'istra')
+ 'o{sx}{cy}u' (<-'osti')
+ 'oscu' (R2 <-'osti')
+ 'daba' (<-'dba')
+ '{cx}cima'
+ '{cx}ci' (<-'{cx}ka')
+ 'mac'
+ 'maca' (<-'mca')
+ 'naca'
+ 'nac' (<-'nca')
+ 'voljan' (<-'voljni')
+ 'anaka' (<-'anki')
+ 'vac'
+ 'vaca' (<-'vca')
+ 'saca'
+ 'sac' (<-'sca')
+ 'raca'
+ 'rac' (<-'rca')
+ 'aoca'
+ 'alaca'
+ 'alac' (<-'alca')
+ 'elaca'
+ 'elac' (<-'elca')
+ 'olaca'
+ 'olac'
+ 'olce' (<-'olca')
+ 'njac'
+ 'njaca' (<-'njca')
+ 'ekata'
+ 'ekat' (<-'ekta')
+ 'izam'
+ 'izama' (<-'izma')
+ 'jebe' (<-'jebi')
+ 'baci' (<-'baci')
+ 'a{sx}an' (<-'a{sx}ni')
+ 'asan' (R2 <-'asni')
+ )
+ )
+
+ define Step_2 as (
+ [substring] R1 among (
+ 'skijima'
+ 'skijega'
+ 'skijemu'
+ 'skijem'
+ 'skega'
+ 'skemu'
+ 'skem'
+ 'skijim'
+ 'skijih'
+ 'skijoj'
+ 'skijeg'
+ 'skiji'
+ 'skije'
+ 'skija'
+ 'skoga'
+ 'skome'
+ 'skomu'
+ 'skima'
+ 'skog'
+ 'skom'
+ 'skim'
+ 'skih'
+ 'skoj'
+ 'ski'
+ 'ske'
+ 'sko'
+ 'ska'
+ 'sku' (<-'sk')
+ '{sx}kijima'
+ '{sx}kijega'
+ '{sx}kijemu'
+ '{sx}kijem'
+ '{sx}kega'
+ '{sx}kemu'
+ '{sx}kem'
+ '{sx}kijim'
+ '{sx}kijih'
+ '{sx}kijoj'
+ '{sx}kijeg'
+ '{sx}kiji'
+ '{sx}kije'
+ '{sx}kija'
+ '{sx}koga'
+ '{sx}kome'
+ '{sx}komu'
+ '{sx}kima'
+ '{sx}kog'
+ '{sx}kom'
+ '{sx}kim'
+ '{sx}kih'
+ '{sx}koj'
+ '{sx}ki'
+ '{sx}ke'
+ '{sx}ko'
+ '{sx}ka'
+ '{sx}ku' (<-'{sx}k')
+ 'stvima'
+ 'stvom'
+ 'stvo'
+ 'stva'
+ 'stvu' (<-'stv')
+ '{sx}tvima'
+ '{sx}tvom'
+ '{sx}tvo'
+ '{sx}tva'
+ '{sx}tvu' (<-'{sx}tv')
+ 'tanijama'
+ 'tanijima'
+ 'tanijom'
+ 'tanija'
+ 'taniju'
+ 'tanije'
+ 'taniji' (<-'tanij')
+ 'manijama'
+ 'manijima'
+ 'manijom'
+ 'manija'
+ 'maniju'
+ 'manije'
+ 'maniji' (<-'manij')
+ 'panijama'
+ 'panijima'
+ 'panijom'
+ 'panija'
+ 'paniju'
+ 'panije'
+ 'paniji' (<-'panij')
+ 'ranijama'
+ 'ranijima'
+ 'ranijom'
+ 'ranija'
+ 'raniju'
+ 'ranije'
+ 'raniji' (<-'ranij')
+ 'ganijama'
+ 'ganijima'
+ 'ganijom'
+ 'ganija'
+ 'ganiju'
+ 'ganije'
+ 'ganiji' (<-'ganij')
+ 'aninom'
+ 'anina'
+ 'aninu'
+ 'anine'
+ 'anima'
+ 'anin'
+ 'anom'
+ 'anu'
+ 'ani'
+ 'ana'
+ 'ane' (<-'an')
+ 'inima'
+ 'inama'
+ 'inom'
+ 'ina'
+ 'ine'
+ 'ini'
+ 'inu'
+ 'ino' (<-'in')
+ 'onovima'
+ 'onova'
+ 'onove'
+ 'onovi'
+ 'onima'
+ 'onom'
+ 'ona'
+ 'one'
+ 'oni'
+ 'onu' (<-'on')
+ 'nijima'
+ 'nijega'
+ 'nijemu'
+ 'nijeg'
+ 'nijem'
+ 'nega'
+ 'nemu'
+ 'neg'
+ 'nem'
+ 'nijim'
+ 'nijih'
+ 'nijoj'
+ 'niji'
+ 'nije'
+ 'nija'
+ 'niju'
+ 'nima'
+ 'nome'
+ 'nomu'
+ 'noga'
+ 'noj'
+ 'nom'
+ 'nih'
+ 'nim'
+ 'nog'
+ 'no'
+ 'ne'
+ 'na'
+ 'nu'
+ 'ni' (<-'n')
+ 'a{cy}oga'
+ 'a{cy}ome'
+ 'a{cy}omu'
+ 'a{cy}ega'
+ 'a{cy}emu'
+ 'a{cy}ima'
+ 'a{cy}oj'
+ 'a{cy}ih'
+ 'a{cy}om'
+ 'a{cy}eg'
+ 'a{cy}em'
+ 'a{cy}og'
+ 'a{cy}uh'
+ 'a{cy}im'
+ 'a{cy}e'
+ 'a{cy}a' (<-'a{cy}')
+ 'e{cy}oga'
+ 'e{cy}ome'
+ 'e{cy}omu'
+ 'e{cy}ega'
+ 'e{cy}emu'
+ 'e{cy}ima'
+ 'e{cy}oj'
+ 'e{cy}ih'
+ 'e{cy}om'
+ 'e{cy}eg'
+ 'e{cy}em'
+ 'e{cy}og'
+ 'e{cy}uh'
+ 'e{cy}im'
+ 'e{cy}e'
+ 'e{cy}a' (<-'e{cy}')
+ 'u{cy}oga'
+ 'u{cy}ome'
+ 'u{cy}omu'
+ 'u{cy}ega'
+ 'u{cy}emu'
+ 'u{cy}ima'
+ 'u{cy}oj'
+ 'u{cy}ih'
+ 'u{cy}om'
+ 'u{cy}eg'
+ 'u{cy}em'
+ 'u{cy}og'
+ 'u{cy}uh'
+ 'u{cy}im'
+ 'u{cy}e'
+ 'u{cy}a' (<-'u{cy}')
+ 'ugovima'
+ 'ugovi'
+ 'ugove'
+ 'ugova' (<-'ugov')
+ 'ugama'
+ 'ugom'
+ 'uga'
+ 'uge'
+ 'ugi'
+ 'ugu'
+ 'ugo' (<-'ug')
+ 'logama'
+ 'logom'
+ 'loga'
+ 'logu'
+ 'loge' (<-'log')
+ 'govima'
+ 'gama'
+ 'govi'
+ 'gove'
+ 'gova'
+ 'gom'
+ 'ga'
+ 'ge'
+ 'gi'
+ 'gu'
+ 'go' (<-'g')
+ 'rarijem'
+ 'rarija'
+ 'rariju'
+ 'rario' (<-'rari')
+ 'otijem'
+ 'otija'
+ 'otiju'
+ 'otio' (<-'oti')
+ 'sijem'
+ 'sija'
+ 'siju'
+ 'sio' (<-'si')
+ 'lijem'
+ 'lija'
+ 'liju'
+ 'lio' (<-'li')
+ 'uju{cy}i'
+ 'ujemo'
+ 'ujete'
+ 'ujmo'
+ 'ujem'
+ 'uje{sx}'
+ 'uje'
+ 'uju' (<-'uj')
+ 'cajevima'
+ 'cajevi'
+ 'cajeva'
+ 'cajeve'
+ 'cajama'
+ 'cajima'
+ 'cajem'
+ 'caja'
+ 'caje'
+ 'caji'
+ 'caju' (<-'caj')
+ '{cx}ajevima'
+ '{cx}ajevi'
+ '{cx}ajeva'
+ '{cx}ajeve'
+ '{cx}ajama'
+ '{cx}ajima'
+ '{cx}ajem'
+ '{cx}aja'
+ '{cx}aje'
+ '{cx}aji'
+ '{cx}aju' (<-'{cx}aj')
+ '{cy}ajevima'
+ '{cy}ajevi'
+ '{cy}ajeva'
+ '{cy}ajeve'
+ '{cy}ajama'
+ '{cy}ajima'
+ '{cy}ajem'
+ '{cy}aja'
+ '{cy}aje'
+ '{cy}aji'
+ '{cy}aju' (<-'{cy}aj')
+ '{dx}ajevima'
+ '{dx}ajevi'
+ '{dx}ajeva'
+ '{dx}ajeve'
+ '{dx}ajama'
+ '{dx}ajima'
+ '{dx}ajem'
+ '{dx}aja'
+ '{dx}aje'
+ '{dx}aji'
+ '{dx}aju' (<-'{dx}aj')
+ 'lajevima'
+ 'lajevi'
+ 'lajeva'
+ 'lajeve'
+ 'lajama'
+ 'lajima'
+ 'lajem'
+ 'laja'
+ 'laje'
+ 'laji'
+ 'laju' (<-'laj')
+ 'rajevima'
+ 'rajevi'
+ 'rajeva'
+ 'rajeve'
+ 'rajama'
+ 'rajima'
+ 'rajem'
+ 'raja'
+ 'raje'
+ 'raji'
+ 'raju' (<-'raj')
+ 'bijima'
+ 'bijama'
+ 'bijom'
+ 'bija'
+ 'bije'
+ 'biji'
+ 'biju'
+ 'bijo' (<-'bij')
+ 'cijima'
+ 'cijama'
+ 'cijom'
+ 'cija'
+ 'cije'
+ 'ciji'
+ 'ciju'
+ 'cijo' (<-'cij')
+ 'dijima'
+ 'dijama'
+ 'dijom'
+ 'dija'
+ 'dije'
+ 'diji'
+ 'diju'
+ 'dijo' (<-'dij')
+ 'lijima'
+ 'lijama'
+ 'lijom'
+ 'lije'
+ 'liji'
+ 'lijo' (<-'lij')
+ 'nijama'
+ 'nijom'
+ 'nijo' (<-'nij')
+ 'mijima'
+ 'mijama'
+ 'mijom'
+ 'mija'
+ 'mije'
+ 'miji'
+ 'miju'
+ 'mijo' (<-'mij')
+ '{zx}ijima'
+ '{zx}ijama'
+ '{zx}ijom'
+ '{zx}ija'
+ '{zx}ije'
+ '{zx}iji'
+ '{zx}iju'
+ '{zx}ijo' (<-'{zx}ij')
+ 'gijima'
+ 'gijama'
+ 'gijom'
+ 'gija'
+ 'gije'
+ 'giji'
+ 'giju'
+ 'gijo' (<-'gij')
+ 'fijima'
+ 'fijama'
+ 'fijom'
+ 'fija'
+ 'fije'
+ 'fiji'
+ 'fiju'
+ 'fijo' (<-'fij')
+ 'pijima'
+ 'pijama'
+ 'pijom'
+ 'pija'
+ 'pije'
+ 'piji'
+ 'piju'
+ 'pijo' (<-'pij')
+ 'rijima'
+ 'rijama'
+ 'rijom'
+ 'rija'
+ 'rije'
+ 'riji'
+ 'riju'
+ 'rijo' (<-'rij')
+ 'sijima'
+ 'sijama'
+ 'sijom'
+ 'sije'
+ 'siji'
+ 'sijo' (<-'sij')
+ 'tijima'
+ 'tijama'
+ 'tijom'
+ 'tija'
+ 'tije'
+ 'tiji'
+ 'tiju'
+ 'tijo' (<-'tij')
+ 'zijima'
+ 'zijama'
+ 'zijom'
+ 'zija'
+ 'zije'
+ 'ziji'
+ 'ziju'
+ 'zijo' (<-'zij')
+ 'nalima'
+ 'nalama'
+ 'nalom'
+ 'nala'
+ 'nale'
+ 'nali'
+ 'nalu'
+ 'nalo' (<-'nal')
+ 'ijalima'
+ 'ijalama'
+ 'ijalom'
+ 'ijala'
+ 'ijale'
+ 'ijali'
+ 'ijalu'
+ 'ijalo' (<-'ijal')
+ 'ozilima'
+ 'ozilom'
+ 'ozila'
+ 'ozile'
+ 'ozilu'
+ 'ozili' (<-'ozil')
+ 'olovima'
+ 'olovi'
+ 'olova'
+ 'olove' (<-'olov')
+ 'olima'
+ 'olom'
+ 'ola'
+ 'olu'
+ 'ole'
+ 'oli' (<-'ol')
+ 'lemama'
+ 'lemima'
+ 'lemom'
+ 'lema'
+ 'leme'
+ 'lemi'
+ 'lemu'
+ 'lemo' (<-'lem')
+ 'ramama'
+ 'ramom'
+ 'rama'
+ 'rame'
+ 'rami'
+ 'ramu'
+ 'ramo' (<-'ram')
+ 'arama'
+ 'arima'
+ 'arom'
+ 'aru'
+ 'ara'
+ 'are'
+ 'ari' (<-'ar')
+ 'drama'
+ 'drima'
+ 'drom'
+ 'dru'
+ 'dra'
+ 'dre'
+ 'dri' (<-'dr')
+ 'erama'
+ 'erima'
+ 'erom'
+ 'eru'
+ 'era'
+ 'ere'
+ 'eri' (<-'er')
+ 'orama'
+ 'orima'
+ 'orom'
+ 'oru'
+ 'ora'
+ 'ore'
+ 'ori' (<-'or')
+ 'esima'
+ 'esom'
+ 'ese'
+ 'esa'
+ 'esu' (<-'es')
+ 'isima'
+ 'isom'
+ 'ise'
+ 'isa'
+ 'isu' (<-'is')
+ 'ta{sx}ama'
+ 'ta{sx}ima'
+ 'ta{sx}om'
+ 'ta{sx}em'
+ 'ta{sx}a'
+ 'ta{sx}u'
+ 'ta{sx}i'
+ 'ta{sx}e' (<-'ta{sx}')
+ 'na{sx}ama'
+ 'na{sx}ima'
+ 'na{sx}om'
+ 'na{sx}em'
+ 'na{sx}a'
+ 'na{sx}u'
+ 'na{sx}i'
+ 'na{sx}e' (<-'na{sx}')
+ 'ja{sx}ama'
+ 'ja{sx}ima'
+ 'ja{sx}om'
+ 'ja{sx}em'
+ 'ja{sx}a'
+ 'ja{sx}u'
+ 'ja{sx}i'
+ 'ja{sx}e' (<-'ja{sx}')
+ 'ka{sx}ama'
+ 'ka{sx}ima'
+ 'ka{sx}om'
+ 'ka{sx}em'
+ 'ka{sx}a'
+ 'ka{sx}u'
+ 'ka{sx}i'
+ 'ka{sx}e' (<-'ka{sx}')
+ 'ba{sx}ama'
+ 'ba{sx}ima'
+ 'ba{sx}om'
+ 'ba{sx}em'
+ 'ba{sx}a'
+ 'ba{sx}u'
+ 'ba{sx}i'
+ 'ba{sx}e' (<-'ba{sx}')
+ 'ga{sx}ama'
+ 'ga{sx}ima'
+ 'ga{sx}om'
+ 'ga{sx}em'
+ 'ga{sx}a'
+ 'ga{sx}u'
+ 'ga{sx}i'
+ 'ga{sx}e' (<-'ga{sx}')
+ 'va{sx}ama'
+ 'va{sx}ima'
+ 'va{sx}om'
+ 'va{sx}em'
+ 'va{sx}a'
+ 'va{sx}u'
+ 'va{sx}i'
+ 'va{sx}e' (<-'va{sx}')
+ 'e{sx}ima'
+ 'e{sx}ama'
+ 'e{sx}om'
+ 'e{sx}em'
+ 'e{sx}i'
+ 'e{sx}e'
+ 'e{sx}a'
+ 'e{sx}u' (<-'e{sx}')
+ 'i{sx}ima'
+ 'i{sx}ama'
+ 'i{sx}om'
+ 'i{sx}em'
+ 'i{sx}i'
+ 'i{sx}e'
+ 'i{sx}a'
+ 'i{sx}u' (<-'i{sx}')
+ 'ikatima'
+ 'ikatom'
+ 'ikata'
+ 'ikate'
+ 'ikati'
+ 'ikatu'
+ 'ikato' (<-'ikat')
+ 'latima'
+ 'latom'
+ 'lata'
+ 'late'
+ 'lati'
+ 'latu'
+ 'lato' (<-'lat')
+ 'etama'
+ 'etima'
+ 'etom'
+ 'eta'
+ 'ete'
+ 'eti'
+ 'etu'
+ 'eto' (<-'et')
+ 'estima'
+ 'estama'
+ 'estom'
+ 'esta'
+ 'este'
+ 'esti'
+ 'estu'
+ 'esto' (<-'est')
+ 'istima'
+ 'istama'
+ 'istom'
+ 'ista'
+ 'iste'
+ 'isti'
+ 'istu'
+ 'isto' (<-'ist')
+ 'kstima'
+ 'kstama'
+ 'kstom'
+ 'ksta'
+ 'kste'
+ 'ksti'
+ 'kstu'
+ 'ksto' (<-'kst')
+ 'ostima'
+ 'ostama'
+ 'ostom'
+ 'osta'
+ 'oste'
+ 'osti'
+ 'ostu'
+ 'osto' (<-'ost')
+ 'i{sx}tima'
+ 'i{sx}tem'
+ 'i{sx}ta'
+ 'i{sx}te'
+ 'i{sx}tu' (<-'i{sx}t')
+ 'ovasmo'
+ 'ovaste'
+ 'ovahu'
+ 'ovati'
+ 'ova{sx}e'
+ 'ovali'
+ 'ovala'
+ 'ovale'
+ 'ovalo'
+ 'ovat'
+ 'ovah'
+ 'ovao' (<-'ova')
+ 'avijemu'
+ 'avijima'
+ 'avijega'
+ 'avijeg'
+ 'avijem'
+ 'avemu'
+ 'avega'
+ 'aveg'
+ 'avem'
+ 'avijim'
+ 'avijih'
+ 'avijoj'
+ 'avoga'
+ 'avome'
+ 'avomu'
+ 'avima'
+ 'avama'
+ 'aviji'
+ 'avije'
+ 'avija'
+ 'aviju'
+ 'avim'
+ 'avih'
+ 'avoj'
+ 'avom'
+ 'avog'
+ 'avi'
+ 'ava'
+ 'avu'
+ 'ave'
+ 'avo' (<-'av')
+ 'evijemu'
+ 'evijima'
+ 'evijega'
+ 'evijeg'
+ 'evijem'
+ 'evemu'
+ 'evega'
+ 'eveg'
+ 'evem'
+ 'evijim'
+ 'evijih'
+ 'evijoj'
+ 'evoga'
+ 'evome'
+ 'evomu'
+ 'evima'
+ 'evama'
+ 'eviji'
+ 'evije'
+ 'evija'
+ 'eviju'
+ 'evim'
+ 'evih'
+ 'evoj'
+ 'evom'
+ 'evog'
+ 'evi'
+ 'eva'
+ 'evu'
+ 'eve'
+ 'evo' (<-'ev')
+ 'ivijemu'
+ 'ivijima'
+ 'ivijega'
+ 'ivijeg'
+ 'ivijem'
+ 'ivemu'
+ 'ivega'
+ 'iveg'
+ 'ivem'
+ 'ivijim'
+ 'ivijih'
+ 'ivijoj'
+ 'ivoga'
+ 'ivome'
+ 'ivomu'
+ 'ivima'
+ 'ivama'
+ 'iviji'
+ 'ivije'
+ 'ivija'
+ 'iviju'
+ 'ivim'
+ 'ivih'
+ 'ivoj'
+ 'ivom'
+ 'ivog'
+ 'ivi'
+ 'iva'
+ 'ivu'
+ 'ive'
+ 'ivo' (<-'iv')
+ 'ovijemu'
+ 'ovijima'
+ 'ovijega'
+ 'ovijeg'
+ 'ovijem'
+ 'ovemu'
+ 'ovega'
+ 'oveg'
+ 'ovijim'
+ 'ovijih'
+ 'ovijoj'
+ 'ovoga'
+ 'ovome'
+ 'ovomu'
+ 'ovima'
+ 'oviji'
+ 'ovije'
+ 'ovija'
+ 'oviju'
+ 'ovim'
+ 'ovih'
+ 'ovoj'
+ 'ovom'
+ 'ovog'
+ 'ovi'
+ 'ova'
+ 'ovu'
+ 'ove'
+ 'ovo' (<-'ov')
+ 'movima'
+ 'movom'
+ 'mova'
+ 'movu'
+ 'move'
+ 'movi' (<-'mov')
+ 'lovima'
+ 'lovom'
+ 'lova'
+ 'lovu'
+ 'love'
+ 'lovi' (<-'lov')
+ 'elijemu'
+ 'elijima'
+ 'elijega'
+ 'elijeg'
+ 'elijem'
+ 'elemu'
+ 'elega'
+ 'eleg'
+ 'elem'
+ 'elijim'
+ 'elijih'
+ 'elijoj'
+ 'eloga'
+ 'elome'
+ 'elomu'
+ 'elima'
+ 'eliji'
+ 'elije'
+ 'elija'
+ 'eliju'
+ 'elim'
+ 'elih'
+ 'eloj'
+ 'elom'
+ 'elog'
+ 'eli'
+ 'ela'
+ 'elu'
+ 'ele'
+ 'elo' (<-'el')
+ 'anjijemu'
+ 'anjijima'
+ 'anjijega'
+ 'anjijeg'
+ 'anjijem'
+ 'anjemu'
+ 'anjega'
+ 'anjeg'
+ 'anjem'
+ 'anjijim'
+ 'anjijih'
+ 'anjijoj'
+ 'anjoga'
+ 'anjome'
+ 'anjomu'
+ 'anjima'
+ 'anjiji'
+ 'anjije'
+ 'anjija'
+ 'anjiju'
+ 'anjim'
+ 'anjih'
+ 'anjoj'
+ 'anjom'
+ 'anjog'
+ 'anja'
+ 'anje'
+ 'anji'
+ 'anjo'
+ 'anju' (<-'anj')
+ 'enjijemu'
+ 'enjijima'
+ 'enjijega'
+ 'enjijeg'
+ 'enjijem'
+ 'enjemu'
+ 'enjega'
+ 'enjeg'
+ 'enjem'
+ 'enjijim'
+ 'enjijih'
+ 'enjijoj'
+ 'enjoga'
+ 'enjome'
+ 'enjomu'
+ 'enjima'
+ 'enjiji'
+ 'enjije'
+ 'enjija'
+ 'enjiju'
+ 'enjim'
+ 'enjih'
+ 'enjoj'
+ 'enjom'
+ 'enjog'
+ 'enja'
+ 'enje'
+ 'enji'
+ 'enjo'
+ 'enju' (<-'enj')
+ '{sx}njijemu'
+ '{sx}njijima'
+ '{sx}njijega'
+ '{sx}njijeg'
+ '{sx}njijem'
+ '{sx}njemu'
+ '{sx}njega'
+ '{sx}njeg'
+ '{sx}njem'
+ '{sx}njijim'
+ '{sx}njijih'
+ '{sx}njijoj'
+ '{sx}njoga'
+ '{sx}njome'
+ '{sx}njomu'
+ '{sx}njima'
+ '{sx}njiji'
+ '{sx}njije'
+ '{sx}njija'
+ '{sx}njiju'
+ '{sx}njim'
+ '{sx}njih'
+ '{sx}njoj'
+ '{sx}njom'
+ '{sx}njog'
+ '{sx}nja'
+ '{sx}nje'
+ '{sx}nji'
+ '{sx}njo'
+ '{sx}nju' (<-'{sx}nj')
+ 'anemu'
+ 'anega'
+ 'aneg'
+ 'anem' (<-'an')
+ 'enemu'
+ 'enega'
+ 'eneg'
+ 'enem' (<-'en')
+ '{sx}nemu'
+ '{sx}nega'
+ '{sx}neg'
+ '{sx}nem' (<-'{sx}n')
+ '{cx}inama'
+ '{cx}inome'
+ '{cx}inomu'
+ '{cx}inoga'
+ '{cx}inima'
+ '{cx}inog'
+ '{cx}inom'
+ '{cx}inim'
+ '{cx}inih'
+ '{cx}inoj'
+ '{cx}ina'
+ '{cx}inu'
+ '{cx}ini'
+ '{cx}ino'
+ '{cx}ine' (<-'{cx}in')
+ 'ro{sx}iv{sx}i'
+ 'ro{sx}ismo'
+ 'ro{sx}iste'
+ 'ro{sx}i{sx}e'
+ 'ro{sx}imo'
+ 'ro{sx}ite'
+ 'ro{sx}iti'
+ 'ro{sx}ili'
+ 'ro{sx}ila'
+ 'ro{sx}ilo'
+ 'ro{sx}ile'
+ 'ro{sx}im'
+ 'ro{sx}i{sx}'
+ 'ro{sx}it'
+ 'ro{sx}ih'
+ 'ro{sx}io' (<-'ro{sx}i')
+ 'o{sx}ijemu'
+ 'o{sx}ijima'
+ 'o{sx}ijega'
+ 'o{sx}ijeg'
+ 'o{sx}ijem'
+ 'o{sx}emu'
+ 'o{sx}ega'
+ 'o{sx}eg'
+ 'o{sx}em'
+ 'o{sx}ijim'
+ 'o{sx}ijih'
+ 'o{sx}ijoj'
+ 'o{sx}oga'
+ 'o{sx}ome'
+ 'o{sx}omu'
+ 'o{sx}ima'
+ 'o{sx}iji'
+ 'o{sx}ije'
+ 'o{sx}ija'
+ 'o{sx}iju'
+ 'o{sx}im'
+ 'o{sx}ih'
+ 'o{sx}oj'
+ 'o{sx}om'
+ 'o{sx}og'
+ 'o{sx}i'
+ 'o{sx}a'
+ 'o{sx}u'
+ 'o{sx}e' (<-'o{sx}')
+ 'evitijima'
+ 'evitijega'
+ 'evitijemu'
+ 'evitijem'
+ 'evitega'
+ 'evitemu'
+ 'evitem'
+ 'evitijim'
+ 'evitijih'
+ 'evitijoj'
+ 'evitijeg'
+ 'evitiji'
+ 'evitije'
+ 'evitija'
+ 'evitoga'
+ 'evitome'
+ 'evitomu'
+ 'evitima'
+ 'evitog'
+ 'evitom'
+ 'evitim'
+ 'evitih'
+ 'evitoj'
+ 'eviti'
+ 'evite'
+ 'evito'
+ 'evita'
+ 'evitu' (<-'evit')
+ 'ovitijima'
+ 'ovitijega'
+ 'ovitijemu'
+ 'ovitijem'
+ 'ovitega'
+ 'ovitemu'
+ 'ovitem'
+ 'ovitijim'
+ 'ovitijih'
+ 'ovitijoj'
+ 'ovitijeg'
+ 'ovitiji'
+ 'ovitije'
+ 'ovitija'
+ 'ovitoga'
+ 'ovitome'
+ 'ovitomu'
+ 'ovitima'
+ 'ovitog'
+ 'ovitom'
+ 'ovitim'
+ 'ovitih'
+ 'ovitoj'
+ 'oviti'
+ 'ovite'
+ 'ovito'
+ 'ovita'
+ 'ovitu' (<-'ovit')
+ 'astijima'
+ 'astijega'
+ 'astijemu'
+ 'astijem'
+ 'astega'
+ 'astemu'
+ 'astem'
+ 'astijim'
+ 'astijih'
+ 'astijoj'
+ 'astijeg'
+ 'astiji'
+ 'astije'
+ 'astija'
+ 'astoga'
+ 'astome'
+ 'astomu'
+ 'astima'
+ 'astog'
+ 'astom'
+ 'astim'
+ 'astih'
+ 'astoj'
+ 'asti'
+ 'aste'
+ 'asto'
+ 'asta'
+ 'astu' (<-'ast')
+ 'kijemu'
+ 'kijima'
+ 'kijega'
+ 'kijeg'
+ 'kijem'
+ 'kemu'
+ 'kega'
+ 'keg'
+ 'kem'
+ 'kijim'
+ 'kijih'
+ 'kijoj'
+ 'koga'
+ 'kome'
+ 'komu'
+ 'kima'
+ 'kiji'
+ 'kije'
+ 'kija'
+ 'kiju'
+ 'kim'
+ 'kih'
+ 'koj'
+ 'kom'
+ 'kog'
+ 'kov'
+ 'ki'
+ 'ka'
+ 'ku'
+ 'ke'
+ 'ko' (<-'k')
+ 'evaju{cy}i'
+ 'evasmo'
+ 'evaste'
+ 'evajmo'
+ 'evajte'
+ 'evaju'
+ 'evala'
+ 'evale'
+ 'evali'
+ 'evalo'
+ 'evamo'
+ 'evana'
+ 'evane'
+ 'evani'
+ 'evano'
+ 'evate'
+ 'evati'
+ 'eva{sx}e'
+ 'evahu'
+ 'evah'
+ 'evaj'
+ 'evam'
+ 'evan'
+ 'evao'
+ 'evat'
+ 'evav'
+ 'eva{sx}' (<-'eva')
+ 'avaju{cy}i'
+ 'avasmo'
+ 'avaste'
+ 'avajmo'
+ 'avajte'
+ 'avaju'
+ 'avala'
+ 'avale'
+ 'avali'
+ 'avalo'
+ 'avamo'
+ 'avana'
+ 'avane'
+ 'avani'
+ 'avano'
+ 'avate'
+ 'avati'
+ 'ava{sx}e'
+ 'avahu'
+ 'avah'
+ 'avaj'
+ 'avam'
+ 'avan'
+ 'avao'
+ 'avat'
+ 'avav'
+ 'ava{sx}' (<-'ava')
+ 'ivaju{cy}i'
+ 'ivasmo'
+ 'ivaste'
+ 'ivajmo'
+ 'ivajte'
+ 'ivaju'
+ 'ivala'
+ 'ivale'
+ 'ivali'
+ 'ivalo'
+ 'ivamo'
+ 'ivana'
+ 'ivane'
+ 'ivani'
+ 'ivano'
+ 'ivate'
+ 'ivati'
+ 'iva{sx}e'
+ 'ivahu'
+ 'ivah'
+ 'ivaj'
+ 'ivam'
+ 'ivan'
+ 'ivao'
+ 'ivat'
+ 'ivav'
+ 'iva{sx}' (<-'iva')
+ 'uvaju{cy}i'
+ 'uvasmo'
+ 'uvaste'
+ 'uvajmo'
+ 'uvajte'
+ 'uvaju'
+ 'uvala'
+ 'uvale'
+ 'uvali'
+ 'uvalo'
+ 'uvamo'
+ 'uvana'
+ 'uvane'
+ 'uvani'
+ 'uvano'
+ 'uvate'
+ 'uvati'
+ 'uva{sx}e'
+ 'uvahu'
+ 'uvah'
+ 'uvaj'
+ 'uvam'
+ 'uvan'
+ 'uvao'
+ 'uvat'
+ 'uvav'
+ 'uva{sx}' (<-'uva')
+ 'irujemo'
+ 'irujete'
+ 'iruju{cy}i'
+ 'iraju{cy}i'
+ 'irivat'
+ 'irujem'
+ 'iruje{sx}'
+ 'irujmo'
+ 'irujte'
+ 'irav{sx}i'
+ 'irasmo'
+ 'iraste'
+ 'irati'
+ 'iramo'
+ 'irate'
+ 'iraju'
+ 'ira{sx}e'
+ 'irahu'
+ 'irala'
+ 'iralo'
+ 'irali'
+ 'irale'
+ 'iruje'
+ 'iruju'
+ 'iruj'
+ 'iral'
+ 'iran'
+ 'iram'
+ 'ira{sx}'
+ 'irat'
+ 'irah'
+ 'irao' (<-'ir')
+ 'a{cx}ismo'
+ 'a{cx}iste'
+ 'a{cx}iti'
+ 'a{cx}imo'
+ 'a{cx}ite'
+ 'a{cx}i{sx}e'
+ 'a{cx}e{cy}i'
+ 'a{cx}ila'
+ 'a{cx}ilo'
+ 'a{cx}ili'
+ 'a{cx}ile'
+ 'a{cx}ena'
+ 'a{cx}eno'
+ 'a{cx}eni'
+ 'a{cx}ene'
+ 'a{cx}io'
+ 'a{cx}im'
+ 'a{cx}i{sx}'
+ 'a{cx}it'
+ 'a{cx}ih'
+ 'a{cx}en'
+ 'a{cx}i'
+ 'a{cx}e' (<-'a{cx}')
+ 'a{cx}av{sx}i'
+ 'a{cx}asmo'
+ 'a{cx}aste'
+ 'a{cx}ahu'
+ 'a{cx}ati'
+ 'a{cx}amo'
+ 'a{cx}ate'
+ 'a{cx}a{sx}e'
+ 'a{cx}ala'
+ 'a{cx}alo'
+ 'a{cx}ali'
+ 'a{cx}ale'
+ 'a{cx}aju'
+ 'a{cx}ana'
+ 'a{cx}ano'
+ 'a{cx}ani'
+ 'a{cx}ane'
+ 'a{cx}ao'
+ 'a{cx}am'
+ 'a{cx}a{sx}'
+ 'a{cx}at'
+ 'a{cx}ah'
+ 'a{cx}an' (<-'a{cx}a')
+ 'nuv{sx}i'
+ 'nusmo'
+ 'nuste'
+ 'nu{cy}i'
+ 'nimo'
+ 'nite'
+ 'nemo'
+ 'nete'
+ 'nula'
+ 'nulo'
+ 'nule'
+ 'nuli'
+ 'nuto'
+ 'nuti'
+ 'nuta'
+ 'ne{sx}'
+ 'nuo'
+ 'nut' (<-'n')
+ 'niv{sx}i'
+ 'nismo'
+ 'niste'
+ 'niti'
+ 'nila'
+ 'nilo'
+ 'nile'
+ 'nili'
+ 'ni{sx}'
+ 'nio' (<-'ni')
+ 'aju{cy}i'
+ 'av{sx}i'
+ 'asmo'
+ 'ajmo'
+ 'ajte'
+ 'ajem'
+ 'aloj'
+ 'amo'
+ 'ate'
+ 'aje'
+ 'aju'
+ 'ati'
+ 'a{sx}e'
+ 'ahu'
+ 'ala'
+ 'ali'
+ 'ale'
+ 'alo'
+ 'ano'
+ 'at'
+ 'ah'
+ 'ao'
+ 'aj'
+ 'an'
+ 'am'
+ 'a{sx}' (<-'a')
+ 'uraju{cy}i'
+ 'urasmo'
+ 'uraste'
+ 'urajmo'
+ 'urajte'
+ 'uramo'
+ 'urate'
+ 'uraju'
+ 'urati'
+ 'ura{sx}e'
+ 'urahu'
+ 'urala'
+ 'urali'
+ 'urale'
+ 'uralo'
+ 'urana'
+ 'urano'
+ 'urani'
+ 'urane'
+ 'ural'
+ 'urat'
+ 'urah'
+ 'urao'
+ 'uraj'
+ 'uran'
+ 'uram'
+ 'ura{sx}' (<-'ur')
+ 'astajasmo'
+ 'astajaste'
+ 'astajahu'
+ 'astajati'
+ 'astajemo'
+ 'astajete'
+ 'astaja{sx}e'
+ 'astajali'
+ 'astaju{cy}i'
+ 'astajala'
+ 'astajalo'
+ 'astajale'
+ 'astajmo'
+ 'astajao'
+ 'astajem'
+ 'astaje{sx}'
+ 'astajat'
+ 'astajah'
+ 'astajte'
+ 'astaje'
+ 'astaju' (<-'astaj')
+ 'istajasmo'
+ 'istajaste'
+ 'istajahu'
+ 'istajati'
+ 'istajemo'
+ 'istajete'
+ 'istaja{sx}e'
+ 'istajali'
+ 'istaju{cy}i'
+ 'istajala'
+ 'istajalo'
+ 'istajale'
+ 'istajmo'
+ 'istajao'
+ 'istajem'
+ 'istaje{sx}'
+ 'istajat'
+ 'istajah'
+ 'istajte'
+ 'istaje'
+ 'istaju' (<-'istaj')
+ 'ostajasmo'
+ 'ostajaste'
+ 'ostajahu'
+ 'ostajati'
+ 'ostajemo'
+ 'ostajete'
+ 'ostaja{sx}e'
+ 'ostajali'
+ 'ostaju{cy}i'
+ 'ostajala'
+ 'ostajalo'
+ 'ostajale'
+ 'ostajmo'
+ 'ostajao'
+ 'ostajem'
+ 'ostaje{sx}'
+ 'ostajat'
+ 'ostajah'
+ 'ostajte'
+ 'ostaje'
+ 'ostaju' (<-'ostaj')
+ 'alama'
+ 'alima'
+ 'alom'
+ 'alu'
+ 'al' (<-'a')
+ 'ajevima'
+ 'ajevi'
+ 'ajeva'
+ 'ajeve'
+ 'ajama'
+ 'ajima'
+ 'aja'
+ 'aji' (<-'aj')
+ 'astadosmo'
+ 'astadoste'
+ 'astado{sx}e'
+ 'astanemo'
+ 'astademo'
+ 'astanete'
+ 'astadete'
+ 'astanimo'
+ 'astanite'
+ 'astanila'
+ 'astav{sx}i'
+ 'astanem'
+ 'astadem'
+ 'astane{sx}'
+ 'astade{sx}'
+ 'astadoh'
+ 'astade'
+ 'astati'
+ 'astane'
+ 'astanu'
+ 'astadu'
+ 'astala'
+ 'astali'
+ 'astalo'
+ 'astale'
+ 'astat'
+ 'astao' (<-'asta')
+ 'istadosmo'
+ 'istadoste'
+ 'istado{sx}e'
+ 'istanemo'
+ 'istademo'
+ 'istanete'
+ 'istadete'
+ 'istanimo'
+ 'istanite'
+ 'istanila'
+ 'istav{sx}i'
+ 'istanem'
+ 'istadem'
+ 'istane{sx}'
+ 'istade{sx}'
+ 'istadoh'
+ 'istade'
+ 'istati'
+ 'istane'
+ 'istanu'
+ 'istadu'
+ 'istala'
+ 'istali'
+ 'istalo'
+ 'istale'
+ 'istat'
+ 'istao' (<-'ista')
+ 'ostadosmo'
+ 'ostadoste'
+ 'ostado{sx}e'
+ 'ostanemo'
+ 'ostademo'
+ 'ostanete'
+ 'ostadete'
+ 'ostanimo'
+ 'ostanite'
+ 'ostanila'
+ 'ostav{sx}i'
+ 'ostanem'
+ 'ostadem'
+ 'ostane{sx}'
+ 'ostade{sx}'
+ 'ostadoh'
+ 'ostade'
+ 'ostati'
+ 'ostane'
+ 'ostanu'
+ 'ostadu'
+ 'ostala'
+ 'ostali'
+ 'ostalo'
+ 'ostale'
+ 'ostat'
+ 'ostao' (<-'osta')
+ 'tasmo'
+ 'taste'
+ 'tajmo'
+ 'tajte'
+ 'tav{sx}i'
+ 'tati'
+ 'tamo'
+ 'tate'
+ 'taju'
+ 'tala'
+ 'talo'
+ 'tale'
+ 'tali'
+ 'tana'
+ 'tano'
+ 'tani'
+ 'tane'
+ 'tan'
+ 'taj'
+ 'tao'
+ 'tam'
+ 'ta{sx}'
+ 'tat'
+ 'tah' (<-'ta')
+ 'injasmo'
+ 'injaste'
+ 'injati'
+ 'injemo'
+ 'injete'
+ 'injali'
+ 'injala'
+ 'injalo'
+ 'injale'
+ 'inja{sx}e'
+ 'injahu'
+ 'injem'
+ 'inje{sx}'
+ 'injat'
+ 'injah'
+ 'injao' (<-'inj')
+ 'astemo'
+ 'astete'
+ 'astimo'
+ 'astite'
+ 'astu{cy}i'
+ 'aste{sx}'
+ 'asli'
+ 'asla'
+ 'aslo'
+ 'asle' (<-'as')
+ 'iv{sx}i'
+ 'ie{cy}i'
+ 'ismo'
+ 'imo'
+ 'ite'
+ 'iti'
+ 'ili'
+ 'ila'
+ 'ilo'
+ 'ile'
+ 'im'
+ 'i{sx}'
+ 'it'
+ 'ih'
+ 'io' (<-'i')
+ 'ijemo'
+ 'ijete'
+ 'ijem'
+ 'ije{sx}'
+ 'ijmo'
+ 'ijte'
+ 'iju'
+ 'ije'
+ 'ij'
+ 'ilu' (<-'i')
+ 'lu{cx}ujete'
+ 'lu{cx}uju{cy}i'
+ 'lu{cx}ujemo'
+ 'lu{cx}ujem'
+ 'lu{cx}uje{sx}'
+ 'lu{cx}ismo'
+ 'lu{cx}iste'
+ 'lu{cx}ujmo'
+ 'lu{cx}ujte'
+ 'lu{cx}uje'
+ 'lu{cx}uju'
+ 'lu{cx}i{sx}e'
+ 'lu{cx}iti'
+ 'lu{cx}imo'
+ 'lu{cx}ite'
+ 'lu{cx}ila'
+ 'lu{cx}ilo'
+ 'lu{cx}ili'
+ 'lu{cx}ile'
+ 'lu{cx}ena'
+ 'lu{cx}eno'
+ 'lu{cx}eni'
+ 'lu{cx}ene'
+ 'lu{cx}uj'
+ 'lu{cx}io'
+ 'lu{cx}en'
+ 'lu{cx}im'
+ 'lu{cx}i{sx}'
+ 'lu{cx}it'
+ 'lu{cx}ih'
+ 'lu{cx}e'
+ 'lu{cx}i' (<-'lu{cx}')
+ 'jetismo'
+ 'jetiste'
+ 'jeti{sx}e'
+ 'jetimo'
+ 'jetite'
+ 'jetiti'
+ 'jetili'
+ 'jetila'
+ 'jetilo'
+ 'jetile'
+ 'jetim'
+ 'jeti{sx}'
+ 'jetit'
+ 'jetih'
+ 'jetio' (<-'jeti')
+ 'emo'
+ 'em'
+ 'e{sx}'
+ 'elama'
+ 'el' (<-'e')
+ 'ilama'
+ 'ilima'
+ 'ilom'
+ 'il' (<-'i')
+ 'atijega'
+ 'atijemu'
+ 'atijima'
+ 'atijeg'
+ 'atijem'
+ 'atega'
+ 'atemu'
+ 'ateg'
+ 'atem'
+ 'atijih'
+ 'atijim'
+ 'atima'
+ 'atoga'
+ 'atome'
+ 'atomu'
+ 'atiji'
+ 'atije'
+ 'atija'
+ 'atiju'
+ 'atoj'
+ 'atog'
+ 'atom'
+ 'atim'
+ 'atih'
+ 'ata'
+ 'atu'
+ 'ato' (<-'at')
+ 'etav{sx}i'
+ 'etu{cy}i'
+ 'etemo'
+ 'etimo'
+ 'etem'
+ 'ete{sx}' (<-'et')
+ 'lucujuci'
+ 'lucujemo'
+ 'lucujete'
+ 'lucujem'
+ 'lucujes'
+ 'lucujmo'
+ 'lucujte'
+ 'lucismo'
+ 'luciste'
+ 'luciti'
+ 'lucite'
+ 'lucise'
+ 'lucuje'
+ 'lucuju'
+ 'lucila'
+ 'lucile'
+ 'lucili'
+ 'lucilo'
+ 'lucena'
+ 'luceni'
+ 'lucene'
+ 'luceno'
+ 'lucimo'
+ 'lucim'
+ 'lucis'
+ 'lucih'
+ 'lucit'
+ 'lucio'
+ 'lucuj'
+ 'lucen'
+ 'luce'
+ 'luci' (R2 <-'luc')
+ 'snjijima'
+ 'snjijemu'
+ 'snjijega'
+ 'snjijim'
+ 'snjijih'
+ 'snjijeg'
+ 'snjijoj'
+ 'snjiji'
+ 'snjija'
+ 'snjije'
+ 'snjiju'
+ 'snjima'
+ 'snjemu'
+ 'snjomu'
+ 'snjome'
+ 'snjega'
+ 'snjoga'
+ 'snjih'
+ 'snjim'
+ 'snjem'
+ 'snjom'
+ 'snjeg'
+ 'snjog'
+ 'snjoj'
+ 'snja'
+ 'snje'
+ 'snji'
+ 'snjo'
+ 'snju' (R2 <-'snj')
+ 'osijima'
+ 'osijemu'
+ 'osijega'
+ 'snjijem'
+ 'osijih'
+ 'osijim'
+ 'osijem'
+ 'osijeg'
+ 'osijoj'
+ 'osima'
+ 'osemu'
+ 'osomu'
+ 'osome'
+ 'osega'
+ 'osoga'
+ 'osija'
+ 'osije'
+ 'osiji'
+ 'osiju'
+ 'osih'
+ 'osim'
+ 'osem'
+ 'osom'
+ 'oseg'
+ 'osog'
+ 'osoj'
+ 'osa'
+ 'ose'
+ 'osi'
+ 'osu' (R2 <-'os')
+ 'acismo'
+ 'aciste'
+ 'acima'
+ 'acimo'
+ 'acome'
+ 'acomu'
+ 'acite'
+ 'aciti'
+ 'acise'
+ 'acila'
+ 'acile'
+ 'acili'
+ 'acilo'
+ 'acega'
+ 'acene'
+ 'aceci'
+ 'aceni'
+ 'acemu'
+ 'acena'
+ 'aceno'
+ 'acoga'
+ 'acoj'
+ 'acih'
+ 'acem'
+ 'acom'
+ 'acen'
+ 'acog'
+ 'acit'
+ 'acio'
+ 'aceg'
+ 'acim'
+ 'acuh'
+ 'acis'
+ 'ace'
+ 'aca'
+ 'aci' (R2 <-'ac')
+ 'ecome'
+ 'ecoga'
+ 'ecemu'
+ 'ecima'
+ 'ecega'
+ 'ecomu'
+ 'ecoj'
+ 'ecuh'
+ 'ecom'
+ 'ecog'
+ 'eceg'
+ 'ecih'
+ 'ecem'
+ 'ecim'
+ 'eca'
+ 'ece' (R2 <-'ec')
+ 'ucomu'
+ 'ucome'
+ 'ucima'
+ 'ucoga'
+ 'ucega'
+ 'ucemu'
+ 'ucih'
+ 'ucog'
+ 'uceg'
+ 'ucom'
+ 'ucem'
+ 'ucim'
+ 'ucuh'
+ 'ucoj'
+ 'uca'
+ 'uce' (R2 <-'uc')
+ 'rosismo'
+ 'rosivsi'
+ 'rosiste'
+ 'rositi'
+ 'rosili'
+ 'rosise'
+ 'rosite'
+ 'rosilo'
+ 'rosimo'
+ 'rosile'
+ 'rosila'
+ 'rosit'
+ 'rosis'
+ 'rosio'
+ 'rosim'
+ 'rosih' (R2 <-'rosi')
+ 'acavsi'
+ 'acaste'
+ 'acasmo'
+ 'acaju'
+ 'acane'
+ 'acate'
+ 'acali'
+ 'acani'
+ 'acati'
+ 'acale'
+ 'acahu'
+ 'acase'
+ 'acano'
+ 'acamo'
+ 'acalo'
+ 'acana'
+ 'acala'
+ 'acam'
+ 'acan'
+ 'acao'
+ 'acas'
+ 'acat'
+ 'acah' (R2 <-'aca')
+ 'jasima'
+ 'jasama'
+ 'jasem'
+ 'jasom'
+ 'jase'
+ 'jasi'
+ 'jasa'
+ 'jasu' (R2 <-'jas')
+ 'tasima'
+ 'tasama'
+ 'tasem'
+ 'tasom'
+ 'tase'
+ 'tasa'
+ 'tasu'
+ 'tasi' (R2 <-'tas')
+ 'gasima'
+ 'gasama'
+ 'gasem'
+ 'gasom'
+ 'gasi'
+ 'gasu'
+ 'gase'
+ 'gasa' (R2 <-'gas')
+ 'nasama'
+ 'nasima'
+ 'nasem'
+ 'nasom'
+ 'nasu'
+ 'nasi'
+ 'nase'
+ 'nasa' (R2 <-'nas')
+ 'kasama'
+ 'kasima'
+ 'kasom'
+ 'kasem'
+ 'kasi'
+ 'kasu'
+ 'kase'
+ 'kasa' (R2 <-'kas')
+ 'vasama'
+ 'vasima'
+ 'vasom'
+ 'vasem'
+ 'vasi'
+ 'vase'
+ 'vasa'
+ 'vasu' (R2 <-'vas')
+ 'basama'
+ 'basima'
+ 'basom'
+ 'basem'
+ 'basi'
+ 'base'
+ 'basu'
+ 'basa' (R2 <-'bas')
+ 'astuci'
+ 'astes' (R2 <-'as')
+ 'cinima'
+ 'cinome'
+ 'cinama'
+ 'cinomu'
+ 'cinoga'
+ 'cinom'
+ 'cinih'
+ 'cinim'
+ 'cinog'
+ 'cinoj'
+ 'cino'
+ 'cini'
+ 'cinu'
+ 'cine'
+ 'cina' (R2 <-'cin')
+ 'astajase'
+ 'astajuci'
+ 'astajes' (R2 <-'astaj')
+ 'istajase'
+ 'istajuci'
+ 'istajes' (R2 <-'istaj')
+ 'ostajase'
+ 'ostajuci'
+ 'ostajes' (R2 <-'ostaj')
+ 'astadose'
+ 'astades'
+ 'astanes'
+ 'astavsi' (R2 <-'asta')
+ 'istadose'
+ 'istades'
+ 'istanes'
+ 'istavsi' (R2 <-'ista')
+ 'ostadose'
+ 'ostades'
+ 'ostanes'
+ 'ostavsi' (R2 <-'osta')
+ 'avajuci'
+ 'avase'
+ 'avas' (R2 <-'ava')
+ 'evajuci'
+ 'evase'
+ 'evas' (R2 <-'eva')
+ 'ivajuci'
+ 'ivase'
+ 'ivas' (R2 <-'iva')
+ 'uvajuci'
+ 'uvase'
+ 'uvas' (R2 <-'uva')
+ 'ovase' (R2 <-'ova')
+ 'jetise'
+ 'jetis' (R2 <-'jeti')
+ 'injase'
+ 'injes' (R2 <-'inj')
+ 'istem' (R2 <-'ist')
+ 'esama'
+ 'esem'
+ 'esi' (R2 <-'es')
+ 'etavsi'
+ 'etuci'
+ 'etes' (R2 <-'et')
+ 'isama'
+ 'isem'
+ 'isi' (R2 <-'is')
+ 'irajuci'
+ 'irujuci'
+ 'irujes'
+ 'iravsi'
+ 'irase'
+ 'iras' (R2 <-'ir')
+ 'urajuci'
+ 'urase'
+ 'uras' (R2 <-'ur')
+ 'ujuci'
+ 'ujes' (R2 <-'uj')
+ 'nivsi'
+ 'nis' (R2 <-'ni')
+ 'snega'
+ 'snemu'
+ 'snem'
+ 'sneg' (R2 <-'sn')
+ 'tavsi'
+ 'tas' (R2 <-'ta')
+ 'ajuci'
+ 'avsi'
+ 'ase'
+ 'as' (R2 <-'a')
+ 'ijes'
+ 'ivsi'
+ 'ieci'
+ 'is' (R2 <-'i')
+ 'es' (R2 <-'e')
+ 'nuvsi'
+ 'nuci'
+ 'nes' (R2 <-'n')
+ )
+ )
+
+ define Step_3 as (
+ [substring] R1 among (
+ 'enom'
+ 'enoj'
+ 'enog'
+ 'enim'
+ 'enih'
+ 'anoj'
+ 'anog'
+ 'anim'
+ 'anih'
+ 'ost'
+ 'eno'
+ 'eni'
+ 'oga'
+ 'ima'
+ 'enu'
+ 'ena'
+ 'ama'
+ 'ano'
+ 'ani'
+ 'om'
+ 'og'
+ 'u'
+ 'o'
+ 'i'
+ 'e'
+ 'a' (<-'')
+ )
+ )
+)
+
+define stem as (
+ do cyr_to_lat
+ do prelude
+ do mark_regions
+ backwards (
+ do Step_1
+ do (Step_2 or Step_3)
+ )
+)
diff --git a/contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/spanish.sbl
index 9dee289cc..6638f5f70 100644
--- a/contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl
+++ b/contrib/snowball/algorithms/spanish.sbl
@@ -16,15 +16,15 @@ groupings ( v )
stringescapes {}
-/* special characters (in ISO Latin I) */
-
-stringdef a' hex 'E1' // a-acute
-stringdef e' hex 'E9' // e-acute
-stringdef i' hex 'ED' // i-acute
-stringdef o' hex 'F3' // o-acute
-stringdef u' hex 'FA' // u-acute
-stringdef u" hex 'FC' // u-diaeresis
-stringdef n~ hex 'F1' // n-tilde
+/* special characters */
+
+stringdef a' '{U+00E1}' // a-acute
+stringdef e' '{U+00E9}' // e-acute
+stringdef i' '{U+00ED}' // i-acute
+stringdef o' '{U+00F3}' // o-acute
+stringdef u' '{U+00FA}' // u-acute
+stringdef u" '{U+00FC}' // u-diaeresis
+stringdef n~ '{U+00F1}' // n-tilde
define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}'
diff --git a/contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl
deleted file mode 100644
index db7a46201..000000000
--- a/contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl
+++ /dev/null
@@ -1,230 +0,0 @@
-routines (
- postlude mark_regions
- RV R1 R2
- attached_pronoun
- standard_suffix
- y_verb_suffix
- verb_suffix
- residual_suffix
-)
-
-externals ( stem )
-
-integers ( pV p1 p2 )
-
-groupings ( v )
-
-stringescapes {}
-
-/* special characters (in MS-DOS Latin I) */
-
-stringdef a' hex 'A0' // a-acute
-stringdef e' hex '82' // e-acute
-stringdef i' hex 'A1' // i-acute
-stringdef o' hex 'A2' // o-acute
-stringdef u' hex 'A3' // u-acute
-stringdef u" hex '81' // u-diaeresis
-stringdef n~ hex 'A4' // n-tilde
-
-define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}'
-
-define mark_regions as (
-
- $pV = limit
- $p1 = limit
- $p2 = limit // defaults
-
- do (
- ( v (non-v gopast v) or (v gopast non-v) )
- or
- ( non-v (non-v gopast v) or (v next) )
- setmark pV
- )
- do (
- gopast v gopast non-v setmark p1
- gopast v gopast non-v setmark p2
- )
-)
-
-define postlude as repeat (
- [substring] among(
- '{a'}' (<- 'a')
- '{e'}' (<- 'e')
- '{i'}' (<- 'i')
- '{o'}' (<- 'o')
- '{u'}' (<- 'u')
- // and possibly {u"}->u here, or in prelude
- '' (next)
- ) //or next
-)
-
-backwardmode (
-
- define RV as $pV <= cursor
- define R1 as $p1 <= cursor
- define R2 as $p2 <= cursor
-
- define attached_pronoun as (
- [substring] among(
- 'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo'
- 'las' 'les' 'los' 'nos'
- )
- substring RV among(
- 'i{e'}ndo' (] <- 'iendo')
- '{a'}ndo' (] <- 'ando')
- '{a'}r' (] <- 'ar')
- '{e'}r' (] <- 'er')
- '{i'}r' (] <- 'ir')
- 'ando'
- 'iendo'
- 'ar' 'er' 'ir'
- (delete)
- 'yendo' ('u' delete)
- )
- )
-
- define standard_suffix as (
- [substring] among(
-
- 'anza' 'anzas'
- 'ico' 'ica' 'icos' 'icas'
- 'ismo' 'ismos'
- 'able' 'ables'
- 'ible' 'ibles'
- 'ista' 'istas'
- 'oso' 'osa' 'osos' 'osas'
- 'amiento' 'amientos'
- 'imiento' 'imientos'
- (
- R2 delete
- )
- 'adora' 'ador' 'aci{o'}n'
- 'adoras' 'adores' 'aciones'
- 'ante' 'antes' 'ancia' 'ancias'// Note 1
- (
- R2 delete
- try ( ['ic'] R2 delete )
- )
- 'log{i'}a'
- 'log{i'}as'
- (
- R2 <- 'log'
- )
- 'uci{o'}n' 'uciones'
- (
- R2 <- 'u'
- )
- 'encia' 'encias'
- (
- R2 <- 'ente'
- )
- 'amente'
- (
- R1 delete
- try (
- [substring] R2 delete among(
- 'iv' (['at'] R2 delete)
- 'os'
- 'ic'
- 'ad'
- )
- )
- )
- 'mente'
- (
- R2 delete
- try (
- [substring] among(
- 'ante' // Note 1
- 'able'
- 'ible' (R2 delete)
- )
- )
- )
- 'idad'
- 'idades'
- (
- R2 delete
- try (
- [substring] among(
- 'abil'
- 'ic'
- 'iv' (R2 delete)
- )
- )
- )
- 'iva' 'ivo'
- 'ivas' 'ivos'
- (
- R2 delete
- try (
- ['at'] R2 delete // but not a further ['ic'] R2 delete
- )
- )
- )
- )
-
- define y_verb_suffix as (
- setlimit tomark pV for ([substring]) among(
- 'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}'
- 'yas' 'yes' 'yais' 'yamos'
- ('u' delete)
- )
- )
-
- define verb_suffix as (
- setlimit tomark pV for ([substring]) among(
-
- 'en' 'es' '{e'}is' 'emos'
- (try ('u' test 'g') ] delete)
-
- 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais'
- 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}'
- 'ar{e'}'
- 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais'
- 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}'
- 'er{e'}'
- 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais'
- 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}'
- 'ir{e'}'
-
- 'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed'
- 'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an'
- 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado'
- 'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as'
- 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases'
- 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais'
- 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados'
- 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos'
- '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos'
- (delete)
- )
- )
-
- define residual_suffix as (
- [substring] among(
- 'os'
- 'a' 'o' '{a'}' '{i'}' '{o'}'
- ( RV delete )
- 'e' '{e'}'
- ( RV delete try( ['u'] test 'g' RV delete ) )
- )
- )
-)
-
-define stem as (
- do mark_regions
- backwards (
- do attached_pronoun
- do ( standard_suffix or
- y_verb_suffix or
- verb_suffix
- )
- do residual_suffix
- )
- do postlude
-)
-
-/*
- Note 1: additions of 15 Jun 2005
-*/
diff --git a/contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/swedish.sbl
index 03ce1e22f..2cbb88596 100644
--- a/contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl
+++ b/contrib/snowball/algorithms/swedish.sbl
@@ -13,11 +13,11 @@ groupings ( v s_ending )
stringescapes {}
-/* special characters (in ISO Latin I) */
+/* special characters */
-stringdef a" hex 'E4'
-stringdef ao hex 'E5'
-stringdef o" hex 'F6'
+stringdef a" '{U+00E4}'
+stringdef ao '{U+00E5}'
+stringdef o" '{U+00F6}'
define v 'aeiouy{a"}{ao}{o"}'
diff --git a/contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl
deleted file mode 100644
index 1631f401a..000000000
--- a/contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl
+++ /dev/null
@@ -1,72 +0,0 @@
-routines (
- mark_regions
- main_suffix
- consonant_pair
- other_suffix
-)
-
-externals ( stem )
-
-integers ( p1 x )
-
-groupings ( v s_ending )
-
-stringescapes {}
-
-/* special characters (in MS-DOS Latin I) */
-
-stringdef a" hex '84'
-stringdef ao hex '86'
-stringdef o" hex '94'
-
-define v 'aeiouy{a"}{ao}{o"}'
-
-define s_ending 'bcdfghjklmnoprtvy'
-
-define mark_regions as (
-
- $p1 = limit
- test ( hop 3 setmark x )
- goto v gopast non-v setmark p1
- try ( $p1 < x $p1 = x )
-)
-
-backwardmode (
-
- define main_suffix as (
- setlimit tomark p1 for ([substring])
- among(
-
- 'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne'
- 'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter'
- 'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens'
- 'hetens' 'erns' 'at' 'andet' 'het' 'ast'
- (delete)
- 's'
- (s_ending delete)
- )
- )
-
- define consonant_pair as setlimit tomark p1 for (
- among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt')
- and ([next] delete)
- )
-
- define other_suffix as setlimit tomark p1 for (
- [substring] among(
- 'lig' 'ig' 'els' (delete)
- 'l{o"}st' (<-'l{o"}s')
- 'fullt' (<-'full')
- )
- )
-)
-
-define stem as (
-
- do mark_regions
- backwards (
- do main_suffix
- do consonant_pair
- do other_suffix
- )
-)
diff --git a/contrib/snowball/algorithms/tamil.sbl b/contrib/snowball/algorithms/tamil.sbl
new file mode 100644
index 000000000..9635777ab
--- /dev/null
+++ b/contrib/snowball/algorithms/tamil.sbl
@@ -0,0 +1,405 @@
+/*
+* Affix stripping stemming algorithm for Tamil
+* By Damodharan Rajalingam
+*/
+
+stringescapes {}
+
+/* Aytham */
+stringdef aytham '{U+0B83}'
+
+/* Uyir - independent vowels */
+stringdef a '{U+0B85}'
+stringdef aa '{U+0B86}'
+stringdef i '{U+0B87}'
+stringdef ii '{U+0B88}'
+stringdef u '{U+0B89}'
+stringdef uu '{U+0B8A}'
+stringdef e '{U+0B8E}'
+stringdef ee '{U+0B8F}'
+stringdef ai '{U+0B90}'
+stringdef o '{U+0B92}'
+stringdef oo '{U+0B93}'
+stringdef au '{U+0B94}'
+
+/* Consonants */
+stringdef ka '{U+0B95}'
+stringdef nga '{U+0B99}'
+stringdef ca '{U+0B9A}'
+stringdef ja '{U+0B9C}'
+stringdef nya '{U+0B9E}'
+stringdef tta '{U+0B9F}'
+stringdef nna '{U+0BA3}'
+stringdef ta '{U+0BA4}'
+stringdef tha '{U+0BA4}'
+stringdef na '{U+0BA8}'
+stringdef nnna '{U+0BA9}'
+stringdef pa '{U+0BAA}'
+stringdef ma '{U+0BAE}'
+stringdef ya '{U+0BAF}'
+stringdef ra '{U+0BB0}'
+stringdef rra '{U+0BB1}'
+stringdef la '{U+0BB2}'
+stringdef lla '{U+0BB3}'
+stringdef llla '{U+0BB4}'
+stringdef zha '{U+0BB4}'
+stringdef va '{U+0BB5}'
+
+/* Vatamozi - borrowed */
+stringdef sha '{U+0BB6}'
+stringdef ssa '{U+0BB7}'
+stringdef sa '{U+0BB8}'
+stringdef ha '{U+0BB9}'
+
+
+/* Dependent vowel signs (kombu etc.) */
+stringdef vs_aa '{U+0BBE}'
+stringdef vs_i '{U+0BBF}'
+stringdef vs_ii '{U+0BC0}'
+stringdef vs_u '{U+0BC1}'
+stringdef vs_uu '{U+0BC2}'
+stringdef vs_e '{U+0BC6}'
+stringdef vs_ee '{U+0BC7}'
+stringdef vs_ai '{U+0BC8}'
+stringdef vs_o '{U+0BCA}'
+stringdef vs_oo '{U+0BCB}'
+stringdef vs_au '{U+0BCC}'
+
+/* Pulli */
+stringdef pulli '{U+0BCD}'
+
+/* AU length markk */
+stringdef au_lmark '{U+0BD7}'
+
+
+routines (
+ remove_plural_suffix
+ remove_question_suffixes
+ remove_question_prefixes
+ remove_pronoun_prefixes
+ remove_command_suffixes
+ remove_um
+ remove_vetrumai_urupukal
+ fix_va_start
+ fix_ending
+ fix_endings
+ remove_tense_suffix
+ remove_tense_suffixes
+ remove_common_word_endings
+ has_min_length
+)
+
+externals ( stem )
+
+booleans (
+ found_a_match
+ found_vetrumai_urupu
+)
+
+define has_min_length as (
+ $(len > 4)
+)
+
+define fix_va_start as (
+ (try '{va}{vs_oo}' and [ '{va}{vs_oo}' ] <- '{oo}' ) or
+ (try '{va}{vs_o}' and [ '{va}{vs_o}' ] <- '{o}' ) or
+ (try '{va}{vs_u}' and [ '{va}{vs_u}' ] <- '{u}' ) or
+ (try '{va}{vs_uu}' and [ '{va}{vs_uu}' ] <- '{uu}' )
+)
+
+define fix_endings as (
+ do repeat fix_ending
+)
+
+define remove_question_prefixes as (
+ [ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
+ do fix_va_start
+)
+
+// Gives signal t if an ending was fixed, signal f otherwise.
+define fix_ending as (
+ $(len > 3)
+ backwards (
+ ( [among('{na}{pulli}' '{na}{pulli}{ta}' '{na}{pulli}{ta}{pulli}') ] delete )
+ or
+ ( ['{ya}{pulli}' test among('{vs_ai}' '{vs_i}' '{vs_ii}') ] delete )
+ or
+ ( [ '{tta}{pulli}{pa}{pulli}' or '{tta}{pulli}{ka}{pulli}' ] <- '{lla}{pulli}' )
+ or
+ ( [ '{nnna}{pulli}{rra}{pulli}' ] <- '{la}{pulli}' )
+ or
+// ( [ '{rra}{pulli}{ka}{pulli}' or '{nnna}{pulli}{nnna}{pulli}' ] <- '{la}{pulli}' )
+ ( [ '{rra}{pulli}{ka}{pulli}' ] <- '{la}{pulli}' )
+ or
+ ( [ '{tta}{pulli}{tta}{pulli}' ] <- '{tta}{vs_u}' )
+ or
+ ( found_vetrumai_urupu [ '{ta}{pulli}{ta}{pulli}' (test not '{vs_ai}') ] <- '{ma}{pulli}' ] )
+ or
+ ( [ '{vs_u}{ka}{pulli}' or '{vs_u}{ka}{pulli}{ka}{pulli}' ] <- '{pulli}' )
+ or
+ ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
+ or
+ ( [ '{vs_u}{ka}{pulli}' ] <- '{pulli}' )
+ or
+ ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
+ or
+ ( [ '{pulli}' (among('{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}') or among('{nga}' '{nya}' '{nna}' '{na}' '{ma}' '{nnna}')) '{pulli}' ] <- '{pulli}' )
+ or
+ ( [ among('{va}' '{ya}' '{va}{pulli}') ] delete )
+ or
+ ( [ '{nnna}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')) ] delete )
+ or
+ ( [ '{nga}{pulli}' (test not '{vs_ai}')] <- '{ma}{pulli}' )
+ or
+ ( [ '{nga}{pulli}' ] delete )
+ or
+ ( [ '{pulli}' (test (among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') or '{pulli}')) ] delete )
+ )
+)
+
+define remove_pronoun_prefixes as (
+ unset found_a_match
+ [ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
+ (set found_a_match)
+ do fix_va_start
+)
+
+define remove_plural_suffix as (
+ unset found_a_match
+ backwards (
+ ( [ '{vs_u}{nga}{pulli}{ka}{lla}{pulli}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')) ] <- '{pulli}' ) or
+ ( [ '{rra}{pulli}{ka}{lla}{pulli}' ] <- '{la}{pulli}' ) or
+ ( [ '{tta}{pulli}{ka}{lla}{pulli}' ] <- '{lla}{pulli}' ) or
+ ( [ '{ka}{lla}{pulli}' ] delete )
+ (set found_a_match)
+ )
+)
+
+define remove_question_suffixes as (
+ has_min_length
+ unset found_a_match
+ backwards (
+ do (
+ [ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}'
+ (set found_a_match)
+ )
+ )
+ do fix_endings
+)
+
+define remove_command_suffixes as (
+ has_min_length
+ unset found_a_match
+ backwards (
+ [ among('{pa}{vs_i}' '{va}{vs_i}') ] delete
+ (set found_a_match)
+ )
+)
+
+define remove_um as (
+ unset found_a_match
+ has_min_length
+ backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}'
+ (set found_a_match)
+ )
+ do fix_ending
+)
+
+define remove_common_word_endings as (
+ // These are not suffixes actually but are
+ // some words that are attached to other words
+ // but can be removed for stemming
+ unset found_a_match
+ has_min_length
+ backwards (
+ test ( [ '{vs_u}{tta}{nnna}{pulli}' or
+ '{vs_i}{la}{pulli}{la}{vs_ai}' or
+ '{vs_i}{tta}{ma}{pulli}' or
+ '{vs_i}{nnna}{pulli}{rra}{vs_i}' or
+ '{vs_aa}{ka}{vs_i}' or
+ '{vs_aa}{ka}{vs_i}{ya}' or
+ '{vs_e}{nnna}{pulli}{rra}{vs_u}' or
+ '{vs_u}{lla}{pulli}{lla}' or
+ '{vs_u}{tta}{vs_ai}{ya}' or
+ '{vs_u}{tta}{vs_ai}' or
+ '{vs_e}{nnna}{vs_u}{ma}{pulli}' or
+ ('{la}{pulli}{la}' test (not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
+ '{vs_e}{nnna}' or
+ '{vs_aa}{ka}{vs_i}' ] <- '{pulli}'
+ (set found_a_match)
+ )
+ or
+ test ( [ among('{pa}{tta}{vs_u}'
+ '{pa}{tta}{pulli}{tta}'
+ '{pa}{tta}{pulli}{tta}{vs_u}'
+ '{pa}{tta}{pulli}{tta}{ta}{vs_u}'
+ '{pa}{tta}{pulli}{tta}{nna}'
+ '{ka}{vs_u}{ra}{vs_i}{ya}'
+ '{pa}{rra}{pulli}{rra}{vs_i}'
+ '{va}{vs_i}{tta}{vs_u}'
+ '{va}{vs_i}{tta}{pulli}{tta}{vs_u}'
+ '{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}'
+ '{pa}{tta}{vs_i}'
+ '{ta}{vs_aa}{nnna}'
+ '{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}')
+ ] delete
+ (set found_a_match)
+ )
+ )
+ do fix_endings
+)
+
+define remove_vetrumai_urupukal as (
+ unset found_a_match
+ unset found_vetrumai_urupu
+ has_min_length
+ backwards (
+ (
+ test ( ['{nnna}{vs_ai}'] delete )
+ or
+ test ([ ( '{vs_i}{nnna}{vs_ai}' or
+ '{vs_ai}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}'))) or
+ ( '{vs_ai}' (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}')))
+ ] <- '{pulli}'
+ )
+ or
+ test ( [
+ '{vs_o}{tta}{vs_u}' or
+ '{vs_oo}{tta}{vs_u}' or
+ '{vs_i}{la}{pulli}' or
+ '{vs_i}{rra}{pulli}' or
+ ('{vs_i}{nnna}{pulli}' (test not '{ma}')) or
+ '{vs_i}{nnna}{pulli}{rra}{vs_u}' or
+ '{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}' or
+ '{va}{vs_i}{tta}' or
+ ($(len >= 7) '{vs_i}{tta}{ma}{pulli}') or
+ '{vs_aa}{la}{pulli}' or
+ '{vs_u}{tta}{vs_ai}' or
+ '{vs_aa}{ma}{la}{pulli}' or
+ ('{la}{pulli}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
+ '{vs_u}{lla}{pulli}'
+ ] <- '{pulli}'
+ )
+ or
+ test ( [
+ '{ka}{nna}{pulli}' or
+ '{ma}{vs_u}{nnna}{pulli}' or
+ '{ma}{vs_ee}{la}{pulli}' or
+ '{ma}{vs_ee}{rra}{pulli}' or
+ '{ka}{vs_ii}{llla}{pulli}' or
+ '{pa}{vs_i}{nnna}{pulli}' or
+ ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')))
+ ] delete
+ )
+ or
+ test ([ '{vs_ii}' ] <- '{vs_i}')
+ )
+ (set found_a_match)
+ (set found_vetrumai_urupu)
+ do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' )
+ )
+ do fix_endings
+)
+
+define remove_tense_suffixes as (
+ set found_a_match
+ repeat ( found_a_match (do remove_tense_suffix) )
+)
+
+define remove_tense_suffix as (
+ unset found_a_match
+ has_min_length
+ backwards (
+ do (
+ test ( [among(
+ '{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}'
+ '{pa}{tta}{vs_u}'
+ )] delete
+ (set found_a_match)
+ )
+ or
+ test ( [
+ '{ma}{vs_aa}{ra}{pulli}' or
+ '{ma}{vs_i}{nnna}{pulli}' or
+ '{nnna}{nnna}{pulli}' or
+ '{nnna}{vs_aa}{nnna}{pulli}' or
+ '{nnna}{vs_aa}{lla}{pulli}' or
+ '{nnna}{vs_aa}{ra}{pulli}' or
+ ('{va}{nnna}{pulli}' test (not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}')) ) or
+ '{nnna}{lla}{pulli}' or
+ '{va}{lla}{pulli}' or
+ '{nnna}{ra}{pulli}' or
+ '{va}{ra}{pulli}' or
+ '{nnna}' or '{pa}' or '{ka}' or '{ta}' or '{ya}' or
+ '{pa}{nnna}{pulli}' or
+ '{pa}{lla}{pulli}' or
+ '{pa}{ra}{pulli}' or
+ ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
+ '{vs_i}{rra}{pulli}{rra}{vs_u}' or
+ '{pa}{ma}{pulli}' or
+ '{nnna}{ma}{pulli}' or
+ '{ta}{vs_u}{ma}{pulli}' or
+ '{rra}{vs_u}{ma}{pulli}' or
+ '{ka}{vs_u}{ma}{pulli}' or
+ '{nnna}{vs_e}{nnna}{pulli}' or
+ '{nnna}{vs_ai}' or
+ '{va}{vs_ai}'
+ ] delete
+ (set found_a_match)
+ )
+ or
+ test ( [
+ ('{vs_aa}{nnna}{pulli}' test (not '{ca}')) or
+ '{vs_aa}{lla}{pulli}' or
+ '{vs_aa}{ra}{pulli}' or
+ '{vs_ee}{nnna}{pulli}' or
+ '{vs_aa}' or
+ '{vs_aa}{ma}{pulli}' or
+ '{vs_e}{ma}{pulli}' or
+ '{vs_ee}{ma}{pulli}' or
+ '{vs_oo}{ma}{pulli}' or
+ '{ka}{vs_u}{ma}{pulli}' or
+ '{ta}{vs_u}{ma}{pulli}' or
+ '{tta}{vs_u}{ma}{pulli}' or
+ '{rra}{vs_u}{ma}{pulli}' or
+ '{vs_aa}{ya}{pulli}' or
+ '{nnna}{vs_e}{nnna}{pulli}' or
+ '{nnna}{vs_i}{ra}{pulli}' or
+ '{vs_ii}{ra}{pulli}' or
+ '{vs_ii}{ya}{ra}{pulli}'
+ ] <- '{pulli}'
+ (set found_a_match)
+ )
+ or
+ test ( ([ '{ka}{vs_u}' or '{ta}{vs_u}' ) (test '{pulli}') ] delete
+ (set found_a_match)
+ )
+ )
+ do ([among(
+ '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}'
+ '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}'
+ '{ka}{vs_i}{nnna}{pulli}{rra}'
+ '{ka}{vs_i}{nnna}{pulli}{rra}{pulli}'
+ '{ka}{vs_i}{rra}'
+ '{ka}{vs_i}{rra}{pulli}'
+ )] delete
+ (set found_a_match)
+ )
+ )
+ do fix_endings
+)
+
+define stem as (
+ unset found_vetrumai_urupu
+ do fix_ending
+ has_min_length
+ do remove_question_prefixes
+ do remove_pronoun_prefixes
+ do remove_question_suffixes
+ do remove_um
+ do remove_common_word_endings
+ do remove_vetrumai_urupukal
+ do remove_plural_suffix
+ do remove_command_suffixes
+ do remove_tense_suffixes
+)
diff --git a/contrib/snowball/algorithms/turkish/stem_Unicode.sbl b/contrib/snowball/algorithms/turkish.sbl
index 16c02a51f..eadd61d02 100644
--- a/contrib/snowball/algorithms/turkish/stem_Unicode.sbl
+++ b/contrib/snowball/algorithms/turkish.sbl
@@ -2,7 +2,7 @@
* author: Evren (Kapusuz) Çilden
* email: evren.kapusuz at gmail.com
* version: 1.0 (15.01.2007)
-
+
* stems nominal verb suffixes
* stems nominal inflections
@@ -10,13 +10,13 @@
* (y,n,s,U) context check
* vowel harmony check
* last consonant check and conversion (b, c, d, ğ to p, ç, t, k)
-
+
* The stemming algorithm is based on the paper "An Affix Stripping
* Morphological Analyzer for Turkish" by Gülşen Eryiğit and
* Eşref Adalı (Proceedings of the IAESTED International Conference
* ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004,
* Innsbruck, Austria
-
+
* Turkish is an agglutinative language and has a very rich morphological
* structure. In Turkish, you can form many different words from a single stem
* by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means
@@ -59,14 +59,14 @@ routines (
mark_yken // nominal verb suffix
mark_ymUs_ // nominal verb suffix
mark_ysA // nominal verb suffix
-
+
mark_suffix_with_optional_y_consonant
mark_suffix_with_optional_U_vowel
mark_suffix_with_optional_n_consonant
mark_suffix_with_optional_s_consonant
-
+
more_than_one_syllable_word
-
+
post_process_last_consonants
postlude
@@ -75,34 +75,32 @@ routines (
stem_suffix_chain_before_ki
)
-/* Special characters in Unicode Latin-1 and Latin Extended-A */
-stringdef c. hex 'E7' // LATIN SMALL LETTER C WITH CEDILLA
-stringdef g~ hex '011F' // LATIN SMALL LETTER G WITH BREVE
-stringdef i' hex '0131' // LATIN SMALL LETTER I WITHOUT DOT
-stringdef o" hex 'F6' // LATIN SMALL LETTER O WITH DIAERESIS
-stringdef s. hex '015F' // LATIN SMALL LETTER S WITH CEDILLA
-stringdef u" hex 'FC' // LATIN SMALL LETTER U WITH DIAERESIS
-
-stringescapes { }
+stringescapes { }
-integers ( strlen ) // length of a string
+/* Special characters in Unicode Latin-1 and Latin Extended-A */
+stringdef c, '{U+00E7}' // LATIN SMALL LETTER C WITH CEDILLA
+stringdef g~ '{U+011F}' // LATIN SMALL LETTER G WITH BREVE
+stringdef i' '{U+0131}' // LATIN SMALL LETTER I WITHOUT DOT
+stringdef o" '{U+00F6}' // LATIN SMALL LETTER O WITH DIAERESIS
+stringdef s, '{U+015F}' // LATIN SMALL LETTER S WITH CEDILLA
+stringdef u" '{U+00FC}' // LATIN SMALL LETTER U WITH DIAERESIS
booleans ( continue_stemming_noun_suffixes )
-groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6)
+groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6)
-define vowel 'ae{i'}io{o"}u{u"}'
+define vowel 'ae{i'}io{o"}u{u"}'
define U '{i'}iu{u"}'
// the vowel grouping definitions below are used for checking vowel harmony
-define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a'
-define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e'
-define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i''
-define vowel4 'ei' // vowels that can end with suffixes containing 'i'
-define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u'
-define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"'
+define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a'
+define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e'
+define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i''
+define vowel4 'ei' // vowels that can end with suffixes containing 'i'
+define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u'
+define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"'
-externals ( stem )
+externals ( stem )
backwardmode (
// checks vowel harmony for possible suffixes,
@@ -124,165 +122,165 @@ backwardmode (
)
)
)
-
+
// if the last consonant before suffix is vowel and n then advance and delete
// if the last consonant before suffix is non vowel and n do nothing
// if the last consonant before suffix is not n then only delete the suffix
// assumption: slice beginning is set correctly
define mark_suffix_with_optional_n_consonant as (
- ((test 'n') next (test vowel))
+ ('n' (test vowel))
or
- ((not(test 'n')) test(next (test vowel)))
+ ((not(test 'n')) test(next vowel))
)
-
+
// if the last consonant before suffix is vowel and s then advance and delete
// if the last consonant before suffix is non vowel and s do nothing
// if the last consonant before suffix is not s then only delete the suffix
// assumption: slice beginning is set correctly
define mark_suffix_with_optional_s_consonant as (
- ((test 's') next (test vowel))
+ ('s' (test vowel))
or
- ((not(test 's')) test(next (test vowel)))
+ ((not(test 's')) test(next vowel))
)
-
+
// if the last consonant before suffix is vowel and y then advance and delete
// if the last consonant before suffix is non vowel and y do nothing
// if the last consonant before suffix is not y then only delete the suffix
// assumption: slice beginning is set correctly
define mark_suffix_with_optional_y_consonant as (
- ((test 'y') next (test vowel))
+ ('y' (test vowel))
or
- ((not(test 'y')) test(next (test vowel)))
+ ((not(test 'y')) test(next vowel))
)
-
+
define mark_suffix_with_optional_U_vowel as (
- ((test U) next (test non-vowel))
+ (U (test non-vowel))
or
- ((not(test U)) test(next (test non-vowel)))
+ ((not(test U)) test(next non-vowel))
)
-
+
define mark_possessives as (
among ('m{i'}z' 'miz' 'muz' 'm{u"}z'
'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n')
(mark_suffix_with_optional_U_vowel)
)
-
+
define mark_sU as (
check_vowel_harmony
U
(mark_suffix_with_optional_s_consonant)
)
-
+
define mark_lArI as (
among ('leri' 'lar{i'}')
)
-
+
define mark_yU as (
check_vowel_harmony
U
- (mark_suffix_with_optional_y_consonant)
+ (mark_suffix_with_optional_y_consonant)
)
-
+
define mark_nU as (
check_vowel_harmony
- among ('n{i'}' 'ni' 'nu' 'n{u"}')
+ among ('n{i'}' 'ni' 'nu' 'n{u"}')
)
-
+
define mark_nUn as (
check_vowel_harmony
- among ('{i'}n' 'in' 'un' '{u"}n')
+ among ('{i'}n' 'in' 'un' '{u"}n')
(mark_suffix_with_optional_n_consonant)
)
-
+
define mark_yA as (
check_vowel_harmony
among('a' 'e')
(mark_suffix_with_optional_y_consonant)
)
-
+
define mark_nA as (
check_vowel_harmony
among('na' 'ne')
)
-
+
define mark_DA as (
check_vowel_harmony
among('da' 'de' 'ta' 'te')
)
-
+
define mark_ndA as (
check_vowel_harmony
among('nda' 'nde')
)
-
+
define mark_DAn as (
check_vowel_harmony
among('dan' 'den' 'tan' 'ten')
)
-
+
define mark_ndAn as (
check_vowel_harmony
among('ndan' 'nden')
)
-
+
define mark_ylA as (
check_vowel_harmony
among('la' 'le')
(mark_suffix_with_optional_y_consonant)
)
-
+
define mark_ki as (
'ki'
)
-
+
define mark_ncA as (
check_vowel_harmony
- among('ca' 'ce')
+ among('ca' 'ce')
(mark_suffix_with_optional_n_consonant)
)
-
+
define mark_yUm as (
check_vowel_harmony
among ('{i'}m' 'im' 'um' '{u"}m')
(mark_suffix_with_optional_y_consonant)
)
-
+
define mark_sUn as (
check_vowel_harmony
among ('s{i'}n' 'sin' 'sun' 's{u"}n' )
)
-
+
define mark_yUz as (
check_vowel_harmony
among ('{i'}z' 'iz' 'uz' '{u"}z')
(mark_suffix_with_optional_y_consonant)
)
-
+
define mark_sUnUz as (
among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z')
)
-
+
define mark_lAr as (
check_vowel_harmony
among ('ler' 'lar')
)
-
+
define mark_nUz as (
check_vowel_harmony
among ('n{i'}z' 'niz' 'nuz' 'n{u"}z')
)
-
+
define mark_DUr as (
check_vowel_harmony
among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r')
)
-
+
define mark_cAsInA as (
among ('cas{i'}na' 'cesine')
)
-
+
define mark_yDU as (
check_vowel_harmony
among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m'
@@ -292,24 +290,24 @@ backwardmode (
(mark_suffix_with_optional_y_consonant)
)
- // does not fully obey vowel harmony
+ // does not fully obey vowel harmony
define mark_ysA as (
among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se')
(mark_suffix_with_optional_y_consonant)
)
-
+
define mark_ymUs_ as (
check_vowel_harmony
- among ('m{i'}{s.}' 'mi{s.}' 'mu{s.}' 'm{u"}{s.}')
+ among ('m{i'}{s,}' 'mi{s,}' 'mu{s,}' 'm{u"}{s,}')
(mark_suffix_with_optional_y_consonant)
)
-
+
define mark_yken as (
'ken' (mark_suffix_with_optional_y_consonant)
)
-
+
define stem_nominal_verb_suffixes as (
- [
+ [
set continue_stemming_noun_suffixes
(mark_ymUs_ or mark_yDU or mark_ysA or mark_yken)
or
@@ -327,7 +325,7 @@ backwardmode (
(mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_))
]delete
)
-
+
// stems noun suffix chains ending with -ki
define stem_suffix_chain_before_ki as (
[
@@ -337,7 +335,7 @@ backwardmode (
(mark_lAr] delete try(stem_suffix_chain_before_ki))
or
(mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
-
+
))
or
(mark_nUn] delete try([
@@ -348,7 +346,7 @@ backwardmode (
(stem_suffix_chain_before_ki)
))
or
- (mark_ndA (
+ (mark_ndA (
(mark_lArI] delete)
or
((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki)))
@@ -357,7 +355,7 @@ backwardmode (
))
)
)
-
+
define stem_noun_suffixes as (
([mark_lAr] delete try(stem_suffix_chain_before_ki))
or
@@ -373,24 +371,24 @@ backwardmode (
or
([(mark_ndA or mark_nA)
(
- (mark_lArI] delete)
- or
- (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
- or
- (stem_suffix_chain_before_ki)
- )
+ (mark_lArI] delete)
+ or
+ (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+ or
+ (stem_suffix_chain_before_ki)
+ )
)
or
([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI)))
or
( [mark_DAn] delete try ([
(
- (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
- or
- (mark_lAr] delete try(stem_suffix_chain_before_ki))
- or
- (stem_suffix_chain_before_ki)
- ))
+ (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+ or
+ (mark_lAr] delete try(stem_suffix_chain_before_ki))
+ or
+ (stem_suffix_chain_before_ki)
+ ))
)
or
([mark_nUn or mark_ylA] delete
@@ -404,18 +402,18 @@ backwardmode (
)
or
([mark_lArI] delete)
- or
+ or
(stem_suffix_chain_before_ki)
or
([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki))
or
([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
)
-
- define post_process_last_consonants as (
+
+ define post_process_last_consonants as (
[substring] among (
'b' (<- 'p')
- 'c' (<- '{c.}')
+ 'c' (<- '{c,}')
'd' (<- 't')
'{g~}' (<- 'k')
)
@@ -424,7 +422,7 @@ backwardmode (
// after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed
// like in 'kedim' -> 'ked'
// Turkish words don't usually end with 'd' or 'g'
- // some very well known words are ignored (like 'ad' 'soyad'
+ // some very well known words are ignored (like 'ad' 'soyad'
// appends U to stems ending with d or g, decides which vowel to add
// based on the last vowel in the stem
define append_U_to_stems_ending_with_d_or_g as (
@@ -437,7 +435,10 @@ backwardmode (
or
(test((goto vowel) '{o"}' or '{u"}') <+ '{u"}')
)
-
+
+ define is_reserved_word as (
+ 'ad' try 'soy' atlimit
+ )
)
// Tests if there are more than one syllables
@@ -446,18 +447,12 @@ define more_than_one_syllable_word as (
test (atleast 2 (gopast vowel))
)
-define is_reserved_word as (
- test(gopast 'ad' ($strlen = 2) ($strlen == limit))
- or
- test(gopast 'soyad' ($strlen = 5) ($strlen == limit))
-)
-
define postlude as (
- not(is_reserved_word)
backwards (
+ not(is_reserved_word)
do append_U_to_stems_ending_with_d_or_g
do post_process_last_consonants
-
+
)
)
@@ -469,9 +464,7 @@ define stem as (
continue_stemming_noun_suffixes
do stem_noun_suffixes
)
-
+
postlude
)
)
-
-
diff --git a/contrib/snowball/charsets/ISO-8859-2.sbl b/contrib/snowball/charsets/ISO-8859-2.sbl
new file mode 100644
index 000000000..5829ea8a9
--- /dev/null
+++ b/contrib/snowball/charsets/ISO-8859-2.sbl
@@ -0,0 +1,98 @@
+// ISO-8859-2 character mappings.
+
+stringdef U+00A0 hex 'A0'
+stringdef U+0104 hex 'A1'
+stringdef U+02D8 hex 'A2'
+stringdef U+0141 hex 'A3'
+stringdef U+00A4 hex 'A4'
+stringdef U+013D hex 'A5'
+stringdef U+015A hex 'A6'
+stringdef U+00A7 hex 'A7'
+stringdef U+00A8 hex 'A8'
+stringdef U+0160 hex 'A9'
+stringdef U+015E hex 'AA'
+stringdef U+0164 hex 'AB'
+stringdef U+0179 hex 'AC'
+stringdef U+00AD hex 'AD'
+stringdef U+017D hex 'AE'
+stringdef U+017B hex 'AF'
+stringdef U+00B0 hex 'B0'
+stringdef U+0105 hex 'B1'
+stringdef U+02DB hex 'B2'
+stringdef U+0142 hex 'B3'
+stringdef U+00B4 hex 'B4'
+stringdef U+013E hex 'B5'
+stringdef U+015B hex 'B6'
+stringdef U+02C7 hex 'B7'
+stringdef U+00B8 hex 'B8'
+stringdef U+0161 hex 'B9'
+stringdef U+015F hex 'BA'
+stringdef U+0165 hex 'BB'
+stringdef U+017A hex 'BC'
+stringdef U+02DD hex 'BD'
+stringdef U+017E hex 'BE'
+stringdef U+017C hex 'BF'
+stringdef U+0154 hex 'C0'
+stringdef U+00C1 hex 'C1'
+stringdef U+00C2 hex 'C2'
+stringdef U+0102 hex 'C3'
+stringdef U+00C4 hex 'C4'
+stringdef U+0139 hex 'C5'
+stringdef U+0106 hex 'C6'
+stringdef U+00C7 hex 'C7'
+stringdef U+010C hex 'C8'
+stringdef U+00C9 hex 'C9'
+stringdef U+0118 hex 'CA'
+stringdef U+00CB hex 'CB'
+stringdef U+011A hex 'CC'
+stringdef U+00CD hex 'CD'
+stringdef U+00CE hex 'CE'
+stringdef U+010E hex 'CF'
+stringdef U+0110 hex 'D0'
+stringdef U+0143 hex 'D1'
+stringdef U+0147 hex 'D2'
+stringdef U+00D3 hex 'D3'
+stringdef U+00D4 hex 'D4'
+stringdef U+0150 hex 'D5'
+stringdef U+00D6 hex 'D6'
+stringdef U+00D7 hex 'D7'
+stringdef U+0158 hex 'D8'
+stringdef U+016E hex 'D9'
+stringdef U+00DA hex 'DA'
+stringdef U+0170 hex 'DB'
+stringdef U+00DC hex 'DC'
+stringdef U+00DD hex 'DD'
+stringdef U+0162 hex 'DE'
+stringdef U+00DF hex 'DF'
+stringdef U+0155 hex 'E0'
+stringdef U+00E1 hex 'E1'
+stringdef U+00E2 hex 'E2'
+stringdef U+0103 hex 'E3'
+stringdef U+00E4 hex 'E4'
+stringdef U+013A hex 'E5'
+stringdef U+0107 hex 'E6'
+stringdef U+00E7 hex 'E7'
+stringdef U+010D hex 'E8'
+stringdef U+00E9 hex 'E9'
+stringdef U+0119 hex 'EA'
+stringdef U+00EB hex 'EB'
+stringdef U+011B hex 'EC'
+stringdef U+00ED hex 'ED'
+stringdef U+00EE hex 'EE'
+stringdef U+010F hex 'EF'
+stringdef U+0111 hex 'F0'
+stringdef U+0144 hex 'F1'
+stringdef U+0148 hex 'F2'
+stringdef U+00F3 hex 'F3'
+stringdef U+00F4 hex 'F4'
+stringdef U+0151 hex 'F5'
+stringdef U+00F6 hex 'F6'
+stringdef U+00F7 hex 'F7'
+stringdef U+0159 hex 'F8'
+stringdef U+016F hex 'F9'
+stringdef U+00FA hex 'FA'
+stringdef U+0171 hex 'FB'
+stringdef U+00FC hex 'FC'
+stringdef U+00FD hex 'FD'
+stringdef U+0163 hex 'FE'
+stringdef U+02D9 hex 'FF'
diff --git a/contrib/snowball/charsets/KOI8-R.sbl b/contrib/snowball/charsets/KOI8-R.sbl
new file mode 100644
index 000000000..46854e825
--- /dev/null
+++ b/contrib/snowball/charsets/KOI8-R.sbl
@@ -0,0 +1,74 @@
+// KOI8-R character mappings.
+
+stringdef U+00A0 hex '9A'
+stringdef U+00A9 hex 'BF'
+stringdef U+00B0 hex '9C'
+stringdef U+00B2 hex '9D'
+stringdef U+00B7 hex '9E'
+stringdef U+00F7 hex '9F'
+stringdef U+0401 hex 'B3'
+stringdef U+0410 hex 'E1'
+stringdef U+0411 hex 'E2'
+stringdef U+0412 hex 'F7'
+stringdef U+0413 hex 'E7'
+stringdef U+0414 hex 'E4'
+stringdef U+0415 hex 'E5'
+stringdef U+0416 hex 'F6'
+stringdef U+0417 hex 'FA'
+stringdef U+0418 hex 'E9'
+stringdef U+0419 hex 'EA'
+stringdef U+041A hex 'EB'
+stringdef U+041B hex 'EC'
+stringdef U+041C hex 'ED'
+stringdef U+041D hex 'EE'
+stringdef U+041E hex 'EF'
+stringdef U+041F hex 'F0'
+stringdef U+0420 hex 'F2'
+stringdef U+0421 hex 'F3'
+stringdef U+0422 hex 'F4'
+stringdef U+0423 hex 'F5'
+stringdef U+0424 hex 'E6'
+stringdef U+0425 hex 'E8'
+stringdef U+0426 hex 'E3'
+stringdef U+0427 hex 'FE'
+stringdef U+0428 hex 'FB'
+stringdef U+0429 hex 'FD'
+stringdef U+042A hex 'FF'
+stringdef U+042B hex 'F9'
+stringdef U+042C hex 'F8'
+stringdef U+042D hex 'FC'
+stringdef U+042E hex 'E0'
+stringdef U+042F hex 'F1'
+stringdef U+0430 hex 'C1'
+stringdef U+0431 hex 'C2'
+stringdef U+0432 hex 'D7'
+stringdef U+0433 hex 'C7'
+stringdef U+0434 hex 'C4'
+stringdef U+0435 hex 'C5'
+stringdef U+0436 hex 'D6'
+stringdef U+0437 hex 'DA'
+stringdef U+0438 hex 'C9'
+stringdef U+0439 hex 'CA'
+stringdef U+043A hex 'CB'
+stringdef U+043B hex 'CC'
+stringdef U+043C hex 'CD'
+stringdef U+043D hex 'CE'
+stringdef U+043E hex 'CF'
+stringdef U+043F hex 'D0'
+stringdef U+0440 hex 'D2'
+stringdef U+0441 hex 'D3'
+stringdef U+0442 hex 'D4'
+stringdef U+0443 hex 'D5'
+stringdef U+0444 hex 'C6'
+stringdef U+0445 hex 'C8'
+stringdef U+0446 hex 'C3'
+stringdef U+0447 hex 'DE'
+stringdef U+0448 hex 'DB'
+stringdef U+0449 hex 'DD'
+stringdef U+044A hex 'DF'
+stringdef U+044B hex 'D9'
+stringdef U+044C hex 'D8'
+stringdef U+044D hex 'DC'
+stringdef U+044E hex 'C0'
+stringdef U+044F hex 'D1'
+stringdef U+0451 hex 'A3'
diff --git a/contrib/snowball/charsets/cp850.sbl b/contrib/snowball/charsets/cp850.sbl
new file mode 100644
index 000000000..b78022037
--- /dev/null
+++ b/contrib/snowball/charsets/cp850.sbl
@@ -0,0 +1,130 @@
+// Code page 850 (MSDOS Latin 1) character mappings.
+
+stringdef U+00A0 hex 'FF'
+stringdef U+00A1 hex 'AD'
+stringdef U+00A2 hex 'BD'
+stringdef U+00A3 hex '9C'
+stringdef U+00A4 hex 'CF'
+stringdef U+00A5 hex 'BE'
+stringdef U+00A6 hex 'DD'
+stringdef U+00A7 hex 'F5'
+stringdef U+00A8 hex 'F9'
+stringdef U+00A9 hex 'B8'
+stringdef U+00AA hex 'A6'
+stringdef U+00AB hex 'AE'
+stringdef U+00AC hex 'AA'
+stringdef U+00AD hex 'F0'
+stringdef U+00AE hex 'A9'
+stringdef U+00AF hex 'EE'
+stringdef U+00B0 hex 'F8'
+stringdef U+00B1 hex 'F1'
+stringdef U+00B2 hex 'FD'
+stringdef U+00B3 hex 'FC'
+stringdef U+00B4 hex 'EF'
+stringdef U+00B5 hex 'E6'
+stringdef U+00B6 hex 'F4'
+stringdef U+00B7 hex 'FA'
+stringdef U+00B8 hex 'F7'
+stringdef U+00B9 hex 'FB'
+stringdef U+00BA hex 'A7'
+stringdef U+00BB hex 'AF'
+stringdef U+00BC hex 'AC'
+stringdef U+00BD hex 'AB'
+stringdef U+00BE hex 'F3'
+stringdef U+00BF hex 'A8'
+stringdef U+00C0 hex 'B7'
+stringdef U+00C1 hex 'B5'
+stringdef U+00C2 hex 'B6'
+stringdef U+00C3 hex 'C7'
+stringdef U+00C4 hex '8E'
+stringdef U+00C5 hex '8F'
+stringdef U+00C6 hex '92'
+stringdef U+00C7 hex '80'
+stringdef U+00C8 hex 'D4'
+stringdef U+00C9 hex '90'
+stringdef U+00CA hex 'D2'
+stringdef U+00CB hex 'D3'
+stringdef U+00CC hex 'DE'
+stringdef U+00CD hex 'D6'
+stringdef U+00CE hex 'D7'
+stringdef U+00CF hex 'D8'
+stringdef U+00D0 hex 'D1'
+stringdef U+00D1 hex 'A5'
+stringdef U+00D2 hex 'E3'
+stringdef U+00D3 hex 'E0'
+stringdef U+00D4 hex 'E2'
+stringdef U+00D5 hex 'E5'
+stringdef U+00D6 hex '99'
+stringdef U+00D7 hex '9E'
+stringdef U+00D8 hex '9D'
+stringdef U+00D9 hex 'EB'
+stringdef U+00DA hex 'E9'
+stringdef U+00DB hex 'EA'
+stringdef U+00DC hex '9A'
+stringdef U+00DD hex 'ED'
+stringdef U+00DE hex 'E8'
+stringdef U+00DF hex 'E1'
+stringdef U+00E0 hex '85'
+stringdef U+00E1 hex 'A0'
+stringdef U+00E2 hex '83'
+stringdef U+00E3 hex 'C6'
+stringdef U+00E4 hex '84'
+stringdef U+00E5 hex '86'
+stringdef U+00E6 hex '91'
+stringdef U+00E7 hex '87'
+stringdef U+00E8 hex '8A'
+stringdef U+00E9 hex '82'
+stringdef U+00EA hex '88'
+stringdef U+00EB hex '89'
+stringdef U+00EC hex '8D'
+stringdef U+00ED hex 'A1'
+stringdef U+00EE hex '8C'
+stringdef U+00EF hex '8B'
+stringdef U+00F0 hex 'D0'
+stringdef U+00F1 hex 'A4'
+stringdef U+00F2 hex '95'
+stringdef U+00F3 hex 'A2'
+stringdef U+00F4 hex '93'
+stringdef U+00F5 hex 'E4'
+stringdef U+00F6 hex '94'
+stringdef U+00F7 hex 'F6'
+stringdef U+00F8 hex '9B'
+stringdef U+00F9 hex '97'
+stringdef U+00FA hex 'A3'
+stringdef U+00FB hex '96'
+stringdef U+00FC hex '81'
+stringdef U+00FD hex 'EC'
+stringdef U+00FE hex 'E7'
+stringdef U+00FF hex '98'
+stringdef U+0131 hex 'D5'
+stringdef U+0192 hex '9F'
+stringdef U+2017 hex 'F2'
+stringdef U+2500 hex 'C4'
+stringdef U+2502 hex 'B3'
+stringdef U+250C hex 'DA'
+stringdef U+2510 hex 'BF'
+stringdef U+2514 hex 'C0'
+stringdef U+2518 hex 'D9'
+stringdef U+251C hex 'C3'
+stringdef U+2524 hex 'B4'
+stringdef U+252C hex 'C2'
+stringdef U+2534 hex 'C1'
+stringdef U+253C hex 'C5'
+stringdef U+2550 hex 'CD'
+stringdef U+2551 hex 'BA'
+stringdef U+2554 hex 'C9'
+stringdef U+2557 hex 'BB'
+stringdef U+255A hex 'C8'
+stringdef U+255D hex 'BC'
+stringdef U+2560 hex 'CC'
+stringdef U+2563 hex 'B9'
+stringdef U+2566 hex 'CB'
+stringdef U+2569 hex 'CA'
+stringdef U+256C hex 'CE'
+stringdef U+2580 hex 'DF'
+stringdef U+2584 hex 'DC'
+stringdef U+2588 hex 'DB'
+stringdef U+2591 hex 'B0'
+stringdef U+2592 hex 'B1'
+stringdef U+2593 hex 'B2'
+stringdef U+25A0 hex 'FE'
diff --git a/contrib/snowball/compiler/analyser.c b/contrib/snowball/compiler/analyser.c
index ed712f64f..dffa5552d 100644
--- a/contrib/snowball/compiler/analyser.c
+++ b/contrib/snowball/compiler/analyser.c
@@ -4,31 +4,53 @@
#include <string.h> /* memmove */
#include "header.h"
+typedef enum {
+ e_token_omitted = 0,
+ e_unexpected_token = 1,
+ e_string_omitted = 2,
+ e_unexpected_token_in_among = 3,
+ /* For codes above here, report "after " t->previous_token after the error. */
+ e_unresolved_substring = 14,
+ e_not_allowed_inside_reverse = 15,
+ e_empty_grouping = 16,
+ e_already_backwards = 17,
+ e_empty_among = 18,
+ e_adjacent_bracketed_in_among = 19,
+ e_substring_preceded_by_substring = 20,
+ /* For codes below here, tokeniser->b is printed before the error. */
+ e_redeclared = 30,
+ e_undeclared = 31,
+ e_declared_as_different_mode = 32,
+ e_not_of_type_x = 33,
+ e_not_of_type_string_or_integer = 34,
+ e_misplaced = 35,
+ e_redefined = 36,
+ e_misused = 37
+} error_code;
+
/* recursive usage: */
static void read_program_(struct analyser * a, int terminator);
static struct node * read_C(struct analyser * a);
-static struct node * C_style(struct analyser * a, char * s, int token);
-
+static struct node * C_style(struct analyser * a, const char * s, int token);
-static void fault(int n) { fprintf(stderr, "fault %d\n", n); exit(1); }
static void print_node_(struct node * p, int n, const char * s) {
int i;
for (i = 0; i < n; i++) fputs(i == n - 1 ? s : " ", stdout);
printf("%s ", name_of_token(p->type));
- unless (p->name == 0) report_b(stdout, p->name->b);
- unless (p->literalstring == 0) {
+ if (p->name) report_b(stdout, p->name->b);
+ if (p->literalstring) {
printf("'");
report_b(stdout, p->literalstring);
printf("'");
}
printf("\n");
- unless (p->AE == 0) print_node_(p->AE, n+1, "# ");
- unless (p->left == 0) print_node_(p->left, n+1, " ");
- unless (p->right == 0) print_node_(p->right, n, " ");
- if (p->aux != 0) print_node_(p->aux, n+1, "@ ");
+ if (p->AE) print_node_(p->AE, n+1, "# ");
+ if (p->left) print_node_(p->left, n+1, " ");
+ if (p->aux) print_node_(p->aux, n+1, "@ ");
+ if (p->right) print_node_(p->right, n, " ");
}
extern void print_program(struct analyser * a) {
@@ -52,22 +74,37 @@ static struct node * new_node(struct analyser * a, int type) {
static const char * name_of_mode(int n) {
switch (n) {
- default: fault(0);
- case m_backward: return "string backward";
- case m_forward: return "string forward";
- /* case m_integer: return "integer"; */
+ case m_backward: return "string backward";
+ case m_forward: return "string forward";
+ /* case m_integer: return "integer"; */
}
+ fprintf(stderr, "Invalid mode %d in name_of_mode()\n", n);
+ exit(1);
}
static const char * name_of_type(int n) {
switch (n) {
- default: fault(1);
- case 's': return "string";
- case 'i': return "integer";
- case 'r': return "routine";
- case 'R': return "routine or grouping";
- case 'g': return "grouping";
+ case 's': return "string";
+ case 'i': return "integer";
+ case 'r': return "routine";
+ case 'R': return "routine or grouping";
+ case 'g': return "grouping";
+ }
+ fprintf(stderr, "Invalid type %d in name_of_type()\n", n);
+ exit(1);
+}
+
+static const char * name_of_name_type(int code) {
+ switch (code) {
+ case t_string: return "string";
+ case t_boolean: return "boolean";
+ case t_integer: return "integer";
+ case t_routine: return "routine";
+ case t_external: return "external";
+ case t_grouping: return "grouping";
}
+ fprintf(stderr, "Invalid type code %d in name_of_name_type()\n", code);
+ exit(1);
}
static void count_error(struct analyser * a) {
@@ -76,86 +113,78 @@ static void count_error(struct analyser * a) {
t->error_count++;
}
-static void error2(struct analyser * a, int n, int x) {
+static void error2(struct analyser * a, error_code n, int x) {
struct tokeniser * t = a->tokeniser;
count_error(a);
fprintf(stderr, "%s:%d: ", t->file, t->line_number);
- if (n >= 30) report_b(stderr, t->b);
+ if ((int)n >= (int)e_redeclared) report_b(stderr, t->b);
switch (n) {
- case 0:
+ case e_token_omitted:
fprintf(stderr, "%s omitted", name_of_token(t->omission)); break;
- case 3:
+ case e_unexpected_token_in_among:
fprintf(stderr, "in among(...), ");
- case 1:
+ /* fall through */
+ case e_unexpected_token:
fprintf(stderr, "unexpected %s", name_of_token(t->token));
if (t->token == c_number) fprintf(stderr, " %d", t->number);
if (t->token == c_name) {
fprintf(stderr, " ");
report_b(stderr, t->b);
} break;
- case 2:
+ case e_string_omitted:
fprintf(stderr, "string omitted"); break;
- case 14:
+ case e_unresolved_substring:
fprintf(stderr, "unresolved substring on line %d", x); break;
- case 15:
+ case e_not_allowed_inside_reverse:
fprintf(stderr, "%s not allowed inside reverse(...)", name_of_token(t->token)); break;
- case 16:
+ case e_empty_grouping:
fprintf(stderr, "empty grouping"); break;
- case 17:
+ case e_already_backwards:
fprintf(stderr, "backwards used when already in this mode"); break;
- case 18:
+ case e_empty_among:
fprintf(stderr, "empty among(...)"); break;
- case 19:
+ case e_adjacent_bracketed_in_among:
fprintf(stderr, "two adjacent bracketed expressions in among(...)"); break;
- case 20:
+ case e_substring_preceded_by_substring:
fprintf(stderr, "substring preceded by another substring on line %d", x); break;
- case 30:
+ case e_redeclared:
fprintf(stderr, " re-declared"); break;
- case 31:
+ case e_undeclared:
fprintf(stderr, " undeclared"); break;
- case 32:
+ case e_declared_as_different_mode:
fprintf(stderr, " declared as %s mode; used as %s mode",
name_of_mode(a->mode), name_of_mode(x)); break;
- case 33:
+ case e_not_of_type_x:
fprintf(stderr, " not of type %s", name_of_type(x)); break;
- case 34:
+ case e_not_of_type_string_or_integer:
fprintf(stderr, " not of type string or integer"); break;
- case 35:
+ case e_misplaced:
fprintf(stderr, " misplaced"); break;
- case 36:
+ case e_redefined:
fprintf(stderr, " redefined"); break;
- case 37:
+ case e_misused:
fprintf(stderr, " mis-used as %s mode",
name_of_mode(x)); break;
- default:
- fprintf(stderr, " error %d", n); break;
-
}
- if (n <= 13 && t->previous_token > 0)
+ if ((int)n < (int)e_unresolved_substring && t->previous_token > 0)
fprintf(stderr, " after %s", name_of_token(t->previous_token));
fprintf(stderr, "\n");
}
-static void error(struct analyser * a, int n) { error2(a, n, 0); }
-
-static void error3(struct analyser * a, struct node * p, symbol * b) {
- count_error(a);
- fprintf(stderr, "%s:%d: among(...) has repeated string '", a->tokeniser->file, p->line_number);
- report_b(stderr, b);
- fprintf(stderr, "'\n");
-}
+static void error(struct analyser * a, error_code n) { error2(a, n, 0); }
static void error4(struct analyser * a, struct name * q) {
count_error(a);
+ fprintf(stderr, "%s:%d: ", a->tokeniser->file, q->used->line_number);
report_b(stderr, q->b);
fprintf(stderr, " undefined\n");
}
static void omission_error(struct analyser * a, int n) {
a->tokeniser->omission = n;
- error(a, 0);
+ error(a, e_token_omitted);
}
static int check_token(struct analyser * a, int code) {
@@ -169,71 +198,120 @@ static int get_token(struct analyser * a, int code) {
read_token(t);
{
int x = check_token(a, code);
- unless (x) t->token_held = true;
+ if (!x) t->token_held = true;
return x;
}
}
static struct name * look_for_name(struct analyser * a) {
- struct name * p = a->names;
symbol * q = a->tokeniser->b;
- repeat {
- if (p == 0) return 0;
- { symbol * b = p->b;
- int n = SIZE(b);
- if (n == SIZE(q) && memcmp(q, b, n * sizeof(symbol)) == 0) {
- p->referenced = true;
- return p;
- }
+ struct name * p;
+ for (p = a->names; p; p = p->next) {
+ symbol * b = p->b;
+ int n = SIZE(b);
+ if (n == SIZE(q) && memcmp(q, b, n * sizeof(symbol)) == 0) {
+ p->referenced = true;
+ return p;
}
- p = p->next;
}
+ return 0;
}
static struct name * find_name(struct analyser * a) {
struct name * p = look_for_name(a);
- if (p == 0) error(a, 31);
+ if (p == 0) error(a, e_undeclared);
return p;
}
static void check_routine_mode(struct analyser * a, struct name * p, int mode) {
if (p->mode < 0) p->mode = mode; else
- unless (p->mode == mode) error2(a, 37, mode);
+ if (p->mode != mode) error2(a, e_misused, mode);
}
static void check_name_type(struct analyser * a, struct name * p, int type) {
switch (type) {
- case 's': if (p->type == t_string) return; break;
- case 'i': if (p->type == t_integer) return; break;
- case 'b': if (p->type == t_boolean) return; break;
- case 'R': if (p->type == t_grouping) return;
- case 'r': if (p->type == t_routine || p->type == t_external) return; break;
- case 'g': if (p->type == t_grouping) return; break;
+ case 's':
+ if (p->type == t_string) return;
+ break;
+ case 'i':
+ if (p->type == t_integer) return;
+ break;
+ case 'b':
+ if (p->type == t_boolean) return;
+ break;
+ case 'R':
+ if (p->type == t_grouping) return;
+ /* FALLTHRU */
+ case 'r':
+ if (p->type == t_routine || p->type == t_external) return;
+ break;
+ case 'g':
+ if (p->type == t_grouping) return;
+ break;
}
- error2(a, 33, type);
+ error2(a, e_not_of_type_x, type);
}
static void read_names(struct analyser * a, int type) {
struct tokeniser * t = a->tokeniser;
- unless (get_token(a, c_bra)) return;
- repeat {
- if (read_token(t) != c_name) break;
- if (look_for_name(a) != 0) error(a, 30); else {
- NEW(name, p);
- p->b = copy_b(t->b);
- p->type = type;
- p->mode = -1; /* routines, externals */
- p->count = a->name_count[type];
- p->referenced = false;
- p->used = false;
- p->grouping = 0;
- p->definition = 0;
- a->name_count[type] ++;
- p->next = a->names;
- a->names = p;
- }
- }
- unless (check_token(a, c_ket)) t->token_held = true;
+ if (!get_token(a, c_bra)) return;
+ while (true) {
+ int token = read_token(t);
+ switch (token) {
+ case c_len: {
+ /* Context-sensitive token - once declared as a name, it loses
+ * its special meaning, for compatibility with older versions
+ * of snowball.
+ */
+ static const symbol c_len_lit[] = {
+ 'l', 'e', 'n'
+ };
+ MOVE_TO_B(t->b, c_len_lit);
+ goto handle_as_name;
+ }
+ case c_lenof: {
+ /* Context-sensitive token - once declared as a name, it loses
+ * its special meaning, for compatibility with older versions
+ * of snowball.
+ */
+ static const symbol c_lenof_lit[] = {
+ 'l', 'e', 'n', 'o', 'f'
+ };
+ MOVE_TO_B(t->b, c_lenof_lit);
+ goto handle_as_name;
+ }
+ case c_name:
+handle_as_name:
+ if (look_for_name(a) != 0) error(a, e_redeclared); else {
+ NEW(name, p);
+ p->b = copy_b(t->b);
+ p->type = type;
+ p->mode = -1; /* routines, externals */
+ /* We defer assigning counts until after we've eliminated
+ * variables whose values are never used. */
+ p->count = -1;
+ p->referenced = false;
+ p->used_in_among = false;
+ p->used = 0;
+ p->value_used = false;
+ p->initialised = false;
+ p->used_in_definition = false;
+ p->local_to = 0;
+ p->grouping = 0;
+ p->definition = 0;
+ p->declaration_line_number = t->line_number;
+ p->next = a->names;
+ a->names = p;
+ if (token != c_name) {
+ disable_token(t, token);
+ }
+ }
+ break;
+ default:
+ if (!check_token(a, c_ket)) t->token_held = true;
+ return;
+ }
+ }
}
static symbol * new_literalstring(struct analyser * a) {
@@ -259,7 +337,7 @@ static int read_AE_test(struct analyser * a) {
case c_ge:
case c_ls:
case c_le: return t->token;
- default: error(a, 1); t->token_held = true; return c_eq;
+ default: error(a, e_unexpected_token); t->token_held = true; return c_eq;
}
}
@@ -271,11 +349,23 @@ static int binding(int t) {
}
}
+static void mark_used_in(struct analyser * a, struct name * q, struct node * p) {
+ if (!q->used) {
+ q->used = p;
+ q->local_to = a->program_end->name;
+ } else if (q->local_to) {
+ if (q->local_to != a->program_end->name) {
+ /* Used in more than one routine/external. */
+ q->local_to = NULL;
+ }
+ }
+}
+
static void name_to_node(struct analyser * a, struct node * p, int type) {
struct name * q = find_name(a);
- unless (q == 0) {
+ if (q) {
check_name_type(a, q, type);
- q->used = true;
+ mark_used_in(a, q, p);
}
p->name = q;
}
@@ -286,8 +376,18 @@ static struct node * read_AE(struct analyser * a, int B) {
struct node * q;
switch (read_token(t)) {
case c_minus: /* monadic */
+ q = read_AE(a, 100);
+ if (q->type == c_neg) {
+ /* Optimise away double negation, which avoids generators
+ * having to worry about generating "--" (decrement operator
+ * in many languages).
+ */
+ p = q->right;
+ /* Don't free q, it's in the linked list a->nodes. */
+ break;
+ }
p = new_node(a, c_neg);
- p->right = read_AE(a, 100);
+ p->right = q;
break;
case c_bra:
p = read_AE(a, 0);
@@ -296,11 +396,15 @@ static struct node * read_AE(struct analyser * a, int B) {
case c_name:
p = new_node(a, c_name);
name_to_node(a, p, 'i');
+ if (p->name) p->name->value_used = true;
break;
case c_maxint:
case c_minint:
+ a->int_limits_used = true;
+ /* fall through */
case c_cursor:
case c_limit:
+ case c_len:
case c_size:
p = new_node(a, t->token);
break;
@@ -308,18 +412,19 @@ static struct node * read_AE(struct analyser * a, int B) {
p = new_node(a, c_number);
p->number = t->number;
break;
+ case c_lenof:
case c_sizeof:
- p = C_style(a, "s", c_sizeof);
+ p = C_style(a, "s", t->token);
break;
default:
- error(a, 1);
+ error(a, e_unexpected_token);
t->token_held = true;
return 0;
}
- repeat {
+ while (true) {
int token = read_token(t);
int b = binding(token);
- unless (binding(token) > B) {
+ if (binding(token) <= B) {
t->token_held = true;
return p;
}
@@ -335,14 +440,11 @@ static struct node * read_C_connection(struct analyser * a, struct node * q, int
struct node * p = new_node(a, op);
struct node * p_end = q;
p->left = q;
- repeat {
+ do {
q = read_C(a);
p_end->right = q; p_end = q;
- if (read_token(t) != op) {
- t->token_held = true;
- break;
- }
- }
+ } while (read_token(t) == op);
+ t->token_held = true;
return p;
}
@@ -350,14 +452,14 @@ static struct node * read_C_list(struct analyser * a) {
struct tokeniser * t = a->tokeniser;
struct node * p = new_node(a, c_bra);
struct node * p_end = 0;
- repeat {
+ while (true) {
int token = read_token(t);
if (token == c_ket) return p;
if (token < 0) { omission_error(a, c_ket); return p; }
t->token_held = true;
{
struct node * q = read_C(a);
- repeat {
+ while (true) {
token = read_token(t);
if (token != c_and && token != c_or) {
t->token_held = true;
@@ -371,10 +473,10 @@ static struct node * read_C_list(struct analyser * a) {
}
}
-static struct node * C_style(struct analyser * a, char * s, int token) {
+static struct node * C_style(struct analyser * a, const char * s, int token) {
int i;
struct node * p = new_node(a, token);
- for (i = 0; s[i] != 0; i++) switch(s[i]) {
+ for (i = 0; s[i] != 0; i++) switch (s[i]) {
case 'C':
p->left = read_C(a); continue;
case 'D':
@@ -388,7 +490,7 @@ static struct node * C_style(struct analyser * a, char * s, int token) {
int str_token = read_token(a->tokeniser);
if (str_token == c_name) name_to_node(a, p, 's'); else
if (str_token == c_literalstring) p->literalstring = new_literalstring(a);
- else error(a, 2);
+ else error(a, e_string_omitted);
}
continue;
case 'b':
@@ -408,7 +510,7 @@ static struct node * read_literalstring(struct analyser * a) {
static void reverse_b(symbol * b) {
int i = 0; int j = SIZE(b) - 1;
- until (i >= j) {
+ while (i < j) {
int ch1 = b[i]; int ch2 = b[j];
b[i++] = ch2; b[j--] = ch1;
}
@@ -423,7 +525,74 @@ static int compare_amongvec(const void *pv, const void *qv) {
int i;
for (i = 0; i < smaller_size; i++)
if (b_p[i] != b_q[i]) return b_p[i] - b_q[i];
- return p_size - q_size;
+ if (p_size - q_size)
+ return p_size - q_size;
+ return p->line_number - q->line_number;
+}
+
+#define PTR_NULL_CHECK(P, Q) do {\
+ if ((Q) == NULL) {\
+ if ((P) != NULL) return 1;\
+ } else {\
+ if ((P) == NULL) return -1;\
+ }\
+ } while (0)
+
+static int compare_node(const struct node *p, const struct node *q) {
+ PTR_NULL_CHECK(p, q);
+ if (q == NULL) {
+ /* p must be NULL too. */
+ return 0;
+ }
+
+ if (p->type != q->type) return p->type > q->type ? 1 : -1;
+ if (p->mode != q->mode) return p->mode > q->mode ? 1 : -1;
+ if (p->type == c_number) {
+ if (p->number != q->number)
+ return p->number > q->number ? 1 : -1;
+ }
+
+ PTR_NULL_CHECK(p->left, q->left);
+ if (p->left) {
+ int r = compare_node(p->left, q->left);
+ if (r != 0) return r;
+ }
+
+ PTR_NULL_CHECK(p->AE, q->AE);
+ if (p->AE) {
+ int r = compare_node(p->AE, q->AE);
+ if (r != 0) return r;
+ }
+
+ PTR_NULL_CHECK(p->aux, q->aux);
+ if (p->aux) {
+ int r = compare_node(p->aux, q->aux);
+ if (r != 0) return r;
+ }
+
+ PTR_NULL_CHECK(p->name, q->name);
+ if (p->name) {
+ int r;
+ if (SIZE(p->name->b) != SIZE(q->name->b)) {
+ return SIZE(p->name->b) - SIZE(q->name->b);
+ }
+ r = memcmp(p->name->b, q->name->b,
+ SIZE(p->name->b) * sizeof(symbol));
+ if (r != 0) return r;
+ }
+
+ PTR_NULL_CHECK(p->literalstring, q->literalstring);
+ if (p->literalstring) {
+ int r;
+ if (SIZE(p->literalstring) != SIZE(q->literalstring)) {
+ return SIZE(p->literalstring) - SIZE(q->literalstring);
+ }
+ r = memcmp(p->literalstring, q->literalstring,
+ SIZE(p->literalstring) * sizeof(symbol));
+ if (r != 0) return r;
+ }
+
+ return compare_node(p->right, q->right);
}
static void make_among(struct analyser * a, struct node * p, struct node * substring) {
@@ -443,38 +612,84 @@ static void make_among(struct analyser * a, struct node * p, struct node * subst
x->next = 0;
x->b = v;
x->number = a->among_count++;
+ x->function_count = 0;
x->starter = 0;
+ x->nocommand_count = 0;
+ x->amongvar_needed = false;
if (q->type == c_bra) { x->starter = q; q = q->right; }
- until (q == 0) {
+ while (q) {
if (q->type == c_literalstring) {
symbol * b = q->literalstring;
w1->b = b; /* pointer to case string */
- w1->p = 0; /* pointer to corresponding case expression */
+ w1->action = NULL; /* action gets filled in below */
+ w1->line_number = q->line_number;
w1->size = SIZE(b); /* number of characters in string */
w1->i = -1; /* index of longest substring */
w1->result = -1; /* number of corresponding case expression */
- w1->function = q->left == 0 ? 0 : q->left->name;
- unless (w1->function == 0)
- check_routine_mode(a, w1->function, direction);
+ if (q->left) {
+ struct name * function = q->left->name;
+ w1->function = function;
+ function->used_in_among = true;
+ check_routine_mode(a, function, direction);
+ x->function_count++;
+ } else {
+ w1->function = 0;
+ }
w1++;
- }
- else
- if (q->left == 0) /* empty command: () */
+ } else if (q->left == 0) {
+ /* empty command: () */
w0 = w1;
- else {
- until (w0 == w1) {
- w0->p = q;
- w0->result = result;
+ } else {
+ /* Check for previous action which is the same as this one and use
+ * the same action code if we find one.
+ */
+ int among_result = -1;
+ struct amongvec * w;
+ for (w = v; w < w0; ++w) {
+ if (w->action && compare_node(w->action->left, q->left) == 0) {
+ if (w->result <= 0) {
+ printf("Among code %d isn't positive\n", w->result);
+ exit(1);
+ }
+ among_result = w->result;
+ break;
+ }
+ }
+ if (among_result < 0) {
+ among_result = result++;
+ }
+
+ while (w0 != w1) {
+ w0->action = q;
+ w0->result = among_result;
w0++;
}
- result++;
}
q = q->right;
}
- unless (w1-v == p->number) { fprintf(stderr, "oh! %d %d\n", (int)(w1-v), p->number); exit(1); }
- if (backward) for (w0 = v; w0 < w1; w0++) reverse_b(w0->b);
+ if (w1-v != p->number) { fprintf(stderr, "oh! %d %d\n", (int)(w1-v), p->number); exit(1); }
+ x->command_count = result - 1;
+ {
+ NEWVEC(node*, commands, x->command_count);
+ memset(commands, 0, x->command_count * sizeof(struct node*));
+ for (w0 = v; w0 < w1; w0++) {
+ if (w0->result > 0) {
+ /* result == -1 when there's no command. */
+ if (w0->result > x->command_count) {
+ fprintf(stderr, "More among codes than expected\n");
+ exit(1);
+ }
+ if (!commands[w0->result - 1])
+ commands[w0->result - 1] = w0->action;
+ } else {
+ ++x->nocommand_count;
+ }
+ if (backward) reverse_b(w0->b);
+ }
+ x->commands = commands;
+ }
qsort(v, w1 - v, sizeof(struct amongvec), compare_amongvec);
/* the following loop is O(n squared) */
@@ -494,15 +709,30 @@ static void make_among(struct analyser * a, struct node * p, struct node * subst
for (w0 = v; w0 < w1 - 1; w0++)
if (w0->size == (w0 + 1)->size &&
- memcmp(w0->b, (w0 + 1)->b, w0->size * sizeof(symbol)) == 0) error3(a, p, w0->b);
+ memcmp(w0->b, (w0 + 1)->b, w0->size * sizeof(symbol)) == 0) {
+ count_error(a);
+ fprintf(stderr, "%s:%d: among(...) has repeated string '",
+ a->tokeniser->file, (w0 + 1)->line_number);
+ report_b(stderr, (w0 + 1)->b);
+ fprintf(stderr, "'\n");
+ count_error(a);
+ fprintf(stderr, "%s:%d: previously seen here\n",
+ a->tokeniser->file, w0->line_number);
+ }
x->literalstring_count = p->number;
- x->command_count = result - 1;
p->among = x;
x->substring = substring;
if (substring != 0) substring->among = x;
- unless (x->command_count == 0 && x->starter == 0) a->amongvar_needed = true;
+ if (x->command_count > 1 ||
+ (x->command_count == 1 && x->nocommand_count > 0) ||
+ x->starter != 0) {
+ /* We need to set among_var rather than just checking if find_among*()
+ * returns zero or not.
+ */
+ x->amongvar_needed = a->amongvar_needed = true;
+ }
}
static struct node * read_among(struct analyser * a) {
@@ -514,8 +744,8 @@ static struct node * read_among(struct analyser * a) {
a->substring = 0;
p->number = 0; /* counts the number of literals */
- unless (get_token(a, c_bra)) return p;
- repeat {
+ if (!get_token(a, c_bra)) return p;
+ while (true) {
struct node * q;
int token = read_token(t);
switch (token) {
@@ -529,12 +759,14 @@ static struct node * read_among(struct analyser * a) {
else t->token_held = true;
p->number++; break;
case c_bra:
- if (previous_token == c_bra) error(a, 19);
+ if (previous_token == c_bra) error(a, e_adjacent_bracketed_in_among);
q = read_C_list(a); break;
default:
- error(a, 3);
+ error(a, e_unexpected_token_in_among);
+ previous_token = token;
+ continue;
case c_ket:
- if (p->number == 0) error(a, 18);
+ if (p->number == 0) error(a, e_empty_among);
if (t->error_count == 0) make_among(a, p, substring);
return p;
}
@@ -547,13 +779,13 @@ static struct node * read_among(struct analyser * a) {
static struct node * read_substring(struct analyser * a) {
struct node * p = new_node(a, c_substring);
- if (a->substring != 0) error2(a, 20, a->substring->line_number);
+ if (a->substring != 0) error2(a, e_substring_preceded_by_substring, a->substring->line_number);
a->substring = p;
return p;
}
static void check_modifyable(struct analyser * a) {
- unless (a->modifyable) error(a, 15);
+ if (!a->modifyable) error(a, e_not_allowed_inside_reverse);
}
static struct node * read_C(struct analyser * a) {
@@ -565,7 +797,7 @@ static struct node * read_C(struct analyser * a) {
case c_backwards:
{
int mode = a->mode;
- if (a->mode == m_backward) error(a, 17); else a->mode = m_backward;
+ if (a->mode == m_backward) error(a, e_already_backwards); else a->mode = m_backward;
{ struct node * p = C_style(a, "C", token);
a->mode = mode;
return p;
@@ -596,14 +828,18 @@ static struct node * read_C(struct analyser * a) {
case c_loop:
case c_atleast:
return C_style(a, "AC", token);
- case c_setmark:
- return C_style(a, "i", token);
+ case c_setmark: {
+ struct node * n = C_style(a, "i", token);
+ if (n->name) n->name->initialised = true;
+ return n;
+ }
case c_tomark:
case c_atmark:
case c_hop:
return C_style(a, "A", token);
case c_delete:
check_modifyable(a);
+ /* fall through */
case c_next:
case c_tolimit:
case c_atlimit:
@@ -612,60 +848,147 @@ static struct node * read_C(struct analyser * a) {
case c_true:
case c_false:
case c_debug:
- return C_style(a, "", token);
+ return new_node(a, token);
case c_assignto:
- case c_sliceto:
+ case c_sliceto: {
+ struct node *n;
check_modifyable(a);
- return C_style(a, "s", token);
+ n = C_style(a, "s", token);
+ if (n->name) n->name->initialised = true;
+ return n;
+ }
case c_assign:
case c_insert:
case c_attach:
- case c_slicefrom:
+ case c_slicefrom: {
+ struct node *n;
check_modifyable(a);
- return C_style(a, "S", token);
+ n = C_style(a, "S", token);
+ if (n->name) n->name->value_used = true;
+ return n;
+ }
case c_setlimit:
return C_style(a, "CfD", token);
case c_set:
- case c_unset:
- return C_style(a, "b", token);
- case c_dollar:
- get_token(a, c_name);
- {
+ case c_unset: {
+ struct node * n = C_style(a, "b", token);
+ if (n->name) n->name->initialised = true;
+ return n;
+ }
+ case c_dollar: {
+ struct tokeniser * t = a->tokeniser;
+ read_token(t);
+ if (t->token == c_bra) {
+ /* Handle newer $(AE REL_OP AE) syntax. */
+ struct node * n = read_AE(a, 0);
+ read_token(t);
+ switch (t->token) {
+ case c_eq:
+ case c_ne:
+ case c_gr:
+ case c_ge:
+ case c_ls:
+ case c_le: {
+ struct node * lhs = n;
+ n = new_node(a, t->token);
+ n->left = lhs;
+ n->AE = read_AE(a, 0);
+ get_token(a, c_ket);
+ break;
+ }
+ default:
+ error(a, e_unexpected_token);
+ t->token_held = true;
+ break;
+ }
+ return n;
+ }
+
+ if (t->token == c_name) {
struct node * p;
struct name * q = find_name(a);
int mode = a->mode;
int modifyable = a->modifyable;
- switch (q ? q->type : t_string)
- /* above line was: switch (q->type) - bug #1 fix 7/2/2003 */
- {
- default: error(a, 34);
- case t_string:
- a->mode = m_forward;
- a->modifyable = true;
- p = new_node(a, c_dollar);
- p->left = read_C(a); break;
- case t_integer:
- /* a->mode = m_integer; */
- p = new_node(a, read_AE_test(a));
- p->AE = read_AE(a, 0); break;
+ if (q && q->type == t_string) {
+ /* Assume for now that $ on string both initialises and
+ * uses the string variable. FIXME: Can we do better?
+ */
+ q->initialised = true;
+ q->value_used = true;
+ a->mode = m_forward;
+ a->modifyable = true;
+ p = new_node(a, c_dollar);
+ p->left = read_C(a);
+ p->name = q;
+ } else {
+ if (q && q->type != t_integer) {
+ /* If $ is used on an unknown name or a name which
+ * isn't a string or an integer then we assume the
+ * unknown name is an integer as $ is used more often
+ * on integers than strings, so hopefully this it less
+ * likely to cause an error avalanche.
+ *
+ * For an unknown name, we'll already have reported an
+ * error.
+ */
+ error(a, e_not_of_type_string_or_integer);
+ q = NULL;
+ }
+ p = new_node(a, read_AE_test(a));
+ p->AE = read_AE(a, 0);
+
+ if (q) {
+ switch (p->type) {
+ case c_mathassign:
+ q->initialised = true;
+ p->name = q;
+ break;
+ default:
+ /* +=, etc don't "initialise" as they only
+ * amend an existing value. Similarly, they
+ * don't count as using the value.
+ */
+ p->name = q;
+ break;
+ case c_eq:
+ case c_ne:
+ case c_gr:
+ case c_ge:
+ case c_ls:
+ case c_le:
+ p->left = new_node(a, c_name);
+ p->left->name = q;
+ q->value_used = true;
+ break;
+ }
+ }
}
- p->name = q;
+ if (q) mark_used_in(a, q, p);
a->mode = mode;
a->modifyable = modifyable;
return p;
}
+
+ error(a, e_unexpected_token);
+ t->token_held = true;
+ return new_node(a, c_dollar);
+ }
case c_name:
{
struct name * q = find_name(a);
struct node * p = new_node(a, c_name);
- unless (q == 0) {
- q->used = true;
+ if (q) {
+ mark_used_in(a, q, p);
switch (q->type) {
case t_boolean:
- p->type = c_booltest; break;
+ p->type = c_booltest;
+ q->value_used = true;
+ break;
case t_integer:
- error(a, 35); /* integer name misplaced */
+ error(a, e_misplaced); /* integer name misplaced */
+ break;
case t_string:
+ q->value_used = true;
break;
case t_routine:
case t_external:
@@ -684,7 +1007,7 @@ static struct node * read_C(struct analyser * a) {
struct node * p = new_node(a, token);
read_token(t);
if (t->token == c_minus) read_token(t);
- unless (check_token(a, c_name)) { omission_error(a, c_name); return p; }
+ if (!check_token(a, c_name)) { omission_error(a, c_name); return p; }
name_to_node(a, p, 'g');
return p;
}
@@ -692,7 +1015,7 @@ static struct node * read_C(struct analyser * a) {
return read_literalstring(a);
case c_among: return read_among(a);
case c_substring: return read_substring(a);
- default: error(a, 1); return 0;
+ default: error(a, e_unexpected_token); return 0;
}
}
@@ -739,26 +1062,27 @@ static void read_define_grouping(struct analyser * a, struct name * q) {
NEW(grouping, p);
if (a->groupings == 0) a->groupings = p; else a->groupings_end->next = p;
a->groupings_end = p;
- q->grouping = p;
+ if (q) q->grouping = p;
p->next = 0;
p->name = q;
- p->number = q->count;
+ p->line_number = a->tokeniser->line_number;
p->b = create_b(0);
- repeat {
+ while (true) {
switch (read_token(t)) {
case c_name:
{
struct name * r = find_name(a);
- unless (r == 0) {
+ if (r) {
check_name_type(a, r, 'g');
p->b = alter_grouping(p->b, r->grouping->b, style, false);
+ r->used_in_definition = true;
}
}
break;
case c_literalstring:
- p->b = alter_grouping(p->b, t->b, style, a->utf8);
+ p->b = alter_grouping(p->b, t->b, style, (a->encoding == ENC_UTF8));
break;
- default: error(a, 1); return;
+ default: error(a, e_unexpected_token); return;
}
switch (read_token(t)) {
case c_plus:
@@ -777,7 +1101,7 @@ static void read_define_grouping(struct analyser * a, struct name * q) {
}
p->largest_ch = max;
p->smallest_ch = min;
- if (min == 1<<16) error(a, 16);
+ if (min == 1<<16) error(a, e_empty_grouping);
}
t->token_held = true; return;
}
@@ -786,32 +1110,49 @@ static void read_define_grouping(struct analyser * a, struct name * q) {
static void read_define_routine(struct analyser * a, struct name * q) {
struct node * p = new_node(a, c_define);
a->amongvar_needed = false;
- unless (q == 0) {
+ if (q) {
check_name_type(a, q, 'R');
- if (q->definition != 0) error(a, 36);
+ if (q->definition != 0) error(a, e_redefined);
if (q->mode < 0) q->mode = a->mode; else
- if (q->mode != a->mode) error2(a, 32, q->mode);
+ if (q->mode != a->mode) error2(a, e_declared_as_different_mode, q->mode);
}
p->name = q;
if (a->program == 0) a->program = p; else a->program_end->right = p;
a->program_end = p;
get_token(a, c_as);
p->left = read_C(a);
- unless (q == 0) q->definition = p->left;
+ if (q) q->definition = p->left;
if (a->substring != 0) {
- error2(a, 14, a->substring->line_number);
- a->substring = 0;
+ error2(a, e_unresolved_substring, a->substring->line_number);
+ a->substring = 0;
}
p->amongvar_needed = a->amongvar_needed;
}
static void read_define(struct analyser * a) {
- unless (get_token(a, c_name)) return;
- {
+ if (get_token(a, c_name)) {
struct name * q = find_name(a);
- if (q != 0 && q->type == t_grouping) read_define_grouping(a, q);
- else read_define_routine(a, q);
+ int type;
+ if (q) {
+ type = q->type;
+ } else {
+ /* No declaration, so sniff next token - if it is 'as' then parse
+ * as a routine, otherwise as a grouping.
+ */
+ if (read_token(a->tokeniser) == c_as) {
+ type = t_routine;
+ } else {
+ type = t_grouping;
+ }
+ a->tokeniser->token_held = true;
+ }
+
+ if (type == t_grouping) {
+ read_define_grouping(a, q);
+ } else {
+ read_define_routine(a, q);
+ }
}
}
@@ -827,7 +1168,7 @@ static void read_backwardmode(struct analyser * a) {
static void read_program_(struct analyser * a, int terminator) {
struct tokeniser * t = a->tokeniser;
- repeat {
+ while (true) {
switch (read_token(t)) {
case c_strings: read_names(a, t_string); break;
case c_booleans: read_names(a, t_boolean); break;
@@ -839,25 +1180,60 @@ static void read_program_(struct analyser * a, int terminator) {
case c_backwardmode:read_backwardmode(a); break;
case c_ket:
if (terminator == c_ket) return;
+ /* fall through */
default:
- error(a, 1); break;
+ error(a, e_unexpected_token); break;
case -1:
- unless (terminator < 0) omission_error(a, c_ket);
+ if (terminator >= 0) omission_error(a, c_ket);
return;
}
}
}
+static void remove_dead_assignments(struct node * p, struct name * q) {
+ if (p->name == q) {
+ switch (p->type) {
+ case c_assignto:
+ case c_sliceto:
+ case c_mathassign:
+ case c_plusassign:
+ case c_minusassign:
+ case c_multiplyassign:
+ case c_divideassign:
+ case c_setmark:
+ case c_set:
+ case c_unset:
+ case c_dollar:
+ /* c_true is a no-op. */
+ p->type = c_true;
+ break;
+ default:
+ /* There are no read accesses to this variable, so any
+ * references must be assignments.
+ */
+ fprintf(stderr, "Unhandled type of dead assignment via %s\n",
+ name_of_token(p->type));
+ exit(1);
+ }
+ }
+ if (p->AE) remove_dead_assignments(p->AE, q);
+ if (p->left) remove_dead_assignments(p->left, q);
+ if (p->aux) remove_dead_assignments(p->aux, q);
+ if (p->right) remove_dead_assignments(p->right, q);
+}
+
extern void read_program(struct analyser * a) {
read_program_(a, -1);
{
struct name * q = a->names;
- until (q == 0) {
- switch(q->type) {
+ while (q) {
+ switch (q->type) {
case t_external: case t_routine:
- if (q->used && q->definition == 0) { error4(a, q); } break;
+ if (q->used && q->definition == 0) error4(a, q);
+ break;
case t_grouping:
- if (q->used && q->grouping == 0) { error4(a, q); } break;
+ if (q->used && q->grouping == 0) error4(a, q);
+ break;
}
q = q->next;
}
@@ -865,33 +1241,77 @@ extern void read_program(struct analyser * a) {
if (a->tokeniser->error_count == 0) {
struct name * q = a->names;
- int warned = false;
- until (q == 0) {
- unless (q->referenced) {
- unless (warned) {
- fprintf(stderr, "Declared but not used:");
- warned = true;
+ struct name ** ptr = &(a->names);
+ while (q) {
+ if (!q->referenced) {
+ fprintf(stderr, "%s:%d: warning: %s '",
+ a->tokeniser->file,
+ q->declaration_line_number,
+ name_of_name_type(q->type));
+ report_b(stderr, q->b);
+ if (q->type == t_routine ||
+ q->type == t_external ||
+ q->type == t_grouping) {
+ fprintf(stderr, "' declared but not defined\n");
+ } else {
+ fprintf(stderr, "' defined but not used\n");
+ q = q->next;
+ *ptr = q;
+ continue;
}
- fprintf(stderr, " "); report_b(stderr, q->b);
+ } else if (q->type == t_routine || q->type == t_grouping) {
+ /* It's OK to define a grouping but only use it to define other
+ * groupings.
+ */
+ if (!q->used && !q->used_in_definition) {
+ int line_num;
+ if (q->type == t_routine) {
+ line_num = q->definition->line_number;
+ } else {
+ line_num = q->grouping->line_number;
+ }
+ fprintf(stderr, "%s:%d: warning: %s '",
+ a->tokeniser->file,
+ line_num,
+ name_of_name_type(q->type));
+ report_b(stderr, q->b);
+ fprintf(stderr, "' defined but not used\n");
+ }
+ } else if (q->type == t_external) {
+ /* Unused is OK. */
+ } else if (!q->initialised) {
+ fprintf(stderr, "%s:%d: warning: %s '",
+ a->tokeniser->file,
+ q->declaration_line_number,
+ name_of_name_type(q->type));
+ report_b(stderr, q->b);
+ fprintf(stderr, "' is never initialised\n");
+ } else if (!q->value_used) {
+ fprintf(stderr, "%s:%d: warning: %s '",
+ a->tokeniser->file,
+ q->declaration_line_number,
+ name_of_name_type(q->type));
+ report_b(stderr, q->b);
+ fprintf(stderr, "' is set but never used\n");
+ remove_dead_assignments(a->program, q);
+ q = q->next;
+ *ptr = q;
+ continue;
}
+ ptr = &(q->next);
q = q->next;
}
- if (warned) fprintf(stderr, "\n");
- q = a->names;
- warned = false;
- until (q == 0) {
- if (! q->used && (q->type == t_routine ||
- q->type == t_grouping)) {
- unless (warned) {
- fprintf(stderr, "Declared and defined but not used:");
- warned = true;
- }
- fprintf(stderr, " "); report_b(stderr, q->b);
+ {
+ /* Now we've eliminated variables whose values are never used we
+ * can number the variables, which is used by some generators.
+ */
+ int * name_count = a->name_count;
+ struct name * n;
+ for (n = a->names; n; n = n->next) {
+ n->count = name_count[n->type]++;
}
- q = q->next;
}
- if (warned) fprintf(stderr, "\n");
}
}
@@ -909,13 +1329,14 @@ extern struct analyser * create_analyser(struct tokeniser * t) {
a->modifyable = true;
{ int i; for (i = 0; i < t_size; i++) a->name_count[i] = 0; }
a->substring = 0;
+ a->int_limits_used = false;
return a;
}
extern void close_analyser(struct analyser * a) {
{
struct node * q = a->nodes;
- until (q == 0) {
+ while (q) {
struct node * q_next = q->next;
FREE(q);
q = q_next;
@@ -923,7 +1344,7 @@ extern void close_analyser(struct analyser * a) {
}
{
struct name * q = a->names;
- until (q == 0) {
+ while (q) {
struct name * q_next = q->next;
lose_b(q->b); FREE(q);
q = q_next;
@@ -931,7 +1352,7 @@ extern void close_analyser(struct analyser * a) {
}
{
struct literalstring * q = a->literalstrings;
- until (q == 0) {
+ while (q) {
struct literalstring * q_next = q->next;
lose_b(q->b); FREE(q);
q = q_next;
@@ -939,15 +1360,17 @@ extern void close_analyser(struct analyser * a) {
}
{
struct among * q = a->amongs;
- until (q == 0) {
+ while (q) {
struct among * q_next = q->next;
- FREE(q->b); FREE(q);
+ FREE(q->b);
+ FREE(q->commands);
+ FREE(q);
q = q_next;
}
}
{
struct grouping * q = a->groupings;
- until (q == 0) {
+ while (q) {
struct grouping * q_next = q->next;
lose_b(q->b); FREE(q);
q = q_next;
@@ -955,4 +1378,3 @@ extern void close_analyser(struct analyser * a) {
}
FREE(a);
}
-
diff --git a/contrib/snowball/compiler/driver.c b/contrib/snowball/compiler/driver.c
index fbb1e9cae..9cbe447a5 100644
--- a/contrib/snowball/compiler/driver.c
+++ b/contrib/snowball/compiler/driver.c
@@ -1,48 +1,86 @@
+#include <ctype.h> /* for toupper etc */
#include <stdio.h> /* for fprintf etc */
#include <stdlib.h> /* for free etc */
-#include <string.h> /* for strlen */
+#include <string.h> /* for strcmp */
#include "header.h"
-#define DEFAULT_PACKAGE "org.tartarus.snowball.ext"
-#define DEFAULT_BASE_CLASS "org.tartarus.snowball.SnowballProgram"
-#define DEFAULT_AMONG_CLASS "org.tartarus.snowball.Among"
-#define DEFAULT_STRING_CLASS "java.lang.StringBuilder"
+#define DEFAULT_JAVA_PACKAGE "org.tartarus.snowball.ext"
+#define DEFAULT_JAVA_BASE_CLASS "org.tartarus.snowball.SnowballProgram"
+#define DEFAULT_JAVA_AMONG_CLASS "org.tartarus.snowball.Among"
+#define DEFAULT_JAVA_STRING_CLASS "java.lang.StringBuilder"
+
+#define DEFAULT_GO_PACKAGE "snowball"
+#define DEFAULT_GO_SNOWBALL_RUNTIME "github.com/snowballstem/snowball/go"
+
+#define DEFAULT_CS_NAMESPACE "Snowball"
+#define DEFAULT_CS_BASE_CLASS "Stemmer"
+#define DEFAULT_CS_AMONG_CLASS "Among"
+#define DEFAULT_CS_STRING_CLASS "StringBuilder"
+
+#define DEFAULT_JS_BASE_CLASS "BaseStemmer"
+
+#define DEFAULT_PYTHON_BASE_CLASS "BaseStemmer"
static int eq(const char * s1, const char * s2) {
- int s1_len = strlen(s1);
- int s2_len = strlen(s2);
- return s1_len == s2_len && memcmp(s1, s2, s1_len) == 0;
+ return strcmp(s1, s2) == 0;
}
-static void print_arglist(void) {
- fprintf(stderr, "Usage: snowball <file> [options]\n\n"
- "options are: [-o[utput] file]\n"
- " [-s[yntax]]\n"
+static void print_arglist(int exit_code) {
+ FILE * f = exit_code ? stderr : stdout;
+ fprintf(f, "Usage: snowball SOURCE_FILE... [OPTIONS]\n\n"
+ "Supported options:\n"
+ " -o[utput] file\n"
+ " -s[yntax]\n"
+ " -comments\n"
#ifndef DISABLE_JAVA
- " [-j[ava]]\n"
+ " -j[ava]\n"
#endif
- " [-c++]\n"
- " [-w[idechars]]\n"
- " [-u[tf8]]\n"
- " [-n[ame] class name]\n"
- " [-ep[refix] string]\n"
- " [-vp[refix] string]\n"
- " [-i[nclude] directory]\n"
- " [-r[untime] path to runtime headers]\n"
-#ifndef DISABLE_JAVA
- " [-p[arentclassname] fully qualified parent class name]\n"
- " [-P[ackage] package name for stemmers]\n"
- " [-S[tringclass] StringBuffer-compatible class]\n"
- " [-a[mongclass] fully qualified name of the Among class]\n"
+#ifndef DISABLE_CSHARP
+ " -cs[harp]\n"
+#endif
+ " -c++\n"
+#ifndef DISABLE_PASCAL
+ " -pascal\n"
+#endif
+#ifndef DISABLE_PYTHON
+ " -py[thon]\n"
#endif
+#ifndef DISABLE_JS
+ " -js\n"
+#endif
+#ifndef DISABLE_RUST
+ " -rust\n"
+#endif
+#ifndef DISABLE_GO
+ " -go\n"
+#endif
+ " -w[idechars]\n"
+ " -u[tf8]\n"
+ " -n[ame] class name\n"
+ " -ep[refix] string\n"
+ " -vp[refix] string\n"
+ " -i[nclude] directory\n"
+ " -r[untime] path to runtime headers\n"
+ " -p[arentclassname] fully qualified parent class name\n"
+#if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP)
+ " -P[ackage] package name for stemmers\n"
+ " -S[tringclass] StringBuffer-compatible class\n"
+ " -a[mongclass] fully qualified name of the Among class\n"
+#endif
+#ifndef DISABLE_GO
+ " -gop[ackage] Go package name for stemmers\n"
+ " -gor[untime] Go snowball runtime package\n"
+#endif
+ " --help display this help and exit\n"
+ " --version output version information and exit\n"
);
- exit(1);
+ exit(exit_code);
}
static void check_lim(int i, int argc) {
if (i >= argc) {
fprintf(stderr, "argument list is one short\n");
- print_arglist();
+ print_arglist(1);
}
}
@@ -57,35 +95,47 @@ static FILE * get_output(symbol * b) {
return output;
}
-static void read_options(struct options * o, int argc, char * argv[]) {
+static int read_options(struct options * o, int argc, char * argv[]) {
char * s;
- int i = 2;
+ int i = 1;
+ int new_argc = 1;
+ /* Note down the last option used to specify an explicit encoding so
+ * we can warn we ignored it for languages with a fixed encoding.
+ */
+ const char * encoding_opt = NULL;
/* set defaults: */
o->output_file = 0;
o->syntax_tree = false;
- o->externals_prefix = "";
+ o->comments = false;
+ o->externals_prefix = NULL;
o->variables_prefix = 0;
o->runtime_path = 0;
- o->parent_class_name = DEFAULT_BASE_CLASS;
- o->string_class = DEFAULT_STRING_CLASS;
- o->among_class = DEFAULT_AMONG_CLASS;
- o->package = DEFAULT_PACKAGE;
- o->name = "";
+ o->parent_class_name = NULL;
+ o->string_class = NULL;
+ o->among_class = NULL;
+ o->package = NULL;
+ o->go_snowball_runtime = DEFAULT_GO_SNOWBALL_RUNTIME;
+ o->name = NULL;
o->make_lang = LANG_C;
- o->widechars = false;
o->includes = 0;
o->includes_end = 0;
- o->utf8 = false;
+ o->encoding = ENC_SINGLEBYTE;
/* read options: */
- repeat {
- if (i >= argc) break;
+ while (i < argc) {
s = argv[i++];
- { if (eq(s, "-o") || eq(s, "-output")) {
- check_lim(i, argc);
+ if (s[0] != '-') {
+ /* Non-option argument - shuffle down. */
+ argv[new_argc++] = s;
+ continue;
+ }
+
+ {
+ if (eq(s, "-o") || eq(s, "-output")) {
+ check_lim(i, argc);
o->output_file = argv[i++];
continue;
}
@@ -94,10 +144,33 @@ static void read_options(struct options * o, int argc, char * argv[]) {
o->name = argv[i++];
continue;
}
+#ifndef DISABLE_JS
+ if (eq(s, "-js")) {
+ o->make_lang = LANG_JAVASCRIPT;
+ continue;
+ }
+#endif
+#ifndef DISABLE_RUST
+ if (eq(s, "-rust")) {
+ o->make_lang = LANG_RUST;
+ continue;
+ }
+#endif
+#ifndef DISABLE_GO
+ if (eq(s, "-go")) {
+ o->make_lang = LANG_GO;
+ continue;
+ }
+#endif
#ifndef DISABLE_JAVA
if (eq(s, "-j") || eq(s, "-java")) {
o->make_lang = LANG_JAVA;
- o->widechars = true;
+ continue;
+ }
+#endif
+#ifndef DISABLE_CSHARP
+ if (eq(s, "-cs") || eq(s, "-csharp")) {
+ o->make_lang = LANG_CSHARP;
continue;
}
#endif
@@ -105,15 +178,31 @@ static void read_options(struct options * o, int argc, char * argv[]) {
o->make_lang = LANG_CPLUSPLUS;
continue;
}
+#ifndef DISABLE_PASCAL
+ if (eq(s, "-pascal")) {
+ o->make_lang = LANG_PASCAL;
+ continue;
+ }
+#endif
+#ifndef DISABLE_PYTHON
+ if (eq(s, "-py") || eq(s, "-python")) {
+ o->make_lang = LANG_PYTHON;
+ continue;
+ }
+#endif
if (eq(s, "-w") || eq(s, "-widechars")) {
- o->widechars = true;
- o->utf8 = false;
+ encoding_opt = s;
+ o->encoding = ENC_WIDECHARS;
continue;
}
if (eq(s, "-s") || eq(s, "-syntax")) {
o->syntax_tree = true;
continue;
}
+ if (eq(s, "-comments")) {
+ o->comments = true;
+ continue;
+ }
if (eq(s, "-ep") || eq(s, "-eprefix")) {
check_lim(i, argc);
o->externals_prefix = argv[i++];
@@ -145,16 +234,16 @@ static void read_options(struct options * o, int argc, char * argv[]) {
continue;
}
if (eq(s, "-u") || eq(s, "-utf8")) {
- o->utf8 = true;
- o->widechars = false;
+ encoding_opt = s;
+ o->encoding = ENC_UTF8;
continue;
}
-#ifndef DISABLE_JAVA
if (eq(s, "-p") || eq(s, "-parentclassname")) {
check_lim(i, argc);
o->parent_class_name = argv[i++];
continue;
}
+#if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP)
if (eq(s, "-P") || eq(s, "-Package")) {
check_lim(i, argc);
o->package = argv[i++];
@@ -171,44 +260,216 @@ static void read_options(struct options * o, int argc, char * argv[]) {
continue;
}
#endif
+#ifndef DISABLE_GO
+ if (eq(s, "-gop") || eq(s, "-gopackage")) {
+ check_lim(i, argc);
+ o->package = argv[i++];
+ continue;
+ }
+ if (eq(s, "-gor") || eq(s, "-goruntime")) {
+ check_lim(i, argc);
+ o->go_snowball_runtime = argv[i++];
+ continue;
+ }
+#endif
+ if (eq(s, "--help")) {
+ print_arglist(0);
+ }
+
+ if (eq(s, "--version")) {
+ printf("Snowball compiler version " SNOWBALL_VERSION "\n");
+ exit(0);
+ }
+
fprintf(stderr, "'%s' misplaced\n", s);
- print_arglist();
+ print_arglist(1);
}
}
+ if (new_argc == 1) {
+ fprintf(stderr, "no source files specified\n");
+ print_arglist(1);
+ }
+ argv[new_argc] = NULL;
+
+ /* Set language-dependent defaults. */
+ switch (o->make_lang) {
+ case LANG_C:
+ case LANG_CPLUSPLUS:
+ encoding_opt = NULL;
+ break;
+ case LANG_CSHARP:
+ o->encoding = ENC_WIDECHARS;
+ if (!o->parent_class_name)
+ o->parent_class_name = DEFAULT_CS_BASE_CLASS;
+ if (!o->string_class)
+ o->string_class = DEFAULT_CS_STRING_CLASS;
+ if (!o->among_class)
+ o->among_class = DEFAULT_CS_AMONG_CLASS;
+ if (!o->package)
+ o->package = DEFAULT_CS_NAMESPACE;
+ break;
+ case LANG_GO:
+ o->encoding = ENC_UTF8;
+ if (!o->package)
+ o->package = DEFAULT_GO_PACKAGE;
+ break;
+ case LANG_JAVA:
+ o->encoding = ENC_WIDECHARS;
+ if (!o->parent_class_name)
+ o->parent_class_name = DEFAULT_JAVA_BASE_CLASS;
+ if (!o->string_class)
+ o->string_class = DEFAULT_JAVA_STRING_CLASS;
+ if (!o->among_class)
+ o->among_class = DEFAULT_JAVA_AMONG_CLASS;
+ if (!o->package)
+ o->package = DEFAULT_JAVA_PACKAGE;
+ break;
+ case LANG_JAVASCRIPT:
+ o->encoding = ENC_WIDECHARS;
+ if (!o->parent_class_name)
+ o->parent_class_name = DEFAULT_JS_BASE_CLASS;
+ break;
+ case LANG_PYTHON:
+ o->encoding = ENC_WIDECHARS;
+ if (!o->parent_class_name)
+ o->parent_class_name = DEFAULT_PYTHON_BASE_CLASS;
+ break;
+ case LANG_RUST:
+ o->encoding = ENC_UTF8;
+ break;
+ default:
+ break;
+ }
+
+ if (encoding_opt) {
+ fprintf(stderr, "warning: %s only meaningful for C and C++\n",
+ encoding_opt);
+ }
+
+ if (o->make_lang != LANG_C && o->make_lang != LANG_CPLUSPLUS) {
+ if (o->runtime_path) {
+ fprintf(stderr, "warning: -r/-runtime only meaningful for C and C++\n");
+ }
+ if (o->externals_prefix) {
+ fprintf(stderr, "warning: -ep/-eprefix only meaningful for C and C++\n");
+ }
+ }
+ if (!o->externals_prefix) o->externals_prefix = "";
+
+ if (!o->name && o->output_file) {
+ /* Default class name to basename of output_file - this is the standard
+ * convention for at least Java and C#.
+ */
+ const char * slash = strrchr(o->output_file, '/');
+ size_t len;
+ const char * leaf = (slash == NULL) ? o->output_file : slash + 1;
+
+ slash = strrchr(leaf, '\\');
+ if (slash != NULL) leaf = slash + 1;
+
+ {
+ const char * dot = strchr(leaf, '.');
+ len = (dot == NULL) ? strlen(leaf) : (size_t)(dot - leaf);
+ }
+
+ {
+ char * new_name = malloc(len + 1);
+ switch (o->make_lang) {
+ case LANG_CSHARP:
+ case LANG_PASCAL:
+ /* Upper case initial letter. */
+ memcpy(new_name, leaf, len);
+ new_name[0] = toupper(new_name[0]);
+ break;
+ case LANG_JAVASCRIPT:
+ case LANG_PYTHON: {
+ /* Upper case initial letter and change each
+ * underscore+letter or hyphen+letter to an upper case
+ * letter.
+ */
+ size_t i, j = 0;
+ int uc_next = true;
+ for (i = 0; i != len; ++i) {
+ unsigned char ch = leaf[i];
+ if (ch == '_' || ch == '-') {
+ uc_next = true;
+ } else {
+ if (uc_next) {
+ new_name[j] = toupper(ch);
+ uc_next = false;
+ } else {
+ new_name[j] = ch;
+ }
+ ++j;
+ }
+ }
+ len = j;
+ break;
+ }
+ default:
+ /* Just copy. */
+ memcpy(new_name, leaf, len);
+ break;
+ }
+ new_name[len] = '\0';
+ o->name = new_name;
+ }
+ }
+
+ return new_argc;
}
extern int main(int argc, char * argv[]) {
+ int i;
NEW(options, o);
- if (argc == 1) print_arglist();
- read_options(o, argc, argv);
+ argc = read_options(o, argc, argv);
{
- symbol * filename = add_s_to_b(0, argv[1]);
- char * file;
- symbol * u = get_input(filename, &file);
+ char * file = argv[1];
+ symbol * u = get_input(file);
if (u == 0) {
- fprintf(stderr, "Can't open input %s\n", argv[1]);
+ fprintf(stderr, "Can't open input %s\n", file);
exit(1);
}
{
struct tokeniser * t = create_tokeniser(u, file);
struct analyser * a = create_analyser(t);
- t->widechars = o->widechars;
+ struct input ** next_input_ptr = &(t->next);
+ a->encoding = t->encoding = o->encoding;
t->includes = o->includes;
- a->utf8 = t->utf8 = o->utf8;
+ /* If multiple source files are specified, set up the others to be
+ * read after the first in order, using the same mechanism as
+ * 'get' uses. */
+ for (i = 2; i != argc; ++i) {
+ NEW(input, q);
+ file = argv[i];
+ u = get_input(file);
+ if (u == 0) {
+ fprintf(stderr, "Can't open input %s\n", file);
+ exit(1);
+ }
+ q->p = u;
+ q->c = 0;
+ q->file = file;
+ q->file_needs_freeing = false;
+ q->line_number = 1;
+ *next_input_ptr = q;
+ next_input_ptr = &(q->next);
+ }
+ *next_input_ptr = NULL;
read_program(a);
if (t->error_count > 0) exit(1);
if (o->syntax_tree) print_program(a);
close_tokeniser(t);
- unless (o->syntax_tree) {
+ if (!o->syntax_tree) {
struct generator * g;
- char * s = o->output_file;
- unless (s) {
+ const char * s = o->output_file;
+ if (!s) {
fprintf(stderr, "Please include the -o option\n");
- print_arglist();
- exit(1);
+ print_arglist(1);
}
+ g = create_generator(a, o);
if (o->make_lang == LANG_C || o->make_lang == LANG_CPLUSPLUS) {
symbol * b = add_s_to_b(0, s);
b = add_s_to_b(b, ".h");
@@ -217,41 +478,96 @@ extern int main(int argc, char * argv[]) {
if (o->make_lang == LANG_CPLUSPLUS) {
b = add_s_to_b(b, "c");
}
- o->output_c = get_output(b);
+ o->output_src = get_output(b);
lose_b(b);
- g = create_generator_c(a, o);
generate_program_c(g);
- close_generator_c(g);
- fclose(o->output_c);
+ fclose(o->output_src);
fclose(o->output_h);
}
#ifndef DISABLE_JAVA
if (o->make_lang == LANG_JAVA) {
symbol * b = add_s_to_b(0, s);
b = add_s_to_b(b, ".java");
- o->output_java = get_output(b);
+ o->output_src = get_output(b);
lose_b(b);
- g = create_generator_java(a, o);
generate_program_java(g);
- close_generator_java(g);
- fclose(o->output_java);
+ fclose(o->output_src);
}
#endif
+#ifndef DISABLE_PASCAL
+ if (o->make_lang == LANG_PASCAL) {
+ symbol *b = add_s_to_b(0, s);
+ b = add_s_to_b(b, ".pas");
+ o->output_src = get_output(b);
+ lose_b(b);
+ generate_program_pascal(g);
+ fclose(o->output_src);
+ }
+#endif
+#ifndef DISABLE_PYTHON
+ if (o->make_lang == LANG_PYTHON) {
+ symbol * b = add_s_to_b(0, s);
+ b = add_s_to_b(b, ".py");
+ o->output_src = get_output(b);
+ lose_b(b);
+ generate_program_python(g);
+ fclose(o->output_src);
+ }
+#endif
+#ifndef DISABLE_JS
+ if (o->make_lang == LANG_JAVASCRIPT) {
+ symbol * b = add_s_to_b(0, s);
+ b = add_s_to_b(b, ".js");
+ o->output_src = get_output(b);
+ lose_b(b);
+ generate_program_js(g);
+ fclose(o->output_src);
+ }
+#endif
+#ifndef DISABLE_CSHARP
+ if (o->make_lang == LANG_CSHARP) {
+ symbol * b = add_s_to_b(0, s);
+ b = add_s_to_b(b, ".cs");
+ o->output_src = get_output(b);
+ lose_b(b);
+ generate_program_csharp(g);
+ fclose(o->output_src);
+ }
+#endif
+#ifndef DISABLE_RUST
+ if (o->make_lang == LANG_RUST) {
+ symbol * b = add_s_to_b(0, s);
+ b = add_s_to_b(b, ".rs");
+ o->output_src = get_output(b);
+ lose_b(b);
+ generate_program_rust(g);
+ fclose(o->output_src);
+ }
+#endif
+#ifndef DISABLE_GO
+ if (o->make_lang == LANG_GO) {
+ symbol * b = add_s_to_b(0, s);
+ b = add_s_to_b(b, ".go");
+ o->output_src = get_output(b);
+ lose_b(b);
+ generate_program_go(g);
+ fclose(o->output_src);
+ }
+#endif
+ close_generator(g);
}
close_analyser(a);
}
lose_b(u);
- lose_b(filename);
}
{ struct include * p = o->includes;
- until (p == 0)
- { struct include * q = p->next;
+ while (p) {
+ struct include * q = p->next;
lose_b(p->b); FREE(p); p = q;
}
}
FREE(o);
- unless (space_count == 0) fprintf(stderr, "%d blocks unfreed\n", space_count);
+ if (space_count) fprintf(stderr, "%d blocks unfreed\n", space_count);
return 0;
}
-
diff --git a/contrib/snowball/compiler/generator.c b/contrib/snowball/compiler/generator.c
index 7294b0471..eed86c117 100644
--- a/contrib/snowball/compiler/generator.c
+++ b/contrib/snowball/compiler/generator.c
@@ -12,286 +12,349 @@
static void generate(struct generator * g, struct node * p);
-enum special_labels {
-
- x_return = -1
-
-};
-
static int new_label(struct generator * g) {
return g->next_label++;
}
-/* Output routines */
-static void output_str(FILE * outfile, struct str * str) {
+/* Write routines for simple entities */
- char * s = b_to_s(str_data(str));
- fprintf(outfile, "%s", s);
- free(s);
+/* Write a space if the preceding character was not whitespace */
+static void ws_opt_space(struct generator * g, const char * s) {
+ int ch = str_back(g->outbuf);
+ if (ch != ' ' && ch != '\n' && ch != '\t' && ch != -1)
+ write_char(g, ' ');
+ write_string(g, s);
}
-static void wch(struct generator * g, int ch) {
- str_append_ch(g->outbuf, ch); /* character */
+static void wi3(struct generator * g, int i) {
+ if (i < 100) write_char(g, ' ');
+ if (i < 10) write_char(g, ' ');
+ write_int(g, i); /* integer (width 3) */
}
-static void wnl(struct generator * g) {
- str_append_ch(g->outbuf, '\n'); /* newline */
- g->line_count++;
-}
-static void ws(struct generator * g, const char * s) {
- str_append_string(g->outbuf, s); /* string */
-}
+/* Write routines for items from the syntax tree */
-static void wi(struct generator * g, int i) {
- str_append_int(g->outbuf, i); /* integer */
-}
+static void write_varname(struct generator * g, struct name * p) {
-static void wh_ch(struct generator * g, int i) {
- str_append_ch(g->outbuf, "0123456789ABCDEF"[i & 0xF]); /* hexchar */
+ int ch = "SIIrxg"[p->type];
+ switch (p->type) {
+ case t_external:
+ write_string(g, g->options->externals_prefix); break;
+ case t_string:
+ case t_boolean:
+ case t_integer: {
+ int count = p->count;
+ if (count < 0) {
+ fprintf(stderr, "Reference to optimised out variable ");
+ report_b(stderr, p->b);
+ fprintf(stderr, " attempted\n");
+ exit(1);
+ }
+ if (p->type == t_boolean) {
+ /* We use a single array for booleans and integers, with the
+ * integers first.
+ */
+ count += g->analyser->name_count[t_integer];
+ }
+ write_char(g, ch);
+ write_char(g, '[');
+ write_int(g, count);
+ write_char(g, ']');
+ return;
+ }
+ default:
+ write_char(g, ch); write_char(g, '_');
+ }
+ write_b(g, p->b);
}
-static void wh(struct generator * g, int i) {
- if (i >> 4) wh(g, i >> 4);
- wh_ch(g, i); /* hex integer */
+static void write_varref(struct generator * g, struct name * p) { /* reference to variable */
+ if (p->type < t_routine) write_string(g, "z->");
+ write_varname(g, p);
}
-static void wi3(struct generator * g, int i) {
- if (i < 100) wch(g, ' ');
- if (i < 10) wch(g, ' ');
- wi(g, i); /* integer (width 3) */
+static void write_hexdigit(struct generator * g, int i) {
+ str_append_ch(g->outbuf, "0123456789ABCDEF"[i & 0xF]); /* hexchar */
}
-static void wvn(struct generator * g, struct name * p) { /* variable name */
-
- int ch = "SBIrxg"[p->type];
- switch (p->type) {
- case t_string:
- case t_boolean:
- case t_integer:
- wch(g, ch); wch(g, '['); wi(g, p->count); wch(g, ']'); return;
- case t_external:
- ws(g, g->options->externals_prefix); break;
- default:
- wch(g, ch); wch(g, '_');
- }
- str_append_b(g->outbuf, p->b);
+static void write_hex(struct generator * g, int i) {
+ if (i >> 4) write_hex(g, i >> 4);
+ write_hexdigit(g, i); /* hex integer */
}
-static void wv(struct generator * g, struct name * p) { /* reference to variable */
- if (p->type < t_routine) ws(g, "z->");
- wvn(g, p);
+/* write character literal */
+static void wlitch(struct generator * g, int ch) {
+ if (32 <= ch && ch < 127) {
+ write_char(g, '\'');
+ if (ch == '\'' || ch == '\\') {
+ write_char(g, '\\');
+ }
+ write_char(g, ch);
+ write_char(g, '\'');
+ } else {
+ write_string(g, "0x"); write_hex(g, ch);
+ }
}
static void wlitarray(struct generator * g, symbol * p) { /* write literal array */
- ws(g, "{ ");
+ write_string(g, "{ ");
{
int i;
for (i = 0; i < SIZE(p); i++) {
- int ch = p[i];
- if (32 <= ch && ch < 127) {
- wch(g, '\'');
- switch (ch) {
- case '\'':
- case '\\': wch(g, '\\');
- default: wch(g, ch);
- }
- wch(g, '\'');
- } else {
- wch(g, '0'); wch(g, 'x'); wh(g, ch);
- }
- if (i < SIZE(p) - 1) ws(g, ", ");
+ wlitch(g, p[i]);
+ if (i < SIZE(p) - 1) write_string(g, ", ");
}
}
- ws(g, " }");
+ write_string(g, " }");
}
static void wlitref(struct generator * g, symbol * p) { /* write ref to literal array */
- if (SIZE(p) == 0) ws(g, "0"); else {
+ if (SIZE(p) == 0) {
+ write_char(g, '0');
+ } else {
struct str * s = g->outbuf;
g->outbuf = g->declarations;
- ws(g, "static const symbol s_"); wi(g, g->literalstring_count); ws(g, "[] = ");
+ write_string(g, "static const symbol s_"); write_int(g, g->literalstring_count); write_string(g, "[] = ");
wlitarray(g, p);
- ws(g, ";\n");
+ write_string(g, ";\n");
g->outbuf = s;
- ws(g, "s_"); wi(g, g->literalstring_count);
+ write_string(g, "s_"); write_int(g, g->literalstring_count);
g->literalstring_count++;
}
}
-
-static void wm(struct generator * g) { /* margin */
+static void write_margin(struct generator * g) {
int i;
- for (i = 0; i < g->margin; i++) ws(g, " ");
+ for (i = 0; i < g->margin; i++) write_string(g, " ");
}
-static void wc(struct generator * g, struct node * p) { /* comment */
-
- ws(g, " /* ");
+void write_comment_content(struct generator * g, struct node * p) {
switch (p->type) {
case c_mathassign:
case c_plusassign:
case c_minusassign:
case c_multiplyassign:
case c_divideassign:
+ if (p->name) {
+ write_char(g, '$');
+ write_b(g, p->name->b);
+ write_char(g, ' ');
+ }
+ write_string(g, name_of_token(p->type));
+ write_string(g, " <integer expression>");
+ break;
case c_eq:
case c_ne:
case c_gr:
case c_ge:
case c_ls:
case c_le:
- if (p->name) {
- wch(g, '$');
- str_append_b(g->outbuf, p->name->b);
- wch(g, ' ');
- }
- ws(g, name_of_token(p->type));
- ws(g, " <integer expression>");
+ write_string(g, "$(<integer expression> ");
+ write_string(g, name_of_token(p->type));
+ write_string(g, " <integer expression>)");
break;
default:
- ws(g, name_of_token(p->type));
+ write_string(g, name_of_token(p->type));
if (p->name) {
- wch(g, ' ');
- str_append_b(g->outbuf, p->name->b);
+ write_char(g, ' ');
+ write_b(g, p->name->b);
}
}
- ws(g, ", line "); wi(g, p->line_number); ws(g, " */");
- wnl(g);
+ write_string(g, ", line ");
+ write_int(g, p->line_number);
+}
+
+static void write_comment(struct generator * g, struct node * p) {
+ if (g->options->comments) {
+ ws_opt_space(g, "/* ");
+ write_comment_content(g, p);
+ write_string(g, " */");
+ }
+ write_newline(g);
}
static void wms(struct generator * g, const char * s) {
- wm(g); ws(g, s); } /* margin + string */
+ write_margin(g); write_string(g, s); } /* margin + string */
-static void wbs(struct generator * g) { /* block start */
+static void write_block_start(struct generator * g) { /* block start */
wms(g, "{ ");
g->margin++;
}
-static void wbe(struct generator * g) { /* block end */
+static void write_block_end(struct generator * g) { /* block end */
- if (g->line_labelled == g->line_count) { wms(g, ";"); wnl(g); }
+ if (g->line_labelled == g->line_count) { wms(g, ";"); write_newline(g); }
g->margin--;
- wms(g, "}"); wnl(g);
+ wms(g, "}"); write_newline(g);
}
-static void wk(struct generator * g, struct node * p) { /* keep c */
+static void w(struct generator * g, const char * s);
+
+/* keep c */
+static void wk(struct generator * g, struct node * p, int keep_limit) {
++g->keep_count;
if (p->mode == m_forward) {
- ws(g, "int c"); wi(g, g->keep_count); ws(g, " = z->c;");
+ write_string(g, "int c");
+ write_int(g, g->keep_count);
+ write_string(g, " = z->c");
+ if (keep_limit) {
+ write_string(g, ", mlimit");
+ write_int(g, g->keep_count);
+ }
+ write_char(g, ';');
} else {
- ws(g, "int m"); wi(g, g->keep_count); ws(g, " = z->l - z->c; (void)m");
- wi(g, g->keep_count); ws(g, ";");
+ write_string(g, "int m");
+ write_int(g, g->keep_count);
+ write_string(g, " = z->l - z->c");
+ if (keep_limit) {
+ write_string(g, ", mlimit");
+ write_int(g, g->keep_count);
+ }
+ write_string(g, "; (void)m");
+ write_int(g, g->keep_count);
+ write_char(g, ';');
}
}
static void wrestore(struct generator * g, struct node * p, int keep_token) { /* restore c */
if (p->mode == m_forward) {
- ws(g, "z->c = c");
+ write_string(g, "z->c = c");
} else {
- ws(g, "z->c = z->l - m");
+ write_string(g, "z->c = z->l - m");
}
- wi(g, keep_token); ws(g, ";");
+ write_int(g, keep_token); write_char(g, ';');
+}
+
+static void wrestorelimit(struct generator * g, struct node * p, int keep_token) { /* restore limit */
+ if (p->mode == m_forward) {
+ w(g, "z->l += mlimit");
+ } else {
+ w(g, "z->lb = mlimit");
+ }
+ write_int(g, keep_token); write_string(g, ";");
}
static void winc(struct generator * g, struct node * p) { /* increment c */
- ws(g, p->mode == m_forward ? "z->c++;" :
+ write_string(g, p->mode == m_forward ? "z->c++;" :
"z->c--;");
}
static void wsetl(struct generator * g, int n) {
g->margin--;
- wms(g, "lab"); wi(g, n); wch(g, ':'); wnl(g);
+ wms(g, "lab"); write_int(g, n); write_char(g, ':'); write_newline(g);
g->line_labelled = g->line_count;
g->margin++;
}
static void wgotol(struct generator * g, int n) {
- wms(g, "goto lab"); wi(g, n); wch(g, ';'); wnl(g);
+ wms(g, "goto lab"); write_int(g, n); write_char(g, ';'); write_newline(g);
}
-static void wf(struct generator * g) { /* fail */
- if (g->failure_string != 0) { ws(g, "{ "); ws(g, g->failure_string); wch(g, ' '); }
- switch (g->failure_label)
- {
+static void write_failure(struct generator * g, struct node * p) { /* fail */
+ if (g->failure_keep_count != 0) {
+ write_string(g, "{ ");
+ if (g->failure_keep_count > 0) {
+ wrestore(g, p, g->failure_keep_count);
+ } else {
+ wrestorelimit(g, p, -g->failure_keep_count);
+ }
+ write_char(g, ' ');
+ }
+ switch (g->failure_label) {
case x_return:
- ws(g, "return 0;");
- break;
+ write_string(g, "return 0;");
+ break;
default:
- ws(g, "goto lab");
- wi(g, g->failure_label);
- wch(g, ';');
- g->label_used = 1;
+ write_string(g, "goto lab");
+ write_int(g, g->failure_label);
+ write_char(g, ';');
+ g->label_used = 1;
}
- if (g->failure_string != 0) ws(g, " }");
+ if (g->failure_keep_count != 0) write_string(g, " }");
}
-static void wlim(struct generator * g, struct node * p) { /* if at limit fail */
- ws(g, p->mode == m_forward ? "if (z->c >= z->l) " :
+/* if at limit fail */
+static void write_check_limit(struct generator * g, struct node * p) {
+
+ write_string(g, p->mode == m_forward ? "if (z->c >= z->l) " :
"if (z->c <= z->lb) ");
- wf(g);
+ write_failure(g, p);
}
-static void wp(struct generator * g, const char * s, struct node * p) { /* formatted write */
+static void write_data_address(struct generator * g, struct node * p) {
+ symbol * b = p->literalstring;
+ if (b != 0) {
+ write_int(g, SIZE(b)); w(g, ", ");
+ wlitref(g, b);
+ } else {
+ write_varref(g, p->name);
+ }
+}
+
+/* Formatted write. */
+static void writef(struct generator * g, const char * input, struct node * p) {
int i = 0;
- int l = strlen(s);
- until (i >= l) {
- int ch = s[i++];
- if (ch != '~') wch(g, ch); else
- switch(s[i++]) {
- default: wch(g, s[i - 1]); continue;
- case 'C': wc(g, p); continue;
- case 'k': wk(g, p); continue;
- case 'K': /* keep for c_test */
- ws(g, p->mode == m_forward ? "int c_test = z->c;" :
- "int m_test = z->l - z->c;");
- continue;
- case 'R': /* restore for c_test */
- ws(g, p->mode == m_forward ? "z->c = c_test;" :
- "z->c = z->l - m_test;");
- continue;
+ int l = strlen(input);
+
+ while (i < l) {
+ int ch = input[i++];
+ if (ch != '~') {
+ write_char(g, ch);
+ continue;
+ }
+ switch (input[i++]) {
+ default: write_char(g, input[i - 1]); continue;
+ case 'C': write_comment(g, p); continue;
+ case 'k': wk(g, p, false); continue;
+ case 'K': wk(g, p, true); continue;
case 'i': winc(g, p); continue;
- case 'l': wlim(g, p); continue;
- case 'f': wf(g); continue;
- case 'M': wm(g); continue;
- case 'N': wnl(g); continue;
- case '{': wbs(g); continue;
- case '}': wbe(g); continue;
- case 'S': ws(g, g->S[s[i++] - '0']); continue;
- case 'I': wi(g, g->I[s[i++] - '0']); continue;
- case 'J': wi3(g, g->I[s[i++] - '0']); continue;
- case 'V': wv(g, g->V[s[i++] - '0']); continue;
- case 'W': wvn(g, g->V[s[i++] - '0']); continue;
- case 'L': wlitref(g, g->L[s[i++] - '0']); continue;
- case 'A': wlitarray(g, g->L[s[i++] - '0']); continue;
+ case 'l': write_check_limit(g, p); continue;
+ case 'f': write_failure(g, p); continue;
+ case 'M': write_margin(g); continue;
+ case 'N': write_newline(g); continue;
+ case '{': write_block_start(g); continue;
+ case '}': write_block_end(g); continue;
+ case 'S': write_string(g, g->S[input[i++] - '0']); continue;
+ case 'I': write_int(g, g->I[input[i++] - '0']); continue;
+ case 'J': wi3(g, g->I[input[i++] - '0']); continue;
+ case 'V': write_varref(g, g->V[input[i++] - '0']); continue;
+ case 'W': write_varname(g, g->V[input[i++] - '0']); continue;
+ case 'L': wlitref(g, g->L[input[i++] - '0']); continue;
+ case 'A': wlitarray(g, g->L[input[i++] - '0']); continue;
+ case 'c': wlitch(g, g->I[input[i++] - '0']); continue;
+ case 'a': write_data_address(g, p); continue;
case '+': g->margin++; continue;
case '-': g->margin--; continue;
case '$': /* insert_s, insert_v etc */
- wch(g, p->literalstring == 0 ? 'v' : 's');
+ write_char(g, p->literalstring == 0 ? 'v' : 's');
continue;
- case 'p': ws(g, g->options->externals_prefix); continue;
+ case 'p': write_string(g, g->options->externals_prefix); continue;
}
}
}
-static void w(struct generator * g, const char * s) { wp(g, s, 0); }
+static void w(struct generator * g, const char * s) {
+ writef(g, s, 0);
+}
static void generate_AE(struct generator * g, struct node * p) {
- char * s;
+ const char * s;
switch (p->type) {
case c_name:
- wv(g, p->name); break;
+ write_varref(g, p->name); break;
case c_number:
- wi(g, p->number); break;
+ write_int(g, p->number); break;
case c_maxint:
- ws(g, "MAXINT"); break;
+ write_string(g, "MAXINT"); break;
case c_minint:
- ws(g, "MININT"); break;
+ write_string(g, "MININT"); break;
case c_neg:
- wch(g, '-'); generate_AE(g, p->right); break;
+ write_char(g, '-'); generate_AE(g, p->right); break;
case c_multiply:
s = " * "; goto label0;
case c_plus:
@@ -301,28 +364,45 @@ static void generate_AE(struct generator * g, struct node * p) {
case c_divide:
s = " / ";
label0:
- wch(g, '('); generate_AE(g, p->left);
- ws(g, s); generate_AE(g, p->right); wch(g, ')'); break;
- case c_sizeof:
- g->V[0] = p->name;
- w(g, "SIZE(~V0)"); break;
+ write_char(g, '('); generate_AE(g, p->left);
+ write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break;
case c_cursor:
w(g, "z->c"); break;
case c_limit:
w(g, p->mode == m_forward ? "z->l" : "z->lb"); break;
+ case c_len:
+ if (g->options->encoding == ENC_UTF8) {
+ w(g, "len_utf8(z->p)");
+ break;
+ }
+ /* FALLTHRU */
case c_size:
- w(g, "SIZE(z->p)"); break;
+ w(g, "SIZE(z->p)");
+ break;
+ case c_lenof:
+ if (g->options->encoding == ENC_UTF8) {
+ g->V[0] = p->name;
+ w(g, "len_utf8(~V0)");
+ break;
+ }
+ /* FALLTHRU */
+ case c_sizeof:
+ g->V[0] = p->name;
+ w(g, "SIZE(~V0)");
+ break;
}
}
/* K_needed() tests to see if we really need to keep c. Not true when the
- the command does not touch the cursor. This and repeat_score() could be
+ command does not touch the cursor. This and repeat_score() could be
elaborated almost indefinitely.
*/
-static int K_needed(struct generator * g, struct node * p) {
- until (p == 0) {
+static int K_needed_(struct generator * g, struct node * p, int call_depth) {
+ while (p) {
switch (p->type) {
+ case c_atlimit:
+ case c_do:
case c_dollar:
case c_leftslice:
case c_rightslice:
@@ -338,17 +418,28 @@ static int K_needed(struct generator * g, struct node * p) {
case c_ls:
case c_le:
case c_sliceto:
+ case c_booltest:
+ case c_set:
+ case c_unset:
case c_true:
case c_false:
case c_debug:
break;
case c_call:
- if (K_needed(g, p->name->definition)) return true;
+ /* Recursive functions aren't typical in snowball programs, so
+ * make the pessimistic assumption that keep is needed if we
+ * hit a generous limit on recursion. It's not likely to make
+ * a difference to any real world program, but means we won't
+ * recurse until we run out of stack for pathological cases.
+ */
+ if (call_depth >= 100) return true;
+ if (K_needed_(g, p->name->definition, call_depth + 1))
+ return true;
break;
case c_bra:
- if (K_needed(g, p->left)) return true;
+ if (K_needed_(g, p->left, call_depth)) return true;
break;
default: return true;
@@ -358,10 +449,13 @@ static int K_needed(struct generator * g, struct node * p) {
return false;
}
-static int repeat_score(struct generator * g, struct node * p) {
+extern int K_needed(struct generator * g, struct node * p) {
+ return K_needed_(g, p, 0);
+}
+
+static int repeat_score(struct generator * g, struct node * p, int call_depth) {
int score = 0;
- until (p == 0)
- {
+ while (p) {
switch (p->type) {
case c_dollar:
case c_leftslice:
@@ -382,11 +476,25 @@ static int repeat_score(struct generator * g, struct node * p) {
break;
case c_call:
- score += repeat_score(g, p->name->definition);
+ /* Recursive functions aren't typical in snowball programs, so
+ * make the pessimistic assumption that repeat requires cursor
+ * reinstatement if we hit a generous limit on recursion. It's
+ * not likely to make a difference to any real world program,
+ * but means we won't recurse until we run out of stack for
+ * pathological cases.
+ */
+ if (call_depth >= 100) {
+ return 2;
+ }
+ score += repeat_score(g, p->name->definition, call_depth + 1);
+ if (score >= 2)
+ return score;
break;
case c_bra:
- score += repeat_score(g, p->left);
+ score += repeat_score(g, p->left, call_depth);
+ if (score >= 2)
+ return score;
break;
case c_name:
@@ -395,9 +503,12 @@ static int repeat_score(struct generator * g, struct node * p) {
case c_grouping:
case c_non:
case c_hop:
- score = score + 1; break;
+ if (++score >= 2)
+ return score;
+ break;
- default: score = 2; break;
+ default:
+ return 2;
}
p = p->right;
}
@@ -406,25 +517,28 @@ static int repeat_score(struct generator * g, struct node * p) {
/* tests if an expression requires cursor reinstatement in a repeat */
-static int repeat_restore(struct generator * g, struct node * p) {
- return repeat_score(g, p) >= 2;
+extern int repeat_restore(struct generator * g, struct node * p) {
+ return repeat_score(g, p, 0) >= 2;
}
static void generate_bra(struct generator * g, struct node * p) {
p = p->left;
- until (p == 0) { generate(g, p); p = p->right; }
+ while (p) {
+ generate(g, p);
+ p = p->right;
+ }
}
static void generate_and(struct generator * g, struct node * p) {
int keep_c = 0;
if (K_needed(g, p->left)) {
- wp(g, "~{~k~C", p);
+ writef(g, "~{~k~C", p);
keep_c = g->keep_count;
} else {
- wp(g, "~M~C", p);
+ writef(g, "~M~C", p);
}
p = p->left;
- until (p == 0) {
+ while (p) {
generate(g, p);
if (keep_c && p->right != 0) {
w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N");
@@ -439,19 +553,19 @@ static void generate_or(struct generator * g, struct node * p) {
int used = g->label_used;
int a0 = g->failure_label;
- const char * a1 = g->failure_string;
+ int a1 = g->failure_keep_count;
int out_lab = new_label(g);
if (K_needed(g, p->left)) {
- wp(g, "~{~k~C", p);
+ writef(g, "~{~k~C", p);
keep_c = g->keep_count;
} else {
- wp(g, "~M~C", p);
+ writef(g, "~M~C", p);
}
p = p->left;
- g->failure_string = 0;
- until (p->right == 0) {
+ g->failure_keep_count = 0;
+ while (p->right) {
g->failure_label = new_label(g);
g->label_used = 0;
generate(g, p);
@@ -465,7 +579,7 @@ static void generate_or(struct generator * g, struct node * p) {
}
g->label_used = used;
g->failure_label = a0;
- g->failure_string = a1;
+ g->failure_keep_count = a1;
generate(g, p);
if (keep_c) w(g, "~}");
@@ -474,7 +588,7 @@ static void generate_or(struct generator * g, struct node * p) {
static void generate_backwards(struct generator * g, struct node * p) {
- wp(g,"~Mz->lb = z->c; z->c = z->l;~C~N", p);
+ writef(g, "~Mz->lb = z->c; z->c = z->l;~C~N", p);
generate(g, p->left);
w(g, "~Mz->c = z->lb;~N");
}
@@ -485,18 +599,18 @@ static void generate_not(struct generator * g, struct node * p) {
int used = g->label_used;
int a0 = g->failure_label;
- const char * a1 = g->failure_string;
+ int a1 = g->failure_keep_count;
if (K_needed(g, p->left)) {
- wp(g, "~{~k~C", p);
+ writef(g, "~{~k~C", p);
keep_c = g->keep_count;
} else {
- wp(g, "~M~C", p);
+ writef(g, "~M~C", p);
}
g->failure_label = new_label(g);
g->label_used = 0;
- g->failure_string = 0;
+ g->failure_keep_count = 0;
generate(g, p->left);
{
@@ -505,9 +619,9 @@ static void generate_not(struct generator * g, struct node * p) {
g->label_used = used;
g->failure_label = a0;
- g->failure_string = a1;
+ g->failure_keep_count = a1;
- w(g, "~M~f~N");
+ writef(g, "~M~f~N", p);
if (u)
wsetl(g, l);
}
@@ -518,20 +632,14 @@ static void generate_not(struct generator * g, struct node * p) {
static void generate_try(struct generator * g, struct node * p) {
- int keep_c = K_needed(g, p->left);
-
- if (keep_c) {
- if (p->mode == m_forward) {
- wp(g, "~{int c_keep = z->c;~C", p);
- g->failure_string = "z->c = c_keep;";
- } else {
- wp(g, "~{int m_keep = z->l - z->c;/* (void) m_keep;*/~C", p);
- g->failure_string = "z->c = z->l - m_keep;";
- }
+ int keep_c = 0;
+ if (K_needed(g, p->left)) {
+ writef(g, "~{~k~C", p);
+ keep_c = g->keep_count;
} else {
- wp(g, "~M~C", p);
- g->failure_string = 0;
+ writef(g, "~M~C", p);
}
+ g->failure_keep_count = keep_c;
g->failure_label = new_label(g);
g->label_used = 0;
@@ -544,47 +652,65 @@ static void generate_try(struct generator * g, struct node * p) {
}
static void generate_set(struct generator * g, struct node * p) {
- g->V[0] = p->name; wp(g, "~M~V0 = 1;~C", p);
+ g->V[0] = p->name; writef(g, "~M~V0 = 1;~C", p);
}
static void generate_unset(struct generator * g, struct node * p) {
- g->V[0] = p->name; wp(g, "~M~V0 = 0;~C", p);
+ g->V[0] = p->name; writef(g, "~M~V0 = 0;~C", p);
}
static void generate_fail(struct generator * g, struct node * p) {
generate(g, p->left);
- wp(g, "~M~f~C", p);
+ writef(g, "~M~f~C", p);
}
/* generate_test() also implements 'reverse' */
static void generate_test(struct generator * g, struct node * p) {
- int keep_c = K_needed(g, p->left);
- if (keep_c) wp(g, "~{~K~C", p);
- else wp(g, "~M~C", p);
+ int keep_c = 0;
+ if (K_needed(g, p->left)) {
+ keep_c = ++g->keep_count;
+ w(g, p->mode == m_forward ? "~{int c_test" :
+ "~{int m_test");
+ write_int(g, keep_c);
+ w(g, p->mode == m_forward ? " = z->c;" :
+ " = z->l - z->c;");
+ writef(g, "~C", p);
+ } else writef(g, "~M~C", p);
generate(g, p->left);
- if (keep_c) wp(g, "~M~R~N"
- "~}", p);
+ if (keep_c) {
+ w(g, p->mode == m_forward ? "~Mz->c = c_test" :
+ "~Mz->c = z->l - m_test");
+ write_int(g, keep_c);
+ writef(g, ";~N~}", p);
+ }
}
static void generate_do(struct generator * g, struct node * p) {
int keep_c = 0;
if (K_needed(g, p->left)) {
- wp(g, "~{~k~C", p);
+ writef(g, "~{~k~C", p);
keep_c = g->keep_count;
} else {
- wp(g, "~M~C", p);
+ writef(g, "~M~C", p);
}
- g->failure_label = new_label(g);
- g->label_used = 0;
- g->failure_string = 0;
- generate(g, p->left);
+ if (p->left->type == c_call) {
+ /* Optimise do <call> */
+ g->V[0] = p->left->name;
+ writef(g, "~{int ret = ~V0(z);~C", p->left);
+ w(g, "~Mif (ret < 0) return ret;~N~}");
+ } else {
+ g->failure_label = new_label(g);
+ g->label_used = 0;
+ g->failure_keep_count = 0;
+ generate(g, p->left);
- if (g->label_used)
- wsetl(g, g->failure_label);
+ if (g->label_used)
+ wsetl(g, g->failure_label);
+ }
if (keep_c) {
w(g, "~M"); wrestore(g, p, keep_c);
w(g, "~N~}");
@@ -592,17 +718,17 @@ static void generate_do(struct generator * g, struct node * p) {
}
static void generate_next(struct generator * g, struct node * p) {
- if (g->options->utf8) {
+ if (g->options->encoding == ENC_UTF8) {
if (p->mode == m_forward)
w(g, "~{int ret = skip_utf8(z->p, z->c, 0, z->l, 1");
else
w(g, "~{int ret = skip_utf8(z->p, z->c, z->lb, 0, -1");
- wp(g, ");~N"
+ writef(g, ");~N"
"~Mif (ret < 0) ~f~N"
"~Mz->c = ret;~C"
"~}", p);
} else
- wp(g, "~M~l~N"
+ writef(g, "~M~l~N"
"~M~i~C", p);
}
@@ -611,21 +737,21 @@ static void generate_GO_grouping(struct generator * g, struct node * p, int is_g
struct grouping * q = p->name->grouping;
g->S[0] = p->mode == m_forward ? "" : "_b";
g->S[1] = complement ? "in" : "out";
- g->S[2] = g->options->utf8 ? "_U" : "";
+ g->S[2] = g->options->encoding == ENC_UTF8 ? "_U" : "";
g->V[0] = p->name;
g->I[0] = q->smallest_ch;
g->I[1] = q->largest_ch;
if (is_goto) {
- wp(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1) < 0) ~f /* goto */~C", p);
+ writef(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1) < 0) ~f~C", p);
} else {
- wp(g, "~{ /* gopast */~C"
- "~Mint ret = ~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1);~N"
- "~Mif (ret < 0) ~f~N", p);
- if (p->mode == m_forward)
- w(g, "~Mz->c += ret;~N");
- else
- w(g, "~Mz->c -= ret;~N");
- w(g, "~}");
+ writef(g, "~{~C"
+ "~Mint ret = ~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1);~N"
+ "~Mif (ret < 0) ~f~N", p);
+ if (p->mode == m_forward)
+ w(g, "~Mz->c += ret;~N");
+ else
+ w(g, "~Mz->c -= ret;~N");
+ w(g, "~}");
}
}
@@ -634,23 +760,26 @@ static void generate_GO(struct generator * g, struct node * p, int style) {
int used = g->label_used;
int a0 = g->failure_label;
- const char * a1 = g->failure_string;
+ int a1 = g->failure_keep_count;
if (p->left->type == c_grouping || p->left->type == c_non) {
- /* Special case for "goto" or "gopast" when used on a grouping or an
- * inverted grouping - the movement of c by the matching action is
- * exactly what we want! */
+ /* Special case for "goto" or "gopast" when used on a grouping or an
+ * inverted grouping - the movement of c by the matching action is
+ * exactly what we want! */
#ifdef OPTIMISATION_WARNINGS
- printf("Optimising %s %s\n", style ? "goto" : "gopast", p->left->type == c_non ? "non" : "grouping");
+ printf("Optimising %s %s\n", style ? "goto" : "gopast", p->left->type == c_non ? "non" : "grouping");
#endif
- generate_GO_grouping(g, p->left, style, p->left->type == c_non);
- return;
+ if (g->options->comments) {
+ writef(g, "~M~C", p);
+ }
+ generate_GO_grouping(g, p->left, style, p->left->type == c_non);
+ return;
}
- w(g, "~Mwhile(1) {"); wp(g, "~C~+", p);
+ w(g, "~Mwhile(1) {"); writef(g, "~C~+", p);
if (style == 1 || repeat_restore(g, p->left)) {
- wp(g, "~M~k~N", p);
+ writef(g, "~M~k~N", p);
keep_c = g->keep_count;
}
@@ -671,16 +800,16 @@ static void generate_GO(struct generator * g, struct node * p, int style) {
g->label_used = used;
g->failure_label = a0;
- g->failure_string = a1;
+ g->failure_keep_count = a1;
-/* wp(g, "~M~l~N"
+/* writef(g, "~M~l~N"
"~M~i~N", p); */
generate_next(g, p);
w(g, "~}");
}
static void generate_loop(struct generator * g, struct node * p) {
- w(g, "~{int i; for (i = "); generate_AE(g, p->AE); wp(g, "; i > 0; i--)~C"
+ w(g, "~{int i; for (i = "); generate_AE(g, p->AE); writef(g, "; i > 0; i--)~C"
"~{", p);
generate(g, p->left);
@@ -689,18 +818,22 @@ static void generate_loop(struct generator * g, struct node * p) {
"~}");
}
-static void generate_repeat(struct generator * g, struct node * p, int atleast_case) {
+static void generate_repeat_or_atleast(struct generator * g, struct node * p, int atleast_case) {
int keep_c = 0;
- wp(g, "~Mwhile(1) {~C~+", p);
+ if (atleast_case) {
+ writef(g, "~Mwhile(1) {~+~N", p);
+ } else {
+ writef(g, "~Mwhile(1) {~+~C", p);
+ }
if (repeat_restore(g, p->left)) {
- wp(g, "~M~k~N", p);
+ writef(g, "~M~k~N", p);
keep_c = g->keep_count;
}
g->failure_label = new_label(g);
g->label_used = 0;
- g->failure_string = 0;
+ g->failure_keep_count = 0;
generate(g, p->left);
if (atleast_case) w(g, "~Mi--;~N");
@@ -717,130 +850,120 @@ static void generate_repeat(struct generator * g, struct node * p, int atleast_c
"~}");
}
+static void generate_repeat(struct generator * g, struct node * p) {
+ generate_repeat_or_atleast(g, p, false);
+}
+
static void generate_atleast(struct generator * g, struct node * p) {
- w(g, "~{int i = "); generate_AE(g, p->AE); w(g, ";~N");
+ w(g, "~{int i = "); generate_AE(g, p->AE); w(g, ";~C");
{
int used = g->label_used;
int a0 = g->failure_label;
- const char * a1 = g->failure_string;
+ int a1 = g->failure_keep_count;
- generate_repeat(g, p, true);
+ generate_repeat_or_atleast(g, p, true);
g->label_used = used;
g->failure_label = a0;
- g->failure_string = a1;
+ g->failure_keep_count = a1;
}
- w(g, "~Mif (i > 0) ~f~N"
- "~}");
+ writef(g, "~Mif (i > 0) ~f~N"
+ "~}", p);
}
static void generate_setmark(struct generator * g, struct node * p) {
g->V[0] = p->name;
- wp(g, "~M~V0 = z->c;~C", p);
+ writef(g, "~M~V0 = z->c;~C", p);
}
static void generate_tomark(struct generator * g, struct node * p) {
g->S[0] = p->mode == m_forward ? ">" : "<";
- w(g, "~Mif (z->c ~S0 "); generate_AE(g, p->AE); w(g, ") ~f~N");
- w(g, "~Mz->c = "); generate_AE(g, p->AE); wp(g, ";~C", p);
+ w(g, "~Mif (z->c ~S0 "); generate_AE(g, p->AE); writef(g, ") ~f~N", p);
+ w(g, "~Mz->c = "); generate_AE(g, p->AE); writef(g, ";~C", p);
}
static void generate_atmark(struct generator * g, struct node * p) {
- w(g, "~Mif (z->c != "); generate_AE(g, p->AE); wp(g, ") ~f~C", p);
+ w(g, "~Mif (z->c != "); generate_AE(g, p->AE); writef(g, ") ~f~C", p);
}
static void generate_hop(struct generator * g, struct node * p) {
g->S[0] = p->mode == m_forward ? "+" : "-";
g->S[1] = p->mode == m_forward ? "0" : "z->lb";
- if (g->options->utf8) {
+ if (g->options->encoding == ENC_UTF8) {
w(g, "~{int ret = skip_utf8(z->p, z->c, ~S1, z->l, ~S0 ");
- generate_AE(g, p->AE); wp(g, ");~C", p);
- w(g, "~Mif (ret < 0) ~f~N");
+ generate_AE(g, p->AE); writef(g, ");~C", p);
+ writef(g, "~Mif (ret < 0) ~f~N", p);
} else {
w(g, "~{int ret = z->c ~S0 ");
- generate_AE(g, p->AE); wp(g, ";~C", p);
- w(g, "~Mif (~S1 > ret || ret > z->l) ~f~N");
+ generate_AE(g, p->AE); writef(g, ";~C", p);
+ writef(g, "~Mif (~S1 > ret || ret > z->l) ~f~N", p);
}
- wp(g, "~Mz->c = ret;~C"
+ writef(g, "~Mz->c = ret;~N"
"~}", p);
}
static void generate_delete(struct generator * g, struct node * p) {
- wp(g, "~{int ret = slice_del(z);~C", p);
- wp(g, "~Mif (ret < 0) return ret;~N"
+ writef(g, "~{int ret = slice_del(z);~C", p);
+ writef(g, "~Mif (ret < 0) return ret;~N"
"~}", p);
}
static void generate_tolimit(struct generator * g, struct node * p) {
g->S[0] = p->mode == m_forward ? "" : "b";
- wp(g, "~Mz->c = z->l~S0;~C", p);
+ writef(g, "~Mz->c = z->l~S0;~C", p);
}
static void generate_atlimit(struct generator * g, struct node * p) {
g->S[0] = p->mode == m_forward ? "" : "b";
g->S[1] = p->mode == m_forward ? "<" : ">";
- wp(g, "~Mif (z->c ~S1 z->l~S0) ~f~C", p);
+ writef(g, "~Mif (z->c ~S1 z->l~S0) ~f~C", p);
}
static void generate_leftslice(struct generator * g, struct node * p) {
g->S[0] = p->mode == m_forward ? "bra" : "ket";
- wp(g, "~Mz->~S0 = z->c;~C", p);
+ writef(g, "~Mz->~S0 = z->c;~C", p);
}
static void generate_rightslice(struct generator * g, struct node * p) {
g->S[0] = p->mode == m_forward ? "ket" : "bra";
- wp(g, "~Mz->~S0 = z->c;~C", p);
+ writef(g, "~Mz->~S0 = z->c;~C", p);
}
static void generate_assignto(struct generator * g, struct node * p) {
g->V[0] = p->name;
- wp(g, "~M~V0 = assign_to(z, ~V0);~C"
+ writef(g, "~M~V0 = assign_to(z, ~V0);~C"
"~Mif (~V0 == 0) return -1;~C", p);
}
static void generate_sliceto(struct generator * g, struct node * p) {
g->V[0] = p->name;
- wp(g, "~M~V0 = slice_to(z, ~V0);~C"
- "~Mif (~V0 == 0) return -1;~C", p);
-}
-
-static void generate_data_address(struct generator * g, struct node * p) {
-
- symbol * b = p->literalstring;
- if (b != 0) {
- wi(g, SIZE(b)); w(g, ", ");
- wlitref(g, b);
- } else
- wv(g, p->name);
+ writef(g, "~M~V0 = slice_to(z, ~V0);~C"
+ "~Mif (~V0 == 0) return -1;~N", p);
}
static void generate_insert(struct generator * g, struct node * p, int style) {
int keep_c = style == c_attach;
if (p->mode == m_backward) keep_c = !keep_c;
- wp(g, "~{", p);
- if (keep_c) w(g, "int c_keep = z->c;~N~M");
- wp(g, "int ret = insert_~$(z, z->c, z->c, ", p);
- generate_data_address(g, p);
- wp(g, ");~C", p);
- if (keep_c) w(g, "~Mz->c = c_keep;~N");
- wp(g, "~Mif (ret < 0) return ret;~N"
+ writef(g, "~{int ret;~N", p);
+ if (keep_c) w(g, "~{int saved_c = z->c;~N");
+ writef(g, "~Mret = insert_~$(z, z->c, z->c, ~a);~C", p);
+ if (keep_c) w(g, "~Mz->c = saved_c;~N~}");
+ writef(g, "~Mif (ret < 0) return ret;~N"
"~}", p);
}
static void generate_assignfrom(struct generator * g, struct node * p) {
int keep_c = p->mode == m_forward; /* like 'attach' */
- wp(g, "~{", p);
- if (keep_c) wp(g, "int c_keep = z->c;~N"
- "~Mret = insert_~$(z, z->c, z->l, ", p);
- else wp(g, "ret = insert_~$(z, z->lb, z->c, ", p);
- generate_data_address(g, p);
- wp(g, ");~C", p);
- if (keep_c) w(g, "~Mz->c = c_keep;~N");
- wp(g, "~Mif (ret < 0) return ret;~N"
+ writef(g, "~{int ret;~N", p);
+ if (keep_c) writef(g, "~{int saved_c = z->c;~N", p);
+ w(g, "~Mret = ");
+ writef(g, keep_c ? "insert_~$(z, z->c, z->l, ~a);~C" : "insert_~$(z, z->lb, z->c, ~a);~C", p);
+ if (keep_c) w(g, "~Mz->c = saved_c;~N~}");
+ writef(g, "~Mif (ret < 0) return ret;~N"
"~}", p);
}
@@ -849,81 +972,132 @@ static void generate_assignfrom(struct generator * g, struct node * p) {
static void generate_slicefrom(struct generator * g, struct node * p) {
/* w(g, "~Mslice_from_s(z, "); <============= bug! should be: */
- wp(g, "~{int ret = slice_from_~$(z, ", p);
- generate_data_address(g, p);
- wp(g, ");~C", p);
- wp(g, "~Mif (ret < 0) return ret;~N"
+ writef(g, "~{int ret = slice_from_~$(z, ~a);~C", p);
+ writef(g, "~Mif (ret < 0) return ret;~N"
"~}", p);
}
static void generate_setlimit(struct generator * g, struct node * p) {
int keep_c;
- wp(g, "~{int mlimit;~C"
- "~M~k~N"
- , p);
- keep_c = g->keep_count;
- generate(g, p->left);
- if (p->mode == m_forward) w(g, "~Mmlimit = z->l - z->c; z->l = z->c;~N");
- else w(g, "~Mmlimit = z->lb; z->lb = z->c;~N");
- w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N");
- g->failure_string = p->mode == m_forward ? "z->l += mlimit;" :
- "z->lb = mlimit;";
+ if (p->left && p->left->type == c_tomark) {
+ /* Special case for:
+ *
+ * setlimit tomark AE for C
+ *
+ * All uses of setlimit in the current stemmers we ship follow this
+ * pattern, and by special-casing we can avoid having to save and
+ * restore c.
+ */
+ struct node * q = p->left;
+
+ ++g->keep_count;
+ writef(g, "~N~{int mlimit", p);
+ write_int(g, g->keep_count);
+ writef(g, ";~C", p);
+ keep_c = g->keep_count;
+
+ g->S[0] = q->mode == m_forward ? ">" : "<";
+
+ w(g, "~Mif (z->c ~S0 "); generate_AE(g, q->AE); writef(g, ") ~f~N", q);
+ w(g, "~Mmlimit");
+ write_int(g, keep_c);
+ if (p->mode == m_forward) {
+ w(g, " = z->l - z->c; z->l = ");
+ } else {
+ w(g, " = z->lb; z->lb = ");
+ }
+ generate_AE(g, q->AE);
+ w(g, ";~N");
+ } else {
+ writef(g, "~{~K~C", p);
+ keep_c = g->keep_count;
+ generate(g, p->left);
+
+ w(g, "~Mmlimit");
+ write_int(g, keep_c);
+ if (p->mode == m_forward)
+ w(g, " = z->l - z->c; z->l = z->c;~N");
+ else
+ w(g, " = z->lb; z->lb = z->c;~N");
+ w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N");
+ }
+
+ g->failure_keep_count = -keep_c;
generate(g, p->aux);
- wms(g, g->failure_string);
+ w(g, "~M");
+ wrestorelimit(g, p, -g->failure_keep_count);
w(g, "~N"
"~}");
}
+/* dollar sets snowball up to operate on a string variable as if it were the
+ * current string */
static void generate_dollar(struct generator * g, struct node * p) {
int used = g->label_used;
int a0 = g->failure_label;
- const char * a1 = g->failure_string;
+ int a1 = g->failure_keep_count;
+ int keep_token;
g->failure_label = new_label(g);
g->label_used = 0;
- g->failure_string = 0;
+ g->failure_keep_count = 0;
+ keep_token = ++g->keep_count;
+ g->I[0] = keep_token;
+ writef(g, "~{struct SN_env env~I0 = * z;~C", p);
g->V[0] = p->name;
- wp(g, "~{struct SN_env env = * z;~C"
- "~Mint failure = 1; /* assume failure */~N"
- "~Mz->p = ~V0;~N"
- "~Mz->lb = z->c = 0;~N"
- "~Mz->l = SIZE(z->p);~N", p);
+ /* Assume failure. */
+ writef(g, "~Mint failure = 1;~N"
+ "~Mz->p = ~V0;~N"
+ "~Mz->lb = z->c = 0;~N"
+ "~Mz->l = SIZE(z->p);~N", p);
generate(g, p->left);
- w(g, "~Mfailure = 0; /* mark success */~N");
+ /* Mark success. */
+ w(g, "~Mfailure = 0;~N");
if (g->label_used)
wsetl(g, g->failure_label);
g->V[0] = p->name; /* necessary */
g->label_used = used;
g->failure_label = a0;
- g->failure_string = a1;
+ g->failure_keep_count = a1;
- w(g, "~M~V0 = z->p;~N"
- "~M* z = env;~N"
- "~Mif (failure) ~f~N~}");
+ g->I[0] = keep_token;
+ writef(g, "~M~V0 = z->p;~N"
+ "~M* z = env~I0;~N"
+ "~Mif (failure) ~f~N~}", p);
}
static void generate_integer_assign(struct generator * g, struct node * p, char * s) {
g->V[0] = p->name;
g->S[0] = s;
- w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); wp(g, ";~C", p);
+ w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); writef(g, ";~C", p);
}
static void generate_integer_test(struct generator * g, struct node * p, char * s) {
- g->V[0] = p->name;
- g->S[0] = s;
- w(g, "~Mif (!(~V0 ~S0 "); generate_AE(g, p->AE); wp(g, ")) ~f~C", p);
+ w(g, "~Mif (!(");
+ generate_AE(g, p->left);
+ write_char(g, ' ');
+ write_string(g, s);
+ write_char(g, ' ');
+ generate_AE(g, p->AE);
+ writef(g, ")) ~f~C", p);
}
static void generate_call(struct generator * g, struct node * p) {
g->V[0] = p->name;
- wp(g, "~{int ret = ~V0(z);~C"
- "~Mif (ret == 0) ~f~N"
- "~Mif (ret < 0) return ret;~N~}", p);
+ writef(g, "~{int ret = ~V0(z);~C", p);
+ if (g->failure_keep_count == 0 && g->failure_label == x_return) {
+ /* Combine the two tests in this special case for better optimisation
+ * and clearer generated code. */
+ writef(g, "~Mif (ret <= 0) return ret;~N~}", p);
+ } else {
+ writef(g, "~Mif (ret == 0) ~f~N"
+ "~Mif (ret < 0) return ret;~N~}", p);
+ }
}
static void generate_grouping(struct generator * g, struct node * p, int complement) {
@@ -931,27 +1105,47 @@ static void generate_grouping(struct generator * g, struct node * p, int complem
struct grouping * q = p->name->grouping;
g->S[0] = p->mode == m_forward ? "" : "_b";
g->S[1] = complement ? "out" : "in";
- g->S[2] = g->options->utf8 ? "_U" : "";
+ g->S[2] = g->options->encoding == ENC_UTF8 ? "_U" : "";
g->V[0] = p->name;
g->I[0] = q->smallest_ch;
g->I[1] = q->largest_ch;
- wp(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 0)) ~f~C", p);
+ writef(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 0)) ~f~C", p);
}
static void generate_namedstring(struct generator * g, struct node * p) {
g->S[0] = p->mode == m_forward ? "" : "_b";
g->V[0] = p->name;
- wp(g, "~Mif (!(eq_v~S0(z, ~V0))) ~f~C", p);
+ writef(g, "~Mif (!(eq_v~S0(z, ~V0))) ~f~C", p);
}
static void generate_literalstring(struct generator * g, struct node * p) {
symbol * b = p->literalstring;
- g->S[0] = p->mode == m_forward ? "" : "_b";
- g->I[0] = SIZE(b);
- g->L[0] = b;
+ if (SIZE(b) == 1) {
+ /* It's quite common to compare with a single character literal string,
+ * so just inline the simpler code for this case rather than making a
+ * function call. In UTF-8 mode, only do this for the ASCII subset,
+ * since multi-byte characters are more complex to test against.
+ */
+ if (g->options->encoding == ENC_UTF8 && *b >= 128) {
+ printf("single byte %d\n", *b);
+ exit(1);
+ }
+ g->I[0] = *b;
+ if (p->mode == m_forward) {
+ writef(g, "~Mif (z->c == z->l || z->p[z->c] != ~c0) ~f~C"
+ "~Mz->c++;~N", p);
+ } else {
+ writef(g, "~Mif (z->c <= z->lb || z->p[z->c - 1] != ~c0) ~f~C"
+ "~Mz->c--;~N", p);
+ }
+ } else {
+ g->S[0] = p->mode == m_forward ? "" : "_b";
+ g->I[0] = SIZE(b);
+ g->L[0] = b;
- wp(g, "~Mif (!(eq_s~S0(z, ~I0, ~L0))) ~f~C", p);
+ writef(g, "~Mif (!(eq_s~S0(z, ~I0, ~L0))) ~f~C", p);
+ }
}
static void generate_define(struct generator * g, struct node * p) {
@@ -961,9 +1155,13 @@ static void generate_define(struct generator * g, struct node * p) {
g->S[0] = q->type == t_routine ? "static" : "extern";
g->V[0] = q;
- w(g, "~N~S0 int ~V0(struct SN_env * z) {~N~+");
+ w(g, "~N~S0 int ~V0(struct SN_env * z) {");
+ if (g->options->comments) {
+ write_string(g, p->mode == m_forward ? " /* forwardmode */" : " /* backwardmode */");
+ }
+ w(g, "~N~+");
if (p->amongvar_needed) w(g, "~Mint among_var;~N");
- g->failure_string = 0;
+ g->failure_keep_count = 0;
g->failure_label = x_return;
g->label_used = 0;
g->keep_count = 0;
@@ -982,6 +1180,7 @@ static void generate_substring(struct generator * g, struct node * p) {
int n_cases = 0;
symbol cases[2];
int shortest_size = INT_MAX;
+ int shown_comment = 0;
g->S[0] = p->mode == m_forward ? "" : "_b";
g->I[0] = x->number;
@@ -1019,19 +1218,19 @@ static void generate_substring(struct generator * g, struct node * p) {
if (n_cases > 2) break;
}
if (block == -1) {
- if (ch == cases[0]) continue;
+ if (n_cases > 0 && ch == cases[0]) continue;
if (n_cases < 2) {
- cases[n_cases++] = ch;
+ cases[n_cases++] = ch;
} else if (ch != cases[1]) {
- ++n_cases;
- break;
+ ++n_cases;
+ break;
}
} else {
if ((bitmap & (1u << (ch & 0x1f))) == 0) {
- bitmap |= 1u << (ch & 0x1f);
- if (n_cases < 2)
- cases[n_cases] = ch;
- ++n_cases;
+ bitmap |= 1u << (ch & 0x1f);
+ if (n_cases < 2)
+ cases[n_cases] = ch;
+ ++n_cases;
}
}
}
@@ -1045,16 +1244,16 @@ static void generate_substring(struct generator * g, struct node * p) {
sprintf(buf, "z->p[z->c + %d]", shortest_size - 1);
g->S[1] = buf;
if (shortest_size == 1) {
- wp(g, "~Mif (z->c >= z->l || ", p);
+ writef(g, "~Mif (z->c >= z->l", p);
} else {
- wp(g, "~Mif (z->c + ~I4 >= z->l || ", p);
+ writef(g, "~Mif (z->c + ~I4 >= z->l", p);
}
} else {
g->S[1] = "z->p[z->c - 1]";
if (shortest_size == 1) {
- wp(g, "~Mif (z->c <= z->lb || ", p);
+ writef(g, "~Mif (z->c <= z->lb", p);
} else {
- wp(g, "~Mif (z->c - ~I4 <= z->lb || ", p);
+ writef(g, "~Mif (z->c - ~I4 <= z->lb", p);
}
}
if (n_cases == 0) {
@@ -1062,81 +1261,83 @@ static void generate_substring(struct generator * g, struct node * p) {
* This doesn't seem to be a useful construct, but it is
* syntactically valid.
*/
- wp(g, "0", p);
} else if (n_cases == 1) {
g->I[4] = cases[0];
- wp(g, "~S1 != ~I4", p);
+ writef(g, " || ~S1 != ~I4", p);
} else if (n_cases == 2) {
g->I[4] = cases[0];
g->I[5] = cases[1];
- wp(g, "(~S1 != ~I4 && ~S1 != ~I5)", p);
+ writef(g, " || (~S1 != ~I4 && ~S1 != ~I5)", p);
} else {
- wp(g, "~S1 >> 5 != ~I2 || !((~I3 >> (~S1 & 0x1f)) & 1)", p);
+ writef(g, " || ~S1 >> 5 != ~I2 || !((~I3 >> (~S1 & 0x1f)) & 1)", p);
}
- ws(g, ") ");
+ write_string(g, ") ");
if (empty_case != -1) {
/* If the among includes the empty string, it can never fail
* so not matching the bitmap means we match the empty string.
*/
g->I[4] = among_cases[empty_case].result;
- wp(g, "among_var = ~I4; else~C", p);
+ writef(g, "among_var = ~I4; else~C", p);
} else {
- wp(g, "~f~C", p);
+ writef(g, "~f~C", p);
}
+ shown_comment = 1;
} else {
#ifdef OPTIMISATION_WARNINGS
printf("Couldn't shortcut among %d\n", x->number);
#endif
}
- if (x->command_count == 0 && x->starter == 0)
- wp(g, "~Mif (!(find_among~S0(z, a_~I0, ~I1))) ~f~C", p);
- else
- wp(g, "~Mamong_var = find_among~S0(z, a_~I0, ~I1);~C"
- "~Mif (!(among_var)) ~f~N", p);
+ if (!x->amongvar_needed) {
+ writef(g, "~Mif (!(find_among~S0(z, a_~I0, ~I1))) ~f", p);
+ writef(g, shown_comment ? "~N" : "~C", p);
+ } else {
+ writef(g, "~Mamong_var = find_among~S0(z, a_~I0, ~I1);", p);
+ writef(g, shown_comment ? "~N" : "~C", p);
+ writef(g, "~Mif (!(among_var)) ~f~N", p);
+ }
}
static void generate_among(struct generator * g, struct node * p) {
struct among * x = p->among;
- int case_number = 1;
if (x->substring == 0) generate_substring(g, p);
- if (x->command_count == 0 && x->starter == 0) return;
- unless (x->starter == 0) generate(g, x->starter);
+ if (x->starter != 0) generate(g, x->starter);
- p = p->left;
- if (p != 0 && p->type != c_literalstring) p = p->right;
- w(g, "~Mswitch(among_var) {~N~+"
- "~Mcase 0: ~f~N");
-
- until (p == 0) {
- if (p->type == c_bra && p->left != 0) {
- g->I[0] = case_number++;
- w(g, "~Mcase ~I0:~N~+"); generate(g, p); w(g, "~Mbreak;~N~-");
- }
- p = p->right;
+ if (x->command_count == 1 && x->nocommand_count == 0) {
+ /* Only one outcome ("no match" already handled). */
+ generate(g, x->commands[0]);
+ } else if (x->command_count > 0) {
+ int i;
+ writef(g, "~Mswitch (among_var) {~C~+", p);
+ for (i = 1; i <= x->command_count; i++) {
+ g->I[0] = i;
+ w(g, "~Mcase ~I0:~N~+");
+ generate(g, x->commands[i - 1]);
+ w(g, "~Mbreak;~N~-");
+ }
+ w(g, "~}");
}
- w(g, "~}");
}
static void generate_booltest(struct generator * g, struct node * p) {
g->V[0] = p->name;
- wp(g, "~Mif (!(~V0)) ~f~C", p);
+ writef(g, "~Mif (!(~V0)) ~f~C", p);
}
static void generate_false(struct generator * g, struct node * p) {
- wp(g, "~M~f~C", p);
+ writef(g, "~M~f~C", p);
}
static void generate_debug(struct generator * g, struct node * p) {
g->I[0] = g->debug_count++;
g->I[1] = p->line_number;
- wp(g, "~Mdebug(z, ~I0, ~I1);~C", p);
+ writef(g, "~Mdebug(z, ~I0, ~I1);~C", p);
}
@@ -1144,10 +1345,9 @@ static void generate(struct generator * g, struct node * p) {
int used = g->label_used;
int a0 = g->failure_label;
- const char * a1 = g->failure_string;
+ int a1 = g->failure_keep_count;
- switch (p->type)
- {
+ switch (p->type) {
case c_define: generate_define(g, p); break;
case c_bra: generate_bra(g, p); break;
case c_and: generate_and(g, p); break;
@@ -1163,7 +1363,7 @@ static void generate(struct generator * g, struct node * p) {
case c_do: generate_do(g, p); break;
case c_goto: generate_GO(g, p, 1); break;
case c_gopast: generate_GO(g, p, 0); break;
- case c_repeat: generate_repeat(g, p, false); break;
+ case c_repeat: generate_repeat(g, p); break;
case c_loop: generate_loop(g, p); break;
case c_atleast: generate_atleast(g, p); break;
case c_setmark: generate_setmark(g, p); break;
@@ -1213,30 +1413,40 @@ static void generate(struct generator * g, struct node * p) {
if (g->failure_label != a0)
g->label_used = used;
g->failure_label = a0;
- g->failure_string = a1;
+ g->failure_keep_count = a1;
}
-static void generate_start_comment(struct generator * g) {
+void write_generated_comment_content(struct generator * g) {
+ w(g, "Generated by Snowball " SNOWBALL_VERSION
+ " - https://snowballstem.org/");
+}
- w(g, "~N/* This file was generated automatically by the Snowball to ANSI C compiler */~N");
+void write_start_comment(struct generator * g,
+ const char * comment_start,
+ const char * comment_end) {
+ write_margin(g);
+ w(g, comment_start);
+ write_generated_comment_content(g);
+ if (comment_end) {
+ w(g, comment_end);
+ }
+ w(g, "~N~N");
}
static void generate_head(struct generator * g) {
- if (g->options->runtime_path == 0) {
- w(g, "~N#include \"header.h\"~N~N");
- } else {
- w(g, "~N#include \"");
- ws(g, g->options->runtime_path);
+ w(g, "#include \"");
+ if (g->options->runtime_path) {
+ write_string(g, g->options->runtime_path);
if (g->options->runtime_path[strlen(g->options->runtime_path) - 1] != '/')
- wch(g, '/');
- w(g, "header.h\"~N~N");
+ write_char(g, '/');
}
+ w(g, "header.h\"~N~N");
}
static void generate_routine_headers(struct generator * g) {
- struct name * q = g->analyser->names;
- until (q == 0) {
+ struct name * q;
+ for (q = g->analyser->names; q; q = q->next) {
g->V[0] = q;
switch (q->type) {
case t_routine:
@@ -1254,7 +1464,6 @@ static void generate_routine_headers(struct generator * g) {
);
break;
}
- q = q->next;
}
}
@@ -1265,12 +1474,11 @@ static void generate_among_table(struct generator * g, struct among * x) {
g->I[0] = x->number;
{
int i;
- for (i = 0; i < x->literalstring_count; i++)
- {
+ for (i = 0; i < x->literalstring_count; i++) {
g->I[1] = i;
g->I[2] = v->size;
g->L[0] = v->b;
- unless (v->size == 0)
+ if (v->size)
w(g, "static const symbol s_~I0_~I1[~I2] = ~A0;~N");
v++;
}
@@ -1289,12 +1497,21 @@ static void generate_among_table(struct generator * g, struct among * x) {
g->I[4] = v->result;
g->S[0] = i < x->literalstring_count - 1 ? "," : "";
- w(g, "/*~J1 */ { ~I2, ");
- if (v->size == 0) w(g, "0,");
- else w(g, "s_~I0_~I1,");
+ if (g->options->comments) {
+ w(g, "/*~J1 */ ");
+ }
+ w(g, "{ ~I2, ");
+ if (v->size == 0) {
+ w(g, "0,");
+ } else {
+ w(g, "s_~I0_~I1,");
+ }
w(g, " ~I3, ~I4, ");
- if (v->function == 0) w(g, "0"); else
- wvn(g, v->function);
+ if (v->function == 0) {
+ write_char(g, '0');
+ } else {
+ write_varname(g, v->function);
+ }
w(g, "}~S0~N");
v++;
}
@@ -1303,10 +1520,9 @@ static void generate_among_table(struct generator * g, struct among * x) {
}
static void generate_amongs(struct generator * g) {
- struct among * x = g->analyser->amongs;
- until (x == 0) {
+ struct among * x;
+ for (x = g->analyser->amongs; x; x = x->next) {
generate_among_table(g, x);
- x = x->next;
}
}
@@ -1323,24 +1539,22 @@ static void generate_grouping_table(struct generator * g, struct grouping * q) {
for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch);
- {
- g->V[0] = q->name;
+ g->V[0] = q->name;
- w(g, "static const unsigned char ~V0[] = { ");
- for (i = 0; i < size; i++) {
- wi(g, map[i]);
- if (i < size - 1) w(g, ", ");
- }
- w(g, " };~N~N");
+ w(g, "static const unsigned char ~V0[] = { ");
+ for (i = 0; i < size; i++) {
+ write_int(g, map[i]);
+ if (i < size - 1) w(g, ", ");
}
+ w(g, " };~N~N");
lose_b(map);
}
static void generate_groupings(struct generator * g) {
- struct grouping * q = g->analyser->groupings;
- until (q == 0) {
- generate_grouping_table(g, q);
- q = q->next;
+ struct grouping * q;
+ for (q = g->analyser->groupings; q; q = q->next) {
+ if (q->name->used)
+ generate_grouping_table(g, q);
}
}
@@ -1348,10 +1562,9 @@ static void generate_create(struct generator * g) {
int * p = g->analyser->name_count;
g->I[0] = p[t_string];
- g->I[1] = p[t_integer];
- g->I[2] = p[t_boolean];
+ g->I[1] = p[t_integer] + p[t_boolean];
w(g, "~N"
- "extern struct SN_env * ~pcreate_env(void) { return SN_create_env(~I0, ~I1, ~I2); }"
+ "extern struct SN_env * ~pcreate_env(void) { return SN_create_env(~I0, ~I1); }"
"~N");
}
@@ -1371,36 +1584,47 @@ static void generate_create_and_close_templates(struct generator * g) {
static void generate_header_file(struct generator * g) {
- struct name * q = g->analyser->names;
- char * vp = g->options->variables_prefix;
+ struct name * q;
+ const char * vp = g->options->variables_prefix;
g->S[0] = vp;
- w(g, "~N"
- "#ifdef __cplusplus~N"
+ w(g, "#ifdef __cplusplus~N"
"extern \"C\" {~N"
"#endif~N"); /* for C++ */
generate_create_and_close_templates(g);
- until (q == 0) {
+ for (q = g->analyser->names; q; q = q->next) {
g->V[0] = q;
- switch (q->type)
- {
+ switch (q->type) {
case t_external:
w(g, "extern int ~W0(struct SN_env * z);~N");
break;
- case t_string: g->S[1] = "S"; goto label0;
- case t_integer: g->S[1] = "I"; goto label0;
- case t_boolean: g->S[1] = "B";
- label0:
+ case t_string:
+ case t_integer:
+ case t_boolean:
if (vp) {
- g->I[0] = q->count;
+ int count = q->count;
+ if (count < 0) {
+ /* Unused variables should get removed from `names`. */
+ fprintf(stderr, "Optimised out variable ");
+ report_b(stderr, q->b);
+ fprintf(stderr, " still in names list\n");
+ exit(1);
+ }
+ if (q->type == t_boolean) {
+ /* We use a single array for booleans and integers,
+ * with the integers first.
+ */
+ count += g->analyser->name_count[t_integer];
+ }
+ g->I[0] = count;
+ g->I[1] = "SIIrxg"[q->type];
w(g, "#define ~S0");
- str_append_b(g->outbuf, q->b);
- w(g, " (~S1[~I0])~N");
+ write_b(g, q->b);
+ w(g, " (~c1[~I0])~N");
}
break;
}
- q = q->next;
}
w(g, "~N"
@@ -1414,7 +1638,7 @@ static void generate_header_file(struct generator * g) {
extern void generate_program_c(struct generator * g) {
g->outbuf = str_new();
- generate_start_comment(g);
+ write_start_comment(g, "/* ", " */");
generate_head(g);
generate_routine_headers(g);
w(g, "#ifdef __cplusplus~N"
@@ -1433,33 +1657,69 @@ extern void generate_program_c(struct generator * g) {
g->literalstring_count = 0;
{
struct node * p = g->analyser->program;
- until (p == 0) { generate(g, p); p = p->right; }
+ while (p) { generate(g, p); p = p->right; }
}
generate_create(g);
generate_close(g);
- output_str(g->options->output_c, g->declarations);
+ output_str(g->options->output_src, g->declarations);
str_delete(g->declarations);
- output_str(g->options->output_c, g->outbuf);
+ output_str(g->options->output_src, g->outbuf);
str_clear(g->outbuf);
- generate_start_comment(g);
+ write_start_comment(g, "/* ", " */");
generate_header_file(g);
output_str(g->options->output_h, g->outbuf);
str_delete(g->outbuf);
}
-extern struct generator * create_generator_c(struct analyser * a, struct options * o) {
+/* Generator functions common to multiple languages. */
+
+extern struct generator * create_generator(struct analyser * a, struct options * o) {
NEW(generator, g);
g->analyser = a;
g->options = o;
g->margin = 0;
g->debug_count = 0;
+ g->copy_from_count = 0;
g->line_count = 0;
+ g->line_labelled = 0;
+ g->failure_label = -1;
+ g->unreachable = false;
+#ifndef DISABLE_PYTHON
+ g->max_label = 0;
+#endif
return g;
}
-extern void close_generator_c(struct generator * g) {
-
+extern void close_generator(struct generator * g) {
FREE(g);
}
+/* Write routines for simple entities */
+
+extern void write_char(struct generator * g, int ch) {
+ str_append_ch(g->outbuf, ch); /* character */
+}
+
+extern void write_newline(struct generator * g) {
+ str_append_ch(g->outbuf, '\n'); /* newline */
+ g->line_count++;
+}
+
+extern void write_string(struct generator * g, const char * s) {
+ str_append_string(g->outbuf, s);
+}
+
+extern void write_int(struct generator * g, int i) {
+ str_append_int(g->outbuf, i);
+}
+
+extern void write_b(struct generator * g, symbol * b) {
+
+ str_append_b(g->outbuf, b);
+}
+
+extern void write_str(struct generator * g, struct str * str) {
+
+ str_append(g->outbuf, str);
+}
diff --git a/contrib/snowball/compiler/generator_java.c b/contrib/snowball/compiler/generator_java.c
deleted file mode 100644
index 07a4b8e2e..000000000
--- a/contrib/snowball/compiler/generator_java.c
+++ /dev/null
@@ -1,1452 +0,0 @@
-
-#include <stdlib.h> /* for exit */
-#include <string.h> /* for strlen */
-#include <stdio.h> /* for fprintf etc */
-#include "header.h"
-
-/* prototypes */
-
-static void generate(struct generator * g, struct node * p);
-static void w(struct generator * g, const char * s);
-static void writef(struct generator * g, const char * s, struct node * p);
-
-
-enum special_labels {
- x_return = -1
-};
-
-static int new_label(struct generator * g) {
-
- return g->next_label++;
-}
-
-static struct str * vars_newname(struct generator * g) {
-
- struct str * output;
- g->var_number ++;
- output = str_new();
- str_append_string(output, "v_");
- str_append_int(output, g->var_number);
- return output;
-}
-
-/* Output routines */
-static void output_str(FILE * outfile, struct str * str) {
-
- char * s = b_to_s(str_data(str));
- fprintf(outfile, "%s", s);
- free(s);
-}
-
-/* Write routines for simple entities */
-
-static void write_char(struct generator * g, int ch) {
-
- str_append_ch(g->outbuf, ch);
-}
-
-static void write_newline(struct generator * g) {
-
- str_append_string(g->outbuf, "\n");
-}
-
-static void write_string(struct generator * g, const char * s) {
-
- str_append_string(g->outbuf, s);
-}
-
-static void write_b(struct generator * g, symbol * b) {
-
- str_append_b(g->outbuf, b);
-}
-
-static void write_str(struct generator * g, struct str * str) {
-
- str_append(g->outbuf, str);
-}
-
-static void write_int(struct generator * g, int i) {
-
- str_append_int(g->outbuf, i);
-}
-
-
-/* Write routines for items from the syntax tree */
-
-static void write_varname(struct generator * g, struct name * p) {
-
- int ch = "SBIrxg"[p->type];
- if (p->type != t_external)
- {
- write_char(g, ch);
- write_char(g, '_');
- }
- str_append_b(g->outbuf, p->b);
-}
-
-static void write_varref(struct generator * g, struct name * p) {
-
- /* In java, references look just the same */
- write_varname(g, p);
-}
-
-static void write_hexdigit(struct generator * g, int n) {
-
- write_char(g, n < 10 ? n + '0' : n - 10 + 'A');
-}
-
-static void write_hex(struct generator * g, int ch) {
-
- write_string(g, "\\u");
- {
- int i;
- for (i = 12; i >= 0; i -= 4) write_hexdigit(g, ch >> i & 0xf);
- }
-}
-
-static void write_literal_string(struct generator * g, symbol * p) {
-
- int i;
- write_string(g, "\"");
- for (i = 0; i < SIZE(p); i++) {
- int ch = p[i];
- if (32 <= ch && ch <= 127) {
- if (ch == '\"' || ch == '\\') write_string(g, "\\");
- write_char(g, ch);
- } else {
- write_hex(g, ch);
- }
- }
- write_string(g, "\"");
-}
-
-static void write_margin(struct generator * g) {
-
- int i;
- for (i = 0; i < g->margin; i++) write_string(g, " ");
-}
-
-/* Write a variable declaration. */
-static void write_declare(struct generator * g,
- char * declaration,
- struct node * p) {
-
- struct str * temp = g->outbuf;
- g->outbuf = g->declarations;
- write_string(g, " ");
- writef(g, declaration, p);
- write_string(g, ";");
- write_newline(g);
- g->outbuf = temp;
-}
-
-static void write_comment(struct generator * g, struct node * p) {
-
- write_margin(g);
- write_string(g, "// ");
- write_string(g, name_of_token(p->type));
- if (p->name != 0) {
- write_string(g, " ");
- str_append_b(g->outbuf, p->name->b);
- }
- write_string(g, ", line ");
- write_int(g, p->line_number);
- write_newline(g);
-}
-
-static void write_block_start(struct generator * g) {
-
- w(g, "~M{~+~N");
-}
-
-static void write_block_end(struct generator * g) /* block end */ {
-
- w(g, "~-~M}~N");
-}
-
-static void write_savecursor(struct generator * g, struct node * p,
- struct str * savevar) {
-
- g->B[0] = str_data(savevar);
- g->S[1] = "";
- if (p->mode != m_forward) g->S[1] = "limit - ";
- write_declare(g, "int ~B0", p);
- writef(g, "~M~B0 = ~S1cursor;~N" , p);
-}
-
-static void restore_string(struct node * p, struct str * out, struct str * savevar) {
-
- str_clear(out);
- str_append_string(out, "cursor = ");
- if (p->mode != m_forward) str_append_string(out, "limit - ");
- str_append(out, savevar);
- str_append_string(out, ";");
-}
-
-static void write_restorecursor(struct generator * g, struct node * p,
- struct str * savevar) {
-
- struct str * temp = str_new();
- write_margin(g);
- restore_string(p, temp, savevar);
- write_str(g, temp);
- write_newline(g);
- str_delete(temp);
-}
-
-static void write_inc_cursor(struct generator * g, struct node * p) {
-
- write_margin(g);
- write_string(g, p->mode == m_forward ? "cursor++;" : "cursor--;");
- write_newline(g);
-}
-
-static void wsetlab_begin(struct generator * g, int n) {
-
- w(g, "~Mlab");
- write_int(g, n);
- w(g, ": do {~+~N");
-}
-
-static void wsetlab_end(struct generator * g) {
-
- w(g, "~-~M} while (false);~N");
-}
-
-static void wgotol(struct generator * g, int n) {
-
- write_margin(g);
- write_string(g, "break lab");
- write_int(g, n);
- write_string(g, ";");
- write_newline(g);
-}
-
-static void write_failure(struct generator * g) {
-
- if (str_len(g->failure_str) != 0) {
- write_margin(g);
- write_str(g, g->failure_str);
- write_newline(g);
- }
- write_margin(g);
- switch (g->failure_label)
- {
- case x_return:
- write_string(g, "return false;");
- break;
- default:
- write_string(g, "break lab");
- write_int(g, g->failure_label);
- write_string(g, ";");
- g->unreachable = true;
- }
- write_newline(g);
-}
-
-static void write_failure_if(struct generator * g, char * s, struct node * p) {
-
- writef(g, "~Mif (", p);
- writef(g, s, p);
- writef(g, ")~N", p);
- write_block_start(g);
- write_failure(g);
- write_block_end(g);
- g->unreachable = false;
-}
-
-/* if at limit fail */
-static void write_check_limit(struct generator * g, struct node * p) {
-
- if (p->mode == m_forward) {
- write_failure_if(g, "cursor >= limit", p);
- } else {
- write_failure_if(g, "cursor <= limit_backward", p);
- }
-}
-
-/* Formatted write. */
-static void writef(struct generator * g, const char * input, struct node * p) {
-
- int i = 0;
- int l = strlen(input);
-
- while (i < l) {
- int ch = input[i++];
- if (ch == '~') {
- switch(input[i++]) {
- default: write_char(g, input[i - 1]); continue;
- case 'C': write_comment(g, p); continue;
- case 'f': write_block_start(g);
- write_failure(g);
- g->unreachable = false;
- write_block_end(g);
- continue;
- case 'M': write_margin(g); continue;
- case 'N': write_newline(g); continue;
- case '{': write_block_start(g); continue;
- case '}': write_block_end(g); continue;
- case 'S': write_string(g, g->S[input[i++] - '0']); continue;
- case 'B': write_b(g, g->B[input[i++] - '0']); continue;
- case 'I': write_int(g, g->I[input[i++] - '0']); continue;
- case 'V': write_varref(g, g->V[input[i++] - '0']); continue;
- case 'W': write_varname(g, g->V[input[i++] - '0']); continue;
- case 'L': write_literal_string(g, g->L[input[i++] - '0']); continue;
- case '+': g->margin++; continue;
- case '-': g->margin--; continue;
- case 'n': write_string(g, g->options->name); continue;
- }
- } else {
- write_char(g, ch);
- }
- }
-}
-
-static void w(struct generator * g, const char * s) {
- writef(g, s, 0);
-}
-
-static void generate_AE(struct generator * g, struct node * p) {
- char * s;
- switch (p->type) {
- case c_name:
- write_varref(g, p->name); break;
- case c_number:
- write_int(g, p->number); break;
- case c_maxint:
- write_string(g, "MAXINT"); break;
- case c_minint:
- write_string(g, "MININT"); break;
- case c_neg:
- write_string(g, "-"); generate_AE(g, p->right); break;
- case c_multiply:
- s = " * "; goto label0;
- case c_plus:
- s = " + "; goto label0;
- case c_minus:
- s = " - "; goto label0;
- case c_divide:
- s = " / ";
- label0:
- write_string(g, "("); generate_AE(g, p->left);
- write_string(g, s); generate_AE(g, p->right); write_string(g, ")"); break;
- case c_sizeof:
- g->V[0] = p->name;
- w(g, "(~V0.length())"); break;
- case c_cursor:
- w(g, "cursor"); break;
- case c_limit:
- w(g, p->mode == m_forward ? "limit" : "limit_backward"); break;
- case c_size:
- w(g, "(current.length())"); break;
- }
-}
-
-/* K_needed() tests to see if we really need to keep c. Not true when the
- the command does not touch the cursor. This and repeat_score() could be
- elaborated almost indefinitely.
-*/
-
-static int K_needed(struct generator * g, struct node * p) {
-
- while (p != 0) {
- switch (p->type) {
- case c_dollar:
- case c_leftslice:
- case c_rightslice:
- case c_mathassign:
- case c_plusassign:
- case c_minusassign:
- case c_multiplyassign:
- case c_divideassign:
- case c_eq:
- case c_ne:
- case c_gr:
- case c_ge:
- case c_ls:
- case c_le:
- case c_sliceto:
- case c_booltest:
- case c_true:
- case c_false:
- case c_debug:
- break;
-
- case c_call:
- if (K_needed(g, p->name->definition)) return true;
- break;
-
- case c_bra:
- if (K_needed(g, p->left)) return true;
- break;
-
- default: return true;
- }
- p = p->right;
- }
- return false;
-}
-
-static int repeat_score(struct generator * g, struct node * p) {
-
- int score = 0;
- while (p != 0) {
- switch (p->type) {
- case c_dollar:
- case c_leftslice:
- case c_rightslice:
- case c_mathassign:
- case c_plusassign:
- case c_minusassign:
- case c_multiplyassign:
- case c_divideassign:
- case c_eq:
- case c_ne:
- case c_gr:
- case c_ge:
- case c_ls:
- case c_le:
- case c_sliceto: /* case c_not: must not be included here! */
- case c_debug:
- break;
-
- case c_call:
- score += repeat_score(g, p->name->definition);
- break;
-
- case c_bra:
- score += repeat_score(g, p->left);
- break;
-
- case c_name:
- case c_literalstring:
- case c_next:
- case c_grouping:
- case c_non:
- case c_hop:
- score = score + 1;
- break;
-
- default:
- score = 2;
- break;
- }
- p = p->right;
- }
- return score;
-}
-
-/* tests if an expression requires cursor reinstatement in a repeat */
-
-static int repeat_restore(struct generator * g, struct node * p) {
-
- return repeat_score(g, p) >= 2;
-}
-
-static void generate_bra(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- p = p->left;
- while (p != 0) {
- generate(g, p);
- p = p->right;
- }
-}
-
-static void generate_and(struct generator * g, struct node * p) {
-
- struct str * savevar = vars_newname(g);
- int keep_c = K_needed(g, p->left);
-
- write_comment(g, p);
-
- if (keep_c) write_savecursor(g, p, savevar);
-
- p = p->left;
- while (p != 0) {
- generate(g, p);
- if (g->unreachable) break;
- if (keep_c && p->right != 0) write_restorecursor(g, p, savevar);
- p = p->right;
- }
- str_delete(savevar);
-}
-
-static void generate_or(struct generator * g, struct node * p) {
-
- struct str * savevar = vars_newname(g);
- int keep_c = K_needed(g, p->left);
-
- int a0 = g->failure_label;
- struct str * a1 = str_copy(g->failure_str);
-
- int out_lab = new_label(g);
- write_comment(g, p);
- wsetlab_begin(g, out_lab);
-
- if (keep_c) write_savecursor(g, p, savevar);
-
- p = p->left;
- str_clear(g->failure_str);
-
- if (p == 0) {
- /* p should never be 0 after an or: there should be at least two
- * sub nodes. */
- fprintf(stderr, "Error: \"or\" node without children nodes.");
- exit (1);
- }
- while (p->right != 0) {
- g->failure_label = new_label(g);
- wsetlab_begin(g, g->failure_label);
- generate(g, p);
- if (!g->unreachable) wgotol(g, out_lab);
- wsetlab_end(g);
- g->unreachable = false;
- if (keep_c) write_restorecursor(g, p, savevar);
- p = p->right;
- }
-
- g->failure_label = a0;
- str_delete(g->failure_str);
- g->failure_str = a1;
-
- generate(g, p);
- wsetlab_end(g);
- str_delete(savevar);
-}
-
-static void generate_backwards(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- writef(g,"~Mlimit_backward = cursor; cursor = limit;~N", p);
- generate(g, p->left);
- w(g, "~Mcursor = limit_backward;");
-}
-
-
-static void generate_not(struct generator * g, struct node * p) {
-
- struct str * savevar = vars_newname(g);
- int keep_c = K_needed(g, p->left);
-
- int a0 = g->failure_label;
- struct str * a1 = str_copy(g->failure_str);
-
- write_comment(g, p);
- if (keep_c) {
- write_block_start(g);
- write_savecursor(g, p, savevar);
- }
-
- g->failure_label = new_label(g);
- str_clear(g->failure_str);
-
- wsetlab_begin(g, g->failure_label);
-
- generate(g, p->left);
-
- g->failure_label = a0;
- str_delete(g->failure_str);
- g->failure_str = a1;
-
- if (!g->unreachable) write_failure(g);
-
- wsetlab_end(g);
- g->unreachable = false;
-
- if (keep_c) write_restorecursor(g, p, savevar);
- if (keep_c) write_block_end(g);
- str_delete(savevar);
-}
-
-
-static void generate_try(struct generator * g, struct node * p) {
-
- struct str * savevar = vars_newname(g);
- int keep_c = K_needed(g, p->left);
-
- write_comment(g, p);
- if (keep_c) write_savecursor(g, p, savevar);
-
- g->failure_label = new_label(g);
- if (keep_c) restore_string(p, g->failure_str, savevar);
-
- wsetlab_begin(g, g->failure_label);
- generate(g, p->left);
- wsetlab_end(g);
- g->unreachable = false;
-
- str_delete(savevar);
-}
-
-static void generate_set(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- g->V[0] = p->name;
- writef(g, "~M~V0 = true;~N", p);
-}
-
-static void generate_unset(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- g->V[0] = p->name;
- writef(g, "~M~V0 = false;~N", p);
-}
-
-static void generate_fail(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- generate(g, p->left);
- if (!g->unreachable) write_failure(g);
-}
-
-/* generate_test() also implements 'reverse' */
-
-static void generate_test(struct generator * g, struct node * p) {
-
- struct str * savevar = vars_newname(g);
- int keep_c = K_needed(g, p->left);
-
- write_comment(g, p);
-
- if (keep_c) {
- write_savecursor(g, p, savevar);
- }
-
- generate(g, p->left);
-
- if (!g->unreachable) {
- if (keep_c) {
- write_restorecursor(g, p, savevar);
- }
- }
- str_delete(savevar);
-}
-
-static void generate_do(struct generator * g, struct node * p) {
-
- struct str * savevar = vars_newname(g);
- int keep_c = K_needed(g, p->left);
- write_comment(g, p);
- if (keep_c) write_savecursor(g, p, savevar);
-
- g->failure_label = new_label(g);
- str_clear(g->failure_str);
-
- wsetlab_begin(g, g->failure_label);
- generate(g, p->left);
- wsetlab_end(g);
- g->unreachable = false;
-
- if (keep_c) write_restorecursor(g, p, savevar);
- str_delete(savevar);
-}
-
-static void generate_GO(struct generator * g, struct node * p, int style) {
-
- int end_unreachable = false;
- struct str * savevar = vars_newname(g);
- int keep_c = style == 1 || repeat_restore(g, p->left);
-
- int a0 = g->failure_label;
- struct str * a1 = str_copy(g->failure_str);
-
- int golab = new_label(g);
- g->I[0] = golab;
- write_comment(g, p);
- w(g, "~Mgolab~I0: while(true)~N");
- w(g, "~{");
-
- if (keep_c) write_savecursor(g, p, savevar);
-
- g->failure_label = new_label(g);
- wsetlab_begin(g, g->failure_label);
- generate(g, p->left);
-
- if (g->unreachable) {
- /* Cannot break out of this loop: therefore the code after the
- * end of the loop is unreachable.*/
- end_unreachable = true;
- } else {
- /* include for goto; omit for gopast */
- if (style == 1) write_restorecursor(g, p, savevar);
- g->I[0] = golab;
- w(g, "~Mbreak golab~I0;~N");
- }
- g->unreachable = false;
- wsetlab_end(g);
- if (keep_c) write_restorecursor(g, p, savevar);
-
- g->failure_label = a0;
- str_delete(g->failure_str);
- g->failure_str = a1;
-
- write_check_limit(g, p);
- write_inc_cursor(g, p);
- write_block_end(g);
- str_delete(savevar);
- g->unreachable = end_unreachable;
-}
-
-static void generate_loop(struct generator * g, struct node * p) {
-
- struct str * loopvar = vars_newname(g);
- write_comment(g, p);
- g->B[0] = str_data(loopvar);
- write_declare(g, "int ~B0", p);
- w(g, "~Mfor (~B0 = ");
- generate_AE(g, p->AE);
- g->B[0] = str_data(loopvar);
- writef(g, "; ~B0 > 0; ~B0--)~N", p);
- writef(g, "~{", p);
-
- generate(g, p->left);
-
- w(g, "~}");
- str_delete(loopvar);
- g->unreachable = false;
-}
-
-static void generate_repeat(struct generator * g, struct node * p, struct str * loopvar) {
-
- struct str * savevar = vars_newname(g);
- int keep_c = repeat_restore(g, p->left);
- int replab = new_label(g);
- g->I[0] = replab;
- write_comment(g, p);
- writef(g, "~Mreplab~I0: while(true)~N~{", p);
-
- if (keep_c) write_savecursor(g, p, savevar);
-
- g->failure_label = new_label(g);
- str_clear(g->failure_str);
- wsetlab_begin(g, g->failure_label);
- generate(g, p->left);
-
- if (!g->unreachable) {
- if (loopvar != 0) {
- g->B[0] = str_data(loopvar);
- w(g, "~M~B0--;~N");
- }
-
- g->I[0] = replab;
- w(g, "~Mcontinue replab~I0;~N");
- }
-
- wsetlab_end(g);
- g->unreachable = false;
-
- if (keep_c) write_restorecursor(g, p, savevar);
-
- g->I[0] = replab;
- w(g, "~Mbreak replab~I0;~N~}");
- str_delete(savevar);
-}
-
-static void generate_atleast(struct generator * g, struct node * p) {
-
- struct str * loopvar = vars_newname(g);
- write_comment(g, p);
- w(g, "~{");
- g->B[0] = str_data(loopvar);
- w(g, "~Mint ~B0 = ");
- generate_AE(g, p->AE);
- w(g, ";~N");
- {
- int a0 = g->failure_label;
- struct str * a1 = str_copy(g->failure_str);
-
- generate_repeat(g, p, loopvar);
-
- g->failure_label = a0;
- str_delete(g->failure_str);
- g->failure_str = a1;
- }
- g->B[0] = str_data(loopvar);
- write_failure_if(g, "~B0 > 0", p);
- w(g, "~}");
- str_delete(loopvar);
-}
-
-static void generate_setmark(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- g->V[0] = p->name;
- writef(g, "~M~V0 = cursor;~N", p);
-}
-
-static void generate_tomark(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- g->S[0] = p->mode == m_forward ? ">" : "<";
-
- w(g, "~Mif (cursor ~S0 "); generate_AE(g, p->AE); w(g, ")~N");
- write_block_start(g);
- write_failure(g);
- write_block_end(g);
- g->unreachable = false;
- w(g, "~Mcursor = "); generate_AE(g, p->AE); writef(g, ";~N", p);
-}
-
-static void generate_atmark(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- w(g, "~Mif (cursor != "); generate_AE(g, p->AE); writef(g, ")~N", p);
- write_block_start(g);
- write_failure(g);
- write_block_end(g);
- g->unreachable = false;
-}
-
-
-static void generate_hop(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- g->S[0] = p->mode == m_forward ? "+" : "-";
-
- w(g, "~{~Mint c = cursor ~S0 ");
- generate_AE(g, p->AE);
- w(g, ";~N");
-
- g->S[0] = p->mode == m_forward ? "0" : "limit_backward";
-
- write_failure_if(g, "~S0 > c || c > limit", p);
- writef(g, "~Mcursor = c;~N", p);
- writef(g, "~}", p);
-}
-
-static void generate_delete(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- writef(g, "~Mslice_del();~N", p);
-}
-
-
-static void generate_next(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- write_check_limit(g, p);
- write_inc_cursor(g, p);
-}
-
-static void generate_tolimit(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- g->S[0] = p->mode == m_forward ? "limit" : "limit_backward";
- writef(g, "~Mcursor = ~S0;~N", p);
-}
-
-static void generate_atlimit(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- g->S[0] = p->mode == m_forward ? "limit" : "limit_backward";
- g->S[1] = p->mode == m_forward ? "<" : ">";
- write_failure_if(g, "cursor ~S1 ~S0", p);
-}
-
-static void generate_leftslice(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- g->S[0] = p->mode == m_forward ? "bra" : "ket";
- writef(g, "~M~S0 = cursor;~N", p);
-}
-
-static void generate_rightslice(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- g->S[0] = p->mode == m_forward ? "ket" : "bra";
- writef(g, "~M~S0 = cursor;~N", p);
-}
-
-static void generate_assignto(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- g->V[0] = p->name;
- writef(g, "~M~V0 = assign_to(~V0);~N", p);
-}
-
-static void generate_sliceto(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- g->V[0] = p->name;
- writef(g, "~M~V0 = slice_to(~V0);~N", p);
-}
-
-static void generate_address(struct generator * g, struct node * p) {
-
- symbol * b = p->literalstring;
- if (b != 0) {
- write_literal_string(g, b);
- } else {
- write_varref(g, p->name);
- }
-}
-
-static void generate_insert(struct generator * g, struct node * p, int style) {
-
- int keep_c = style == c_attach;
- write_comment(g, p);
- if (p->mode == m_backward) keep_c = !keep_c;
- if (keep_c) w(g, "~{~Mint c = cursor;~N");
- writef(g, "~Minsert(cursor, cursor, ", p);
- generate_address(g, p);
- writef(g, ");~N", p);
- if (keep_c) w(g, "~Mcursor = c;~N~}");
-}
-
-static void generate_assignfrom(struct generator * g, struct node * p) {
-
- int keep_c = p->mode == m_forward; /* like 'attach' */
-
- write_comment(g, p);
- if (keep_c) writef(g, "~{~Mint c = cursor;~N", p);
- if (p->mode == m_forward) {
- writef(g, "~Minsert(cursor, limit, ", p);
- } else {
- writef(g, "~Minsert(limit_backward, cursor, ", p);
- }
- generate_address(g, p);
- writef(g, ");~N", p);
- if (keep_c) w(g, "~Mcursor = c;~N~}");
-}
-
-
-static void generate_slicefrom(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- w(g, "~Mslice_from(");
- generate_address(g, p);
- writef(g, ");~N", p);
-}
-
-static void generate_setlimit(struct generator * g, struct node * p) {
-
- struct str * savevar = vars_newname(g);
- struct str * varname = vars_newname(g);
- write_comment(g, p);
- write_savecursor(g, p, savevar);
- generate(g, p->left);
-
- if (!g->unreachable) {
- g->B[0] = str_data(varname);
- write_declare(g, "int ~B0", p);
- if (p->mode == m_forward) {
- w(g, "~M~B0 = limit - cursor;~N");
- w(g, "~Mlimit = cursor;~N");
- } else {
- w(g, "~M~B0 = limit_backward;~N");
- w(g, "~Mlimit_backward = cursor;~N");
- }
- write_restorecursor(g, p, savevar);
-
- if (p->mode == m_forward) {
- str_assign(g->failure_str, "limit += ");
- str_append(g->failure_str, varname);
- str_append_ch(g->failure_str, ';');
- } else {
- str_assign(g->failure_str, "limit_backward = ");
- str_append(g->failure_str, varname);
- str_append_ch(g->failure_str, ';');
- }
- generate(g, p->aux);
-
- if (!g->unreachable) {
- write_margin(g);
- write_str(g, g->failure_str);
- write_newline(g);
- }
- }
- str_delete(varname);
- str_delete(savevar);
-}
-
-/* dollar sets snowball up to operate on a string variable as if it were the
- * current string */
-static void generate_dollar(struct generator * g, struct node * p) {
-
- struct str * savevar = vars_newname(g);
- write_comment(g, p);
- g->V[0] = p->name;
-
- str_assign(g->failure_str, "copy_from(");
- str_append(g->failure_str, savevar);
- str_append_string(g->failure_str, ");");
- g->B[0] = str_data(savevar);
- writef(g, "~{~M~n ~B0 = this;~N"
- "~Mcurrent = new StringBuffer(~V0.toString());~N"
- "~Mcursor = 0;~N"
- "~Mlimit = (current.length());~N", p);
- generate(g, p->left);
- if (!g->unreachable) {
- write_margin(g);
- write_str(g, g->failure_str);
- write_newline(g);
- }
- w(g, "~}");
- str_delete(savevar);
-}
-
-static void generate_integer_assign(struct generator * g, struct node * p, char * s) {
-
- g->V[0] = p->name;
- g->S[0] = s;
- w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); w(g, ";~N");
-}
-
-static void generate_integer_test(struct generator * g, struct node * p, char * s) {
-
- g->V[0] = p->name;
- g->S[0] = s;
- w(g, "~Mif (!(~V0 ~S0 "); generate_AE(g, p->AE); w(g, "))~N");
- write_block_start(g);
- write_failure(g);
- write_block_end(g);
- g->unreachable = false;
-}
-
-static void generate_call(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- g->V[0] = p->name;
- write_failure_if(g, "!~V0()", p);
-}
-
-static void generate_grouping(struct generator * g, struct node * p, int complement) {
-
- struct grouping * q = p->name->grouping;
- g->S[0] = p->mode == m_forward ? "" : "_b";
- g->S[1] = complement ? "out" : "in";
- g->V[0] = p->name;
- g->I[0] = q->smallest_ch;
- g->I[1] = q->largest_ch;
- if (q->no_gaps)
- write_failure_if(g, "!(~S1_range~S0(~I0, ~I1))", p);
- else
- write_failure_if(g, "!(~S1_grouping~S0(~V0, ~I0, ~I1))", p);
-}
-
-static void generate_namedstring(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- g->S[0] = p->mode == m_forward ? "" : "_b";
- g->V[0] = p->name;
- write_failure_if(g, "!(eq_v~S0(~V0))", p);
-}
-
-static void generate_literalstring(struct generator * g, struct node * p) {
-
- symbol * b = p->literalstring;
- write_comment(g, p);
- g->S[0] = p->mode == m_forward ? "" : "_b";
- g->I[0] = SIZE(b);
- g->L[0] = b;
- write_failure_if(g, "!(eq_s~S0(~I0, ~L0))", p);
-}
-
-static void generate_define(struct generator * g, struct node * p) {
-
- struct name * q = p->name;
-
- struct str * saved_output = g->outbuf;
- struct str * saved_declarations = g->declarations;
-
- g->S[0] = q->type == t_routine ? "private" : "public";
- g->V[0] = q;
- w(g, "~+~+~N~M~S0 boolean ~V0() {~+~N");
-
- g->outbuf = str_new();
- g->declarations = str_new();
-
- g->next_label = 0;
- g->var_number = 0;
-
- if (p->amongvar_needed) write_declare(g, "int among_var", p);
- str_clear(g->failure_str);
- g->failure_label = x_return;
- g->unreachable = false;
- generate(g, p->left);
- if (!g->unreachable) w(g, "~Mreturn true;~N");
- w(g, "~}~-~-");
-
- str_append(saved_output, g->declarations);
- str_append(saved_output, g->outbuf);
- str_delete(g->declarations);
- str_delete(g->outbuf);
- g->declarations = saved_declarations;
- g->outbuf = saved_output;
-}
-
-static void generate_substring(struct generator * g, struct node * p) {
-
- struct among * x = p->among;
-
- write_comment(g, p);
-
- g->S[0] = p->mode == m_forward ? "" : "_b";
- g->I[0] = x->number;
- g->I[1] = x->literalstring_count;
-
- if (x->command_count == 0 && x->starter == 0) {
- write_failure_if(g, "find_among~S0(a_~I0, ~I1) == 0", p);
- } else {
- writef(g, "~Mamong_var = find_among~S0(a_~I0, ~I1);~N", p);
- write_failure_if(g, "among_var == 0", p);
- }
-}
-
-static void generate_among(struct generator * g, struct node * p) {
-
- struct among * x = p->among;
- int case_number = 1;
-
- if (x->substring == 0) generate_substring(g, p);
- if (x->command_count == 0 && x->starter == 0) return;
-
- if (x->starter != 0) generate(g, x->starter);
-
- p = p->left;
- if (p != 0 && p->type != c_literalstring) p = p->right;
- w(g, "~Mswitch(among_var) {~N~+");
- w(g, "~Mcase 0:~N~+");
- write_failure(g);
- g->unreachable = false;
- w(g, "~-");
-
- while (p != 0) {
- if (p->type == c_bra && p->left != 0) {
- g->I[0] = case_number++;
- w(g, "~Mcase ~I0:~N~+");
- generate(g, p);
- if (!g->unreachable) w(g, "~Mbreak;~N");
- w(g, "~-");
- g->unreachable = false;
- }
- p = p->right;
- }
- write_block_end(g);
-}
-
-static void generate_booltest(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- g->V[0] = p->name;
- write_failure_if(g, "!(~V0)", p);
-}
-
-static void generate_false(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- write_failure(g);
-}
-
-static void generate_debug(struct generator * g, struct node * p) {
-
- write_comment(g, p);
- g->I[0] = g->debug_count++;
- g->I[1] = p->line_number;
- writef(g, "~Mdebug(~I0, ~I1);~N", p);
-}
-
-static void generate(struct generator * g, struct node * p) {
-
- int a0;
- struct str * a1;
-
- if (g->unreachable) return;
-
- a0 = g->failure_label;
- a1 = str_copy(g->failure_str);
-
- switch (p->type)
- {
- case c_define: generate_define(g, p); break;
- case c_bra: generate_bra(g, p); break;
- case c_and: generate_and(g, p); break;
- case c_or: generate_or(g, p); break;
- case c_backwards: generate_backwards(g, p); break;
- case c_not: generate_not(g, p); break;
- case c_set: generate_set(g, p); break;
- case c_unset: generate_unset(g, p); break;
- case c_try: generate_try(g, p); break;
- case c_fail: generate_fail(g, p); break;
- case c_reverse:
- case c_test: generate_test(g, p); break;
- case c_do: generate_do(g, p); break;
- case c_goto: generate_GO(g, p, 1); break;
- case c_gopast: generate_GO(g, p, 0); break;
- case c_repeat: generate_repeat(g, p, 0); break;
- case c_loop: generate_loop(g, p); break;
- case c_atleast: generate_atleast(g, p); break;
- case c_setmark: generate_setmark(g, p); break;
- case c_tomark: generate_tomark(g, p); break;
- case c_atmark: generate_atmark(g, p); break;
- case c_hop: generate_hop(g, p); break;
- case c_delete: generate_delete(g, p); break;
- case c_next: generate_next(g, p); break;
- case c_tolimit: generate_tolimit(g, p); break;
- case c_atlimit: generate_atlimit(g, p); break;
- case c_leftslice: generate_leftslice(g, p); break;
- case c_rightslice: generate_rightslice(g, p); break;
- case c_assignto: generate_assignto(g, p); break;
- case c_sliceto: generate_sliceto(g, p); break;
- case c_assign: generate_assignfrom(g, p); break;
- case c_insert:
- case c_attach: generate_insert(g, p, p->type); break;
- case c_slicefrom: generate_slicefrom(g, p); break;
- case c_setlimit: generate_setlimit(g, p); break;
- case c_dollar: generate_dollar(g, p); break;
- case c_mathassign: generate_integer_assign(g, p, "="); break;
- case c_plusassign: generate_integer_assign(g, p, "+="); break;
- case c_minusassign: generate_integer_assign(g, p, "-="); break;
- case c_multiplyassign:generate_integer_assign(g, p, "*="); break;
- case c_divideassign: generate_integer_assign(g, p, "/="); break;
- case c_eq: generate_integer_test(g, p, "=="); break;
- case c_ne: generate_integer_test(g, p, "!="); break;
- case c_gr: generate_integer_test(g, p, ">"); break;
- case c_ge: generate_integer_test(g, p, ">="); break;
- case c_ls: generate_integer_test(g, p, "<"); break;
- case c_le: generate_integer_test(g, p, "<="); break;
- case c_call: generate_call(g, p); break;
- case c_grouping: generate_grouping(g, p, false); break;
- case c_non: generate_grouping(g, p, true); break;
- case c_name: generate_namedstring(g, p); break;
- case c_literalstring: generate_literalstring(g, p); break;
- case c_among: generate_among(g, p); break;
- case c_substring: generate_substring(g, p); break;
- case c_booltest: generate_booltest(g, p); break;
- case c_false: generate_false(g, p); break;
- case c_true: break;
- case c_debug: generate_debug(g, p); break;
- default: fprintf(stderr, "%d encountered\n", p->type);
- exit(1);
- }
-
- g->failure_label = a0;
- str_delete(g->failure_str);
- g->failure_str = a1;
-}
-
-static void generate_start_comment(struct generator * g) {
-
- w(g, "// This file was generated automatically by the Snowball to Java compiler~N");
- w(g, "~N");
-}
-
-static void generate_class_begin(struct generator * g) {
-
- w(g, "package " );
- w(g, g->options->package);
- w(g, ";~N~N" );
-
- w(g, "import ");
- w(g, g->options->among_class );
- w(g, ";~N"
- "~N"
- " /**~N"
- " * This class was automatically generated by a Snowball to Java compiler ~N"
- " * It implements the stemming algorithm defined by a snowball script.~N"
- " */~N"
- "~N"
- "public class ~n extends ");
-
- w(g, g->options->parent_class_name);
- w(g, " {~N"
- "~N"
- "private static final long serialVersionUID = 1L;~N"
- "~N"
- "~+~+~Mprivate final static ~n methodObject = new ~n ();~N"
- "~N");
-}
-
-static void generate_class_end(struct generator * g) {
-
- w(g, "~N}");
- w(g, "~N~N");
-}
-
-static void generate_equals(struct generator * g) {
-
- w(g, "~N"
- "~Mpublic boolean equals( Object o ) {~N"
- "~+~Mreturn o instanceof ");
- w(g, g->options->name);
- w(g, ";~N~-~M}~N"
- "~N"
- "~Mpublic int hashCode() {~N"
- "~+~Mreturn ");
- w(g, g->options->name);
- w(g, ".class.getName().hashCode();~N"
- "~-~M}~N");
- w(g, "~N~N");
-}
-
-static void generate_among_table(struct generator * g, struct among * x) {
-
- struct amongvec * v = x->b;
-
- g->I[0] = x->number;
- g->I[1] = x->literalstring_count;
-
- w(g, "~+~+~Mprivate final static Among a_~I0[] = {~N~+");
- {
- int i;
- for (i = 0; i < x->literalstring_count; i++) {
- g->I[0] = i;
- g->I[1] = v->i;
- g->I[2] = v->result;
- g->L[0] = v->b;
- g->S[0] = i < x->literalstring_count - 1 ? "," : "";
-
- w(g, "~Mnew Among ( ~L0, ~I1, ~I2, \"");
- if (v->function != 0) {
- write_varname(g, v->function);
- }
- w(g, "\", methodObject )~S0~N");
- v++;
- }
- }
- w(g, "~-~M};~-~-~N~N");
-}
-
-static void generate_amongs(struct generator * g) {
-
- struct among * x = g->analyser->amongs;
- while (x != 0) {
- generate_among_table(g, x);
- x = x->next;
- }
-}
-
-static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; }
-
-static int bit_is_set(symbol * b, int i) { return b[i/8] & 1 << i%8; }
-
-static void generate_grouping_table(struct generator * g, struct grouping * q) {
-
- int range = q->largest_ch - q->smallest_ch + 1;
- int size = (range + 7)/ 8; /* assume 8 bits per symbol */
- symbol * b = q->b;
- symbol * map = create_b(size);
- int i;
- for (i = 0; i < size; i++) map[i] = 0;
-
- /* Using unicode would require revision here */
-
- for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch);
-
- q->no_gaps = true;
- for (i = 0; i < range; i++) unless (bit_is_set(map, i)) q->no_gaps = false;
-
- unless (q->no_gaps) {
- g->V[0] = q->name;
-
- w(g, "~+~+~Mprivate static final char ~V0[] = {");
- for (i = 0; i < size; i++) {
- write_int(g, map[i]);
- if (i < size - 1) w(g, ", ");
- }
- w(g, " };~N~-~-~N");
- }
- lose_b(map);
-}
-
-static void generate_groupings(struct generator * g) {
- struct grouping * q = g->analyser->groupings;
- until (q == 0) {
- generate_grouping_table(g, q);
- q = q->next;
- }
-}
-
-static void generate_members(struct generator * g) {
-
- struct name * q = g->analyser->names;
- until (q == 0) {
- g->V[0] = q;
- switch (q->type) {
- case t_string:
- w(g, " private ");
- w(g, g->options->string_class );
- w(g, " ~W0 = new ");
- w(g, g->options->string_class);
- w(g, "();~N");
- break;
- case t_integer:
- w(g, " private int ~W0;~N");
- break;
- case t_boolean:
- w(g, " private boolean ~W0;~N");
- break;
- }
- q = q->next;
- }
- w(g, "~N");
-}
-
-static void generate_copyfrom(struct generator * g) {
-
- struct name * q;
- w(g, "~+~+~Mprivate void copy_from(~n other) {~+~N");
- for (q = g->analyser->names; q != 0; q = q->next) {
- g->V[0] = q;
- switch (q->type) {
- case t_string:
- case t_integer:
- case t_boolean:
- w(g, "~M~W0 = other.~W0;~N");
- break;
- }
- }
- w(g, "~Msuper.copy_from(other);~N");
- w(g, "~-~M}~-~-~N");
-}
-
-static void generate_methods(struct generator * g) {
-
- struct node * p = g->analyser->program;
- while (p != 0) {
- generate(g, p);
- g->unreachable = false;
- p = p->right;
- }
-}
-
-extern void generate_program_java(struct generator * g) {
-
- g->outbuf = str_new();
- g->failure_str = str_new();
-
- generate_start_comment(g);
- generate_class_begin(g);
-
- generate_amongs(g);
- generate_groupings(g);
-
- generate_members(g);
- generate_copyfrom(g);
- generate_methods(g);
- generate_equals(g);
-
- generate_class_end(g);
-
- output_str(g->options->output_java, g->outbuf);
- str_delete(g->failure_str);
- str_delete(g->outbuf);
-}
-
-extern struct generator * create_generator_java(struct analyser * a, struct options * o) {
-
- NEW(generator, g);
- g->analyser = a;
- g->options = o;
- g->margin = 0;
- g->debug_count = 0;
- g->unreachable = false;
- return g;
-}
-
-extern void close_generator_java(struct generator * g) {
-
- FREE(g);
-}
-
diff --git a/contrib/snowball/compiler/header.h b/contrib/snowball/compiler/header.h
index 9baf1d917..dadbee36c 100644
--- a/contrib/snowball/compiler/header.h
+++ b/contrib/snowball/compiler/header.h
@@ -1,49 +1,56 @@
+#include <stdio.h>
+
+#define SNOWBALL_VERSION "2.0.0"
typedef unsigned char byte;
typedef unsigned short symbol;
#define true 1
#define false 0
-#define repeat while(true)
-#define unless(C) if(!(C))
-#define until(C) while(!(C))
#define MALLOC check_malloc
#define FREE check_free
#define NEW(type, p) struct type * p = (struct type *) MALLOC(sizeof(struct type))
-#define NEWVEC(type, p, n) struct type * p = (struct type *) MALLOC(sizeof(struct type) * n)
+#define NEWVEC(type, p, n) struct type * p = (struct type *) MALLOC(sizeof(struct type) * (n))
-#define STARTSIZE 10
#define SIZE(p) ((int *)(p))[-1]
#define CAPACITY(p) ((int *)(p))[-2]
extern symbol * create_b(int n);
-extern void report_b(FILE * out, symbol * p);
+extern void report_b(FILE * out, const symbol * p);
extern void lose_b(symbol * p);
extern symbol * increase_capacity(symbol * p, int n);
-extern symbol * move_to_b(symbol * p, int n, symbol * q);
-extern symbol * add_to_b(symbol * p, int n, symbol * q);
-extern symbol * copy_b(symbol * p);
-extern char * b_to_s(symbol * p);
+extern symbol * move_to_b(symbol * p, int n, const symbol * q);
+extern symbol * add_to_b(symbol * p, int n, const symbol * q);
+extern symbol * copy_b(const symbol * p);
+extern char * b_to_s(const symbol * p);
extern symbol * add_s_to_b(symbol * p, const char * s);
+#define MOVE_TO_B(B, LIT) \
+ move_to_b(B, sizeof(LIT) / sizeof(LIT[0]), LIT)
+
struct str; /* defined in space.c */
extern struct str * str_new(void);
extern void str_delete(struct str * str);
-extern void str_append(struct str * str, struct str * add);
+extern void str_append(struct str * str, const struct str * add);
extern void str_append_ch(struct str * str, char add);
-extern void str_append_b(struct str * str, symbol * q);
+extern void str_append_b(struct str * str, const symbol * q);
+extern void str_append_b_tail(struct str * str, const symbol * q, int skip);
extern void str_append_string(struct str * str, const char * s);
extern void str_append_int(struct str * str, int i);
extern void str_clear(struct str * str);
-extern void str_assign(struct str * str, char * s);
-extern struct str * str_copy(struct str * old);
-extern symbol * str_data(struct str * str);
-extern int str_len(struct str * str);
+extern void str_assign(struct str * str, const char * s);
+extern struct str * str_copy(const struct str * old);
+extern symbol * str_data(const struct str * str);
+extern int str_len(const struct str * str);
+extern int str_back(const struct str *str);
extern int get_utf8(const symbol * p, int * slot);
extern int put_utf8(int ch, symbol * p);
+extern void output_str(FILE * outfile, struct str * str);
+
+typedef enum { ENC_SINGLEBYTE, ENC_UTF8, ENC_WIDECHARS } enc;
struct m_pair {
@@ -60,6 +67,7 @@ struct input {
symbol * p;
int c;
char * file;
+ int file_needs_freeing;
int line_number;
};
@@ -71,6 +79,28 @@ struct include {
};
+enum token_codes {
+
+#include "syswords2.h"
+
+ c_mathassign,
+ c_name,
+ c_number,
+ c_literalstring,
+ c_neg,
+ c_call,
+ c_grouping,
+ c_booltest,
+
+ NUM_TOKEN_CODES
+};
+
+enum uplus_modes {
+ UPLUS_NONE,
+ UPLUS_DEFINED,
+ UPLUS_UNICODE
+};
+
/* struct input must be a prefix of struct tokeniser. */
struct tokeniser {
@@ -78,6 +108,7 @@ struct tokeniser {
symbol * p;
int c;
char * file;
+ int file_needs_freeing;
int line_number;
symbol * b;
symbol * b2;
@@ -90,34 +121,28 @@ struct tokeniser {
int token;
int previous_token;
byte token_held;
- byte widechars;
- byte utf8;
+ enc encoding;
int omission;
struct include * includes;
+ /* Mode in which U+ has been used:
+ * UPLUS_NONE - not used yet
+ * UPLUS_DEFINED - stringdef U+xxxx ....
+ * UPLUS_UNICODE - {U+xxxx} used with implicit meaning
+ */
+ int uplusmode;
+
+ char token_disabled[NUM_TOKEN_CODES];
};
-extern symbol * get_input(symbol * p, char ** p_file);
+extern symbol * get_input(const char * filename);
extern struct tokeniser * create_tokeniser(symbol * b, char * file);
extern int read_token(struct tokeniser * t);
extern const char * name_of_token(int code);
+extern void disable_token(struct tokeniser * t, int code);
extern void close_tokeniser(struct tokeniser * t);
-enum token_codes {
-
-#include "syswords2.h"
-
- c_mathassign,
- c_name,
- c_number,
- c_literalstring,
- c_neg,
- c_call,
- c_grouping,
- c_booltest
-};
-
extern int space_count;
extern void * check_malloc(int n);
extern void check_free(void * p);
@@ -134,7 +159,13 @@ struct name {
int count; /* 0, 1, 2 for each type */
struct grouping * grouping; /* for grouping names */
byte referenced;
- byte used;
+ byte used_in_among; /* Function used in among? */
+ byte value_used; /* (For variables) is its value ever used? */
+ byte initialised; /* (For variables) is it ever initialised? */
+ byte used_in_definition; /* (grouping) used in grouping definition? */
+ struct node * used; /* First use, or NULL if not used */
+ struct name * local_to; /* Local to one routine/external */
+ int declaration_line_number;/* Line number of declaration */
};
@@ -149,9 +180,10 @@ struct amongvec {
symbol * b; /* the string giving the case */
int size; /* - and its size */
- struct node * p; /* the corresponding command */
+ struct node * action; /* the corresponding action */
int i; /* the amongvec index of the longest substring of b */
int result; /* the numeric result for the case */
+ int line_number; /* for diagnostics and stable sorting */
struct name * function;
};
@@ -163,19 +195,22 @@ struct among {
int number; /* amongs are numbered 0, 1, 2 ... */
int literalstring_count; /* in this among */
int command_count; /* in this among */
+ int nocommand_count; /* number of "no command" entries in this among */
+ int function_count; /* in this among */
+ int amongvar_needed; /* do we need to set among_var? */
struct node * starter; /* i.e. among( (starter) 'string' ... ) */
struct node * substring; /* i.e. substring ... among ( ... ) */
+ struct node ** commands; /* array with command_count entries */
};
struct grouping {
struct grouping * next;
- int number; /* groupings are numbered 0, 1, 2 ... */
symbol * b; /* the characters of this group */
int largest_ch; /* character with max code */
int smallest_ch; /* character with min code */
- byte no_gaps; /* not used in generator.c after 11/5/05 */
struct name * name; /* so g->name->grouping == g */
+ int line_number;
};
struct node {
@@ -234,7 +269,8 @@ struct analyser {
struct grouping * groupings;
struct grouping * groupings_end;
struct node * substring; /* pending 'substring' in current routine definition */
- byte utf8;
+ enc encoding;
+ byte int_limits_used; /* are maxint or minint used? */
};
enum analyser_modes {
@@ -259,16 +295,23 @@ struct generator {
struct str * outbuf; /* temporary str to store output */
struct str * declarations; /* str storing variable declarations */
int next_label;
+#ifndef DISABLE_PYTHON
+ int max_label;
+#endif
int margin;
- const char * failure_string; /* String to output in case of a failure. */
-#ifndef DISABLE_JAVA
- struct str * failure_str; /* This is used by the java generator instead of failure_string */
+ /* if > 0, keep_count to restore in case of a failure;
+ * if < 0, the negated keep_count for the limit to restore in case of
+ * failure. */
+ int failure_keep_count;
+#if !defined(DISABLE_JAVA) && !defined(DISABLE_JS) && !defined(DISABLE_PYTHON) && !defined(DISABLE_CSHARP)
+ struct str * failure_str; /* This is used by some generators instead of failure_keep_count */
#endif
int label_used; /* Keep track of whether the failure label is used. */
int failure_label;
int debug_count;
+ int copy_from_count; /* count of calls to copy_from() */
const char * S[10]; /* strings */
symbol * B[10]; /* blocks */
@@ -277,48 +320,92 @@ struct generator {
symbol * L[5]; /* literals, used in formatted write */
int line_count; /* counts number of lines output */
- int line_labelled; /* in ANSI C, will need extra ';' if it is a block end */
+ int line_labelled; /* in ISO C, will need extra ';' if it is a block end */
int literalstring_count;
int keep_count; /* used to number keep/restore pairs to avoid compiler warnings
about shadowed variables */
};
+/* Special values for failure_label in struct generator. */
+enum special_labels {
+ x_return = -1
+};
+
struct options {
/* for the command line: */
- char * output_file;
- char * name;
- FILE * output_c;
+ const char * output_file;
+ const char * name;
+ FILE * output_src;
FILE * output_h;
-#ifndef DISABLE_JAVA
- FILE * output_java;
-#endif
byte syntax_tree;
- byte widechars;
- enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS } make_lang;
- char * externals_prefix;
- char * variables_prefix;
- char * runtime_path;
- char * parent_class_name;
- char * package;
- char * string_class;
- char * among_class;
+ byte comments;
+ enc encoding;
+ enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS, LANG_CSHARP, LANG_PASCAL, LANG_PYTHON, LANG_JAVASCRIPT, LANG_RUST, LANG_GO } make_lang;
+ const char * externals_prefix;
+ const char * variables_prefix;
+ const char * runtime_path;
+ const char * parent_class_name;
+ const char * package;
+ const char * go_snowball_runtime;
+ const char * string_class;
+ const char * among_class;
struct include * includes;
struct include * includes_end;
- byte utf8;
};
-/* Generator for C code. */
-extern struct generator * create_generator_c(struct analyser * a, struct options * o);
-extern void close_generator_c(struct generator * g);
+/* Generator functions common to several backends. */
+
+extern struct generator * create_generator(struct analyser * a, struct options * o);
+extern void close_generator(struct generator * g);
+
+extern void write_char(struct generator * g, int ch);
+extern void write_newline(struct generator * g);
+extern void write_string(struct generator * g, const char * s);
+extern void write_int(struct generator * g, int i);
+extern void write_b(struct generator * g, symbol * b);
+extern void write_str(struct generator * g, struct str * str);
+
+extern void write_comment_content(struct generator * g, struct node * p);
+extern void write_generated_comment_content(struct generator * g);
+extern void write_start_comment(struct generator * g,
+ const char * comment_start,
+ const char * comment_end);
+
+extern int K_needed(struct generator * g, struct node * p);
+extern int repeat_restore(struct generator * g, struct node * p);
+/* Generator for C code. */
extern void generate_program_c(struct generator * g);
#ifndef DISABLE_JAVA
/* Generator for Java code. */
-extern struct generator * create_generator_java(struct analyser * a, struct options * o);
-extern void close_generator_java(struct generator * g);
-
extern void generate_program_java(struct generator * g);
#endif
+
+#ifndef DISABLE_CSHARP
+/* Generator for C# code. */
+extern void generate_program_csharp(struct generator * g);
+#endif
+
+#ifndef DISABLE_PASCAL
+extern void generate_program_pascal(struct generator * g);
+#endif
+
+#ifndef DISABLE_PYTHON
+/* Generator for Python code. */
+extern void generate_program_python(struct generator * g);
+#endif
+
+#ifndef DISABLE_JS
+extern void generate_program_js(struct generator * g);
+#endif
+
+#ifndef DISABLE_RUST
+extern void generate_program_rust(struct generator * g);
+#endif
+
+#ifndef DISABLE_GO
+extern void generate_program_go(struct generator * g);
+#endif
diff --git a/contrib/snowball/compiler/space.c b/contrib/snowball/compiler/space.c
index 310024e76..5b058763a 100644
--- a/contrib/snowball/compiler/space.c
+++ b/contrib/snowball/compiler/space.c
@@ -57,9 +57,19 @@ extern symbol * create_b(int n) {
return p;
}
-extern void report_b(FILE * out, symbol * p) {
+extern void report_b(FILE * out, const symbol * p) {
int i;
- for (i = 0; i < SIZE(p); i++) fprintf(out, "%c", p[i]);
+ for (i = 0; i < SIZE(p); i++) {
+ if (p[i] > 255) {
+ printf("In report_b, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]);
+ exit(1);
+ }
+ putc(p[i], out);
+ }
+}
+
+extern void output_str(FILE * outfile, struct str * str) {
+ report_b(outfile, str_data(str));
}
extern void lose_b(symbol * p) {
@@ -74,19 +84,19 @@ extern symbol * increase_capacity(symbol * p, int n) {
lose_b(p); return q;
}
-extern symbol * move_to_b(symbol * p, int n, symbol * q) {
+extern symbol * move_to_b(symbol * p, int n, const symbol * q) {
int x = n - CAPACITY(p);
if (x > 0) p = increase_capacity(p, x);
memmove(p, q, n * sizeof(symbol)); SIZE(p) = n; return p;
}
-extern symbol * add_to_b(symbol * p, int n, symbol * q) {
+extern symbol * add_to_b(symbol * p, int n, const symbol * q) {
int x = SIZE(p) + n - CAPACITY(p);
if (x > 0) p = increase_capacity(p, x);
memmove(p + SIZE(p), q, n * sizeof(symbol)); SIZE(p) += n; return p;
}
-extern symbol * copy_b(symbol * p) {
+extern symbol * copy_b(const symbol * p) {
int n = SIZE(p);
symbol * q = create_b(n);
move_to_b(q, n, p);
@@ -97,7 +107,7 @@ int space_count = 0;
extern void * check_malloc(int n) {
space_count++;
- return calloc(1, n);
+ return malloc(n);
}
extern void check_free(void * p) {
@@ -107,18 +117,18 @@ extern void check_free(void * p) {
/* To convert a block to a zero terminated string: */
-extern char * b_to_s(symbol * p) {
+extern char * b_to_s(const symbol * p) {
int n = SIZE(p);
- char * s = (char *)calloc(1, n + 1);
+ char * s = (char *)malloc(n + 1);
{
int i;
for (i = 0; i < n; i++) {
- if (p[i] > 255) {
- printf("In b_to_s, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]);
- exit(1);
- }
- s[i] = (char)p[i];
- }
+ if (p[i] > 255) {
+ printf("In b_to_s, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]);
+ exit(1);
+ }
+ s[i] = (char)p[i];
+ }
}
s[n] = 0;
return s;
@@ -153,9 +163,9 @@ struct str {
};
/* Create a new string. */
-extern struct str * str_new() {
+extern struct str * str_new(void) {
- struct str * output = (struct str *) calloc(1, sizeof(struct str));
+ struct str * output = (struct str *) malloc(sizeof(struct str));
output->data = create_b(0);
return output;
}
@@ -168,7 +178,7 @@ extern void str_delete(struct str * str) {
}
/* Append a str to this str. */
-extern void str_append(struct str * str, struct str * add) {
+extern void str_append(struct str * str, const struct str * add) {
symbol * q = add->data;
str->data = add_to_b(str->data, SIZE(q), q);
@@ -183,12 +193,19 @@ extern void str_append_ch(struct str * str, char add) {
}
/* Append a low level block to a str. */
-extern void str_append_b(struct str * str, symbol * q) {
+extern void str_append_b(struct str * str, const symbol * q) {
str->data = add_to_b(str->data, SIZE(q), q);
}
-/* Append a (char *, null teminated) string to a str. */
+/* Append the tail of a low level block to a str. */
+extern void str_append_b_tail(struct str * str, const symbol * q, int skip) {
+ if (skip < 0 || skip >= SIZE(q)) return;
+
+ str->data = add_to_b(str->data, SIZE(q) - skip, q + skip);
+}
+
+/* Append a (char *, null terminated) string to a str. */
extern void str_append_string(struct str * str, const char * s) {
str->data = add_s_to_b(str->data, s);
@@ -209,14 +226,14 @@ extern void str_clear(struct str * str) {
}
/* Set a string */
-extern void str_assign(struct str * str, char * s) {
+extern void str_assign(struct str * str, const char * s) {
str_clear(str);
str_append_string(str, s);
}
/* Copy a string. */
-extern struct str * str_copy(struct str * old) {
+extern struct str * str_copy(const struct str * old) {
struct str * newstr = str_new();
str_append(newstr, old);
@@ -224,17 +241,25 @@ extern struct str * str_copy(struct str * old) {
}
/* Get the data stored in this str. */
-extern symbol * str_data(struct str * str) {
+extern symbol * str_data(const struct str * str) {
return str->data;
}
/* Get the length of the str. */
-extern int str_len(struct str * str) {
+extern int str_len(const struct str * str) {
return SIZE(str->data);
}
+/* Get the last character of the str.
+ *
+ * Or -1 if the string is empty.
+ */
+extern int str_back(const struct str *str) {
+ return SIZE(str->data) ? str->data[SIZE(str->data) - 1] : -1;
+}
+
extern int get_utf8(const symbol * p, int * slot) {
int b0, b1;
b0 = *p++;
@@ -260,4 +285,3 @@ extern int put_utf8(int ch, symbol * p) {
p[1] = ((ch >> 6) & 0x3F) | 0x80;
p[2] = (ch & 0x3F) | 0x80; return 3;
}
-
diff --git a/contrib/snowball/compiler/syswords.h b/contrib/snowball/compiler/syswords.h
index 917dd5751..c401d3e4f 100644
--- a/contrib/snowball/compiler/syswords.h
+++ b/contrib/snowball/compiler/syswords.h
@@ -1,5 +1,5 @@
-static const struct system_word vocab[80+1] = {
- { 0, (const byte *)"", 80+1},
+static const struct system_word vocab[82+1] = {
+ { 0, (const byte *)"", 82+1},
{ 1, (const byte *)"$", c_dollar },
{ 1, (const byte *)"(", c_bra },
@@ -36,6 +36,7 @@ static const struct system_word vocab[80+1] = {
{ 3, (const byte *)"get", c_get },
{ 3, (const byte *)"hex", c_hex },
{ 3, (const byte *)"hop", c_hop },
+ { 3, (const byte *)"len", c_len },
{ 3, (const byte *)"non", c_non },
{ 3, (const byte *)"not", c_not },
{ 3, (const byte *)"set", c_set },
@@ -49,6 +50,7 @@ static const struct system_word vocab[80+1] = {
{ 4, (const byte *)"true", c_true },
{ 5, (const byte *)"among", c_among },
{ 5, (const byte *)"false", c_false },
+ { 5, (const byte *)"lenof", c_lenof },
{ 5, (const byte *)"limit", c_limit },
{ 5, (const byte *)"unset", c_unset },
{ 6, (const byte *)"atmark", c_atmark },
diff --git a/contrib/snowball/compiler/syswords2.h b/contrib/snowball/compiler/syswords2.h
index eb8a91241..e853e32dc 100644
--- a/contrib/snowball/compiler/syswords2.h
+++ b/contrib/snowball/compiler/syswords2.h
@@ -4,8 +4,8 @@
c_decimal, c_define, c_delete, c_divide, c_divideassign, c_do,
c_dollar, c_eq, c_externals, c_fail, c_false, c_for, c_ge, c_get,
c_gopast, c_goto, c_gr, c_groupings, c_hex, c_hop, c_insert,
- c_integers, c_ket, c_le, c_leftslice, c_limit, c_loop, c_ls,
- c_maxint, c_minint, c_minus, c_minusassign, c_multiply,
+ c_integers, c_ket, c_le, c_leftslice, c_len, c_lenof, c_limit, c_loop,
+ c_ls, c_maxint, c_minint, c_minus, c_minusassign, c_multiply,
c_multiplyassign, c_ne, c_next, c_non, c_not, c_or, c_plus,
c_plusassign, c_repeat, c_reverse, c_rightslice, c_routines,
c_set, c_setlimit, c_setmark, c_size, c_sizeof, c_slicefrom,
diff --git a/contrib/snowball/compiler/tokeniser.c b/contrib/snowball/compiler/tokeniser.c
index 3dae5f744..e6c63861a 100644
--- a/contrib/snowball/compiler/tokeniser.c
+++ b/contrib/snowball/compiler/tokeniser.c
@@ -16,57 +16,57 @@ struct system_word {
#include "syswords.h"
-static int smaller(int a, int b) { return a < b ? a : b; }
+#define INITIAL_INPUT_BUFFER_SIZE 8192
+
+static int hex_to_num(int ch);
-extern symbol * get_input(symbol * p, char ** p_file) {
+static int smaller(int a, int b) { return a < b ? a : b; }
- char * s = b_to_s(p);
+extern symbol * get_input(const char * filename) {
+ FILE * input = fopen(filename, "r");
+ if (input == 0) { return 0; }
{
- FILE * input = fopen(s, "r");
- if (input == 0) { free(s); return 0; }
- *p_file = s;
- {
- symbol * u = create_b(STARTSIZE);
- int size = 0;
- repeat
- { int ch = getc(input);
- if (ch == EOF) break;
- if (size >= CAPACITY(u)) u = increase_capacity(u, size/2);
- u[size++] = ch;
- }
- fclose(input);
- SIZE(u) = size; return u;
+ symbol * u = create_b(INITIAL_INPUT_BUFFER_SIZE);
+ int size = 0;
+ while (true) {
+ int ch = getc(input);
+ if (ch == EOF) break;
+ if (size >= CAPACITY(u)) u = increase_capacity(u, size);
+ u[size++] = ch;
}
+ fclose(input);
+ SIZE(u) = size;
+ return u;
}
}
-static void error(struct tokeniser * t, char * s1, int n, symbol * p, char * s2) {
+static void error(struct tokeniser * t, const char * s1, int n, symbol * p, const char * s2) {
if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); }
fprintf(stderr, "%s:%d: ", t->file, t->line_number);
- unless (s1 == 0) fprintf(stderr, "%s", s1);
- unless (p == 0) {
+ if (s1) fprintf(stderr, "%s", s1);
+ if (p) {
int i;
for (i = 0; i < n; i++) fprintf(stderr, "%c", p[i]);
}
- unless (s2 == 0) fprintf(stderr, "%s", s2);
+ if (s2) fprintf(stderr, "%s", s2);
fprintf(stderr, "\n");
t->error_count++;
}
-static void error1(struct tokeniser * t, char * s) {
+static void error1(struct tokeniser * t, const char * s) {
error(t, s, 0,0, 0);
}
-static void error2(struct tokeniser * t, char * s) {
+static void error2(struct tokeniser * t, const char * s) {
error(t, "unexpected end of text after ", 0,0, s);
}
static int compare_words(int m, symbol * p, int n, const byte * q) {
- unless (m == n) return m - n;
+ if (m != n) return m - n;
{
int i; for (i = 0; i < n; i++) {
int diff = p[i] - q[i];
- unless (diff == 0) return diff;
+ if (diff) return diff;
}
}
return 0;
@@ -74,14 +74,13 @@ static int compare_words(int m, symbol * p, int n, const byte * q) {
static int find_word(int n, symbol * p) {
int i = 0; int j = vocab->code;
- repeat {
+ do {
int k = i + (j - i)/2;
const struct system_word * w = vocab + k;
int diff = compare_words(n, p, w->s_size, w->s);
if (diff == 0) return w->code;
if (diff < 0) j = k; else i = k;
- if (j - i == 1) break;
- }
+ } while (j - i != 1);
return -1;
}
@@ -91,7 +90,7 @@ static int get_number(int n, symbol * p) {
return x;
}
-static int eq_s(struct tokeniser * t, char * s) {
+static int eq_s(struct tokeniser * t, const char * s) {
int l = strlen(s);
if (SIZE(t->p) - t->c < l) return false;
{
@@ -103,64 +102,141 @@ static int eq_s(struct tokeniser * t, char * s) {
static int white_space(struct tokeniser * t, int ch) {
switch (ch) {
- case '\n': t->line_number++;
+ case '\n':
+ t->line_number++;
+ /* fall through */
case '\r':
case '\t':
- case ' ': return true;
+ case ' ':
+ return true;
}
return false;
}
static symbol * find_in_m(struct tokeniser * t, int n, symbol * p) {
- struct m_pair * q = t->m_pairs;
- repeat {
- if (q == 0) return 0;
- {
- symbol * name = q->name;
- if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value;
- }
- q = q->next;
+ struct m_pair * q;
+ for (q = t->m_pairs; q; q = q->next) {
+ symbol * name = q->name;
+ if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value;
}
+ return 0;
}
static int read_literal_string(struct tokeniser * t, int c) {
symbol * p = t->p;
int ch;
SIZE(t->b) = 0;
- repeat {
+ while (true) {
if (c >= SIZE(p)) { error2(t, "'"); return c; }
ch = p[c];
if (ch == '\n') { error1(t, "string not terminated"); return c; }
c++;
if (ch == t->m_start) {
+ /* Inside insert characters. */
int c0 = c;
int newlines = false; /* no newlines as yet */
int black_found = false; /* no printing chars as yet */
- repeat {
+ while (true) {
if (c >= SIZE(p)) { error2(t, "'"); return c; }
ch = p[c]; c++;
if (ch == t->m_end) break;
- unless (white_space(t, ch)) black_found = true;
+ if (!white_space(t, ch)) black_found = true;
if (ch == '\n') newlines = true;
if (newlines && black_found) {
error1(t, "string not terminated");
return c;
}
}
- unless (newlines) {
+ if (!newlines) {
int n = c - c0 - 1; /* macro size */
int firstch = p[c0];
symbol * q = find_in_m(t, n, p + c0);
if (q == 0) {
if (n == 1 && (firstch == '\'' || firstch == t->m_start))
t->b = add_to_b(t->b, 1, p + c0);
- else
+ else if (n >= 3 && firstch == 'U' && p[c0 + 1] == '+') {
+ int codepoint = 0;
+ int x;
+ if (t->uplusmode == UPLUS_DEFINED) {
+ /* See if found with xxxx upper-cased. */
+ symbol * uc = create_b(n);
+ int i;
+ for (i = 0; i != n; ++i) {
+ uc[i] = toupper(p[c0 + i]);
+ }
+ q = find_in_m(t, n, uc);
+ lose_b(uc);
+ if (q != 0) {
+ t->b = add_to_b(t->b, SIZE(q), q);
+ continue;
+ }
+ error1(t, "Some U+xxxx stringdefs seen but not this one");
+ } else {
+ t->uplusmode = UPLUS_UNICODE;
+ }
+ for (x = c0 + 2; x != c - 1; ++x) {
+ int hex = hex_to_num(p[x]);
+ if (hex < 0) {
+ error1(t, "Bad hex digit following U+");
+ break;
+ }
+ codepoint = (codepoint << 4) | hex;
+ }
+ if (t->encoding == ENC_UTF8) {
+ if (codepoint < 0 || codepoint > 0x01ffff) {
+ error1(t, "character values exceed 0x01ffff");
+ }
+ /* Ensure there's enough space for a max length
+ * UTF-8 sequence. */
+ if (CAPACITY(t->b) < SIZE(t->b) + 3) {
+ t->b = increase_capacity(t->b, 3);
+ }
+ SIZE(t->b) += put_utf8(codepoint, t->b + SIZE(t->b));
+ } else {
+ symbol sym;
+ if (t->encoding == ENC_SINGLEBYTE) {
+ /* Only ISO-8859-1 is handled this way - for
+ * other single-byte character sets you need
+ * stringdef all the U+xxxx codes you use
+ * like - e.g.:
+ *
+ * stringdef U+0171 hex 'FB'
+ */
+ if (codepoint < 0 || codepoint > 0xff) {
+ error1(t, "character values exceed 256");
+ }
+ } else {
+ if (codepoint < 0 || codepoint > 0xffff) {
+ error1(t, "character values exceed 64K");
+ }
+ }
+ sym = codepoint;
+ t->b = add_to_b(t->b, 1, &sym);
+ }
+ } else
error(t, "string macro '", n, p + c0, "' undeclared");
} else
t->b = add_to_b(t->b, SIZE(q), q);
}
} else {
if (ch == '\'') return c;
+ if (ch < 0 || ch >= 0x80) {
+ if (t->encoding != ENC_WIDECHARS) {
+ /* We don't really want people using non-ASCII literal
+ * strings, but historically it's worked for single-byte
+ * and UTF-8 if the source encoding matches what the
+ * generated stemmer works in and it seems unfair to just
+ * suddenly make this a hard error.`
+ */
+ fprintf(stderr,
+ "%s:%d: warning: Non-ASCII literal strings aren't "
+ "portable - use stringdef instead\n",
+ t->file, t->line_number);
+ } else {
+ error1(t, "Non-ASCII literal strings aren't "
+ "portable - use stringdef instead");
+ }
+ }
t->b = add_to_b(t->b, 1, p + c - 1);
}
}
@@ -171,7 +247,7 @@ static int next_token(struct tokeniser * t) {
int c = t->c;
int ch;
int code = -1;
- repeat {
+ while (true) {
if (c >= SIZE(p)) { t->c = c; return -1; }
ch = p[c];
if (white_space(t, ch)) { c++; continue; }
@@ -179,7 +255,7 @@ static int next_token(struct tokeniser * t) {
int c0 = c;
while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++;
code = find_word(c - c0, p + c0);
- if (code < 0) {
+ if (code < 0 || t->token_disabled[code]) {
t->b = move_to_b(t->b, c - c0, p + c0);
code = c_name;
}
@@ -218,10 +294,9 @@ static int next_char(struct tokeniser * t) {
}
static int next_real_char(struct tokeniser * t) {
- repeat {
+ while (true) {
int ch = next_char(t);
- if (white_space(t, ch)) continue;
- return ch;
+ if (!white_space(t, ch)) return ch;
}
}
@@ -230,7 +305,7 @@ static void read_chars(struct tokeniser * t) {
if (ch < 0) { error2(t, "stringdef"); return; }
{
int c0 = t->c-1;
- repeat {
+ while (true) {
ch = next_char(t);
if (white_space(t, ch) || ch < 0) break;
}
@@ -246,19 +321,20 @@ static int decimal_to_num(int ch) {
static int hex_to_num(int ch) {
if ('0' <= ch && ch <= '9') return ch - '0';
if ('a' <= ch && ch <= 'f') return ch - 'a' + 10;
+ if ('A' <= ch && ch <= 'F') return ch - 'A' + 10;
return -1;
}
static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) {
int c = 0; int d = 0;
- repeat {
+ while (true) {
while (c < SIZE(p) && p[c] == ' ') c++;
if (c == SIZE(p)) break;
{
int number = 0;
- repeat {
+ while (c != SIZE(p)) {
int ch = p[c];
- if (c == SIZE(p) || ch == ' ') break;
+ if (ch == ' ') break;
if (base == 10) {
ch = decimal_to_num(ch);
if (ch < 0) {
@@ -266,7 +342,7 @@ static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) {
return;
}
} else {
- ch = hex_to_num(tolower(ch));
+ ch = hex_to_num(ch);
if (ch < 0) {
error1(t, "hex string contains non-hex characters");
return;
@@ -275,18 +351,18 @@ static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) {
number = base * number + ch;
c++;
}
- if (t->widechars || t->utf8) {
- unless (0 <= number && number <= 0xffff) {
- error1(t, "character values exceed 64K");
+ if (t->encoding == ENC_SINGLEBYTE) {
+ if (number < 0 || number > 0xff) {
+ error1(t, "character values exceed 256");
return;
}
} else {
- unless (0 <= number && number <= 0xff) {
- error1(t, "character values exceed 256");
+ if (number < 0 || number > 0xffff) {
+ error1(t, "character values exceed 64K");
return;
}
}
- if (t->utf8)
+ if (t->encoding == ENC_UTF8)
d += put_utf8(number, p + d);
else
p[d++] = number;
@@ -300,104 +376,118 @@ extern int read_token(struct tokeniser * t) {
int held = t->token_held;
t->token_held = false;
if (held) return t->token;
- repeat {
+ while (true) {
int code = next_token(t);
switch (code) {
case c_comment1: /* slash-slash comment */
- while (t->c < SIZE(p) && p[t->c] != '\n') t->c++;
- continue;
+ while (t->c < SIZE(p) && p[t->c] != '\n') t->c++;
+ continue;
case c_comment2: /* slash-star comment */
- repeat {
- if (t->c >= SIZE(p)) {
- error1(t, "/* comment not terminated");
- t->token = -1;
- return -1;
- }
- if (p[t->c] == '\n') t->line_number++;
- if (eq_s(t, "*/")) break;
- t->c++;
- }
- continue;
- case c_stringescapes:
- {
- int ch1 = next_real_char(t);
- int ch2 = next_real_char(t);
- if (ch2 < 0)
- { error2(t, "stringescapes"); continue; }
- if (ch1 == '\'')
- { error1(t, "first stringescape cannot be '"); continue; }
- t->m_start = ch1;
- t->m_end = ch2;
- }
- continue;
- case c_stringdef:
- {
- int base = 0;
- read_chars(t);
- code = read_token(t);
- if (code == c_hex) { base = 16; code = read_token(t); } else
- if (code == c_decimal) { base = 10; code = read_token(t); }
- unless (code == c_literalstring)
- { error1(t, "string omitted after stringdef"); continue; }
- if (base > 0) convert_numeric_string(t, t->b, base);
- { NEW(m_pair, q);
- q->next = t->m_pairs;
- q->name = copy_b(t->b2);
- q->value = copy_b(t->b);
- t->m_pairs = q;
- }
- }
- continue;
+ while (true) {
+ if (t->c >= SIZE(p)) {
+ error1(t, "/* comment not terminated");
+ t->token = -1;
+ return -1;
+ }
+ if (p[t->c] == '\n') t->line_number++;
+ if (eq_s(t, "*/")) break;
+ t->c++;
+ }
+ continue;
+ case c_stringescapes: {
+ int ch1 = next_real_char(t);
+ int ch2 = next_real_char(t);
+ if (ch2 < 0) {
+ error2(t, "stringescapes");
+ continue;
+ }
+ if (ch1 == '\'') {
+ error1(t, "first stringescape cannot be '");
+ continue;
+ }
+ t->m_start = ch1;
+ t->m_end = ch2;
+ continue;
+ }
+ case c_stringdef: {
+ int base = 0;
+ read_chars(t);
+ code = read_token(t);
+ if (code == c_hex) { base = 16; code = read_token(t); } else
+ if (code == c_decimal) { base = 10; code = read_token(t); }
+ if (code != c_literalstring) {
+ error1(t, "string omitted after stringdef");
+ continue;
+ }
+ if (base > 0) convert_numeric_string(t, t->b, base);
+ { NEW(m_pair, q);
+ q->next = t->m_pairs;
+ q->name = copy_b(t->b2);
+ q->value = copy_b(t->b);
+ t->m_pairs = q;
+ if (t->uplusmode != UPLUS_DEFINED &&
+ (SIZE(t->b2) >= 3 && t->b2[0] == 'U' && t->b2[1] == '+')) {
+ if (t->uplusmode == UPLUS_UNICODE) {
+ error1(t, "U+xxxx already used with implicit meaning");
+ } else {
+ t->uplusmode = UPLUS_DEFINED;
+ }
+ }
+ }
+ continue;
+ }
case c_get:
- code = read_token(t);
- unless (code == c_literalstring) {
- error1(t, "string omitted after get"); continue;
- }
- t->get_depth++;
- if (t->get_depth > 10) {
- fprintf(stderr, "get directives go 10 deep. Looping?\n");
- exit(1);
- }
- {
- char * file;
- NEW(input, q);
- symbol * u = get_input(t->b, &file);
- if (u == 0) {
- struct include * r = t->includes;
- until (r == 0) {
- symbol * b = copy_b(r->b);
- b = add_to_b(b, SIZE(t->b), t->b);
- u = get_input(b, &file);
- lose_b(b);
- unless (u == 0) break;
- r = r->next;
- }
- }
- if (u == 0) {
- error(t, "Can't get '", SIZE(t->b), t->b, "'");
- exit(1);
- }
- memmove(q, t, sizeof(struct input));
- t->next = q;
- t->p = u;
- t->c = 0;
- t->file = file;
- t->line_number = 1;
- }
- p = t->p;
- continue;
+ code = read_token(t);
+ if (code != c_literalstring) {
+ error1(t, "string omitted after get"); continue;
+ }
+ t->get_depth++;
+ if (t->get_depth > 10) {
+ fprintf(stderr, "get directives go 10 deep. Looping?\n");
+ exit(1);
+ }
+ {
+ NEW(input, q);
+ char * file = b_to_s(t->b);
+ symbol * u = get_input(file);
+ if (u == 0) {
+ struct include * r;
+ for (r = t->includes; r; r = r->next) {
+ symbol * b = copy_b(r->b);
+ b = add_to_b(b, SIZE(t->b), t->b);
+ free(file);
+ file = b_to_s(b);
+ u = get_input(file);
+ lose_b(b);
+ if (u != 0) break;
+ }
+ }
+ if (u == 0) {
+ error(t, "Can't get '", SIZE(t->b), t->b, "'");
+ exit(1);
+ }
+ memmove(q, t, sizeof(struct input));
+ t->next = q;
+ t->p = u;
+ t->c = 0;
+ t->file = file;
+ t->file_needs_freeing = true;
+ t->line_number = 1;
+ }
+ p = t->p;
+ continue;
case -1:
- unless (t->next == 0) {
- lose_b(p);
- {
- struct input * q = t->next;
- memmove(t, q, sizeof(struct input)); p = t->p;
- FREE(q);
- }
- t->get_depth--;
- continue;
- }
- /* drop through */
+ if (t->next) {
+ lose_b(p);
+ {
+ struct input * q = t->next;
+ memmove(t, q, sizeof(struct input)); p = t->p;
+ FREE(q);
+ }
+ t->get_depth--;
+ continue;
+ }
+ /* fall through */
default:
t->previous_token = t->token;
t->token = code;
@@ -425,12 +515,17 @@ extern const char * name_of_token(int code) {
}
}
+extern void disable_token(struct tokeniser * t, int code) {
+ t->token_disabled[code] = 1;
+}
+
extern struct tokeniser * create_tokeniser(symbol * p, char * file) {
NEW(tokeniser, t);
t->next = 0;
t->p = p;
t->c = 0;
t->file = file;
+ t->file_needs_freeing = false;
t->line_number = 1;
t->b = create_b(0);
t->b2 = create_b(0);
@@ -441,6 +536,8 @@ extern struct tokeniser * create_tokeniser(symbol * p, char * file) {
t->token_held = false;
t->token = -2;
t->previous_token = -2;
+ t->uplusmode = UPLUS_NONE;
+ memset(t->token_disabled, 0, sizeof(t->token_disabled));
return t;
}
@@ -449,7 +546,7 @@ extern void close_tokeniser(struct tokeniser * t) {
lose_b(t->b2);
{
struct m_pair * q = t->m_pairs;
- until (q == 0) {
+ while (q) {
struct m_pair * q_next = q->next;
lose_b(q->name);
lose_b(q->value);
@@ -459,12 +556,12 @@ extern void close_tokeniser(struct tokeniser * t) {
}
{
struct input * q = t->next;
- until (q == 0) {
+ while (q) {
struct input * q_next = q->next;
FREE(q);
q = q_next;
}
}
- free(t->file);
+ if (t->file_needs_freeing) free(t->file);
FREE(t);
}
diff --git a/contrib/snowball/examples/stemwords.c b/contrib/snowball/examples/stemwords.c
deleted file mode 100644
index 995bf40dc..000000000
--- a/contrib/snowball/examples/stemwords.c
+++ /dev/null
@@ -1,209 +0,0 @@
-/* This is a simple program which uses libstemmer to provide a command
- * line interface for stemming using any of the algorithms provided.
- */
-
-#include <stdio.h>
-#include <stdlib.h> /* for malloc, free */
-#include <string.h> /* for memmove */
-#include <ctype.h> /* for isupper, tolower */
-
-#include "libstemmer.h"
-
-const char * progname;
-static int pretty = 1;
-
-static void
-stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
-{
-#define INC 10
- int lim = INC;
- sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol));
-
- while(1) {
- int ch = getc(f_in);
- if (ch == EOF) {
- free(b); return;
- }
- {
- int i = 0;
- int inlen = 0;
- while(1) {
- if (ch == '\n' || ch == EOF) break;
- if (i == lim) {
- sb_symbol * newb;
- newb = (sb_symbol *)
- realloc(b, (lim + INC) * sizeof(sb_symbol));
- if (newb == 0) goto error;
- b = newb;
- lim = lim + INC;
- }
- /* Update count of utf-8 characters. */
- if (ch < 0x80 || ch > 0xBF) inlen += 1;
- /* force lower case: */
- if (isupper(ch)) ch = tolower(ch);
-
- b[i] = ch;
- i++;
- ch = getc(f_in);
- }
-
- {
- const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i);
- if (stemmed == NULL)
- {
- fprintf(stderr, "Out of memory");
- exit(1);
- }
- else
- {
- if (pretty == 1) {
- fwrite(b, i, 1, f_out);
- fputs(" -> ", f_out);
- } else if (pretty == 2) {
- fwrite(b, i, 1, f_out);
- if (sb_stemmer_length(stemmer) > 0) {
- int j;
- if (inlen < 30) {
- for (j = 30 - inlen; j > 0; j--)
- fputs(" ", f_out);
- } else {
- fputs("\n", f_out);
- for (j = 30; j > 0; j--)
- fputs(" ", f_out);
- }
- }
- }
-
- fputs((const char *)stemmed, f_out);
- putc('\n', f_out);
- }
- }
- }
- }
-error:
- if (b != 0) free(b);
- return;
-}
-
-/** Display the command line syntax, and then exit.
- * @param n The value to exit with.
- */
-static void
-usage(int n)
-{
- printf("usage: %s [-l <language>] [-i <input file>] [-o <output file>] [-c <character encoding>] [-p[2]] [-h]\n"
- "\n"
- "The input file consists of a list of words to be stemmed, one per\n"
- "line. Words should be in lower case, but (for English) A-Z letters\n"
- "are mapped to their a-z equivalents anyway. If omitted, stdin is\n"
- "used.\n"
- "\n"
- "If -c is given, the argument is the character encoding of the input\n"
- "and output files. If it is omitted, the UTF-8 encoding is used.\n"
- "\n"
- "If -p is given the output file consists of each word of the input\n"
- "file followed by \"->\" followed by its stemmed equivalent.\n"
- "If -p2 is given the output file is a two column layout containing\n"
- "the input words in the first column and the stemmed equivalents in\n"
- "the second column.\n"
- "Otherwise, the output file consists of the stemmed words, one per\n"
- "line.\n"
- "\n"
- "-h displays this help\n",
- progname);
- exit(n);
-}
-
-int
-main(int argc, char * argv[])
-{
- char * in = 0;
- char * out = 0;
- FILE * f_in;
- FILE * f_out;
- struct sb_stemmer * stemmer;
-
- char * language = "english";
- char * charenc = NULL;
-
- char * s;
- int i = 1;
- pretty = 0;
-
- progname = argv[0];
-
- while(i < argc) {
- s = argv[i++];
- if (s[0] == '-') {
- if (strcmp(s, "-o") == 0) {
- if (i >= argc) {
- fprintf(stderr, "%s requires an argument\n", s);
- exit(1);
- }
- out = argv[i++];
- } else if (strcmp(s, "-i") == 0) {
- if (i >= argc) {
- fprintf(stderr, "%s requires an argument\n", s);
- exit(1);
- }
- in = argv[i++];
- } else if (strcmp(s, "-l") == 0) {
- if (i >= argc) {
- fprintf(stderr, "%s requires an argument\n", s);
- exit(1);
- }
- language = argv[i++];
- } else if (strcmp(s, "-c") == 0) {
- if (i >= argc) {
- fprintf(stderr, "%s requires an argument\n", s);
- exit(1);
- }
- charenc = argv[i++];
- } else if (strcmp(s, "-p2") == 0) {
- pretty = 2;
- } else if (strcmp(s, "-p") == 0) {
- pretty = 1;
- } else if (strcmp(s, "-h") == 0) {
- usage(0);
- } else {
- fprintf(stderr, "option %s unknown\n", s);
- usage(1);
- }
- } else {
- fprintf(stderr, "unexpected parameter %s\n", s);
- usage(1);
- }
- }
-
- /* prepare the files */
- f_in = (in == 0) ? stdin : fopen(in, "r");
- if (f_in == 0) {
- fprintf(stderr, "file %s not found\n", in);
- exit(1);
- }
- f_out = (out == 0) ? stdout : fopen(out, "w");
- if (f_out == 0) {
- fprintf(stderr, "file %s cannot be opened\n", out);
- exit(1);
- }
-
- /* do the stemming process: */
- stemmer = sb_stemmer_new(language, charenc);
- if (stemmer == 0) {
- if (charenc == NULL) {
- fprintf(stderr, "language `%s' not available for stemming\n", language);
- exit(1);
- } else {
- fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc);
- exit(1);
- }
- }
- stem_file(stemmer, f_in, f_out);
- sb_stemmer_delete(stemmer);
-
- if (in != 0) (void) fclose(f_in);
- if (out != 0) (void) fclose(f_out);
-
- return 0;
-}
-
diff --git a/contrib/snowball/include/libstemmer.h b/contrib/snowball/include/libstemmer.h
index 9d86b8581..98051e108 100644
--- a/contrib/snowball/include/libstemmer.h
+++ b/contrib/snowball/include/libstemmer.h
@@ -32,9 +32,9 @@ const char ** sb_stemmer_list(void);
*
* @param charenc The character encoding. NULL may be passed as
* this value, in which case UTF-8 encoding will be assumed. Otherwise,
- * the argument may be one of "UTF_8", "ISO_8859_1" (ie, Latin 1),
- * "CP850" (ie, MS-DOS Latin 1) or "KOI8_R" (Russian). Note that
- * case is significant in this parameter.
+ * the argument may be one of "UTF_8", "ISO_8859_1" (i.e. Latin 1),
+ * "ISO_8859_2" (i.e. Latin 2) or "KOI8_R" (Russian). Note that case is
+ * significant in this parameter.
*
* @return NULL if the specified algorithm is not recognised, or the
* algorithm is not available for the requested encoding. Otherwise,
@@ -66,7 +66,7 @@ void sb_stemmer_delete(struct sb_stemmer * stemmer);
* If an out-of-memory error occurs, this will return NULL.
*/
const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer,
- const sb_symbol * word, int size);
+ const sb_symbol * word, int size);
/** Get the length of the result of the last stemmed word.
* This should not be called before sb_stemmer_stem() has been called.
@@ -76,4 +76,3 @@ int sb_stemmer_length(struct sb_stemmer * stemmer);
#ifdef __cplusplus
}
#endif
-
diff --git a/contrib/snowball/java/org/tartarus/snowball/Among.java b/contrib/snowball/java/org/tartarus/snowball/Among.java
deleted file mode 100644
index 5ed37b503..000000000
--- a/contrib/snowball/java/org/tartarus/snowball/Among.java
+++ /dev/null
@@ -1,31 +0,0 @@
-package org.tartarus.snowball;
-
-import java.lang.reflect.Method;
-
-public class Among {
- public Among (String s, int substring_i, int result,
- String methodname, SnowballProgram methodobject) {
- this.s_size = s.length();
- this.s = s.toCharArray();
- this.substring_i = substring_i;
- this.result = result;
- this.methodobject = methodobject;
- if (methodname.length() == 0) {
- this.method = null;
- } else {
- try {
- this.method = methodobject.getClass().
- getDeclaredMethod(methodname, new Class[0]);
- } catch (NoSuchMethodException e) {
- throw new RuntimeException(e);
- }
- }
- }
-
- public final int s_size; /* search string */
- public final char[] s; /* search string */
- public final int substring_i; /* index to longest matching substring */
- public final int result; /* result of the lookup */
- public final Method method; /* method to use if substring matches */
- public final SnowballProgram methodobject; /* object to invoke method on */
-};
diff --git a/contrib/snowball/java/org/tartarus/snowball/SnowballProgram.java b/contrib/snowball/java/org/tartarus/snowball/SnowballProgram.java
deleted file mode 100644
index 52d6baa78..000000000
--- a/contrib/snowball/java/org/tartarus/snowball/SnowballProgram.java
+++ /dev/null
@@ -1,432 +0,0 @@
-
-package org.tartarus.snowball;
-import java.lang.reflect.InvocationTargetException;
-
-public class SnowballProgram {
- protected SnowballProgram()
- {
- current = new StringBuffer();
- setCurrent("");
- }
-
- /**
- * Set the current string.
- */
- public void setCurrent(String value)
- {
- current.replace(0, current.length(), value);
- cursor = 0;
- limit = current.length();
- limit_backward = 0;
- bra = cursor;
- ket = limit;
- }
-
- /**
- * Get the current string.
- */
- public String getCurrent()
- {
- String result = current.toString();
- // Make a new StringBuffer. If we reuse the old one, and a user of
- // the library keeps a reference to the buffer returned (for example,
- // by converting it to a String in a way which doesn't force a copy),
- // the buffer size will not decrease, and we will risk wasting a large
- // amount of memory.
- // Thanks to Wolfram Esser for spotting this problem.
- current = new StringBuffer();
- return result;
- }
-
- // current string
- protected StringBuffer current;
-
- protected int cursor;
- protected int limit;
- protected int limit_backward;
- protected int bra;
- protected int ket;
-
- protected void copy_from(SnowballProgram other)
- {
- current = other.current;
- cursor = other.cursor;
- limit = other.limit;
- limit_backward = other.limit_backward;
- bra = other.bra;
- ket = other.ket;
- }
-
- protected boolean in_grouping(char [] s, int min, int max)
- {
- if (cursor >= limit) return false;
- char ch = current.charAt(cursor);
- if (ch > max || ch < min) return false;
- ch -= min;
- if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
- cursor++;
- return true;
- }
-
- protected boolean in_grouping_b(char [] s, int min, int max)
- {
- if (cursor <= limit_backward) return false;
- char ch = current.charAt(cursor - 1);
- if (ch > max || ch < min) return false;
- ch -= min;
- if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
- cursor--;
- return true;
- }
-
- protected boolean out_grouping(char [] s, int min, int max)
- {
- if (cursor >= limit) return false;
- char ch = current.charAt(cursor);
- if (ch > max || ch < min) {
- cursor++;
- return true;
- }
- ch -= min;
- if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
- cursor ++;
- return true;
- }
- return false;
- }
-
- protected boolean out_grouping_b(char [] s, int min, int max)
- {
- if (cursor <= limit_backward) return false;
- char ch = current.charAt(cursor - 1);
- if (ch > max || ch < min) {
- cursor--;
- return true;
- }
- ch -= min;
- if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
- cursor--;
- return true;
- }
- return false;
- }
-
- protected boolean in_range(int min, int max)
- {
- if (cursor >= limit) return false;
- char ch = current.charAt(cursor);
- if (ch > max || ch < min) return false;
- cursor++;
- return true;
- }
-
- protected boolean in_range_b(int min, int max)
- {
- if (cursor <= limit_backward) return false;
- char ch = current.charAt(cursor - 1);
- if (ch > max || ch < min) return false;
- cursor--;
- return true;
- }
-
- protected boolean out_range(int min, int max)
- {
- if (cursor >= limit) return false;
- char ch = current.charAt(cursor);
- if (!(ch > max || ch < min)) return false;
- cursor++;
- return true;
- }
-
- protected boolean out_range_b(int min, int max)
- {
- if (cursor <= limit_backward) return false;
- char ch = current.charAt(cursor - 1);
- if(!(ch > max || ch < min)) return false;
- cursor--;
- return true;
- }
-
- protected boolean eq_s(int s_size, String s)
- {
- if (limit - cursor < s_size) return false;
- int i;
- for (i = 0; i != s_size; i++) {
- if (current.charAt(cursor + i) != s.charAt(i)) return false;
- }
- cursor += s_size;
- return true;
- }
-
- protected boolean eq_s_b(int s_size, String s)
- {
- if (cursor - limit_backward < s_size) return false;
- int i;
- for (i = 0; i != s_size; i++) {
- if (current.charAt(cursor - s_size + i) != s.charAt(i)) return false;
- }
- cursor -= s_size;
- return true;
- }
-
- protected boolean eq_v(CharSequence s)
- {
- return eq_s(s.length(), s.toString());
- }
-
- protected boolean eq_v_b(CharSequence s)
- { return eq_s_b(s.length(), s.toString());
- }
-
- protected int find_among(Among v[], int v_size)
- {
- int i = 0;
- int j = v_size;
-
- int c = cursor;
- int l = limit;
-
- int common_i = 0;
- int common_j = 0;
-
- boolean first_key_inspected = false;
-
- while(true) {
- int k = i + ((j - i) >> 1);
- int diff = 0;
- int common = common_i < common_j ? common_i : common_j; // smaller
- Among w = v[k];
- int i2;
- for (i2 = common; i2 < w.s_size; i2++) {
- if (c + common == l) {
- diff = -1;
- break;
- }
- diff = current.charAt(c + common) - w.s[i2];
- if (diff != 0) break;
- common++;
- }
- if (diff < 0) {
- j = k;
- common_j = common;
- } else {
- i = k;
- common_i = common;
- }
- if (j - i <= 1) {
- if (i > 0) break; // v->s has been inspected
- if (j == i) break; // only one item in v
-
- // - but now we need to go round once more to get
- // v->s inspected. This looks messy, but is actually
- // the optimal approach.
-
- if (first_key_inspected) break;
- first_key_inspected = true;
- }
- }
- while(true) {
- Among w = v[i];
- if (common_i >= w.s_size) {
- cursor = c + w.s_size;
- if (w.method == null) return w.result;
- boolean res;
- try {
- Object resobj = w.method.invoke(w.methodobject,
- new Object[0]);
- res = resobj.toString().equals("true");
- } catch (InvocationTargetException e) {
- res = false;
- // FIXME - debug message
- } catch (IllegalAccessException e) {
- res = false;
- // FIXME - debug message
- }
- cursor = c + w.s_size;
- if (res) return w.result;
- }
- i = w.substring_i;
- if (i < 0) return 0;
- }
- }
-
- // find_among_b is for backwards processing. Same comments apply
- protected int find_among_b(Among v[], int v_size)
- {
- int i = 0;
- int j = v_size;
-
- int c = cursor;
- int lb = limit_backward;
-
- int common_i = 0;
- int common_j = 0;
-
- boolean first_key_inspected = false;
-
- while(true) {
- int k = i + ((j - i) >> 1);
- int diff = 0;
- int common = common_i < common_j ? common_i : common_j;
- Among w = v[k];
- int i2;
- for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) {
- if (c - common == lb) {
- diff = -1;
- break;
- }
- diff = current.charAt(c - 1 - common) - w.s[i2];
- if (diff != 0) break;
- common++;
- }
- if (diff < 0) {
- j = k;
- common_j = common;
- } else {
- i = k;
- common_i = common;
- }
- if (j - i <= 1) {
- if (i > 0) break;
- if (j == i) break;
- if (first_key_inspected) break;
- first_key_inspected = true;
- }
- }
- while(true) {
- Among w = v[i];
- if (common_i >= w.s_size) {
- cursor = c - w.s_size;
- if (w.method == null) return w.result;
-
- boolean res;
- try {
- Object resobj = w.method.invoke(w.methodobject,
- new Object[0]);
- res = resobj.toString().equals("true");
- } catch (InvocationTargetException e) {
- res = false;
- // FIXME - debug message
- } catch (IllegalAccessException e) {
- res = false;
- // FIXME - debug message
- }
- cursor = c - w.s_size;
- if (res) return w.result;
- }
- i = w.substring_i;
- if (i < 0) return 0;
- }
- }
-
- /* to replace chars between c_bra and c_ket in current by the
- * chars in s.
- */
- protected int replace_s(int c_bra, int c_ket, String s)
- {
- int adjustment = s.length() - (c_ket - c_bra);
- current.replace(c_bra, c_ket, s);
- limit += adjustment;
- if (cursor >= c_ket) cursor += adjustment;
- else if (cursor > c_bra) cursor = c_bra;
- return adjustment;
- }
-
- protected void slice_check()
- {
- if (bra < 0 ||
- bra > ket ||
- ket > limit ||
- limit > current.length()) // this line could be removed
- {
- System.err.println("faulty slice operation");
- // FIXME: report error somehow.
- /*
- fprintf(stderr, "faulty slice operation:\n");
- debug(z, -1, 0);
- exit(1);
- */
- }
- }
-
- protected void slice_from(String s)
- {
- slice_check();
- replace_s(bra, ket, s);
- }
-
- protected void slice_from(CharSequence s)
- {
- slice_from(s.toString());
- }
-
- protected void slice_del()
- {
- slice_from("");
- }
-
- protected void insert(int c_bra, int c_ket, String s)
- {
- int adjustment = replace_s(c_bra, c_ket, s);
- if (c_bra <= bra) bra += adjustment;
- if (c_bra <= ket) ket += adjustment;
- }
-
- protected void insert(int c_bra, int c_ket, CharSequence s)
- {
- insert(c_bra, c_ket, s.toString());
- }
-
- /* Copy the slice into the supplied StringBuffer */
- protected StringBuffer slice_to(StringBuffer s)
- {
- slice_check();
- int len = ket - bra;
- s.replace(0, s.length(), current.substring(bra, ket));
- return s;
- }
-
- /* Copy the slice into the supplied StringBuilder */
- protected StringBuilder slice_to(StringBuilder s)
- {
- slice_check();
- int len = ket - bra;
- s.replace(0, s.length(), current.substring(bra, ket));
- return s;
- }
-
- protected StringBuffer assign_to(StringBuffer s)
- {
- s.replace(0, s.length(), current.substring(0, limit));
- return s;
- }
-
- protected StringBuilder assign_to(StringBuilder s)
- {
- s.replace(0, s.length(), current.substring(0, limit));
- return s;
- }
-
-/*
-extern void debug(struct SN_env * z, int number, int line_count)
-{ int i;
- int limit = SIZE(z->p);
- //if (number >= 0) printf("%3d (line %4d): '", number, line_count);
- if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
- for (i = 0; i <= limit; i++)
- { if (z->lb == i) printf("{");
- if (z->bra == i) printf("[");
- if (z->c == i) printf("|");
- if (z->ket == i) printf("]");
- if (z->l == i) printf("}");
- if (i < limit)
- { int ch = z->p[i];
- if (ch == 0) ch = '#';
- printf("%c", ch);
- }
- }
- printf("'\n");
-}
-*/
-
-};
diff --git a/contrib/snowball/java/org/tartarus/snowball/SnowballStemmer.java b/contrib/snowball/java/org/tartarus/snowball/SnowballStemmer.java
deleted file mode 100644
index 960bd55f6..000000000
--- a/contrib/snowball/java/org/tartarus/snowball/SnowballStemmer.java
+++ /dev/null
@@ -1,7 +0,0 @@
-
-package org.tartarus.snowball;
-import java.lang.reflect.InvocationTargetException;
-
-public abstract class SnowballStemmer extends SnowballProgram {
- public abstract boolean stem();
-};
diff --git a/contrib/snowball/java/org/tartarus/snowball/TestApp.java b/contrib/snowball/java/org/tartarus/snowball/TestApp.java
deleted file mode 100644
index 38803f673..000000000
--- a/contrib/snowball/java/org/tartarus/snowball/TestApp.java
+++ /dev/null
@@ -1,77 +0,0 @@
-
-package org.tartarus.snowball;
-
-import java.lang.reflect.Method;
-import java.io.Reader;
-import java.io.Writer;
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.FileInputStream;
-import java.io.InputStreamReader;
-import java.io.OutputStreamWriter;
-import java.io.OutputStream;
-import java.io.FileOutputStream;
-
-public class TestApp {
- private static void usage()
- {
- System.err.println("Usage: TestApp <algorithm> <input file> [-o <output file>]");
- }
-
- public static void main(String [] args) throws Throwable {
- if (args.length < 2) {
- usage();
- return;
- }
-
- Class stemClass = Class.forName("org.tartarus.snowball.ext." +
- args[0] + "Stemmer");
- SnowballStemmer stemmer = (SnowballStemmer) stemClass.newInstance();
-
- Reader reader;
- reader = new InputStreamReader(new FileInputStream(args[1]));
- reader = new BufferedReader(reader);
-
- StringBuffer input = new StringBuffer();
-
- OutputStream outstream;
-
- if (args.length > 2) {
- if (args.length >= 4 && args[2].equals("-o")) {
- outstream = new FileOutputStream(args[3]);
- } else {
- usage();
- return;
- }
- } else {
- outstream = System.out;
- }
- Writer output = new OutputStreamWriter(outstream);
- output = new BufferedWriter(output);
-
- int repeat = 1;
- if (args.length > 4) {
- repeat = Integer.parseInt(args[4]);
- }
-
- Object [] emptyArgs = new Object[0];
- int character;
- while ((character = reader.read()) != -1) {
- char ch = (char) character;
- if (Character.isWhitespace((char) ch)) {
- if (input.length() > 0) {
- stemmer.setCurrent(input.toString());
- for (int i = repeat; i != 0; i--) {
- stemmer.stem();
- }
- output.write(stemmer.getCurrent());
- output.write('\n');
- input.delete(0, input.length());
- }
- } else {
- input.append(Character.toLowerCase(ch));
- }
- }
- output.flush();
- }
-}
diff --git a/contrib/snowball/libstemmer/libstemmer_c.in b/contrib/snowball/libstemmer/libstemmer_c.in
index 4de579874..2aa918d61 100644
--- a/contrib/snowball/libstemmer/libstemmer_c.in
+++ b/contrib/snowball/libstemmer/libstemmer_c.in
@@ -22,10 +22,10 @@ sb_stemmer_list(void)
static stemmer_encoding_t
sb_getenc(const char * charenc)
{
- struct stemmer_encoding * encoding;
+ const struct stemmer_encoding * encoding;
if (charenc == NULL) return ENC_UTF_8;
for (encoding = encodings; encoding->name != 0; encoding++) {
- if (strcmp(encoding->name, charenc) == 0) break;
+ if (strcmp(encoding->name, charenc) == 0) break;
}
if (encoding->name == NULL) return ENC_UNKNOWN;
return encoding->enc;
@@ -35,14 +35,14 @@ extern struct sb_stemmer *
sb_stemmer_new(const char * algorithm, const char * charenc)
{
stemmer_encoding_t enc;
- struct stemmer_modules * module;
+ const struct stemmer_modules * module;
struct sb_stemmer * stemmer;
enc = sb_getenc(charenc);
if (enc == ENC_UNKNOWN) return NULL;
for (module = modules; module->name != 0; module++) {
- if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
+ if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
}
if (module->name == NULL) return NULL;
@@ -67,9 +67,10 @@ void
sb_stemmer_delete(struct sb_stemmer * stemmer)
{
if (stemmer == 0) return;
- if (stemmer->close == 0) return;
- stemmer->close(stemmer->env);
- stemmer->close = 0;
+ if (stemmer->close) {
+ stemmer->close(stemmer->env);
+ stemmer->close = 0;
+ }
free(stemmer);
}
diff --git a/contrib/snowball/libstemmer/mkmodules.pl b/contrib/snowball/libstemmer/mkmodules.pl
index ff8c19e7c..dd6678759 100755
--- a/contrib/snowball/libstemmer/mkmodules.pl
+++ b/contrib/snowball/libstemmer/mkmodules.pl
@@ -1,10 +1,12 @@
-#!/usr/bin/perl -w
+#!/usr/bin/env perl
use strict;
+use 5.006;
+use warnings;
my $progname = $0;
if (scalar @ARGV < 4 || scalar @ARGV > 5) {
- print "Usage: $progname <outfile> <C source directory> <modules description file> <source list file> [<extn>]\n";
+ print "Usage: $progname <outfile> <C source directory> <modules description file> <source list file> [<enc>]\n";
exit 1;
}
@@ -12,9 +14,11 @@ my $outname = shift(@ARGV);
my $c_src_dir = shift(@ARGV);
my $descfile = shift(@ARGV);
my $srclistfile = shift(@ARGV);
+my $enc_only;
my $extn = '';
if (@ARGV) {
- $extn = '_'.shift(@ARGV);
+ $enc_only = shift(@ARGV);
+ $extn = '_'.$enc_only;
}
my %aliases = ();
@@ -27,6 +31,14 @@ sub addalgenc($$) {
my $alg = shift();
my $enc = shift();
+ if (defined $enc_only) {
+ my $norm_enc = lc $enc;
+ $norm_enc =~ s/_//g;
+ if ($norm_enc ne $enc_only) {
+ return;
+ }
+ }
+
if (defined $algorithm_encs{$alg}) {
my $hashref = $algorithm_encs{$alg};
$$hashref{$enc}=1;
@@ -42,7 +54,7 @@ sub readinput()
{
open DESCFILE, $descfile;
my $line;
- while($line = <DESCFILE>)
+ while ($line = <DESCFILE>)
{
next if $line =~ m/^\s*#/;
next if $line =~ m/^\s*$/;
@@ -123,7 +135,7 @@ struct stemmer_encoding {
const char * name;
stemmer_encoding_t enc;
};
-static struct stemmer_encoding encodings[] = {
+static const struct stemmer_encoding encodings[] = {
EOS
for $enc (sort keys %encs) {
print OUT " {\"${enc}\", ENC_${enc}},\n";
@@ -139,7 +151,7 @@ struct stemmer_modules {
void (*close)(struct SN_env *);
int (*stem)(struct SN_env *);
};
-static struct stemmer_modules modules[] = {
+static const struct stemmer_modules modules[] = {
EOS
for $lang (sort keys %aliases) {
@@ -162,7 +174,6 @@ static const char * algorithm_names[] = {
EOS
for $lang (@algorithms) {
- my $l = $aliases{$lang};
print OUT " \"$lang\", \n";
}
diff --git a/contrib/snowball/libstemmer/modules.txt b/contrib/snowball/libstemmer/modules.txt
index d237c1d8d..f6dcc7e92 100644
--- a/contrib/snowball/libstemmer/modules.txt
+++ b/contrib/snowball/libstemmer/modules.txt
@@ -9,27 +9,35 @@
# List all the main algorithms for each language, in UTF-8, and also with
# the most commonly used encoding.
-danish UTF_8,ISO_8859_1 danish,da,dan
-dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld
-english UTF_8,ISO_8859_1 english,en,eng
-finnish UTF_8,ISO_8859_1 finnish,fi,fin
-french UTF_8,ISO_8859_1 french,fr,fre,fra
-german UTF_8,ISO_8859_1 german,de,ger,deu
-hungarian UTF_8,ISO_8859_2 hungarian,hu,hun
-italian UTF_8,ISO_8859_1 italian,it,ita
-norwegian UTF_8,ISO_8859_1 norwegian,no,nor
-portuguese UTF_8,ISO_8859_1 portuguese,pt,por
-romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron
-russian UTF_8,KOI8_R russian,ru,rus
-spanish UTF_8,ISO_8859_1 spanish,es,esl,spa
-swedish UTF_8,ISO_8859_1 swedish,sv,swe
+arabic UTF_8 arabic,ar,ara
+danish UTF_8 danish,da,dan
+dutch UTF_8 dutch,nl,dut,nld
+english UTF_8 english,en,eng
+finnish UTF_8 finnish,fi,fin
+french UTF_8 french,fr,fre,fra
+german UTF_8 german,de,ger,deu
+greek UTF_8 greek,el,gre,ell
+hindi UTF_8 hindi,hi,hin
+hungarian UTF_8 hungarian,hu,hun
+indonesian UTF_8 indonesian,id,ind
+italian UTF_8 italian,it,ita
+lithuanian UTF_8 lithuanian,lt,lit
+nepali UTF_8 nepali,ne,nep
+norwegian UTF_8 norwegian,no,nor
+portuguese UTF_8 portuguese,pt,por
+romanian UTF_8 romanian,ro,rum,ron
+russian UTF_8 russian,ru,rus
+serbian UTF_8 serbian,sr,srp
+spanish UTF_8 spanish,es,esl,spa
+swedish UTF_8 swedish,sv,swe
+tamil UTF_8 tamil,ta,tam
turkish UTF_8 turkish,tr,tur
# Also include the traditional porter algorithm for english.
# The porter algorithm is included in the libstemmer distribution to assist
# with backwards compatibility, but for new systems the english algorithm
# should be used in preference.
-porter UTF_8,ISO_8859_1 porter
+porter UTF_8 porter english
# Some other stemmers in the snowball project are not included in the standard
# distribution. To compile a libstemmer with them in, add them to this list,
@@ -39,12 +47,12 @@ porter UTF_8,ISO_8859_1 porter
# algorithms are:
#
# german2 - This is a slight modification of the german stemmer.
-#german2 UTF_8,ISO_8859_1 german2
+#german2 UTF_8,ISO_8859_1 german2 german
#
# kraaij_pohlmann - This is a different dutch stemmer.
-#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann
+#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch
#
# lovins - This is an english stemmer, but fairly outdated, and
# only really applicable to a restricted type of input text
# (keywords in academic publications).
-#lovins UTF_8,ISO_8859_1 lovins
+#lovins UTF_8,ISO_8859_1 lovins english
diff --git a/contrib/snowball/runtime/api.c b/contrib/snowball/runtime/api.c
index 40039ef4a..21ea5f246 100644
--- a/contrib/snowball/runtime/api.c
+++ b/contrib/snowball/runtime/api.c
@@ -2,7 +2,7 @@
#include <stdlib.h> /* for calloc, free */
#include "header.h"
-extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
+extern struct SN_env * SN_create_env(int S_size, int I_size)
{
struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env));
if (z == NULL) return NULL;
@@ -27,12 +27,6 @@ extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
if (z->I == NULL) goto error;
}
- if (B_size)
- {
- z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char));
- if (z->B == NULL) goto error;
- }
-
return z;
error:
SN_close_env(z, S_size);
@@ -52,7 +46,6 @@ extern void SN_close_env(struct SN_env * z, int S_size)
free(z->S);
}
free(z->I);
- free(z->B);
if (z->p) lose_s(z->p);
free(z);
}
@@ -63,4 +56,3 @@ extern int SN_set_current(struct SN_env * z, int size, const symbol * s)
z->c = 0;
return err;
}
-
diff --git a/contrib/snowball/runtime/api.h b/contrib/snowball/runtime/api.h
index 8b997f0c2..ba9d1c14b 100644
--- a/contrib/snowball/runtime/api.h
+++ b/contrib/snowball/runtime/api.h
@@ -16,11 +16,17 @@ struct SN_env {
int c; int l; int lb; int bra; int ket;
symbol * * S;
int * I;
- unsigned char * B;
};
-extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size);
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern struct SN_env * SN_create_env(int S_size, int I_size);
extern void SN_close_env(struct SN_env * z, int S_size);
extern int SN_set_current(struct SN_env * z, int size, const symbol * s);
+#ifdef __cplusplus
+}
+#endif
diff --git a/contrib/snowball/runtime/header.h b/contrib/snowball/runtime/header.h
index 4d3078f50..85a42fdb8 100644
--- a/contrib/snowball/runtime/header.h
+++ b/contrib/snowball/runtime/header.h
@@ -54,5 +54,6 @@ extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p);
extern symbol * slice_to(struct SN_env * z, symbol * p);
extern symbol * assign_to(struct SN_env * z, symbol * p);
-extern void debug(struct SN_env * z, int number, int line_count);
+extern int len_utf8(const symbol * p);
+extern void debug(struct SN_env * z, int number, int line_count);
diff --git a/contrib/snowball/runtime/utilities.c b/contrib/snowball/runtime/utilities.c
index 1840f0280..1cfd1a17d 100644
--- a/contrib/snowball/runtime/utilities.c
+++ b/contrib/snowball/runtime/utilities.c
@@ -5,8 +5,6 @@
#include "header.h"
-#define unless(C) if(!(C))
-
#define CREATE_SIZE 1
extern symbol * create_s(void) {
@@ -15,7 +13,7 @@ extern symbol * create_s(void) {
if (mem == NULL) return NULL;
p = (symbol *) (HEAD + (char *) mem);
CAPACITY(p) = CREATE_SIZE;
- SET_SIZE(p, CREATE_SIZE);
+ SET_SIZE(p, 0);
return p;
}
@@ -27,7 +25,7 @@ extern void lose_s(symbol * p) {
/*
new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new
- position, or 0 on failure.
+ position, or -1 on failure.
-- used to implement hop and next in the utf8 case.
*/
@@ -66,77 +64,95 @@ extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) {
/* Code for character groupings: utf8 cases */
static int get_utf8(const symbol * p, int c, int l, int * slot) {
- int b0, b1;
+ int b0, b1, b2;
if (c >= l) return 0;
b0 = p[c++];
if (b0 < 0xC0 || c == l) { /* 1100 0000 */
- * slot = b0; return 1;
+ *slot = b0;
+ return 1;
}
- b1 = p[c++];
+ b1 = p[c++] & 0x3F;
if (b0 < 0xE0 || c == l) { /* 1110 0000 */
- * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
+ *slot = (b0 & 0x1F) << 6 | b1;
+ return 2;
+ }
+ b2 = p[c++] & 0x3F;
+ if (b0 < 0xF0 || c == l) { /* 1111 0000 */
+ *slot = (b0 & 0xF) << 12 | b1 << 6 | b2;
+ return 3;
}
- * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[c] & 0x3F); return 3;
+ *slot = (b0 & 0xE) << 18 | b1 << 12 | b2 << 6 | (p[c] & 0x3F);
+ return 4;
}
static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
- int b0, b1;
+ int a, b;
if (c <= lb) return 0;
- b0 = p[--c];
- if (b0 < 0x80 || c == lb) { /* 1000 0000 */
- * slot = b0; return 1;
+ b = p[--c];
+ if (b < 0x80 || c == lb) { /* 1000 0000 */
+ *slot = b;
+ return 1;
}
- b1 = p[--c];
- if (b1 >= 0xC0 || c == lb) { /* 1100 0000 */
- * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
+ a = b & 0x3F;
+ b = p[--c];
+ if (b >= 0xC0 || c == lb) { /* 1100 0000 */
+ *slot = (b & 0x1F) << 6 | a;
+ return 2;
}
- * slot = (p[c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
+ a |= (b & 0x3F) << 6;
+ b = p[--c];
+ if (b >= 0xE0 || c == lb) { /* 1110 0000 */
+ *slot = (b & 0xF) << 12 | a;
+ return 3;
+ }
+ *slot = (p[--c] & 0xE) << 18 | (b & 0x3F) << 12 | a;
+ return 4;
}
extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
- int ch;
- int w = get_utf8(z->p, z->c, z->l, & ch);
- unless (w) return -1;
- if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
- return w;
- z->c += w;
+ int ch;
+ int w = get_utf8(z->p, z->c, z->l, & ch);
+ if (!w) return -1;
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return w;
+ z->c += w;
} while (repeat);
return 0;
}
extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
- int ch;
- int w = get_b_utf8(z->p, z->c, z->lb, & ch);
- unless (w) return -1;
- if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
- return w;
- z->c -= w;
+ int ch;
+ int w = get_b_utf8(z->p, z->c, z->lb, & ch);
+ if (!w) return -1;
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return w;
+ z->c -= w;
} while (repeat);
return 0;
}
extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
- int ch;
- int w = get_utf8(z->p, z->c, z->l, & ch);
- unless (w) return -1;
- unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
- return w;
- z->c += w;
+ int ch;
+ int w = get_utf8(z->p, z->c, z->l, & ch);
+ if (!w) return -1;
+ if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
+ return w;
+ z->c += w;
} while (repeat);
return 0;
}
extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
- int ch;
- int w = get_b_utf8(z->p, z->c, z->lb, & ch);
- unless (w) return -1;
- unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
- return w;
- z->c -= w;
+ int ch;
+ int w = get_b_utf8(z->p, z->c, z->lb, & ch);
+ if (!w) return -1;
+ if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
+ return w;
+ z->c -= w;
} while (repeat);
return 0;
}
@@ -145,48 +161,48 @@ extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min,
extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
- int ch;
- if (z->c >= z->l) return -1;
- ch = z->p[z->c];
- if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
- return 1;
- z->c++;
+ int ch;
+ if (z->c >= z->l) return -1;
+ ch = z->p[z->c];
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return 1;
+ z->c++;
} while (repeat);
return 0;
}
extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
- int ch;
- if (z->c <= z->lb) return -1;
- ch = z->p[z->c - 1];
- if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
- return 1;
- z->c--;
+ int ch;
+ if (z->c <= z->lb) return -1;
+ ch = z->p[z->c - 1];
+ if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
+ return 1;
+ z->c--;
} while (repeat);
return 0;
}
extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
- int ch;
- if (z->c >= z->l) return -1;
- ch = z->p[z->c];
- unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
- return 1;
- z->c++;
+ int ch;
+ if (z->c >= z->l) return -1;
+ ch = z->p[z->c];
+ if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
+ return 1;
+ z->c++;
} while (repeat);
return 0;
}
extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
- int ch;
- if (z->c <= z->lb) return -1;
- ch = z->p[z->c - 1];
- unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
- return 1;
- z->c--;
+ int ch;
+ if (z->c <= z->lb) return -1;
+ ch = z->p[z->c - 1];
+ if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
+ return 1;
+ z->c--;
} while (repeat);
return 0;
}
@@ -215,7 +231,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
int j = v_size;
int c = z->c; int l = z->l;
- symbol * q = z->p + c;
+ const symbol * q = z->p + c;
const struct among * w;
@@ -224,7 +240,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
int first_key_inspected = 0;
- while(1) {
+ while (1) {
int k = i + ((j - i) >> 1);
int diff = 0;
int common = common_i < common_j ? common_i : common_j; /* smaller */
@@ -237,8 +253,13 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
common++;
}
}
- if (diff < 0) { j = k; common_j = common; }
- else { i = k; common_i = common; }
+ if (diff < 0) {
+ j = k;
+ common_j = common;
+ } else {
+ i = k;
+ common_i = common;
+ }
if (j - i <= 1) {
if (i > 0) break; /* v->s has been inspected */
if (j == i) break; /* only one item in v */
@@ -251,7 +272,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
first_key_inspected = 1;
}
}
- while(1) {
+ while (1) {
w = v + i;
if (common_i >= w->s_size) {
z->c = c + w->s_size;
@@ -275,7 +296,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) {
int j = v_size;
int c = z->c; int lb = z->lb;
- symbol * q = z->p + c - 1;
+ const symbol * q = z->p + c - 1;
const struct among * w;
@@ -284,7 +305,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) {
int first_key_inspected = 0;
- while(1) {
+ while (1) {
int k = i + ((j - i) >> 1);
int diff = 0;
int common = common_i < common_j ? common_i : common_j;
@@ -306,7 +327,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) {
first_key_inspected = 1;
}
}
- while(1) {
+ while (1) {
w = v + i;
if (common_i >= w->s_size) {
z->c = c - w->s_size;
@@ -367,11 +388,10 @@ extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const
z->l += adjustment;
if (z->c >= c_ket)
z->c += adjustment;
- else
- if (z->c > c_bra)
- z->c = c_bra;
+ else if (z->c > c_bra)
+ z->c = c_bra;
}
- unless (s_size == 0) memmove(z->p + c_bra, s, s_size * sizeof(symbol));
+ if (s_size) memmove(z->p + c_bra, s, s_size * sizeof(symbol));
if (adjptr != NULL)
*adjptr = adjustment;
return 0;
@@ -417,12 +437,7 @@ extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbo
}
extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) {
- int adjustment;
- if (replace_s(z, bra, ket, SIZE(p), p, &adjustment))
- return -1;
- if (bra <= z->bra) z->bra += adjustment;
- if (bra <= z->ket) z->ket += adjustment;
- return 0;
+ return insert_s(z, bra, ket, SIZE(p), p);
}
extern symbol * slice_to(struct SN_env * z, symbol * p) {
@@ -455,6 +470,16 @@ extern symbol * assign_to(struct SN_env * z, symbol * p) {
return p;
}
+extern int len_utf8(const symbol * p) {
+ int size = SIZE(p);
+ int len = 0;
+ while (size--) {
+ symbol b = *p++;
+ if (b >= 0xC0 || b < 0x80) ++len;
+ }
+ return len;
+}
+
#if 0
extern void debug(struct SN_env * z, int number, int line_count) {
int i;