diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-02-25 09:55:31 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-02-25 09:55:31 +0000 |
commit | b87995255fa2ef0de97d509b8cd27860f014e90f (patch) | |
tree | ff7fcc84aa85fcd4cd129d94f6fb23ac5f91d4cb /contrib/snowball | |
parent | 52154a6c1dd7e46c174d4aab782494b92f955df5 (diff) | |
download | rspamd-b87995255fa2ef0de97d509b8cd27860f014e90f.tar.gz rspamd-b87995255fa2ef0de97d509b8cd27860f014e90f.zip |
[Rework] Update snowball stemmer to 2.0 and remove all crap aside of UTF8
Diffstat (limited to 'contrib/snowball')
70 files changed, 8903 insertions, 6045 deletions
diff --git a/contrib/snowball/CMakeLists.txt b/contrib/snowball/CMakeLists.txt index 7910c7b3a..015f75d1b 100644 --- a/contrib/snowball/CMakeLists.txt +++ b/contrib/snowball/CMakeLists.txt @@ -1,20 +1,14 @@ # End of configuration -SET(LIBSTEM_ALGORITHMS danish dutch english finnish french german hungarian - italian norwegian porter portuguese romanian - russian spanish swedish turkish) -SET(KOI8_ALGORITHMS russian) -SET(ISO_8859_1_ALGORITHMS danish dutch english finnish french german italian - norwegian porter portuguese spanish swedish) -SET(ISO_8859_2_ALGORITHMS hungarian romanian) -SET(OTHER_ALGORITHMS german2 kraaij_pohlmann lovins) -SET(ALL_ALGORITHMS ${LIBSTEM_ALGORITHMS} ${OTHER_ALGORITHMS}) +SET(LIBSTEM_ALGORITHMS arabic danish dutch english finnish french german greek hindi hungarian + indonesian italian lithuanian nepali norwegian porter portuguese romanian + russian serbian spanish swedish tamil turkish) +SET(ALL_ALGORITHMS ${LIBSTEM_ALGORITHMS}) SET(COMPILER_SOURCES compiler/space.c compiler/tokeniser.c compiler/analyser.c compiler/generator.c - compiler/driver.c - compiler/generator_java.c) + compiler/driver.c) SET(SNOWBALL_RUNTIME runtime/api.c runtime/utilities.c) @@ -24,9 +18,15 @@ SET(LIBSTEMMER_UTF8_SOURCES libstemmer/libstemmer_utf8.c) #LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h #LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in -SET(STEMWORDS_SOURCES examples/stemwords.c) SET(MODULES_H "modules.h") CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/libstemmer_c.in ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/libstemmer.c @ONLY) +ADD_DEFINITIONS("-DDISABLE_JS") +ADD_DEFINITIONS("-DDISABLE_GO") +ADD_DEFINITIONS("-DDISABLE_JAVA") +ADD_DEFINITIONS("-DDISABLE_PYTHON") +ADD_DEFINITIONS("-DDISABLE_CSHARP") +ADD_DEFINITIONS("-DDISABLE_PASCAL") +ADD_DEFINITIONS("-DDISABLE_RUST") MACRO(gen_stem IN ENCODING) FOREACH(_it ${IN}) @@ -34,7 +34,7 @@ MACRO(gen_stem IN ENCODING) SET(_header "${_base}.h") SET(_source "${_base}.c") STRING(REPLACE "UTF_8" "Unicode" _in_enc "${ENCODING}") - SET(_input "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}/stem_${_in_enc}.sbl") + SET(_input "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}.sbl") IF(${_in_enc} STREQUAL "Unicode" AND NOT EXISTS ${_input}) ADD_CUSTOM_COMMAND(OUTPUT ${_source} COMMAND ${CMAKE_CURRENT_BINARY_DIR}/snowball "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}/stem_ISO_8859_1.sbl" -o ${_base} -eprefix ${_it}_${ENCODING}_ -r ../runtime -u @@ -57,7 +57,7 @@ INCLUDE_DIRECTORIES("include") ADD_EXECUTABLE(snowball ${COMPILER_SOURCES}) ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h - COMMAND ${PERL_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/mkmodules.pl libstemmer/modules.h libstemmer ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/modules.txt libstemmer/mkinc.mak) + COMMAND ${PERL_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/mkmodules.pl ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h libstemmer ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/modules.txt ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/mkinc.mak) ADD_CUSTOM_TARGET(modules DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h") SET(STEMMER_SOURCES "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/libstemmer.c") @@ -65,13 +65,6 @@ ADD_CUSTOM_TARGET(stemmer_deps ALL) ADD_DEPENDENCIES(stemmer_deps modules) gen_stem("${LIBSTEM_ALGORITHMS}" "UTF_8") -gen_stem("${KOI8_ALGORITHMS}" "KOI8_R") -gen_stem("${ISO_8859_1_ALGORITHMS}" "ISO_8859_1") -gen_stem("${ISO_8859_2_ALGORITHMS}" "ISO_8859_2") - ADD_LIBRARY(stemmer ${LINK_TYPE} ${SNOWBALL_RUNTIME} ${STEMMER_SOURCES}) ADD_DEPENDENCIES(stemmer stemmer_deps) - -ADD_EXECUTABLE(stemwords ${STEMWORDS_SOURCES}) -TARGET_LINK_LIBRARIES(stemwords stemmer) diff --git a/contrib/snowball/GNUmakefile b/contrib/snowball/GNUmakefile deleted file mode 100644 index a30cafd89..000000000 --- a/contrib/snowball/GNUmakefile +++ /dev/null @@ -1,300 +0,0 @@ -# -*- makefile -*- - -c_src_dir = src_c -java_src_main_dir = java/org/tartarus/snowball -java_src_dir = $(java_src_main_dir)/ext - -libstemmer_algorithms = danish dutch english finnish french german hungarian \ - italian \ - norwegian porter portuguese romanian \ - russian spanish swedish turkish - -KOI8_R_algorithms = russian -ISO_8859_1_algorithms = danish dutch english finnish french german italian \ - norwegian porter portuguese spanish swedish -ISO_8859_2_algorithms = hungarian romanian - -other_algorithms = german2 kraaij_pohlmann lovins - -all_algorithms = $(libstemmer_algorithms) $(other_algorithms) - -COMPILER_SOURCES = compiler/space.c \ - compiler/tokeniser.c \ - compiler/analyser.c \ - compiler/generator.c \ - compiler/driver.c \ - compiler/generator_java.c -COMPILER_HEADERS = compiler/header.h \ - compiler/syswords.h \ - compiler/syswords2.h - -RUNTIME_SOURCES = runtime/api.c \ - runtime/utilities.c -RUNTIME_HEADERS = runtime/api.h \ - runtime/header.h - -JAVARUNTIME_SOURCES = java/org/tartarus/snowball/Among.java \ - java/org/tartarus/snowball/SnowballProgram.java \ - java/org/tartarus/snowball/SnowballStemmer.java \ - java/org/tartarus/snowball/TestApp.java - -LIBSTEMMER_SOURCES = libstemmer/libstemmer.c -LIBSTEMMER_UTF8_SOURCES = libstemmer/libstemmer_utf8.c -LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h -LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in - -STEMWORDS_SOURCES = examples/stemwords.c - -ALL_ALGORITHM_FILES = $(all_algorithms:%=algorithms/%/stem*.sbl) -C_LIB_SOURCES = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) \ - $(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.c) \ - $(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.c) \ - $(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.c) -C_LIB_HEADERS = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) \ - $(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.h) \ - $(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.h) \ - $(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.h) -C_OTHER_SOURCES = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) -C_OTHER_HEADERS = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) -JAVA_SOURCES = $(libstemmer_algorithms:%=$(java_src_dir)/%Stemmer.java) - -COMPILER_OBJECTS=$(COMPILER_SOURCES:.c=.o) -RUNTIME_OBJECTS=$(RUNTIME_SOURCES:.c=.o) -LIBSTEMMER_OBJECTS=$(LIBSTEMMER_SOURCES:.c=.o) -LIBSTEMMER_UTF8_OBJECTS=$(LIBSTEMMER_UTF8_SOURCES:.c=.o) -STEMWORDS_OBJECTS=$(STEMWORDS_SOURCES:.c=.o) -C_LIB_OBJECTS = $(C_LIB_SOURCES:.c=.o) -C_OTHER_OBJECTS = $(C_OTHER_SOURCES:.c=.o) -JAVA_CLASSES = $(JAVA_SOURCES:.java=.class) -JAVA_RUNTIME_CLASSES=$(JAVARUNTIME_SOURCES:.java=.class) - -CFLAGS=-Iinclude -O2 -CPPFLAGS=-W -Wall -Wmissing-prototypes -Wmissing-declarations - -all: snowball libstemmer.o stemwords $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) - -clean: - rm -f $(COMPILER_OBJECTS) $(RUNTIME_OBJECTS) \ - $(LIBSTEMMER_OBJECTS) $(LIBSTEMMER_UTF8_OBJECTS) $(STEMWORDS_OBJECTS) snowball \ - libstemmer.o stemwords \ - libstemmer/modules.h \ - libstemmer/modules_utf8.h \ - snowball.splint \ - $(C_LIB_SOURCES) $(C_LIB_HEADERS) $(C_LIB_OBJECTS) \ - $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) \ - $(JAVA_SOURCES) $(JAVA_CLASSES) $(JAVA_RUNTIME_CLASSES) \ - libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak \ - libstemmer/libstemmer.c libstemmer/libstemmer_utf8.c - rm -rf dist - rmdir $(c_src_dir) || true - -snowball: $(COMPILER_OBJECTS) - $(CC) -o $@ $^ - -$(COMPILER_OBJECTS): $(COMPILER_HEADERS) - -libstemmer/libstemmer.c: libstemmer/libstemmer_c.in - sed 's/@MODULES_H@/modules.h/' $^ >$@ - -libstemmer/libstemmer_utf8.c: libstemmer/libstemmer_c.in - sed 's/@MODULES_H@/modules_utf8.h/' $^ >$@ - -libstemmer/modules.h libstemmer/mkinc.mak: libstemmer/mkmodules.pl libstemmer/modules.txt - libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules.txt libstemmer/mkinc.mak - -libstemmer/modules_utf8.h libstemmer/mkinc_utf8.mak: libstemmer/mkmodules.pl libstemmer/modules_utf8.txt - libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules_utf8.txt libstemmer/mkinc_utf8.mak utf8 - -libstemmer/libstemmer.o: libstemmer/modules.h $(C_LIB_HEADERS) - -libstemmer.o: libstemmer/libstemmer.o $(RUNTIME_OBJECTS) $(C_LIB_OBJECTS) - $(AR) -cru $@ $^ - -stemwords: $(STEMWORDS_OBJECTS) libstemmer.o - $(CC) -o $@ $^ - -algorithms/%/stem_Unicode.sbl: algorithms/%/stem_ISO_8859_1.sbl - cp $^ $@ - -$(c_src_dir)/stem_UTF_8_%.c $(c_src_dir)/stem_UTF_8_%.h: algorithms/%/stem_Unicode.sbl snowball - @mkdir -p $(c_src_dir) - @l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \ - o="$(c_src_dir)/stem_UTF_8_$${l}"; \ - echo "./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u"; \ - ./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u - -$(c_src_dir)/stem_KOI8_R_%.c $(c_src_dir)/stem_KOI8_R_%.h: algorithms/%/stem_KOI8_R.sbl snowball - @mkdir -p $(c_src_dir) - @l=`echo "$<" | sed 's!\(.*\)/stem_KOI8_R.sbl$$!\1!;s!^.*/!!'`; \ - o="$(c_src_dir)/stem_KOI8_R_$${l}"; \ - echo "./snowball $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime"; \ - ./snowball $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime - -$(c_src_dir)/stem_ISO_8859_1_%.c $(c_src_dir)/stem_ISO_8859_1_%.h: algorithms/%/stem_ISO_8859_1.sbl snowball - @mkdir -p $(c_src_dir) - @l=`echo "$<" | sed 's!\(.*\)/stem_ISO_8859_1.sbl$$!\1!;s!^.*/!!'`; \ - o="$(c_src_dir)/stem_ISO_8859_1_$${l}"; \ - echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime"; \ - ./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime - -$(c_src_dir)/stem_ISO_8859_2_%.c $(c_src_dir)/stem_ISO_8859_2_%.h: algorithms/%/stem_ISO_8859_2.sbl snowball - @mkdir -p $(c_src_dir) - @l=`echo "$<" | sed 's!\(.*\)/stem_ISO_8859_2.sbl$$!\1!;s!^.*/!!'`; \ - o="$(c_src_dir)/stem_ISO_8859_2_$${l}"; \ - echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime"; \ - ./snowball $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime - -$(c_src_dir)/stem_%.o: $(c_src_dir)/stem_%.c $(c_src_dir)/stem_%.h - $(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ $< - -$(java_src_dir)/%Stemmer.java: algorithms/%/stem_Unicode.sbl snowball - @mkdir -p $(java_src_dir) - @l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \ - o="$(java_src_dir)/$${l}Stemmer"; \ - echo "./snowball $< -j -o $${o} -p \"org.tartarus.snowball.SnowballStemmer\" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer"; \ - ./snowball $< -j -o $${o} -p "org.tartarus.snowball.SnowballStemmer" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer - -splint: snowball.splint -snowball.splint: $(COMPILER_SOURCES) - splint $^ >$@ -weak - -# Make a full source distribution -dist: dist_snowball dist_libstemmer_c dist_libstemmer_java - -# Make a distribution of all the sources involved in snowball -dist_snowball: $(COMPILER_SOURCES) $(COMPILER_HEADERS) \ - $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \ - $(LIBSTEMMER_SOURCES) \ - $(LIBSTEMMER_UTF8_SOURCES) \ - $(LIBSTEMMER_HEADERS) \ - $(LIBSTEMMER_EXTRA) \ - $(ALL_ALGORITHM_FILES) $(STEMWORDS_SOURCES) \ - GNUmakefile README doc/TODO libstemmer/mkmodules.pl - destname=snowball_code; \ - dest=dist/$${destname}; \ - rm -rf $${dest} && \ - rm -f $${dest}.tgz && \ - for file in $^; do \ - dir=`dirname $$file` && \ - mkdir -p $${dest}/$${dir} && \ - cp -a $${file} $${dest}/$${dir} || exit 1 ; \ - done && \ - (cd dist && tar zcf $${destname}.tgz $${destname}) && \ - rm -rf $${dest} - -# Make a distribution of all the sources required to compile the C library. -dist_libstemmer_c: \ - $(RUNTIME_SOURCES) \ - $(RUNTIME_HEADERS) \ - $(LIBSTEMMER_SOURCES) \ - $(LIBSTEMMER_UTF8_SOURCES) \ - $(LIBSTEMMER_HEADERS) \ - $(LIBSTEMMER_EXTRA) \ - $(C_LIB_SOURCES) \ - $(C_LIB_HEADERS) \ - libstemmer/mkinc.mak \ - libstemmer/mkinc_utf8.mak - destname=libstemmer_c; \ - dest=dist/$${destname}; \ - rm -rf $${dest} && \ - rm -f $${dest}.tgz && \ - mkdir -p $${dest} && \ - cp -a doc/libstemmer_c_README $${dest}/README && \ - mkdir -p $${dest}/examples && \ - cp -a examples/stemwords.c $${dest}/examples && \ - mkdir -p $${dest}/$(c_src_dir) && \ - cp -a $(C_LIB_SOURCES) $(C_LIB_HEADERS) $${dest}/$(c_src_dir) && \ - mkdir -p $${dest}/runtime && \ - cp -a $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) $${dest}/runtime && \ - mkdir -p $${dest}/libstemmer && \ - cp -a $(LIBSTEMMER_SOURCES) $(LIBSTEMMER_UTF8_SOURCES) $(LIBSTEMMER_HEADERS) $(LIBSTEMMER_EXTRA) $${dest}/libstemmer && \ - mkdir -p $${dest}/include && \ - mv $${dest}/libstemmer/libstemmer.h $${dest}/include && \ - (cd $${dest} && \ - echo "README" >> MANIFEST && \ - ls $(c_src_dir)/*.c $(c_src_dir)/*.h >> MANIFEST && \ - ls runtime/*.c runtime/*.h >> MANIFEST && \ - ls libstemmer/*.c libstemmer/*.h >> MANIFEST && \ - ls include/*.h >> MANIFEST) && \ - cp -a libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak $${dest}/ && \ - echo 'include mkinc.mak' >> $${dest}/Makefile && \ - echo 'CFLAGS=-Iinclude' >> $${dest}/Makefile && \ - echo 'all: libstemmer.o stemwords' >> $${dest}/Makefile && \ - echo 'libstemmer.o: $$(snowball_sources:.c=.o)' >> $${dest}/Makefile && \ - echo ' $$(AR) -cru $$@ $$^' >> $${dest}/Makefile && \ - echo 'stemwords: examples/stemwords.o libstemmer.o' >> $${dest}/Makefile && \ - echo ' $$(CC) -o $$@ $$^' >> $${dest}/Makefile && \ - echo 'clean:' >> $${dest}/Makefile && \ - echo ' rm -f stemwords *.o $(c_src_dir)/*.o runtime/*.o libstemmer/*.o' >> $${dest}/Makefile && \ - (cd dist && tar zcf $${destname}.tgz $${destname}) && \ - rm -rf $${dest} - -# Make a distribution of all the sources required to compile the Java library. -dist_libstemmer_java: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \ - $(LIBSTEMMER_EXTRA) \ - $(JAVA_SOURCES) - destname=libstemmer_java; \ - dest=dist/$${destname}; \ - rm -rf $${dest} && \ - rm -f $${dest}.tgz && \ - mkdir -p $${dest} && \ - cp -a doc/libstemmer_java_README $${dest}/README && \ - mkdir -p $${dest}/$(java_src_dir) && \ - cp -a $(JAVA_SOURCES) $${dest}/$(java_src_dir) && \ - mkdir -p $${dest}/$(java_src_main_dir) && \ - cp -a $(JAVARUNTIME_SOURCES) $${dest}/$(java_src_main_dir) && \ - (cd $${dest} && \ - echo "README" >> MANIFEST && \ - ls $(java_src_dir)/*.java >> MANIFEST && \ - ls $(java_src_main_dir)/*.java >> MANIFEST) && \ - (cd dist && tar zcf $${destname}.tgz $${destname}) && \ - rm -rf $${dest} - -check: check_utf8 check_iso_8859_1 check_iso_8859_2 check_koi8r - -check_utf8: $(libstemmer_algorithms:%=check_utf8_%) - -check_iso_8859_1: $(ISO_8859_1_algorithms:%=check_iso_8859_1_%) - -check_iso_8859_2: $(ISO_8859_2_algorithms:%=check_iso_8859_2_%) - -check_koi8r: $(KOI8_R_algorithms:%=check_koi8r_%) - -# Where the data files are located - assumed their repo is checked out as -# a sibling to this one. -STEMMING_DATA = ../snowball-data - -check_utf8_%: $(STEMMING_DATA)/% stemwords - @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with UTF-8" - @./stemwords -c UTF_8 -l `echo $<|sed 's!.*/!!'` -i $</voc.txt -o tmp.txt - @diff -u $</output.txt tmp.txt - @if [ -e $</diffs.txt ] ; \ - then \ - ./stemwords -c UTF_8 -l `echo $<|sed 's!.*/!!'` -i $</voc.txt -o tmp.txt -p2 && \ - diff -u $</diffs.txt tmp.txt; \ - fi - @rm tmp.txt - -check_iso_8859_1_%: $(STEMMING_DATA)/% stemwords - @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_1" - @python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-1"))' | \ - ./stemwords -c ISO_8859_1 -l `echo $<|sed 's!.*/!!'` -o tmp.txt - @python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-1"))' | \ - diff -u - tmp.txt - @rm tmp.txt - -check_iso_8859_2_%: $(STEMMING_DATA)/% stemwords - @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_2" - @python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-2"))' | \ - ./stemwords -c ISO_8859_2 -l `echo $<|sed 's!.*/!!'` -o tmp.txt - @python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-2"))' | \ - diff -u - tmp.txt - @rm tmp.txt - -check_koi8r_%: $(STEMMING_DATA)/% stemwords - @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with KOI8R" - @python -c 'print(open("$</voc.txt").read().decode("utf8").encode("koi8_r"))' | \ - ./stemwords -c KOI8_R -l `echo $<|sed 's!.*/!!'` -o tmp.txt - @python -c 'print(open("$</output.txt").read().decode("utf8").encode("koi8_r"))' | \ - diff -u - tmp.txt - @rm tmp.txt diff --git a/contrib/snowball/NEWS b/contrib/snowball/NEWS new file mode 100644 index 000000000..c71c12dd3 --- /dev/null +++ b/contrib/snowball/NEWS @@ -0,0 +1,407 @@ +Snowball 2.0.0 (2019-10-02) +=========================== + +C/C++ +----- + +* Fully handle 4-byte UTF-8 sequences. Previously `hop` and `next` handled + sequences of any length, but commands which look at the character value only + handled sequences up to length 3. Fixes #89. + +* Fix handling of a 3-byte UTF-8 sequence in a grouping in `backwardmode`. + +Java +---- + +* TestApp.java: + + - Always use UTF-8 for I/O. Patch from David Corbett (#80). + + - Allow reading input from stdin. + + - Remove rather pointless "stem n times" feature. + + - Only lower case ASCII to match stemwords.c. + + - Stem empty lines too to match stemwords.c. + +Code Quality Improvements +------------------------- + +* Fix various warnings from newer compilers. + +* Improve use of `const`. + +* Share common functions between compiler backends rather than having multiple + copies of the same code. + +* Assorted code clean-up. + +* Initialise line_labelled member of struct generator to 0. Previously we were + invoking undefined behaviour, though in practice it'll be zero initialised on + most platforms. + +New Code Generators +------------------- + +* Add Python generator (#24). Originally written by Yoshiki Shibukawa, with + additional updates by Dmitry Shachnev. + +* Add Javascript generator. Based on JSX generator (#26) written by Yoshiki + Shibukawa. + +* Add Rust generator from Jakob Demler (#51). + +* Add Go generator from Marty Schoch (#57). + +* Add C# generator. Based on patch from Cesar Souza (#16, #17). + +* Add Pascal generator. Based on Delphi backend from stemming.zip file on old + website (#75). + +New Language Features +--------------------- + +* Add `len` and `lenof` to measure Unicode length. These are similar to `size` + and `sizeof` (respectively), but `size` and `sizeof` return the length in + bytes under `-utf8`, whereas these new commands give the same result whether + using `-utf8`, `-widechars` or neither (but under `-utf8` they are O(n) in + the length of the string). For compatibility with existing code which might + use these as variable or function names, they stop being treated as tokens if + declared to be a variable or function. + +* New `{U+1234}` stringdef notation for Unicode codepoints. + +* More versatile integer tests. Now you can compare any two arithmetic + expressions with a relational operator in parentheses after the `$`, so for + example `$(len > 3)` can now be used when previously a temporary variable was + required: `$tmp = len $tmp > 3` + +Code generation improvements +---------------------------- + +* General: + + + Avoid unnecessarily saving and restoring of the cursor for more commands - + `atlimit`, `do`, `set` and `unset` all leave the cursor alone or always + restore its value, and for C `booltest` (which other languages already + handled). + + + Special case handling for `setlimit tomark AE`. All uses of setlimit in + the current stemmers we ship follow this pattern, and by special-casing we + can avoid having to save and restore the cursor (#74). + + + Merge duplicate actions in the same `among`. This reduces the size of the + switch/if-chain in the generated code which dispatch the among for many of + the stemmers. + + + Generate simpler code for `among`. We always check for a zero return value + when we call the among, so there's no point also checking for that in the + switch/if-chain. We can also avoid the switch/if-chain entirely when + there's only one possible outcome (besides the zero return). + + + Optimise code generated for `do <function call>`. This speeds up "make + check_python" by about 2%, and should speed up other interpreted languages + too (#110). + + + Generate more and better comments referencing snowball source. + + + Add homepage URL and compiler version as comments in generated files. + +* C/C++: + + + Fix `size` and `sizeof` to not report one too high (reported by Assem + Chelli in #32). + + + If signal `f` from a function call would lead to return from the current + function then handle this and bailing out on an error together with a + simple `if (ret <= 0) return ret;` + + + Inline testing for a single character literals. + + + Avoiding generating `|| 0` in corner case - this can result in a compiler + warning when building the generated code. + + + Implement `insert_v()` in terms of `insert_s()`. + + + Add conditional `extern "C"` so `runtime/api.h` can be included from C++ + code. Closes #90, reported by vvarma. + +* Java: + + + Fix functions in `among` to work in Java. We seem to need to make the + methods called from among `public` instead of `private`, and to call them + on `this` instead of the `methodObject` (which is cleaner anyway). No + revision in version control seems to generate working code for this case, + but Richard says it definitely used to work - possibly older JVMs failed to + correctly enforce the access controls when methods were invoked by + reflection. + + + Code after handling `f` by returning from the current function is + unreachable too. + + + Previously we incorrectly decided that code after an `or` was + unreachable in certain cases. None of the current stemmers in the + distribution triggered this, but Martin Porter's snowball version + of the Schinke Latin stemmer does. Fixes #58, reported by Alexander + Myltsev. + + + The reachability logic was failing to consider reachability from + the final command in an `or`. Fixes #82, reported by David Corbett. + + + Fix `maxint` and `minint`. Patch from David Corbett in #31. + + + Fix `$` on strings. The previous generated code was just wrong. This + doesn't affect any of the included algorithms, but for example breaks + Martin Porter's snowball implementation of Schinke's Latin Stemmer. + Issue noted by Jakob Demler while working on the Rust backend in #51, + and reported in the Schinke's Latin Stemmer by Alexander Myltsev + in #58. + + + Make SnowballProgram objects serializable. Patch from Oleg Smirnov in #43. + + + Eliminate range-check implementation for groupings. This was removed from + the C generator 10 years earlier, isn't used for any of the existing + algorithms, and it doesn't seem likely it would be - the grouping would + have to consist entirely of a contiguous block of Unicode code-points. + + + Simplify code generated for `repeat` and `atleast`. + + + Eliminate unused return values and variables from runtime functions. + + + Only import the `among` and `SnowballProgram` classes if they're actually + used. + + + Only generate `copy_from()` method if it's used. + + + Merge runtime functions `eq_s` and `eq_v` functions. + + + Java arrays know their own length so stop storing it separately. + + + Escape char 127 (DEL) in generated Java code. It's unlikely that this + character would actually be used in a real stemmer, so this was more of a + theoretical bug. + + + Drop unused import of InvocationTargetException from SnowballStemmer. + Reported by GerritDeMeulder in #72. + + + Fix lint check issues in generated Java code. The stemmer classes are only + referenced in the example app via reflection, so add + @SuppressWarnings("unused") for them. The stemmer classes override + equals() and hashCode() methods from the standard java Object class, so + mark these with @Override. Both suggested by GerritDeMeulder in #72. + + + Declare Java variables at point of use in generated code. Putting all + declarations at the top of the function was adding unnecessary complexity + to the Java generator code for no benefit. + + + Improve formatting of generated code. + +New stemming algorithms +----------------------- + +* Add Tamil stemmer from Damodharan Rajalingam (#2, #3). + +* Add Arabic stemmer from Assem Chelli (#32, #50). + +* Add Irish stemmer Jim O'Regan (#48). + +* Add Nepali stemmer from Arthur Zakirov (#70). + +* Add Indonesian stemmer from Olly Betts (#71). + +* Add Hindi stemmer from Olly Betts (#73). Thanks to David Corbett for review. + +* Add Lithuanian stemmer from Dainius Jocas (#22, #76). + +* Add Greek stemmer from Oleg Smirnov (#44). + +* Add Catalan and Basque stemmers from Israel Olalla (#104). + +Behavioural changes to existing algorithms +------------------------------------------ + +* Portuguese: + + + Replace incorrect Spanish suffixes by Portuguese suffixes (#1). + +* French: + + + The MSDOS CP850 version of the French algorithm was missing changes present + in the ISO8859-1 and Unicode versions. There's now a single version of + each algorithm which was based on the Unicode version. + + + Recognize French suffixes even when they begin with diaereses. Patch from + David Corbett in #78. + +* Russian: + + + We now normalise 'ё' to 'е' before stemming. The documentation has long + said "we assume ['ё'] is mapped into ['е']" but it's more convenient for + the stemmer to actually perform this normalisation. This change has no + effect if the caller is already normalising as we recommend. It's a change + in behaviour they aren't, but 'ё' occurs rarely (there are currently no + instances in our test vocabulary) and this improves behaviour when it does + occur. Patch from Eugene Mirotin (#65, #68). + +* Finish: + + + Adjust the Finnish algorithm not to mangle numbers. This change also + means it tends to leave foreign words alone. Fixes #66. + +* Danish: + + + Adjust Danish algorithm not to mangle alphanumeric codes. In particular + alphanumeric codes ending in a double digit (e.g. 0x0e00, hal9000, + space1999) are no longer mangled. See #81. + +Optimisations to existing algorithms +------------------------------------ + +* Turkish: + + + Simplify uses of `test` in stemmer code. + + + Check for 'ad' or 'soyad' more efficiently, and without needing the + strlen variable. This speeds up "make check_utf8_turkish" by 11% + on x86 Linux. + +* Kraaij-Pohlmann: + + + Eliminate variable x `$p1 <= cursor` is simpler and a little more efficient + than `setmark x $x >= p1`. + +Code clarity improvements to existing algorithms +------------------------------------------------ + +* Turkish: + + + Use , for cedilla to match the conventions used in other stemmers. + +* Kraaij-Pohlmann: + + + Avoid cryptic `[among ( (])` ... `)` construct - instead use the same + `[substring] among (` ... `)` construct we do in other stemmers. + +Compiler +-------- + +* Support conventional --help and --version options. + +* Warn if -r or -ep used with backend other than C/C++. + +* Warn if encoding command line options are specified when generating code in a + language with a fixed encoding. + +* The default classname is now set based on the output filename, so `-n` is now + often no longer needed. Fixes #64. + +* Avoid potential one byte buffer over-read when parsing snowball code. + +* Avoid comparing with uninitialised array element during compilation. + +* Improve `-syntax` output for `setlimit L for C`. + +* Optimise away double negation so generators don't have to worry about + generating `--` (decrement operator in many languages). Fixes #52, reported + by David Corbett. + +* Improved compiler error and warning messages: + + - We now report FILE:LINE: before each diagnostic message. + + - Improve warnings for unused declarations/definitions. + + - Warn for variables which are used, but either never initialised + or never read. + + - Flag non-ASCII literal strings. This is an error for wide Unicode, but + only a warning for single-byte and UTF-8 which work so long as the source + encoding matches the encoding used in the generated stemmer code. + + - Improve error recovery after an undeclared `define`. We now sniff the + token after the identifier and if it is `as` we parse as a routine, + otherwise we parse as a grouping. Previously we always just assumed it was + a routine, which gave a confusing second error if it was a grouping. + + - Improve error recovery after an unexpected token in `among`. Previously + we acted as if the unexpected token closed the `among` (this probably + wasn't intended but just a missing `break;` in a switch statement). Now we + issue an error and try the next token. + +* Report error instead of silently truncating character values (e.g. `hex 123` + previously silently became byte 0x23 which is `#` rather than a + g-with-cedilla). + +* Enlarge the initial input buffer size to 8192 bytes and double each time we + hit the end. Snowball programs are typically a few KB in size (with the + current largest we ship being the Greek stemmer at 27KB) so the previous + approach of starting with a 10 byte input buffer and increasing its size by + 50% plus 40 bytes each time it filled was inefficient, needing up to 15 + reallocations to load greek.sbl. + +* Identify variables only used by one `routine`/`external`. This information + isn't yet used, but such variables which are also always written to before + being read can be emitted as local variables in most target languages. + +* We now allow multiple source files on command line, and allow them to be + after (or even interspersed) with options to better match modern Unix + conventions. Support for multiple source files allows specifying a single + byte character set mapping via a source file of `stringdef`. + +* Avoid infinite recursion in compiler when optimising a recursive snowball + function. Recursive functions aren't typical in snowball programs, but + the compiler shouldn't crash for any input, especially not a valid one. + We now simply limit on how deep the compiler will recurse and make the + pessimistic assumption in the unlikely event we hit this limit. + +Build system: + +* `make clean` in C libstemmer_c distribution now removes `examples/*.o`. + (#59) + +* Fix all the places which previously had to have a list of stemmers to work + dynamically or be generated, so now only modules.txt needs updating to add + a new stemmer. + +* Add check_java make target which runs tests for java. + +* Support gzipped test data (the uncompressed arabic test data is too big for + github). + +* GNUmakefile: Drop useless `-eprefix` and `-r` options from snowball + invocations for Java - these are only meaningful when generating C code. + +* Pass CFLAGS when linking which matches convention (e.g. automake does it) and + facilitates use of tools such as ASan. Fixes #84, reported by Thomas + Pointhuber. + +* Add CI builds with -std=c90 to check compiler and generated code are C90 + (#54) + +libstemmer stuff: + +* Split out CPPFLAGS from CFLAGS and use CFLAGS when linking stemwords. + +* Add -O2 to CFLAGS. + +* Make generated tables of encodings and modules const. + +* Fix clang static analyzer memory leak warning (in practice this code path + can never actually be taken). Patch from Patrick O. Perry (#56) + +documentation + +* Added copyright and licensing details (#10). + +* Document that libstemmer supports ISO_8859_2 encoding. Currently hungarian + and romanian are available in ISO_8859_2. + +* Remove documentation falsely claiming that libstemmer supports CP850 + encoding. + +* CONTRIBUTING.rst: Add guidance for contributing new stemming algorithms and + new language backends. + +* Overhaul libstemmer_python_README. Most notably, replace the benchmark data + which was very out of date. diff --git a/contrib/snowball/algorithms/arabic.sbl b/contrib/snowball/algorithms/arabic.sbl new file mode 100644 index 000000000..d827ee7d8 --- /dev/null +++ b/contrib/snowball/algorithms/arabic.sbl @@ -0,0 +1,561 @@ +/* + * Authors: + * - Assem Chelli, < assem [dot] ch [at] gmail > + * - Abdelkrim Aries <ab [underscore] aries [at] esi [dot] dz> + * +*/ + +stringescapes { } + +/* the Arabic letters in Unicode */ +// Hamza +stringdef o '{U+0621}' // Hamza +stringdef ao '{U+0623}' // Hamza above Alef +stringdef ao_ '{U+0625}' // Hamza below Alef +stringdef a~ '{U+0622}' // Alef madda +stringdef wo '{U+0624}' // Hamza above waw +stringdef yo '{U+0626}' // Hamza above yeh + +// Letters +stringdef a '{U+0627}' // Alef +stringdef a_ '{U+0649}' // Alef Maksura +stringdef b '{U+0628}' // Beh +stringdef t_ '{U+0629}' // Teh_Marbuta +stringdef t '{U+062A}' // Teh +stringdef th '{U+062B}' // Theh +stringdef j '{U+062C}' // Jeem +stringdef h '{U+062D}' // Hah +stringdef x '{U+062E}' // Khah +stringdef d '{U+062F}' // Dal +stringdef dz '{U+0630}' // Thal +stringdef r '{U+0631}' // Reh +stringdef z '{U+0632}' // Zain +stringdef s '{U+0633}' // Seen +stringdef sh '{U+0634}' // Sheen +stringdef c '{U+0635}' // Sad +stringdef dh '{U+0636}' // Dad +stringdef tt '{U+0637}' // Tah +stringdef zh '{U+0638}' // Zah +stringdef i '{U+0639}' // Ain +stringdef gh '{U+063A}' // Ghain +stringdef f '{U+0641}' // Feh +stringdef q '{U+0642}' // Qaf +stringdef k '{U+0643}' // Kaf +stringdef l '{U+0644}' // Lam +stringdef m '{U+0645}' // Meem +stringdef n '{U+0646}' // Noon +stringdef e '{U+0647}' // Heh +stringdef w '{U+0648}' // Waw +stringdef y '{U+064A}' // Yeh + +// Diacritics +stringdef aan '{U+064B}' // FatHatan +stringdef uun '{U+064C}' // Dammatan +stringdef iin '{U+064D}' // Kasratan +stringdef aa '{U+064E}' // FatHa +stringdef uu '{U+064F}' // Damma +stringdef ii '{U+0650}' // Kasra +stringdef oo '{U+0652}' // Sukun +stringdef ~ '{U+0651}' // Shadda + +// Hindu–Arabic numerals +stringdef 0 '{U+0660}' +stringdef 1 '{U+0661}' +stringdef 2 '{U+0662}' +stringdef 3 '{U+0663}' +stringdef 4 '{U+0664}' +stringdef 5 '{U+0665}' +stringdef 6 '{U+0666}' +stringdef 7 '{U+0667}' +stringdef 8 '{U+0668}' +stringdef 9 '{U+0669}' + + +// Kasheeda +stringdef _ '{U+0640}' // Kasheeda, Tatweel + +// Shaped forms +stringdef o1 '{U+FE80}' // HAMZA +stringdef ao1 '{U+FE83}' // ALEF_HAMZA_ABOVE +stringdef ao2 '{U+FE84}' // ALEF_HAMZA_ABOVE +stringdef ao_1 '{U+FE87}' // ALEF_HAMZA_BELOW +stringdef ao_2 '{U+FE88}' // ALEF_HAMZA_BELOW +stringdef yo1 '{U+FE8B}' // YEH_HAMZA +stringdef yo2 '{U+FE8C}' // YEH_HAMZA +stringdef yo3 '{U+FE89}' // YEH_HAMZA +stringdef yo4 '{U+FE8A}' // YEH_HAMZA +stringdef a~1 '{U+FE81}' // ALEF_MADDA +stringdef a~2 '{U+FE82}' // ALEF_MADDA +stringdef wo1 '{U+FE85}' // WAW_HAMZA +stringdef wo2 '{U+FE86}' // WAW_HAMZA +stringdef a1 '{U+FE8D}' // ALEF +stringdef a2 '{U+FE8E}' // ALEF +stringdef b1 '{U+FE8F}' // BEH +stringdef b2 '{U+FE90}' // BEH +stringdef b3 '{U+FE91}' // BEH +stringdef b4 '{U+FE92}' // BEH +stringdef t_1 '{U+FE93}' // TEH_MARBUTA +stringdef t_2 '{U+FE94}' // TEH_MARBUTA +stringdef t1 '{U+FE97}' // TEH +stringdef t2 '{U+FE98}' // TEH +stringdef t3 '{U+FE95}' // TEH +stringdef t4 '{U+FE96}' // TEH +stringdef th1 '{U+FE9B}' // THEH +stringdef th2 '{U+FE9C}' // THEH +stringdef th3 '{U+FE9A}' // THEH +stringdef th4 '{U+FE99}' // THEH +stringdef j1 '{U+FE9F}' // JEEM +stringdef j2 '{U+FEA0}' // JEEM +stringdef j3 '{U+FE9D}' // JEEM +stringdef j4 '{U+FE9E}' // JEEM +stringdef h1 '{U+FEA3}' // HAH +stringdef h2 '{U+FEA4}' // HAH +stringdef h3 '{U+FEA1}' // HAH +stringdef h4 '{U+FEA2}' // HAH +stringdef x1 '{U+FEA7}' // KHAH +stringdef x2 '{U+FEA8}' // KHAH +stringdef x3 '{U+FEA5}' // KHAH +stringdef x4 '{U+FEA6}' // KHAH +stringdef d1 '{U+FEA9}' // DAL +stringdef d2 '{U+FEAA}' // DAL +stringdef dz1 '{U+FEAB}' // THAL +stringdef dz2 '{U+FEAC}' // THAL +stringdef r1 '{U+FEAD}' // REH +stringdef r2 '{U+FEAE}' // REH +stringdef z1 '{U+FEAF}' // ZAIN +stringdef z2 '{U+FEB0}' // ZAIN +stringdef s1 '{U+FEB3}' // SEEN +stringdef s2 '{U+FEB4}' // SEEN +stringdef s3 '{U+FEB1}' // SEEN +stringdef s4 '{U+FEB2}' // SEEN +stringdef sh1 '{U+FEB7}' // SHEEN +stringdef sh2 '{U+FEB8}' // SHEEN +stringdef sh3 '{U+FEB5}' // SHEEN +stringdef sh4 '{U+FEB6}' // SHEEN +stringdef c1 '{U+FEBB}' // SAD +stringdef c2 '{U+FEBC}' // SAD +stringdef c3 '{U+FEB9}' // SAD +stringdef c4 '{U+FEBA}' // SAD +stringdef dh1 '{U+FEBF}' // DAD +stringdef dh2 '{U+FEC0}' // DAD +stringdef dh3 '{U+FEBD}' // DAD +stringdef dh4 '{U+FEBE}' // DAD +stringdef tt1 '{U+FEC3}' // TAH +stringdef tt2 '{U+FEC4}' // TAH +stringdef tt3 '{U+FEC1}' // TAH +stringdef tt4 '{U+FEC2}' // TAH +stringdef zh1 '{U+FEC7}' // ZAH +stringdef zh2 '{U+FEC8}' // ZAH +stringdef zh3 '{U+FEC5}' // ZAH +stringdef zh4 '{U+FEC6}' // ZAH +stringdef i1 '{U+FECB}' // AIN +stringdef i2 '{U+FECC}' // AIN +stringdef i3 '{U+FEC9}' // AIN +stringdef i4 '{U+FECA}' // AIN +stringdef gh1 '{U+FECF}' // GHAIN +stringdef gh2 '{U+FED0}' // GHAIN +stringdef gh3 '{U+FECD}' // GHAIN +stringdef gh4 '{U+FECE}' // GHAIN +stringdef f1 '{U+FED3}' // FEH +stringdef f2 '{U+FED4}' // FEH +stringdef f3 '{U+FED1}' // FEH +stringdef f4 '{U+FED2}' // FEH +stringdef q1 '{U+FED7}' // QAF +stringdef q2 '{U+FED8}' // QAF +stringdef q3 '{U+FED5}' // QAF +stringdef q4 '{U+FED6}' // QAF +stringdef k1 '{U+FEDB}' // KAF +stringdef k2 '{U+FEDC}' // KAF +stringdef k3 '{U+FED9}' // KAF +stringdef k4 '{U+FEDA}' // KAF +stringdef l1 '{U+FEDF}' // LAM +stringdef l2 '{U+FEE0}' // LAM +stringdef l3 '{U+FEDD}' // LAM +stringdef l4 '{U+FEDE}' // LAM +stringdef m1 '{U+FEE3}' // MEEM +stringdef m2 '{U+FEE4}' // MEEM +stringdef m3 '{U+FEE1}' // MEEM +stringdef m4 '{U+FEE2}' // MEEM +stringdef n1 '{U+FEE7}' // NOON +stringdef n2 '{U+FEE8}' // NOON +stringdef n3 '{U+FEE5}' // NOON +stringdef n4 '{U+FEE6}' // NOON +stringdef e1 '{U+FEEB}' // HEH +stringdef e2 '{U+FEEC}' // HEH +stringdef e3 '{U+FEE9}' // HEH +stringdef e4 '{U+FEEA}' // HEH +stringdef w1 '{U+FEED}' // WAW +stringdef w2 '{U+FEEE}' // WAW +stringdef a_1 '{U+FEEF}' // ALEF_MAKSURA +stringdef a_2 '{U+FEF0}' // ALEF_MAKSURA +stringdef y1 '{U+FEF3}' // YEH +stringdef y2 '{U+FEF4}' // YEH +stringdef y3 '{U+FEF1}' // YEH +stringdef y4 '{U+FEF2}' // YEH + +// Ligatures Lam-Alef +stringdef la '{U+FEFB}' // LAM_ALEF +stringdef la2 '{U+FEFC}' // LAM_ALEF +stringdef lao '{U+FEF7}' // LAM_ALEF_HAMZA_ABOVE +stringdef lao2 '{U+FEF8}' // LAM_ALEF_HAMZA_ABOVE +stringdef lao_ '{U+FEF9}' // LAM_ALEF_HAMZA_BELOW +stringdef lao_2 '{U+FEFA}' // LAM_ALEF_HAMZA_BELOW +stringdef la~ '{U+FEF5}' // LAM_ALEF_MADDA_ABOVE +stringdef la~2 '{U+FEF6}' // LAM_ALEF_MADDA_ABOVE + + +booleans ( + is_noun + is_verb + is_defined + ) + +routines ( + Prefix_Step1 + Prefix_Step2 + Prefix_Step3a_Noun + Prefix_Step3b_Noun + Prefix_Step3_Verb + Prefix_Step4_Verb + + Suffix_All_alef_maqsura + Suffix_Noun_Step1a + Suffix_Noun_Step1b + Suffix_Noun_Step2a + Suffix_Noun_Step2b + Suffix_Noun_Step2c1 + Suffix_Noun_Step2c2 + Suffix_Noun_Step3 + Suffix_Verb_Step1 + Suffix_Verb_Step2a + Suffix_Verb_Step2b + Suffix_Verb_Step2c + + Normalize_post + Normalize_pre + + Checks1 +) + +externals ( stem ) + +groupings ( ) + + +// Normalizations +define Normalize_pre as ( + do repeat ( + ( + [substring] among ( + '{aan}' '{uun}' '{iin}' '{aa}' '{uu}' '{ii}' '{oo}' '{~}'( delete ) // strip vocalization + '{_}' ( delete ) // strip kasheeda + + // Hindu–Arabic numerals + '{0}' ( <- '0') + '{1}' ( <- '1') + '{2}' ( <- '2') + '{3}' ( <- '3') + '{4}' ( <- '4') + '{5}' ( <- '5') + '{6}' ( <- '6') + '{7}' ( <- '7') + '{8}' ( <- '8') + '{9}' ( <- '9') + + // Shaped forms + '{o1}' ( <- '{o}' ) // HAMZA + '{ao1}' '{ao2}' ( <- '{ao}' ) // ALEF_HAMZA_ABOVE + '{ao_1}' '{ao_2}' ( <- '{ao_}' ) // ALEF_HAMZA_BELOW + '{yo1}' '{yo2}' '{yo3}' '{yo4}' ( <- '{yo}' ) // YEH_HAMZA + '{a~1}' '{a~2}'( <- '{a~}' ) // ALEF_MADDA + '{wo1}' '{wo2}'( <- '{wo}' ) // WAW_HAMZA + '{a1}' '{a2}' ( <- '{a}' ) // ALEF + '{b1}' '{b2}' '{b3}' '{b4}' ( <- '{b}' ) // BEH + '{t_1}' '{t_2}' ( <- '{t_}' ) // TEH_MARBUTA + '{t1}' '{t2}' '{t3}' '{t4}' ( <- '{t}' ) // TEH + '{th1}' '{th2}' '{th3}' '{th4}' ( <- '{th}' ) // THEH + '{j1}' '{j2}' '{j3}' '{j4}'( <- '{j}' ) // JEEM + '{h1}' '{h2}' '{h3}' '{h4}' ( <- '{h}' ) // HAH + '{x1}' '{x2}' '{x3}' '{x4}'( <- '{x}' ) // KHAH + '{d1}' '{d2}' ( <- '{d}' ) // DAL + '{dz1}''{dz2}' ( <- '{dz}' ) // THAL + '{r1}' '{r2}'( <- '{r}' ) // REH + '{z1}' '{z2}' ( <- '{z}' ) // ZAIN + '{s1}' '{s2}' '{s3}' '{s4}'( <- '{s}' ) // SEEN + '{sh1}' '{sh2}' '{sh3}' '{sh4}' ( <- '{sh}' ) // SHEEN + '{c1}' '{c2}' '{c3}' '{c4}'( <- '{c}' ) // SAD + '{dh1}' '{dh2}' '{dh3}' '{dh4}'( <- '{dh}' ) // DAD + '{tt1}' '{tt2}' '{tt3}' '{tt4}' ( <- '{tt}' ) // TAH + '{zh1}' '{zh2}' '{zh3}' '{zh4}'( <- '{zh}' ) // ZAH + '{i1}' '{i2}' '{i3}' '{i4}'( <- '{i}' ) // AIN + '{gh1}' '{gh2}' '{gh3}' '{gh4}'( <- '{gh}' ) // GHAIN + '{f1}' '{f2}' '{f3}' '{f4}' ( <- '{f}' ) // FEH + '{q1}' '{q2}' '{q3}' '{q4}' ( <- '{q}' ) // QAF + '{k1}' '{k2}' '{k3}' '{k4}'( <- '{k}' ) // KAF + '{l1}' '{l2}' '{l3}' '{l4}'( <- '{l}' ) // LAM + '{m1}' '{m2}' '{m3}' '{m4}' ( <- '{m}' ) // MEEM + '{n1}' '{n2}' '{n3}' '{n4}'( <- '{n}' ) // NOON + '{e1}' '{e2}' '{e3}' '{e4}' ( <- '{e}' ) // HEH + '{w1}' '{w2}' ( <- '{w}' ) // WAW + '{a_1}' '{a_2}' ( <- '{a_}' ) // ALEF_MAKSURA + '{y1}' '{y2}' '{y3}' '{y4}' ( <- '{y}' ) // YEH + + // Ligatures Lam-Alef + '{la}' '{la2}' (<- '{l}{a}') + '{lao}' '{lao2}' (<- '{l}{ao}') + '{lao_}' '{lao_2}' (<- '{l}{ao_}') + '{la~}' '{la~2}' (<- '{l}{a~}') + + ) + ) + or + next + ) +) + +define Normalize_post as ( + + do ( + // normalize last hamza + backwards ( + [substring] among ( + '{ao}''{ao_}' '{a~}' ( <- '{o}') + '{wo}' ( <- '{o}') + '{yo}' ( <- '{o}') + ) + ) + ) + + do repeat ( + ( + // normalize other hamza's + [substring] among ( + '{ao}''{ao_}' '{a~}' ( <- '{a}') + '{wo}' ( <- '{w}') + '{yo}' ( <- '{y}') + ) + ) + or + next + ) +) + +// Checks +define Checks1 as ( + [substring] among ( + '{b}{a}{l}' '{k}{a}{l}' ($(len > 4) set is_noun unset is_verb set is_defined) + '{l}{l}' '{a}{l}' ($(len > 3) set is_noun unset is_verb set is_defined) + ) +) + + +//prefixes +define Prefix_Step1 as ( + [substring] among ( + '{ao}{ao}' ($(len > 3) <- '{ao}' ) + '{ao}{a~}' ($(len > 3) <- '{a~}' ) + '{ao}{wo}' ($(len > 3) <- '{ao}' ) + '{ao}{a}' ($(len > 3) <- '{a}' ) + '{ao}{ao_}' ($(len > 3) <- '{ao_}' ) + // '{ao}' ($(len > 3) delete) //rare case + ) +) + +define Prefix_Step2 as ( + not '{f}{a}' + not '{w}{a}' + [substring] among ( + '{f}' ($(len > 3) delete) + '{w}' ($(len > 3) delete) + ) +) + +define Prefix_Step3a_Noun as ( // it is noun and defined + [substring] among ( + '{b}{a}{l}' '{k}{a}{l}' ($(len > 5) delete) + '{l}{l}' '{a}{l}' ($(len > 4) delete) + ) +) + +define Prefix_Step3b_Noun as ( // probably noun and defined + not '{b}{a}' // exception + [substring] among ( + '{b}' ($(len > 3) delete) + // '{k}' '{l}' ($(len > 3) delete) // BUG: cause confusion + '{b}{b}' ($(len > 3) <- '{b}' ) + '{k}{k}' ($(len > 3) <- '{k}' ) + ) + +) + +define Prefix_Step3_Verb as ( + [substring] among ( + //'{s}' ($(len > 4) delete)// BUG: cause confusion + '{s}{y}' ($(len > 4) <- '{y}' ) + '{s}{t}' ($(len > 4) <- '{t}') + '{s}{n}' ($(len > 4) <- '{n}') + '{s}{ao}' ($(len > 4) <- '{ao}') + ) +) + +define Prefix_Step4_Verb as ( + [substring] among ( + '{y}{s}{t}' '{n}{s}{t}' '{t}{s}{t}' ($(len > 4) set is_verb unset is_noun <- '{a}{s}{t}' ) + ) +) + +// suffixes +backwardmode ( + + define Suffix_Noun_Step1a as ( + [substring] among ( + '{y}' '{k}' '{e}' ($(len >= 4) delete) + '{n}{a}' '{k}{m}' '{e}{a}' '{e}{n}' '{e}{m}' ($(len >= 5) delete) + '{k}{m}{a}' '{e}{m}{a}' ($(len >= 6) delete) + ) + ) + define Suffix_Noun_Step1b as ( + [substring] among ( + '{n}' ($(len > 5) delete) + ) + ) + + define Suffix_Noun_Step2a as ( + [substring] among ( + '{a}' '{y}' '{w}' ($(len > 4) delete) + ) + ) + + define Suffix_Noun_Step2b as ( + [substring] among ( + '{a}{t}' ($(len >= 5) delete) + ) + ) + + define Suffix_Noun_Step2c1 as ( + [substring] among ( + '{t}' ($(len >= 4) delete) + ) + ) + define Suffix_Noun_Step2c2 as ( // feminine t_ + [substring] among ( + '{t_}' ($(len >= 4) delete) + ) + ) + define Suffix_Noun_Step3 as ( // ya' nisbiya + [substring] among ( + '{y}' ($(len >= 3) delete) + ) + ) + + define Suffix_Verb_Step1 as ( + [substring] among ( + '{e}' '{k}' ($(len >= 4) delete) + '{n}{y}' '{n}{a}' '{e}{a}' '{e}{m}' '{e}{n}' '{k}{m}' '{k}{n}' ($(len >= 5) delete) + '{e}{m}{a}' '{k}{m}{a}' '{k}{m}{w}'($(len >= 6) delete) + ) + ) + define Suffix_Verb_Step2a as ( + [substring] among ( + '{t}' ($(len >= 4) delete) + '{a}' '{n}' '{y}' ($(len >= 4) delete) + '{n}{a}' '{t}{a}' '{t}{n}' ($(len >= 5) delete)// past + '{a}{n}' '{w}{n}' '{y}{n}' ($(len > 5) delete) // present + '{t}{m}{a}' ($(len >= 6) delete) + ) + ) + + define Suffix_Verb_Step2b as ( + [substring] among ( + '{w}{a}' '{t}{m}' ($(len >= 5) delete) + ) + ) + + + define Suffix_Verb_Step2c as ( + [substring] among ( + '{w}' ($(len >= 4) delete) + '{t}{m}{w}' ($(len >= 6) delete) + ) + ) + + define Suffix_All_alef_maqsura as ( + [substring] among ( + '{a_}' ( <- '{y}' ) // spell error + // '{a_}' ( delete ) // if noun > 3 + // '{a_}' ( <- '{a}') // if verb + ) + ) +) + +define stem as ( + // set initial values + set is_noun + set is_verb + unset is_defined + + // guess type and properties + do Checks1 + + // normalization pre-stemming + do Normalize_pre + + + backwards ( + + do ( + //Suffixes for verbs + ( + is_verb + ( + ( + (atleast 1 Suffix_Verb_Step1) + ( Suffix_Verb_Step2a or Suffix_Verb_Step2c or next) + ) + or Suffix_Verb_Step2b + or Suffix_Verb_Step2a + ) + ) + //Suffixes for nouns + or ( + is_noun + ( + + try ( + Suffix_Noun_Step2c2 + or (not is_defined Suffix_Noun_Step1a ( + Suffix_Noun_Step2a + or Suffix_Noun_Step2b + or Suffix_Noun_Step2c1 + or next)) + or (Suffix_Noun_Step1b ( + Suffix_Noun_Step2a + or Suffix_Noun_Step2b + or Suffix_Noun_Step2c1)) + or (not is_defined Suffix_Noun_Step2a) + or (Suffix_Noun_Step2b) + ) + Suffix_Noun_Step3 + ) + + ) + + // Suffixes for alef maqsura + or Suffix_All_alef_maqsura + ) + ) + + //Prefixes + do ( + try Prefix_Step1 + try Prefix_Step2 + ( Prefix_Step3a_Noun + or (is_noun Prefix_Step3b_Noun) + or (is_verb try Prefix_Step3_Verb Prefix_Step4_Verb) + ) + ) + + // normalization post-stemming + do Normalize_post + +) diff --git a/contrib/snowball/algorithms/basque.sbl b/contrib/snowball/algorithms/basque.sbl new file mode 100644 index 000000000..267abc7e1 --- /dev/null +++ b/contrib/snowball/algorithms/basque.sbl @@ -0,0 +1,149 @@ +routines ( + aditzak + izenak + adjetiboak + mark_regions + RV R2 R1 +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v ) + +stringescapes {} + +/* special characters */ + +stringdef n~ '{U+00F1}' + +define v 'aeiou' + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +backwardmode ( + + define RV as $pV <= cursor + define R2 as $p2 <= cursor + define R1 as $p1 <= cursor + + define aditzak as ( + [substring] among( + 'le' 'la' 'tzaile' 'aldatu' 'atu' 'tzailea' 'taile' 'tailea' 'pera' 'gale' 'galea' + 'gura' 'kura' 'kor' 'korra' 'or' 'orra' 'tun' 'tuna' 'gaitz' 'gaitza' + 'kaitz' 'kaitza' 'ezin' 'ezina' 'tezin' 'tezina' 'errez' 'erreza' + 'karri' 'karria' 'tzaga' 'tzaka' 'tzake' 'tzeke' 'ez' 'eza' 'tzez' + 'keta' 'eta' 'etan' 'pen' 'pena' 'tze' 'atze' 'kuntza' 'kunde' 'kundea' + 'kune' 'kunea' 'kuna' 'kera' 'era' 'kizun' 'kizuna' 'dura' 'tura' 'men' 'mena' + 'go' 'ago' 'tio' 'taldi' 'taldia' 'aldi' 'aldia' 'gune' 'gunea' 'bide' 'bidea' + 'pide' 'pidea' 'gai' 'gaia' 'ki' 'kin' 'rekin' 'kina' 'kari' 'karia' 'ari' 'tari' 'etari' + 'gailu' 'gailua' 'kide' 'kidea' 'ide' 'idea' 'du' 'ka' 'kan' 'an' 'ean' 'tu' 'lari' 'tatu' + 'rean' 'tarazi' 'arazi' 'tzat' 'bera' 'dako' + ( RV delete ) + 'garri' 'garria' 'tza' + (R2 delete) + 'atseden' + (<- 'atseden') + 'arabera' + (<- 'arabera') + 'baditu' + (<- 'baditu') + + ) + ) + + define izenak as ( + [substring] among( + 'ari' 'aria' 'bizia' 'kari' 'karia' 'lari' 'laria' 'tari' 'taria' 'zain' 'zaina' + 'tzain' 'tzaina' 'zale' 'zalea' 'tzale' 'tzalea' 'aizun' 'orde' 'ordea' + 'burua' 'ohi' 'ohia' 'kintza' 'gintzo' 'gintzu' 'tzu' 'tzua' + 'tzo' 'tzoa' 'kuntza' 'talde' 'taldea' 'eria' 'keria' 'teria' 'di' + 'za' 'ada' 'tara' 'etara' 'tra' 'ta' 'tegi' 'tegia' 'keta' 'z' 'zko' 'zkoa' + 'ti' 'tia' 'tsu' 'tsua' 'zu' 'zua' 'bera' 'pera' 'zto' 'ztoa' 'asi' 'asia' + 'gile' 'gilea' 'estu' 'estua' 'larri' 'larria' 'nahi' 'nahia' + 'koi' 'koia' 'oi' 'oia' 'goi' 'min' 'mina' 'dun' 'duna' 'duru' 'durua' + 'duri' 'duria' 'os' 'osa' 'oso' 'osoa' 'ar' 'ara' 'tar' 'dar' 'dara' + 'tiar' 'tiara' 'liar' 'liara' 'gabe' 'gabea' 'kabe' 'kabea' 'ga' 'ge' + 'kada' 'tasun' 'tasuna' 'asun' 'asuna' 'go' 'mendu' 'mendua' 'mentu' 'mentua' + 'mendi' 'mendia' 'zio' 'zioa' 'zino' 'zinoa' 'zione' 'zionea' 'ezia' + 'degi' 'degia' 'egi' 'egia' 'toki' 'tokia' 'leku' 'lekua' 'gintza' 'alde' + 'aldea' 'kalde' 'kaldea' 'gune' 'gunea' 'une' 'unea' 'una' 'pe' 'pea' + 'gibel' 'gibela' 'ondo' 'ondoa' 'arte' 'artea' 'aurre' 'aurrea' + 'etxe' 'etxea' 'ola' 'ontzi' 'ontzia' 'gela' 'denda' 'taldi' 'taldia' + 'aldi' 'aldia' 'te' 'tea' 'zaro' 'zaroa' 'taro' 'taroa' 'oro' 'oroa' + 'aro' 'aroa' 'ero' 'eroa' 'eroz' 'eroza' 'ka' 'kan' 'kana' 'tako' 'etako' 'takoa' + 'kote' 'kotea' 'tzar' 'tzarra' 'handi' 'handia' 'kondo' 'kondoa' 'skila' + 'no' 'noa' '{n~}o' '{n~}oa' 'ska' 'xka' 'zka' 'tila' 'to' 'toa' 'tto' 'ttoa' + 'txo' 'txoa' 'txu' 'txua' 'anda' 'anga' 'urren' 'urrena' 'gai' 'gaia' + 'gei' 'geia' 'eme' 'emea' 'kume' 'kumea' 'sa' 'ko' 'eko' 'koa' 'ena' + 'enea' 'ne' 'nea' 'kor' 'korra' 'ez' 'eza' 'eta' 'etan' + 'ki' 'kia' 'kin' 'kina' 'tu' 'tua' 'du' 'dua' 'ek' + 'tarik' 'tariko' 'tan' 'ordu' 'ordua' 'oste' 'ostea' 'tzara' + 'ra' 'antza' 'behar' 'ro' 'giro' 'ak' 'zp' 'ket' + 'kail' 'kaila' 'ail' 'kirri' 'kirria' 'ngo' 'ngoa' '{n~}i' 'sko' + 'sta' 'koitz' 'koitza' 'na' 'garren' 'garrena' 'kera' + 'gerren' 'gerrena' 'garna' 'kide' 'tz' 'tuko' + ( RV delete ) + 'ora' 'garri' 'garria' 'or' 'buru' 'ren' 'tza' + ( R2 delete ) + 'joka' + (<- 'jok') + 'tzen' 'ten' 'en' 'tatu' + (R1 delete) + 'trako' + (<- 'tra') + 'minutuko' + (<- 'minutu') + 'zehar' + (<- 'zehar') + 'geldi' + (<- 'geldi') + 'igaro' + (<- 'igaro') + 'aurka' + (<- 'aurka') + ) + ) + + define adjetiboak as ( + [substring] among( + 'era' 'ero' 'go' 'tate' 'tade' 'date' 'dade' 'keria' + 'ki' 'to' 'ro' 'la' 'gi' 'larik' 'lanik' 'ik' 'ztik' 'rik' + ( RV delete ) + 'zlea' + (<- 'z') + ) + ) + +) + +define stem as ( + do mark_regions + backwards ( + repeat aditzak + repeat izenak + do adjetiboak + ) + +) + +/* + Note 1: additions of 21 Jul 2010 +*/ diff --git a/contrib/snowball/algorithms/catalan.sbl b/contrib/snowball/algorithms/catalan.sbl new file mode 100644 index 000000000..0a1e3a536 --- /dev/null +++ b/contrib/snowball/algorithms/catalan.sbl @@ -0,0 +1,202 @@ +routines ( + cleaning mark_regions + R1 R2 + attached_pronoun + standard_suffix + verb_suffix + residual_suffix +) + +externals ( stem ) + +integers ( p1 p2 ) + +groupings ( v ) + +stringescapes {} + +/* special characters */ + +stringdef a' '{U+00E1}' // a-acute +stringdef a` '{U+00E0}' // a-grave +stringdef c, '{U+00E7}' // c-cedilla +stringdef e' '{U+00E9}' // e-acute +stringdef e` '{U+00E8}' // e-grave +stringdef i' '{U+00ED}' // i-acute +stringdef i` '{U+00EC}' // i-grave +stringdef i" '{U+00EF}' // i-diaeresis +stringdef o' '{U+00F3}' // o-acute +stringdef o` '{U+00F2}' // o-grave +stringdef u' '{U+00FA}' // u-acute +stringdef u" '{U+00FC}' // u-diaeresis +stringdef . '{U+00B7}' // - per l aggeminades + +define v 'aeiou{a'}{a`}{e'}{e`}{i'}{i"}{o'}{o`}{u'}{u"}' + +define mark_regions as ( + + $p1 = limit + $p2 = limit // defaults + + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define cleaning as repeat ( + [substring] among( + '{a'}' (<- 'a') + '{a`}' (<- 'a') + '{e'}' (<- 'e') + '{e`}' (<- 'e') + '{i'}' (<- 'i') + '{i`}' (<- 'i') + '{o'}' (<- 'o') + '{o`}' (<- 'o') + '{u'}' (<- 'u') + '{u"}' (<- 'u') + '{i"}' (<- 'i') + '{.}' (<- '.') + '' (next) + ) +) + +backwardmode ( + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define attached_pronoun as ( + [substring] among ( + '{'}s' '{'}hi' '{'}ho' '{'}l' '{'}ls' + '-ls' '-la' '-les' '-li' + 'vos' 'se' 'nos' '-nos' '-us' 'us' + '{'}n' '{'}ns' '-n' '-ns' + '{'}m' '-me' '-m' + '-te' '{'}t' + 'li' 'lo' 'los' + 'me' 'sela' 'selo' 'selas' 'selos' 'le' + 'la' 'las' 'les' 'ens' 'ho' 'hi' + (R1 delete) + ) + ) + + define standard_suffix as ( + [substring] among( + 'ar' 'atge' 'formes' 'icte' 'ictes' + 'ell' 'ells' 'ella' '{e'}s' '{e`}s' 'esc' 'essa' 'et' 'ets' 'eta' + 'eres' 'eries' 'ers' 'ina' 'ines' 'able' 'ls' + 'i{o'}' 'itat' 'itats' 'itzar' 'iva' 'ives' 'ivisme' 'ius' + 'fer' 'ment' 'amen' 'ament' 'aments' 'ments' 'ot' 'sfera' 'al' 'als' 'era' 'ana' 'iste' + 'aire' 'eria' 'esa' 'eses' 'esos' 'or' '{i'}cia' '{i'}cies' 'icis' 'ici' '{i'}ci' '{i'}cis' + '{a`}ria' '{a`}ries' 'alla' 'ci{o'}' 'cions' 'n{c,}a' 'nces' '{o'}' 'dor' 'all' + 'il' '{i'}stic' 'enc' 'enca' '{i'}s' 'issa' 'issos' '{i'}ssem' '{i'}ssiu' 'issem' 'isseu' '{i'}sseu' + '{o'}s' 'osa' 'dora' 'dores' 'dors' 'adura' 'ble' 'bles' '{i'}vol' '{i'}vola' 'd{i'}s' 'egar' 'ejar' 'ificar' + 'itar' 'ables' 'adors' 'idores' 'idors' + 'adora' 'aci{o'}' 'doras' 'dur' 'dures' 'alleng{u"}es' + 'ant' 'ants' 'ancia' 'ancies' 'at{o`}ria' 'at{o`}ries' 'tori' 'toris' + 'ats' 'ions' 'ota' 'isam' 'ors' 'ora' 'ores' 'isament' + 'bilitat' 'bilitats' 'ivitat' 'ivitats' 'ari' 'aris' 'ionisme' 'ionista' 'ionistes' + 'ialista' 'ialistes' 'ialisme' 'ialismes' 'ud' 'uts' 'uds' 'encia' 'encies' '{e`}ncia' '{e`}ncies' + '{i"}tat' '{i"}tats' 'atiu' 'atius' 'atives' 'ativa' 'ativitat' 'ativitats' 'ible' 'ibles' + 'assa' 'asses' 'assos' + 'ent' 'ents' + '{i'}ssim' '{i'}ssima' '{i'}ssims' '{i'}ssimes' '{i`}ssem' '{i`}sseu' '{i`}ssin' + 'ims' 'ima' 'imes' + 'isme' 'ista' 'ismes' 'istes' + 'inia' 'inies' '{i'}inia' '{i'}nies' 'ita' 'ites' 'triu' 'trius' + 'oses' 'osos' 'ient' 'otes' 'ots' + (R1 delete) + 'acions' 'ada' 'ades' + (R2 delete) + 'log{i'}a' 'log{i'}es''logia' 'logies' 'logi' 'logis' 'l{o'}gica' 'l{o'}gics' 'l{o'}giques' + (R2 <- 'log') + 'ic' 'ica' 'ics' 'iques' + (R2 <- 'ic') + 'qu{i'}ssim' 'qu{i'}ssims' 'qu{i'}ssimes' 'qu{i'}ssima' + (R1 <- 'c') + ) + ) + + define verb_suffix as ( + [substring] among( + 'ador' 'adora' 'adors' 'adores' 're' 'ie' + 'ent' 'ents' 'udes' 'ar{a`}' 'eren' + 'ar{a'}' 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais' + 'aria' 'arian' 'arien' 'aries' 'ar{a`}s' + 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ara' + 'ar{e'}' 'ar{e'}s' + 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais' + 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}' + 'er{e'}' 'er' 'erau' 'erass' + 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais' + 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}' + 'ir{e'}' '{i'}rem' '{i'}reu' '{i'}eu' + 'ia' 'ies' '{i'}em' '{i`}eu' 'ien' + 'at' 'ut' 'uda' 'ava' 'aves' 'avem' '{a'}vem' '{a`}vem' '{a`}veu' '{a'}veu' 'aven' 'au' 'ats' + 'asseu' 'esseu' 'eresseu' '{a`}sseu' '{a`}ssem' '{a`}ssim' '{a`}ssiu' + 'essen' 'esses' 'assen' 'asses' 'assim' 'assiu' + '{e'}ssen' '{e'}sseu' '{e'}ssim' '{e'}ssiu' '{e'}ssem' + '{i'}' 'ares' '{a`}rem' '{a`}reu' '{a`}ren' + 'ar{i'}em' 'ar{i'}eu' + 'areu' 'aren' 'ant' '{i"}m' '{i"}u' + '{e'}s' '{i"}en' 'en' 'es' 'em' 'am' 'ams' '{i"}a' '{i"}es' + 'dre' 'eix' 'eixer' 'tzar' 'eixes' 'ides' '{i"}des' 'it' '{i"}t' '{i"}da' + 'aba' 'ada' 'ades' 'ida' '{i'}a' 'iera' 'ad' 'ed' 'its' + 'id' 'ids' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an' + 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado' + 'ido' 'iendo' 'i{o'}' 'ar' 'ir' 'as' + 'ieu' 'ii' 'io' 'i{a`}' + 'ess' 'essin' 'essis' 'ass' 'assin' 'assis' 'essim' '{e`}ssim' '{e`}ssiu' + 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases' + 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais' + 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados' + 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' 'ques' + '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos' + 'ira' 'iran' 'irem' 'iren' 'ires' 'ireu' 'iria' 'irien' + 'iries' 'ir{a`}' 'ir{a`}s' 'ir{e`}' 'ir{i`}em' 'ir{i`}eu' + 'isquen' 'iguem' 'igueu' 'esqui' 'esquin' 'esquis' 'eixi' 'eixin' 'eixis' + 'eixen' 'eixo' 'isin' 'isis' 'esques' 'sis' 'sin' + 'int' 'ir{i'}em' 'ir{i'}eu' 'isc' 'atges' 'esca' 'esquen' + 'issen' 'isses' 'issin' 'issis' 'isca' 'issiu' 'issim' + '{i"}sc' '{i"}sca' '{i"}ssin' '{i'}ssiu' '{i'}ssim' '{i"}ssis' '{i"}guem' '{i"}gueu' + '{i"}ra' '{i"}ren' '{i"}res' + '{i"}squen' '{i"}sques' '{i"}ssen' '{i"}sses' '{i"}xo' '{i"}xen' '{i"}xes' '{i"}x' + 'ixo' 'ixen' 'ixes' 'ix' 'ixa' 'inin' 'inis' 'ini' 'ineu' 'itza' 'itzi' 'itzeu' 'itzis' + 'itzo' 'itz' 'itz{a`}' 'arem' 'in' '{a`}s' 'i{i"}' 'i{i"}n' 'i{i"}s' + (R1 delete) + 'ando' + (R2 delete) + ) + ) + + define residual_suffix as ( + [substring] among( + 'os' 'a' 'o' '{a'}' '{a`}' '{i'}' '{o'}' 'e' '{e'}' 'eu' 'iu' + 'is' 'i' 'ir' 's' '{i`}' 'itz' '{i"}' '{i"}n' '{i"}s' 'it' + (R1 delete) + 'iqu' + (R1 <- 'ic') + ) + ) +) + +define stem as ( + do mark_regions + backwards ( + do attached_pronoun + do ( standard_suffix or + verb_suffix + ) + do residual_suffix + ) + do cleaning +) + +/* + First works 2010/07/19 + First Grammatical Reviews: https://ca.wikipedia.org/wiki/Gram%C3%A0tica_del_catal%C3%A0 + Suffix list: https://ca.wikipedia.org/wiki/Llista_de_sufixos + Irregular Verbs: https://ca.wikipedia.org/wiki/Flexi%C3%B3_verbal_del_catal%C3%A0 +*/ diff --git a/contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/danish.sbl index 0a8190a08..761270f7e 100644 --- a/contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl +++ b/contrib/snowball/algorithms/danish.sbl @@ -12,15 +12,17 @@ strings ( ch ) integers ( p1 x ) -groupings ( v s_ending ) +groupings ( c v s_ending ) stringescapes {} -/* special characters (in ISO Latin I) */ +/* special characters */ -stringdef ae hex 'E6' -stringdef ao hex 'E5' -stringdef o/ hex 'F8' +stringdef ae '{U+00E6}' +stringdef ao '{U+00E5}' +stringdef o/ '{U+00F8}' + +define c 'bcdfghjklmnpqrstvwxz' define v 'aeiouy{ae}{ao}{o/}' @@ -73,7 +75,7 @@ backwardmode ( ) ) define undouble as ( - setlimit tomark p1 for ([non-v] ->ch) + setlimit tomark p1 for ([c] ->ch) ch delete ) diff --git a/contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl deleted file mode 100644 index 1131a1cb7..000000000 --- a/contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl +++ /dev/null @@ -1,91 +0,0 @@ -routines ( - mark_regions - main_suffix - consonant_pair - other_suffix - undouble -) - -externals ( stem ) - -strings ( ch ) - -integers ( p1 x ) - -groupings ( v s_ending ) - -stringescapes {} - -/* special characters (in MS-DOS Latin I) */ - -stringdef ae hex '91' -stringdef ao hex '86' -stringdef o/ hex '9B' - -define v 'aeiouy{ae}{ao}{o/}' - -define s_ending 'abcdfghjklmnoprtvyz{ao}' - -define mark_regions as ( - - $p1 = limit - - test ( hop 3 setmark x ) - goto v gopast non-v setmark p1 - try ( $p1 < x $p1 = x ) -) - -backwardmode ( - - define main_suffix as ( - setlimit tomark p1 for ([substring]) - among( - - 'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere' - 'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes' - 'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets' - 'erets' 'et' 'eret' - (delete) - 's' - (s_ending delete) - ) - ) - - define consonant_pair as ( - test ( - setlimit tomark p1 for ([substring]) - among( - 'gd' // significant in the call from other_suffix - 'dt' 'gt' 'kt' - ) - ) - next] delete - ) - - define other_suffix as ( - do ( ['st'] 'ig' delete ) - setlimit tomark p1 for ([substring]) - among( - 'ig' 'lig' 'elig' 'els' - (delete do consonant_pair) - 'l{o/}st' - (<-'l{o/}s') - ) - ) - define undouble as ( - setlimit tomark p1 for ([non-v] ->ch) - ch - delete - ) -) - -define stem as ( - - do mark_regions - backwards ( - do main_suffix - do consonant_pair - do other_suffix - do undouble - ) -) diff --git a/contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/dutch.sbl index 15b8718d1..f24c82de4 100644 --- a/contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl +++ b/contrib/snowball/algorithms/dutch.sbl @@ -18,21 +18,21 @@ groupings ( v v_I v_j ) stringescapes {} -/* special characters (in MS-DOS Latin I) */ +/* special characters */ -stringdef a" hex '84' -stringdef e" hex '89' -stringdef i" hex '8B' -stringdef o" hex '94' -stringdef u" hex '81' +stringdef a" '{U+00E4}' +stringdef e" '{U+00EB}' +stringdef i" '{U+00EF}' +stringdef o" '{U+00F6}' +stringdef u" '{U+00FC}' -stringdef a' hex 'A0' -stringdef e' hex '82' -stringdef i' hex 'A1' -stringdef o' hex 'A2' -stringdef u' hex 'A3' +stringdef a' '{U+00E1}' +stringdef e' '{U+00E9}' +stringdef i' '{U+00ED}' +stringdef o' '{U+00F3}' +stringdef u' '{U+00FA}' -stringdef e` hex '8A' +stringdef e` '{U+00E8}' define v 'aeiouy{e`}' define v_I v + 'I' diff --git a/contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl deleted file mode 100644 index f7609f766..000000000 --- a/contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl +++ /dev/null @@ -1,164 +0,0 @@ -routines ( - prelude postlude - e_ending - en_ending - mark_regions - R1 R2 - undouble - standard_suffix -) - -externals ( stem ) - -booleans ( e_found ) - -integers ( p1 p2 ) - -groupings ( v v_I v_j ) - -stringescapes {} - -/* special characters (in ISO Latin I) */ - -stringdef a" hex 'E4' -stringdef e" hex 'EB' -stringdef i" hex 'EF' -stringdef o" hex 'F6' -stringdef u" hex 'FC' - -stringdef a' hex 'E1' -stringdef e' hex 'E9' -stringdef i' hex 'ED' -stringdef o' hex 'F3' -stringdef u' hex 'FA' - -stringdef e` hex 'E8' - -define v 'aeiouy{e`}' -define v_I v + 'I' -define v_j v + 'j' - -define prelude as ( - test repeat ( - [substring] among( - '{a"}' '{a'}' - (<- 'a') - '{e"}' '{e'}' - (<- 'e') - '{i"}' '{i'}' - (<- 'i') - '{o"}' '{o'}' - (<- 'o') - '{u"}' '{u'}' - (<- 'u') - '' (next) - ) //or next - ) - try(['y'] <- 'Y') - repeat goto ( - v [('i'] v <- 'I') or - ('y'] <- 'Y') - ) -) - -define mark_regions as ( - - $p1 = limit - $p2 = limit - - gopast v gopast non-v setmark p1 - try($p1 < 3 $p1 = 3) // at least 3 - gopast v gopast non-v setmark p2 - -) - -define postlude as repeat ( - - [substring] among( - 'Y' (<- 'y') - 'I' (<- 'i') - '' (next) - ) //or next - -) - -backwardmode ( - - define R1 as $p1 <= cursor - define R2 as $p2 <= cursor - - define undouble as ( - test among('kk' 'dd' 'tt') [next] delete - ) - - define e_ending as ( - unset e_found - ['e'] R1 test non-v delete - set e_found - undouble - ) - - define en_ending as ( - R1 non-v and not 'gem' delete - undouble - ) - - define standard_suffix as ( - do ( - [substring] among( - 'heden' - ( R1 <- 'heid' - ) - 'en' 'ene' - ( en_ending - ) - 's' 'se' - ( R1 non-v_j delete - ) - ) - ) - do e_ending - - do ( ['heid'] R2 not 'c' delete - ['en'] en_ending - ) - - do ( - [substring] among( - 'end' 'ing' - ( R2 delete - (['ig'] R2 not 'e' delete) or undouble - ) - 'ig' - ( R2 not 'e' delete - ) - 'lijk' - ( R2 delete e_ending - ) - 'baar' - ( R2 delete - ) - 'bar' - ( R2 e_found delete - ) - ) - ) - do ( - non-v_I - test ( - among ('aa' 'ee' 'oo' 'uu') - non-v - ) - [next] delete - ) - ) -) - -define stem as ( - - do prelude - do mark_regions - backwards - do standard_suffix - do postlude -) diff --git a/contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/english.sbl index fe18d7a91..fe18d7a91 100644 --- a/contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl +++ b/contrib/snowball/algorithms/english.sbl diff --git a/contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/finnish.sbl index 9ac74f292..3891d2295 100644 --- a/contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl +++ b/contrib/snowball/algorithms/finnish.sbl @@ -24,16 +24,17 @@ externals ( stem ) integers ( p1 p2 ) strings ( x ) booleans ( ending_removed ) -groupings ( AEI V1 V2 particle_end ) +groupings ( AEI C V1 V2 particle_end ) stringescapes {} -/* special characters (in ISO Latin I) */ +/* special characters */ -stringdef a" hex 'E4' -stringdef o" hex 'F6' +stringdef a" '{U+00E4}' +stringdef o" '{U+00F6}' define AEI 'a{a"}ei' +define C 'bcdfghjklmnpqrstvwxz' define V1 'aeiouy{a"}{o"}' define V2 'aeiou{a"}{o"}' define particle_end V1 + 'nt' @@ -116,7 +117,7 @@ backwardmode ( ) 'a' '{a"}' //-. - (V1 non-V1) // | + (V1 C) // | 'tta' 'tt{a"}' // Partitive [32] ('e') // | 'ta' 't{a"}' //-' @@ -172,11 +173,11 @@ backwardmode ( define tidy as ( setlimit tomark p1 for ( do ( LONG and ([next] delete ) ) // undouble vowel - do ( [AEI] non-V1 delete ) // remove trailing a, a", e, i + do ( [AEI] C delete ) // remove trailing a, a", e, i do ( ['j'] 'o' or 'u' delete ) do ( ['o'] 'j' delete ) ) - goto non-V1 [next] -> x x delete // undouble consonant + goto non-V1 [C] -> x x delete // undouble consonant ) ) diff --git a/contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/french.sbl index e972f227f..5c4f32d05 100644 --- a/contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl +++ b/contrib/snowball/algorithms/french.sbl @@ -17,21 +17,21 @@ groupings ( v keep_with_s ) stringescapes {} -/* special characters (in ISO Latin I) */ - -stringdef a^ hex 'E2' // a-circumflex -stringdef a` hex 'E0' // a-grave -stringdef c, hex 'E7' // c-cedilla - -stringdef e" hex 'EB' // e-diaeresis (rare) -stringdef e' hex 'E9' // e-acute -stringdef e^ hex 'EA' // e-circumflex -stringdef e` hex 'E8' // e-grave -stringdef i" hex 'EF' // i-diaeresis -stringdef i^ hex 'EE' // i-circumflex -stringdef o^ hex 'F4' // o-circumflex -stringdef u^ hex 'FB' // u-circumflex -stringdef u` hex 'F9' // u-grave +/* special characters */ + +stringdef a^ '{U+00E2}' // a-circumflex +stringdef a` '{U+00E0}' // a-grave +stringdef c, '{U+00E7}' // c-cedilla + +stringdef e" '{U+00EB}' // e-diaeresis (rare) +stringdef e' '{U+00E9}' // e-acute +stringdef e^ '{U+00EA}' // e-circumflex +stringdef e` '{U+00E8}' // e-grave +stringdef i" '{U+00EF}' // i-diaeresis +stringdef i^ '{U+00EE}' // i-circumflex +stringdef o^ '{U+00F4}' // o-circumflex +stringdef u^ '{U+00FB}' // u-circumflex +stringdef u` '{U+00F9}' // u-grave define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}' @@ -42,6 +42,10 @@ define prelude as repeat goto ( ('y' ] <- 'Y') ) or + ( [ '{e"}' ] <- 'He' ) + or + ( [ '{i"}' ] <- 'Hi' ) + or ( ['y'] v <- 'Y' ) or ( 'q' ['u'] <- 'U' ) @@ -78,6 +82,9 @@ define postlude as repeat ( 'I' (<- 'i') 'U' (<- 'u') 'Y' (<- 'y') + 'He' (<- '{e"}') + 'Hi' (<- '{i"}') + 'H' (delete) '' (next) ) ) @@ -167,7 +174,7 @@ backwardmode ( 'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait' 'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses' 'issez' 'issiez' 'issions' 'issons' 'it' - (non-v delete) + (not 'H' non-v delete) ) ) @@ -196,14 +203,13 @@ backwardmode ( define keep_with_s 'aiou{e`}s' define residual_suffix as ( - try(['s'] test non-keep_with_s delete) + try(['s'] test ('Hi' or non-keep_with_s) delete) setlimit tomark pV for ( [substring] among( 'ion' (R2 's' or 't' delete) 'ier' 'i{e`}re' 'Ier' 'I{e`}re' (<-'i') 'e' (delete) - '{e"}' ('gu' delete) ) ) ) diff --git a/contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl deleted file mode 100644 index 996eba1ab..000000000 --- a/contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl +++ /dev/null @@ -1,239 +0,0 @@ -routines ( - prelude postlude mark_regions - RV R1 R2 - standard_suffix - i_verb_suffix - verb_suffix - residual_suffix - un_double - un_accent -) - -externals ( stem ) - -integers ( pV p1 p2 ) - -groupings ( v keep_with_s ) - -stringescapes {} - -/* special characters (in MS-DOS Latin I) */ - -stringdef a^ hex '83' // a-circumflex -stringdef a` hex '85' // a-grave -stringdef c, hex '87' // c-cedilla - -stringdef e" hex '89' // e-diaeresis (rare) -stringdef e' hex '82' // e-acute -stringdef e^ hex '88' // e-circumflex -stringdef e` hex '8A' // e-grave -stringdef i" hex '8B' // i-diaeresis -stringdef i^ hex '8C' // i-circumflex -stringdef o^ hex '93' // o-circumflex -stringdef u^ hex '96' // u-circumflex -stringdef u` hex '97' // u-grave - -define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}' - -define prelude as repeat goto ( - - ( v [ ('u' ] v <- 'U') or - ('i' ] v <- 'I') or - ('y' ] <- 'Y') - ) - or - ( ['y'] v <- 'Y' ) - or - ( 'q' ['u'] <- 'U' ) -) - -define mark_regions as ( - - $pV = limit - $p1 = limit - $p2 = limit // defaults - - do ( - ( v v next ) or ( next gopast v ) - setmark pV - ) - do ( - gopast v gopast non-v setmark p1 - gopast v gopast non-v setmark p2 - ) -) - -define postlude as repeat ( - - [substring] among( - 'I' (<- 'i') - 'U' (<- 'u') - 'Y' (<- 'y') - '' (next) - ) -) - -backwardmode ( - - define RV as $pV <= cursor - define R1 as $p1 <= cursor - define R2 as $p2 <= cursor - - define standard_suffix as ( - [substring] among( - - 'ance' 'iqUe' 'isme' 'able' 'iste' 'eux' - 'ances' 'iqUes' 'ismes' 'ables' 'istes' - ( R2 delete ) - 'atrice' 'ateur' 'ation' - 'atrices' 'ateurs' 'ations' - ( R2 delete - try ( ['ic'] (R2 delete) or <-'iqU' ) - ) - 'logie' - 'logies' - ( R2 <- 'log' ) - 'usion' 'ution' - 'usions' 'utions' - ( R2 <- 'u' ) - 'ence' - 'ences' - ( R2 <- 'ent' ) - 'ement' - 'ements' - ( - RV delete - try ( - [substring] among( - 'iv' (R2 delete ['at'] R2 delete) - 'eus' ((R2 delete) or (R1<-'eux')) - 'abl' 'iqU' - (R2 delete) - 'i{e`}r' 'I{e`}r' //) - (RV <-'i') //)--new 2 Sept 02 - ) - ) - ) - 'it{e'}' - 'it{e'}s' - ( - R2 delete - try ( - [substring] among( - 'abil' ((R2 delete) or <-'abl') - 'ic' ((R2 delete) or <-'iqU') - 'iv' (R2 delete) - ) - ) - ) - 'if' 'ive' - 'ifs' 'ives' - ( - R2 delete - try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' ) - ) - 'eaux' (<- 'eau') - 'aux' (R1 <- 'al') - 'euse' - 'euses'((R2 delete) or (R1<-'eux')) - - 'issement' - 'issements'(R1 non-v delete) // verbal - - // fail(...) below forces entry to verb_suffix. -ment typically - // follows the p.p., e.g 'confus{e'}ment'. - - 'amment' (RV fail(<- 'ant')) - 'emment' (RV fail(<- 'ent')) - 'ment' - 'ments' (test(v RV) fail(delete)) - // v is e,i,u,{e'},I or U - ) - ) - - define i_verb_suffix as setlimit tomark pV for ( - [substring] among ( - '{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai' - 'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez' - 'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait' - 'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses' - 'issez' 'issiez' 'issions' 'issons' 'it' - (non-v delete) - ) - ) - - define verb_suffix as setlimit tomark pV for ( - [substring] among ( - 'ions' - (R2 delete) - - '{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai' - 'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions' - 'erons' 'eront' 'ez' 'iez' - - // 'ons' //-best omitted - - (delete) - - '{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant' - 'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez' - 'assions' - (delete - try(['e'] delete) - ) - ) - ) - - define keep_with_s 'aiou{e`}s' - - define residual_suffix as ( - try(['s'] test non-keep_with_s delete) - setlimit tomark pV for ( - [substring] among( - 'ion' (R2 's' or 't' delete) - 'ier' 'i{e`}re' - 'Ier' 'I{e`}re' (<-'i') - 'e' (delete) - '{e"}' ('gu' delete) - ) - ) - ) - - define un_double as ( - test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete - ) - - define un_accent as ( - atleast 1 non-v - [ '{e'}' or '{e`}' ] <-'e' - ) -) - -define stem as ( - - do prelude - do mark_regions - backwards ( - - do ( - ( - ( standard_suffix or - i_verb_suffix or - verb_suffix - ) - and - try( [ ('Y' ] <- 'i' ) or - ('{c,}'] <- 'c' ) - ) - ) or - residual_suffix - ) - - // try(['ent'] RV delete) // is best omitted - - do un_double - do un_accent - ) - do postlude -) - diff --git a/contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/german.sbl index 7069daf0d..61f24ef92 100644 --- a/contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl +++ b/contrib/snowball/algorithms/german.sbl @@ -18,12 +18,12 @@ groupings ( v s_ending st_ending ) stringescapes {} -/* special characters (in ISO Latin I) */ +/* special characters */ -stringdef a" hex 'E4' -stringdef o" hex 'F6' -stringdef u" hex 'FC' -stringdef ss hex 'DF' +stringdef a" '{U+00E4}' +stringdef o" '{U+00F6}' +stringdef u" '{U+00FC}' +stringdef ss '{U+00DF}' define v 'aeiouy{a"}{o"}{u"}' diff --git a/contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl deleted file mode 100644 index 3effb3257..000000000 --- a/contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl +++ /dev/null @@ -1,139 +0,0 @@ - -/* - Extra rule for -nisse ending added 11 Dec 2009 -*/ - -routines ( - prelude postlude - mark_regions - R1 R2 - standard_suffix -) - -externals ( stem ) - -integers ( p1 p2 x ) - -groupings ( v s_ending st_ending ) - -stringescapes {} - -/* special characters (in MS-DOS Latin I) */ - -stringdef a" hex '84' -stringdef o" hex '94' -stringdef u" hex '81' -stringdef ss hex 'E1' - -define v 'aeiouy{a"}{o"}{u"}' - -define s_ending 'bdfghklmnrt' -define st_ending s_ending - 'r' - -define prelude as ( - - test repeat ( - ( - ['{ss}'] <- 'ss' - ) or next - ) - - repeat goto ( - v [('u'] v <- 'U') or - ('y'] v <- 'Y') - ) -) - -define mark_regions as ( - - $p1 = limit - $p2 = limit - - test(hop 3 setmark x) - - gopast v gopast non-v setmark p1 - try($p1 < x $p1 = x) // at least 3 - gopast v gopast non-v setmark p2 - -) - -define postlude as repeat ( - - [substring] among( - 'Y' (<- 'y') - 'U' (<- 'u') - '{a"}' (<- 'a') - '{o"}' (<- 'o') - '{u"}' (<- 'u') - '' (next) - ) - -) - -backwardmode ( - - define R1 as $p1 <= cursor - define R2 as $p2 <= cursor - - define standard_suffix as ( - do ( - [substring] R1 among( - 'em' 'ern' 'er' - ( delete - ) - 'e' 'en' 'es' - ( delete - try (['s'] 'nis' delete) - ) - 's' - ( s_ending delete - ) - ) - ) - do ( - [substring] R1 among( - 'en' 'er' 'est' - ( delete - ) - 'st' - ( st_ending hop 3 delete - ) - ) - ) - do ( - [substring] R2 among( - 'end' 'ung' - ( delete - try (['ig'] not 'e' R2 delete) - ) - 'ig' 'ik' 'isch' - ( not 'e' delete - ) - 'lich' 'heit' - ( delete - try ( - ['er' or 'en'] R1 delete - ) - ) - 'keit' - ( delete - try ( - [substring] R2 among( - 'lich' 'ig' - ( delete - ) - ) - ) - ) - ) - ) - ) -) - -define stem as ( - do prelude - do mark_regions - backwards - do standard_suffix - do postlude -) diff --git a/contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/german2.sbl index ce6026a86..47ff61e38 100644 --- a/contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl +++ b/contrib/snowball/algorithms/german2.sbl @@ -18,12 +18,12 @@ groupings ( v s_ending st_ending ) stringescapes {} -/* special characters (in ISO Latin I) */ +/* special characters */ -stringdef a" hex 'E4' -stringdef o" hex 'F6' -stringdef u" hex 'FC' -stringdef ss hex 'DF' +stringdef a" '{U+00E4}' +stringdef o" '{U+00F6}' +stringdef u" '{U+00FC}' +stringdef ss '{U+00DF}' define v 'aeiouy{a"}{o"}{u"}' diff --git a/contrib/snowball/algorithms/greek.sbl b/contrib/snowball/algorithms/greek.sbl new file mode 100644 index 000000000..02df6c33f --- /dev/null +++ b/contrib/snowball/algorithms/greek.sbl @@ -0,0 +1,706 @@ +// A stemmer for Modern Greek language, based on: +// +// Ntais, Georgios. Development of a Stemmer for the Greek +// Language. Diss. Royal Institute of Technology, 2006. +// https://sais.se/mthprize/2007/ntais2007.pdf +// +// Saroukos, Spyridon. Enhancing a Greek language stemmer. +// University of Tampere, 2008. +// https://tampub.uta.fi/bitstream/handle/10024/80480/gradu03463.pdf + +stringescapes {} + +stringdef a '{U+03B1}' // alpha +stringdef v '{U+03B2}' // beta +stringdef g '{U+03B3}' // gamma +stringdef d '{U+03B4}' // delta +stringdef e '{U+03B5}' // epsilon +stringdef z '{U+03B6}' // zeta +stringdef i '{U+03B7}' // eta +stringdef th '{U+03B8}' // theta +stringdef y '{U+03B9}' // iota +stringdef k '{U+03BA}' // kappa +stringdef l '{U+03BB}' // lamda +stringdef m '{U+03BC}' // mu +stringdef n '{U+03BD}' // nu +stringdef x '{U+03BE}' // xi +stringdef o '{U+03BF}' // omicron +stringdef p '{U+03C0}' // pi +stringdef r '{U+03C1}' // rho +stringdef ss '{U+03C2}' // sigma final +stringdef s '{U+03C3}' // sigma +stringdef t '{U+03C4}' // tau +stringdef u '{U+03C5}' // upsilon +stringdef f '{U+03C6}' // phi +stringdef ch '{U+03C7}' // chi +stringdef ps '{U+03C8}' // psi +stringdef oo '{U+03C9}' // omega + +stringdef A '{U+0391}' // Alpha +stringdef V '{U+0392}' // Beta +stringdef G '{U+0393}' // Gamma +stringdef D '{U+0394}' // Delta +stringdef E '{U+0395}' // Epsilon +stringdef Z '{U+0396}' // Zeta +stringdef I '{U+0397}' // Eta +stringdef Th '{U+0398}' // Theta +stringdef Y '{U+0399}' // Iota +stringdef K '{U+039A}' // Kappa +stringdef L '{U+039B}' // Lamda +stringdef M '{U+039C}' // Mu +stringdef N '{U+039D}' // Nu +stringdef X '{U+039E}' // Xi +stringdef O '{U+039F}' // Omicron +stringdef P '{U+03A0}' // Pi +stringdef R '{U+03A1}' // Rho +stringdef S '{U+03A3}' // Sigma +stringdef T '{U+03A4}' // Tau +stringdef U '{U+03A5}' // Upsilon +stringdef F '{U+03A6}' // Phi +stringdef Ch '{U+03A7}' // Chi +stringdef Ps '{U+03A8}' // Psi +stringdef Oo '{U+03A9}' // Omega + +stringdef Y: '{U+03AA}' // Iota with dialytika +stringdef U: '{U+03AB}' // Upsilon with dialytika + +stringdef a' '{U+03AC}' // alpha with tonos +stringdef e' '{U+03AD}' // epsilon with tonos +stringdef i' '{U+03AE}' // eta with tonos +stringdef y' '{U+03AF}' // iota with tonos +stringdef o' '{U+03CC}' // omicron with tonos +stringdef u' '{U+03CD}' // upsilon with tonos +stringdef oo' '{U+03CE}' // omega with tonos + +stringdef i:' '{U+0390}' // iota with dialytika and tonos +stringdef u:' '{U+03B0}' // upsilon with dialytika and tonos + +stringdef i: '{U+03CA}' // iota with dialytika +stringdef u: '{U+03CB}' // upsilon with dialytika + +stringdef A' '{U+0386}' // Alpha with tonos +stringdef E' '{U+0388}' // Epsilon with tonos +stringdef I' '{U+0389}' // Eta with tonos +stringdef Y' '{U+038A}' // Iota with tonos +stringdef O' '{U+038C}' // Omicron with tonos +stringdef U' '{U+038E}' // Upsilon with tonos +stringdef OO' '{U+038F}' // Omega with tonos + +externals ( stem ) + +booleans ( test1 ) + +groupings ( v v2 ) + +routines ( tolower has_min_length + steps1 steps2 steps3 steps4 steps5 steps6 steps7 + steps8 steps9 steps10 + step1 step2a step2b step2c step2d step3 step4 + step5a step5b step5c step5d step5e step5f + step5g step5h step5i + step5j step5k step5l step5m + step6 step7 ) + +define v '{a}{e}{i}{y}{o}{u}{oo}' +define v2 '{a}{e}{i}{y}{o}{oo}' + +backwardmode ( + define has_min_length as ( + $(len >= 3) + ) + + define tolower as ( + repeat ( + [substring] among ( + '{A}' (<- '{a}') + '{V}' (<- '{v}') + '{G}' (<- '{g}') + '{D}' (<- '{d}') + '{E}' (<- '{e}') + '{Z}' (<- '{z}') + '{I}' (<- '{i}') + '{Th}' (<- '{th}') + '{Y}' (<- '{y}') + '{K}' (<- '{k}') + '{L}' (<- '{l}') + '{M}' (<- '{m}') + '{N}' (<- '{n}') + '{X}' (<- '{x}') + '{O}' (<- '{o}') + '{P}' (<- '{p}') + '{R}' (<- '{r}') + '{S}' (<- '{s}') + '{T}' (<- '{t}') + '{U}' (<- '{u}') + '{F}' (<- '{f}') + '{Ch}' (<- '{ch}') + '{Ps}' (<- '{ps}') + '{Oo}' (<- '{oo}') + '{Y:}' (<- '{y}') + '{U:}' (<- '{u}') + '{a'}' (<- '{a}') + '{e'}' (<- '{e}') + '{i'}' (<- '{i}') + '{y'}' (<- '{y}') + '{o'}' (<- '{o}') + '{u'}' (<- '{u}') + '{oo'}' (<- '{oo}') + '{i:'}' (<- '{i}') + '{u:'}' (<- '{u}') + '{i:}' (<- '{i}') + '{u:}' (<- '{u}') + '{A'}' (<- '{a}') + '{E'}' (<- '{e}') + '{I'}' (<- '{i}') + '{Y'}' (<- '{y}') + '{O'}' (<- '{o}') + '{U'}' (<- '{u}') + '{OO'}' (<- '{oo}') + '{ss}' (<- '{s}') + '' (next) + ) + ) + ) + + define step1 as ( + [substring] among ( + '{f}{a}{g}{y}{a}' '{f}{a}{g}{y}{o}{u}' '{f}{a}{g}{y}{oo}{n}' (<- '{f}{a}') + '{s}{k}{a}{g}{y}{a}' '{s}{k}{a}{g}{y}{o}{u}' '{s}{k}{a}{g}{y}{oo}{n}' (<- '{s}{k}{a}') + '{o}{l}{o}{g}{y}{o}{u}' '{o}{l}{o}{g}{y}{a}' '{o}{l}{o}{g}{y}{oo}{n}' (<- '{o}{l}{o}') + '{s}{o}{g}{y}{o}{u}' '{s}{o}{g}{y}{a}' '{s}{o}{g}{y}{oo}{n}' (<- '{s}{o}') + '{t}{a}{t}{o}{g}{y}{a}' '{t}{a}{t}{o}{g}{y}{o}{u}' '{t}{a}{t}{o}{g}{y}{oo}{n}' (<- '{t}{a}{t}{o}') + '{k}{r}{e}{a}{s}' '{k}{r}{e}{a}{t}{o}{s}' '{k}{r}{e}{a}{t}{a}' '{k}{r}{e}{a}{t}{oo}{n}' (<- '{k}{r}{e}') + '{p}{e}{r}{a}{s}' '{p}{e}{r}{a}{t}{o}{s}' '{p}{e}{r}{a}{t}{i}' '{p}{e}{r}{a}{t}{a}' '{p}{e}{r}{a}{t}{oo}{n}' (<- '{p}{e}{r}') + '{t}{e}{r}{a}{s}' '{t}{e}{r}{a}{t}{o}{s}' '{t}{e}{r}{a}{t}{a}' '{t}{e}{r}{a}{t}{oo}{n}' (<- '{t}{e}{r}') + '{f}{oo}{s}' '{f}{oo}{t}{o}{s}' '{f}{oo}{t}{a}' '{f}{oo}{t}{oo}{n}' (<- '{f}{oo}') + '{k}{a}{th}{e}{s}{t}{oo}{s}' '{k}{a}{th}{e}{s}{t}{oo}{t}{o}{s}' '{k}{a}{th}{e}{s}{t}{oo}{t}{a}' '{k}{a}{th}{e}{s}{t}{oo}{t}{oo}{n}' (<- '{k}{a}{th}{e}{s}{t}') + '{g}{e}{g}{o}{n}{o}{s}' '{g}{e}{g}{o}{n}{o}{t}{o}{s}' '{g}{e}{g}{o}{n}{o}{t}{a}' '{g}{e}{g}{o}{n}{o}{t}{oo}{n}' (<- '{g}{e}{g}{o}{n}') + ) + unset test1 + ) + + define steps1 as ( + [substring] among ( + '{y}{z}{a}' '{y}{z}{e}{s}' '{y}{z}{e}' '{y}{z}{a}{m}{e}' '{y}{z}{a}{t}{e}' '{y}{z}{a}{n}' '{y}{z}{a}{n}{e}' '{y}{z}{oo}' '{y}{z}{e}{y}{s}' '{y}{z}{e}{y}' + '{y}{z}{o}{u}{m}{e}' '{y}{z}{e}{t}{e}' '{y}{z}{o}{u}{n}' '{y}{z}{o}{u}{n}{e}' ( + delete + unset test1 + ([] substring atlimit among ( + '{a}{n}{a}{m}{p}{a}' '{e}{m}{p}{a}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{p}{a}' '{p}{e}{r}{y}{p}{a}' '{a}{th}{r}{o}' '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' + (<- '{y}') + )) or + ([] substring atlimit among ( + '{m}{a}{r}{k}' '{k}{o}{r}{n}' '{a}{m}{p}{a}{r}' '{a}{r}{r}' '{v}{a}{th}{u}{r}{y}' '{v}{a}{r}{k}' '{v}' '{v}{o}{l}{v}{o}{r}' '{g}{k}{r}' + '{g}{l}{u}{k}{o}{r}' '{g}{l}{u}{k}{u}{r}' '{y}{m}{p}' '{l}' '{l}{o}{u}' '{m}{a}{r}' '{m}' '{p}{r}' '{m}{p}{r}' '{p}{o}{l}{u}{r}' '{p}' + '{r}' '{p}{y}{p}{e}{r}{o}{r}' + (<- '{y}{z}') + )) + ) + ) + ) + + define steps2 as ( + [substring] among ( + '{oo}{th}{i}{k}{a}' '{oo}{th}{i}{k}{e}{s}' '{oo}{th}{i}{k}{e}' '{oo}{th}{i}{k}{a}{m}{e}' '{oo}{th}{i}{k}{a}{t}{e}' '{oo}{th}{i}{k}{a}{n}' '{oo}{th}{i}{k}{a}{n}{e}' ( + delete + unset test1 + [] substring atlimit among ( + '{a}{l}' '{v}{y}' '{e}{n}' '{u}{ps}' '{l}{y}' '{z}{oo}' '{s}' '{ch}' (<- '{oo}{n}') + ) + ) + ) + ) + + define steps3 as ( + [substring] among ( + '{y}{s}{a}' '{y}{s}{e}{s}' '{y}{s}{e}' '{y}{s}{a}{m}{e}' '{y}{s}{a}{t}{e}' '{y}{s}{a}{n}' '{y}{s}{a}{n}{e}' ( + delete + unset test1 + ('{y}{s}{a}' atlimit <- '{y}{s}') or + ([] substring atlimit among ( + '{a}{n}{a}{m}{p}{a}' '{a}{th}{r}{o}' '{e}{m}{p}{a}' '{e}{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{e}{p}{e}' '{p}{e}{r}{y}{p}{a}' + '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' '{k}{l}{e}' '{ch}{a}{r}{t}{o}{p}{a}' '{e}{x}{a}{r}{ch}{a}' '{m}{e}{t}{e}{p}{e}' '{a}{p}{o}{k}{l}{e}' + '{a}{p}{e}{k}{l}{e}' '{e}{k}{l}{e}' '{p}{e}' + (<- '{y}') + )) or + ([] substring atlimit among ( + '{a}{n}' '{a}{f}' '{g}{e}' '{g}{y}{g}{a}{n}{t}{o}{a}{f}' '{g}{k}{e}' '{d}{i}{m}{o}{k}{r}{a}{t}' '{k}{o}{m}' '{g}{k}' '{m}' '{p}' + '{p}{o}{u}{k}{a}{m}' '{o}{l}{o}' '{l}{a}{r}' + (<- '{y}{s}') + )) + ) + ) + ) + + define steps4 as ( + [substring] among ( + '{y}{s}{oo}' '{y}{s}{e}{y}{s}' '{y}{s}{e}{y}' '{y}{s}{o}{u}{m}{e}' '{y}{s}{e}{t}{e}' '{y}{s}{o}{u}{n}' '{y}{s}{o}{u}{n}{e}' ( + delete + unset test1 + [] substring atlimit among ( + '{a}{n}{a}{m}{p}{a}' '{e}{m}{p}{a}' '{e}{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{e}{p}{e}' '{p}{e}{r}{y}{p}{a}' '{a}{th}{r}{o}' + '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' '{k}{l}{e}' '{ch}{a}{r}{t}{o}{p}{a}' '{e}{x}{a}{r}{ch}{a}' '{m}{e}{t}{e}{p}{e}' '{a}{p}{o}{k}{l}{e}' '{a}{p}{e}{k}{l}{e}' + '{e}{k}{l}{e}' '{p}{e}' + (<- '{y}') + ) + ) + ) + ) + + define steps5 as ( + [substring] among ( + '{y}{s}{t}{o}{s}' '{y}{s}{t}{o}{u}' '{y}{s}{t}{o}' '{y}{s}{t}{e}' '{y}{s}{t}{o}{y}' '{y}{s}{t}{oo}{n}' '{y}{s}{t}{o}{u}{s}' '{y}{s}{t}{i}' '{y}{s}{t}{i}{s}' + '{y}{s}{t}{a}' '{y}{s}{t}{e}{s}' ( + delete + unset test1 + ([] substring atlimit among ( + '{d}{a}{n}{e}' '{s}{u}{n}{a}{th}{r}{o}' '{k}{l}{e}' '{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{a}{s}{e}' '{p}{l}{e}' + (<- '{y}') + )) or + ([] substring atlimit among ( + '{m}' '{p}' '{a}{p}' '{a}{r}' '{i}{d}' '{k}{t}' '{s}{k}' '{s}{ch}' '{u}{ps}' '{f}{a}' '{ch}{r}' '{ch}{t}' '{a}{k}{t}' + '{a}{o}{r}' '{a}{s}{ch}' '{a}{t}{a}' '{a}{ch}{n}' '{a}{ch}{t}' '{g}{e}{m}' '{g}{u}{r}' '{e}{m}{p}' '{e}{u}{p}' '{e}{ch}{th}' '{i}{f}{a}' + '{k}{a}{th}' '{k}{a}{k}' '{k}{u}{l}' '{l}{u}{g}' '{m}{a}{k}' '{m}{e}{g}' '{t}{a}{ch}' '{f}{y}{l}' '{ch}{oo}{r}' + (<- '{y}{s}{t}') + )) + ) + ) + ) + + define steps6 as ( + [substring] among ( + '{y}{s}{m}{o}' '{y}{s}{m}{o}{y}' '{y}{s}{m}{o}{s}' '{y}{s}{m}{o}{u}' '{y}{s}{m}{o}{u}{s}' '{y}{s}{m}{oo}{n}' ( + delete + unset test1 + ([] substring atlimit among ( + '{s}{e}' '{m}{e}{t}{a}{s}{e}' '{m}{y}{k}{r}{o}{s}{e}' '{e}{g}{k}{l}{e}' '{a}{p}{o}{k}{l}{e}' + (<- '{y}{s}{m}') + )) or + ([] substring atlimit among ( + '{d}{a}{n}{e}' '{a}{n}{t}{y}{d}{a}{n}{e}' + (<- '{y}') + )) or + ([substring] among ( + '{a}{g}{n}{oo}{s}{t}{y}{k}' (<- '{a}{g}{n}{oo}{s}{t}') + '{a}{t}{o}{m}{y}{k}' (<- '{a}{t}{o}{m}') + '{g}{n}{oo}{s}{t}{y}{k}' (<- '{g}{n}{oo}{s}{t}') + '{e}{th}{n}{y}{k}' (<- '{e}{th}{n}') + '{e}{k}{l}{e}{k}{t}{y}{k}' (<- '{e}{k}{l}{e}{k}{t}') + '{s}{k}{e}{p}{t}{y}{k}' (<- '{s}{k}{e}{p}{t}') + '{t}{o}{p}{y}{k}' (<- '{t}{o}{p}') + '{a}{l}{e}{x}{a}{n}{d}{r}{y}{n}' (<- '{a}{l}{e}{x}{a}{n}{d}{r}') + '{v}{u}{z}{a}{n}{t}{y}{n}' (<- '{v}{u}{z}{a}{n}{t}') + '{th}{e}{a}{t}{r}{y}{n}' (<- '{th}{e}{a}{t}{r}') + )) + ) + ) + ) + + define steps7 as ( + [substring] among ( + '{a}{r}{a}{k}{y}' '{a}{r}{a}{k}{y}{a}' '{o}{u}{d}{a}{k}{y}' '{o}{u}{d}{a}{k}{y}{a}' ( + delete + unset test1 + [] substring atlimit among ( + '{s}' '{ch}' + (<- '{a}{r}{a}{k}') + ) + ) + ) + ) + + define steps8 as ( + [substring] among ( + '{a}{k}{y}' '{a}{k}{y}{a}' '{y}{t}{s}{a}' '{y}{t}{s}{a}{s}' '{y}{t}{s}{e}{s}' '{y}{t}{s}{oo}{n}' '{a}{r}{a}{k}{y}' '{a}{r}{a}{k}{y}{a}' ( + delete + unset test1 + ([] substring atlimit among ( + '{v}{a}{m}{v}' '{v}{r}' '{k}{a}{y}{m}' '{k}{o}{n}' '{k}{o}{r}' '{l}{a}{v}{r}' '{l}{o}{u}{l}' '{m}{e}{r}' '{m}{o}{u}{s}{t}' + '{n}{a}{g}{k}{a}{s}' '{p}{l}' '{r}' '{r}{u}' '{s}' '{s}{k}' '{s}{o}{k}' '{s}{p}{a}{n}' '{t}{z}' '{f}{a}{r}{m}' '{ch}' '{k}{a}{p}{a}{k}' + '{a}{l}{y}{s}{f}' '{a}{m}{v}{r}' '{a}{n}{th}{r}' '{k}' '{f}{u}{l}' '{k}{a}{t}{r}{a}{p}' '{k}{l}{y}{m}' '{m}{a}{l}' '{s}{l}{o}{v}' '{f}' + '{s}{f}' '{t}{s}{e}{ch}{o}{s}{l}{o}{v}' + (<- '{a}{k}') + )) or + ([] substring atlimit among ( + '{v}' '{v}{a}{l}' '{g}{y}{a}{n}' '{g}{l}' '{z}' '{i}{g}{o}{u}{m}{e}{n}' '{k}{a}{r}{d}' '{k}{o}{n}' '{m}{a}{k}{r}{u}{n}' '{n}{u}{f}' + '{p}{a}{t}{e}{r}' '{p}' '{s}{k}' '{t}{o}{s}' '{t}{r}{y}{p}{o}{l}' + (<- '{y}{t}{s}') + )) or + ([] '{k}{o}{r}' <- '{y}{t}{s}') + ) + ) + ) + + define steps9 as ( + [substring] among ( + '{y}{d}{y}{o}' '{y}{d}{y}{a}' '{y}{d}{y}{oo}{n}' ( + delete + unset test1 + ([] substring atlimit among ( + '{a}{y}{f}{n}' '{y}{r}' '{o}{l}{o}' '{ps}{a}{l}' (<- '{y}{d}') + )) or + ([] substring among ( + '{e}' '{p}{a}{y}{ch}{n}' (<- '{y}{d}') + )) + ) + ) + ) + + define steps10 as ( + [substring] among ( + '{y}{s}{k}{o}{s}' '{y}{s}{k}{o}{u}' '{y}{s}{k}{o}' '{y}{s}{k}{e}' ( + delete + unset test1 + [] substring atlimit among ( + '{d}' '{y}{v}' '{m}{i}{n}' '{r}' '{f}{r}{a}{g}{k}' '{l}{u}{k}' '{o}{v}{e}{l}' + (<- '{y}{s}{k}') + ) + ) + ) + ) + + define step2a as ( + [substring] among ( + '{a}{d}{e}{s}' '{a}{d}{oo}{n}' (delete) + ) + not ([substring] among ( + '{o}{k}' '{m}{a}{m}' '{m}{a}{n}' '{m}{p}{a}{m}{p}' '{p}{a}{t}{e}{r}' '{g}{y}{a}{g}{y}' '{n}{t}{a}{n}{t}' '{k}{u}{r}' '{th}{e}{y}' '{p}{e}{th}{e}{r}' + )) + insert '{a}{d}' + ) + + define step2b as ( + [substring] among ( + '{e}{d}{e}{s}' '{e}{d}{oo}{n}' (delete) + ) + [] substring among ( + '{o}{p}' '{y}{p}' '{e}{m}{p}' '{u}{p}' '{g}{i}{p}' '{d}{a}{p}' '{k}{r}{a}{s}{p}' '{m}{y}{l}' (<- '{e}{d}') + ) + ) + + define step2c as ( + [substring] among ( + '{o}{u}{d}{e}{s}' '{o}{u}{d}{oo}{n}' (delete) + ) + [] substring among ( + '{a}{r}{k}' '{k}{a}{l}{y}{a}{k}' '{p}{e}{t}{a}{l}' '{l}{y}{ch}' '{p}{l}{e}{x}' '{s}{k}' '{s}' '{f}{l}' '{f}{r}' '{v}{e}{l}' '{l}{o}{u}{l}' '{ch}{n}' + '{s}{p}' '{t}{r}{a}{g}' '{f}{e}' (<- '{o}{u}{d}') + ) + ) + + define step2d as ( + [substring] among ( + '{e}{oo}{s}' '{e}{oo}{n}' (delete unset test1) + ) + [] substring atlimit among ( + '{th}' '{d}' '{e}{l}' '{g}{a}{l}' '{n}' '{p}' '{y}{d}' '{p}{a}{r}' (<- '{e}') + ) + ) + + define step3 as ( + [substring] among ( + '{y}{a}' '{y}{o}{u}' '{y}{oo}{n}' (delete unset test1) + ) + ([] v <- '{y}') + ) + + define step4 as ( + [substring] among ( + '{y}{k}{a}' '{y}{k}{o}' '{y}{k}{o}{u}' '{y}{k}{oo}{n}' (delete unset test1) + ) + ([] v <- '{y}{k}') or + [] substring atlimit among ( + '{a}{l}' '{a}{d}' '{e}{n}{d}' '{a}{m}{a}{n}' '{a}{m}{m}{o}{ch}{a}{l}' '{i}{th}' '{a}{n}{i}{th}' '{a}{n}{t}{y}{d}' '{f}{u}{s}' '{v}{r}{oo}{m}' '{g}{e}{r}' + '{e}{x}{oo}{d}' '{k}{a}{l}{p}' '{k}{a}{l}{l}{y}{n}' '{k}{a}{t}{a}{d}' '{m}{o}{u}{l}' '{m}{p}{a}{n}' '{m}{p}{a}{g}{y}{a}{t}' '{m}{p}{o}{l}' '{m}{p}{o}{s}' + '{n}{y}{t}' '{x}{y}{k}' '{s}{u}{n}{o}{m}{i}{l}' '{p}{e}{t}{s}' '{p}{y}{t}{s}' '{p}{y}{k}{a}{n}{t}' '{p}{l}{y}{a}{t}{s}' '{p}{o}{s}{t}{e}{l}{n}' '{p}{r}{oo}{t}{o}{d}' + '{s}{e}{r}{t}' '{s}{u}{n}{a}{d}' '{t}{s}{a}{m}' '{u}{p}{o}{d}' '{f}{y}{l}{o}{n}' '{f}{u}{l}{o}{d}' '{ch}{a}{s}' + (<- '{y}{k}') + ) + ) + + define step5a as ( + do ('{a}{g}{a}{m}{e}' atlimit <- '{a}{g}{a}{m}') + do ( + [substring] among ( + '{a}{g}{a}{m}{e}' '{i}{s}{a}{m}{e}' '{o}{u}{s}{a}{m}{e}' '{i}{k}{a}{m}{e}' '{i}{th}{i}{k}{a}{m}{e}' (delete unset test1) + ) + ) + ['{a}{m}{e}'] + delete + unset test1 + [] substring atlimit among ( + '{a}{n}{a}{p}' '{a}{p}{o}{th}' '{a}{p}{o}{k}' '{a}{p}{o}{s}{t}' '{v}{o}{u}{v}' '{x}{e}{th}' '{o}{u}{l}' '{p}{e}{th}' '{p}{y}{k}{r}' '{p}{o}{t}' '{s}{y}{ch}' '{ch}' + (<- '{a}{m}') + ) + ) + + define step5b as ( + do ( + [substring] among ( + '{a}{g}{a}{n}{e}' '{i}{s}{a}{n}{e}' '{o}{u}{s}{a}{n}{e}' '{y}{o}{n}{t}{a}{n}{e}' '{y}{o}{t}{a}{n}{e}' '{y}{o}{u}{n}{t}{a}{n}{e}' '{o}{n}{t}{a}{n}{e}' '{o}{t}{a}{n}{e}' + '{o}{u}{n}{t}{a}{n}{e}' '{i}{k}{a}{n}{e}' '{i}{th}{i}{k}{a}{n}{e}' ( + delete + unset test1 + [] substring atlimit among ( + '{t}{r}' '{t}{s}' (<- '{a}{g}{a}{n}') + ) + ) + ) + ) + ['{a}{n}{e}'] + delete + unset test1 + ([] v2 <- '{a}{n}') or + [] substring atlimit among ( + '{v}{e}{t}{e}{r}' '{v}{o}{u}{l}{k}' '{v}{r}{a}{ch}{m}' '{g}' '{d}{r}{a}{d}{o}{u}{m}' + '{th}' '{k}{a}{l}{p}{o}{u}{z}' '{k}{a}{s}{t}{e}{l}' '{k}{o}{r}{m}{o}{r}' '{l}{a}{o}{p}{l}' '{m}{oo}{a}{m}{e}{th}' + '{m}' '{m}{o}{u}{s}{o}{u}{l}{m}' '{n}' '{o}{u}{l}' '{p}' '{p}{e}{l}{e}{k}' '{p}{l}' '{p}{o}{l}{y}{s}' + '{p}{o}{r}{t}{o}{l}' '{s}{a}{r}{a}{k}{a}{t}{s}' '{s}{o}{u}{l}{t}' '{t}{s}{a}{r}{l}{a}{t}' '{o}{r}{f}' + '{t}{s}{y}{g}{g}' '{t}{s}{o}{p}' '{f}{oo}{t}{o}{s}{t}{e}{f}' '{ch}' '{ps}{u}{ch}{o}{p}{l}' '{a}{g}' + '{g}{a}{l}' '{g}{e}{r}' '{d}{e}{k}' '{d}{y}{p}{l}' '{a}{m}{e}{r}{y}{k}{a}{n}' '{o}{u}{r}' '{p}{y}{th}' + '{p}{o}{u}{r}{y}{t}' '{s}' '{z}{oo}{n}{t}' '{y}{k}' '{k}{a}{s}{t}' '{k}{o}{p}' '{l}{y}{ch}' + '{l}{o}{u}{th}{i}{r}' '{m}{a}{y}{n}{t}' '{m}{e}{l}' '{s}{y}{g}' '{s}{p}' '{s}{t}{e}{g}' '{t}{r}{a}{g}' + '{t}{s}{a}{g}' '{f}' '{e}{r}' '{a}{d}{a}{p}' '{a}{th}{y}{g}{g}' '{a}{m}{i}{ch}' '{a}{n}{y}{k}' + '{a}{n}{o}{r}{g}' '{a}{p}{i}{g}' '{a}{p}{y}{th}' '{a}{t}{s}{y}{g}{g}' '{v}{a}{s}' '{v}{a}{s}{k}' + '{v}{a}{th}{u}{g}{a}{l}' '{v}{y}{o}{m}{i}{ch}' '{v}{r}{a}{ch}{u}{k}' '{d}{y}{a}{t}' '{d}{y}{a}{f}' '{e}{n}{o}{r}{g}' + '{th}{u}{s}' '{k}{a}{p}{n}{o}{v}{y}{o}{m}{i}{ch}' '{k}{a}{t}{a}{g}{a}{l}' '{k}{l}{y}{v}' '{k}{o}{y}{l}{a}{r}{f}' + '{l}{y}{v}' '{m}{e}{g}{l}{o}{v}{y}{o}{m}{i}{ch}' '{m}{y}{k}{r}{o}{v}{y}{o}{m}{i}{ch}' '{n}{t}{a}{v}' + '{x}{i}{r}{o}{k}{l}{y}{v}' '{o}{l}{y}{g}{o}{d}{a}{m}' '{o}{l}{o}{g}{a}{l}' '{p}{e}{n}{t}{a}{r}{f}' '{p}{e}{r}{i}{f}' + '{p}{e}{r}{y}{t}{r}' '{p}{l}{a}{t}' '{p}{o}{l}{u}{d}{a}{p}' '{p}{o}{l}{u}{m}{i}{ch}' '{s}{t}{e}{f}' '{t}{a}{v}' + '{t}{e}{t}' '{u}{p}{e}{r}{i}{f}' '{u}{p}{o}{k}{o}{p}' '{ch}{a}{m}{i}{l}{o}{d}{a}{p}' '{ps}{i}{l}{o}{t}{a}{v}' + (<- '{a}{n}') + ) + ) + + define step5c as ( + do ( + [substring] among ( + '{i}{s}{e}{t}{e}' (delete unset test1) + ) + ) + ['{e}{t}{e}'] + delete + unset test1 + ([] v2 <- '{e}{t}') or + ([] substring among ( + '{o}{d}' '{a}{y}{r}' '{f}{o}{r}' '{t}{a}{th}' '{d}{y}{a}{th}' '{s}{ch}' '{e}{n}{d}' '{e}{u}{r}' '{t}{y}{th}' '{u}{p}{e}{r}{th}' + '{r}{a}{th}' '{e}{n}{th}' '{r}{o}{th}' '{s}{th}' '{p}{u}{r}' '{a}{y}{n}' '{s}{u}{n}{d}' '{s}{u}{n}' '{s}{u}{n}{th}' '{ch}{oo}{r}' + '{p}{o}{n}' '{v}{r}' '{k}{a}{th}' '{e}{u}{th}' '{e}{k}{th}' '{n}{e}{t}' '{r}{o}{n}' '{a}{r}{k}' '{v}{a}{r}' '{v}{o}{l}' '{oo}{f}{e}{l}' + (<- '{e}{t}') + )) or + [] substring atlimit among ( + '{a}{v}{a}{r}' '{v}{e}{n}' '{e}{n}{a}{r}' '{a}{v}{r}' '{a}{d}' '{a}{th}' '{a}{n}' '{a}{p}{l}' '{v}{a}{r}{o}{n}' '{n}{t}{r}' '{s}{k}' '{k}{o}{p}' + '{m}{p}{o}{r}' '{n}{y}{f}' '{p}{a}{g}' '{p}{a}{r}{a}{k}{a}{l}' '{s}{e}{r}{p}' '{s}{k}{e}{l}' '{s}{u}{r}{f}' '{t}{o}{k}' '{u}' '{d}' '{e}{m}' + '{th}{a}{r}{r}' '{th}' + (<- '{e}{t}') + ) + ) + + define step5d as ( + [substring] among ( + '{o}{n}{t}{a}{s}' '{oo}{n}{t}{a}{s}' ( + delete + unset test1 + ([] '{a}{r}{ch}' atlimit <- '{o}{n}{t}') or + ([] '{k}{r}{e}' <- '{oo}{n}{t}') + ) + ) + ) + + define step5e as ( + [substring] among ( + '{o}{m}{a}{s}{t}{e}' '{y}{o}{m}{a}{s}{t}{e}' ( + delete + unset test1 + ([] '{o}{n}' atlimit <- '{o}{m}{a}{s}{t}') + ) + ) + ) + + define step5f as ( + do ( + ['{y}{e}{s}{t}{e}'] + delete + unset test1 + [] substring atlimit among ( + '{p}' '{a}{p}' '{s}{u}{m}{p}' '{a}{s}{u}{m}{p}' '{a}{k}{a}{t}{a}{p}' '{a}{m}{e}{t}{a}{m}{f}' (<- '{y}{e}{s}{t}') + ) + ) + ['{e}{s}{t}{e}'] + delete + unset test1 + [] substring atlimit among ( + '{a}{l}' '{a}{r}' '{e}{k}{t}{e}{l}' '{z}' '{m}' '{x}' '{p}{a}{r}{a}{k}{a}{l}' '{p}{r}{o}' '{n}{y}{s}' + (<- '{y}{e}{s}{t}') + ) + ) + + define step5g as ( + do ( + [substring] among ( + '{i}{th}{i}{k}{a}' '{i}{th}{i}{k}{e}{s}' '{i}{th}{i}{k}{e}' (delete unset test1) + ) + ) + [substring] among ( + '{i}{k}{a}' '{i}{k}{e}{s}' '{i}{k}{e}' ( + delete + unset test1 + ([] substring among ( + '{s}{k}{oo}{l}' '{s}{k}{o}{u}{l}' '{n}{a}{r}{th}' '{s}{f}' '{o}{th}' '{p}{y}{th}' (<- '{i}{k}') + )) or + ([] substring atlimit among ( + '{d}{y}{a}{th}' '{th}' '{p}{a}{r}{a}{k}{a}{t}{a}{th}' '{p}{r}{o}{s}{th}' '{s}{u}{n}{th}' (<- '{i}{k}') + )) + ) + ) + ) + + define step5h as ( + [substring] among ( + '{o}{u}{s}{a}' '{o}{u}{s}{e}{s}' '{o}{u}{s}{e}' ( + delete + unset test1 + ([] substring among ( + '{p}{o}{d}{a}{r}' '{v}{l}{e}{p}' '{p}{a}{n}{t}{a}{ch}' '{f}{r}{u}{d}' '{m}{a}{n}{t}{y}{l}' '{m}{a}{l}{l}' '{k}{u}{m}{a}{t}' '{l}{a}{ch}' '{l}{i}{g}' + '{f}{a}{g}' '{o}{m}' '{p}{r}{oo}{t}' (<- '{o}{u}{s}') + + )) or + ([] substring atlimit among ( + '{f}{a}{r}{m}{a}{k}' '{ch}{a}{d}' '{a}{g}{k}' '{a}{n}{a}{r}{r}' '{v}{r}{o}{m}' '{e}{k}{l}{y}{p}' '{l}{a}{m}{p}{y}{d}' '{l}{e}{ch}' '{m}' '{p}{a}{t}' + '{r}' '{l}' '{m}{e}{d}' '{m}{e}{s}{a}{z}' '{u}{p}{o}{t}{e}{y}{n}' '{a}{m}' '{a}{y}{th}' '{a}{n}{i}{k}' '{d}{e}{s}{p}{o}{z}' + '{e}{n}{d}{y}{a}{f}{e}{r}' '{d}{e}' '{d}{e}{u}{t}{e}{r}{e}{u}' '{k}{a}{th}{a}{r}{e}{u}' '{p}{l}{e}' '{t}{s}{a}' + (<- '{o}{u}{s}') + )) + ) + ) + ) + + define step5i as ( + [substring] among ( + '{a}{g}{a}' '{a}{g}{e}{s}' '{a}{g}{e}' ( + delete + unset test1 + ([] '{k}{o}{l}{l}' <- '{a}{g}') or ( + not ([substring] among ('{ps}{o}{f}' '{n}{a}{u}{l}{o}{ch}')) + ([] substring among ( + '{o}{f}' '{p}{e}{l}' '{ch}{o}{r}{t}' '{l}{l}' '{s}{f}' '{r}{p}' '{f}{r}' '{p}{r}' '{l}{o}{ch}' '{s}{m}{i}{n}' + (<- '{a}{g}') + )) or + ([] substring atlimit among ( + '{a}{v}{a}{s}{t}' '{p}{o}{l}{u}{f}' '{a}{d}{i}{f}' '{p}{a}{m}{f}' '{r}' '{a}{s}{p}' '{a}{f}' '{a}{m}{a}{l}' '{a}{m}{a}{l}{l}{y}' + '{a}{n}{u}{s}{t}' '{a}{p}{e}{r}' '{a}{s}{p}{a}{r}' '{a}{ch}{a}{r}' '{d}{e}{r}{v}{e}{n}' '{d}{r}{o}{s}{o}{p}' '{x}{e}{f}' '{n}{e}{o}{p}' + '{n}{o}{m}{o}{t}' '{o}{l}{o}{p}' '{o}{m}{o}{t}' '{p}{r}{o}{s}{t}' '{p}{r}{o}{s}{oo}{p}{o}{p}' '{s}{u}{m}{p}' '{s}{u}{n}{t}' '{t}' '{u}{p}{o}{t}' + '{ch}{a}{r}' '{a}{e}{y}{p}' '{a}{y}{m}{o}{s}{t}' '{a}{n}{u}{p}' '{a}{p}{o}{t}' '{a}{r}{t}{y}{p}' '{d}{y}{a}{t}' '{e}{n}' '{e}{p}{y}{t}' + '{k}{r}{o}{k}{a}{l}{o}{p}' '{s}{y}{d}{i}{r}{o}{p}' '{l}' '{n}{a}{u}' '{o}{u}{l}{a}{m}' '{o}{u}{r}' '{p}' '{t}{r}' '{m}' + (<- '{a}{g}') + )) + ) + ) + ) + ) + + define step5j as ( + [substring] among ( + '{i}{s}{e}' '{i}{s}{o}{u}' '{i}{s}{a}' (delete unset test1) + ) + [] substring atlimit among ( + '{n}' '{ch}{e}{r}{s}{o}{n}' '{d}{oo}{d}{e}{k}{a}{n}' '{e}{r}{i}{m}{o}{n}' '{m}{e}{g}{a}{l}{o}{n}' '{e}{p}{t}{a}{n}' (<- '{i}{s}') + ) + ) + + define step5k as ( + [substring] among ( + '{i}{s}{t}{e}' (delete unset test1) + ) + [] substring atlimit among ( + '{a}{s}{v}' '{s}{v}' '{a}{ch}{r}' '{ch}{r}' '{a}{p}{l}' '{a}{e}{y}{m}{n}' '{d}{u}{s}{ch}{r}' '{e}{u}{ch}{r}' '{k}{o}{y}{n}{o}{ch}{r}' '{p}{a}{l}{y}{m}{ps}' + (<- '{i}{s}{t}') + ) + ) + + define step5l as ( + [substring] among ( + '{o}{u}{n}{e}' '{i}{s}{o}{u}{n}{e}' '{i}{th}{o}{u}{n}{e}' (delete unset test1) + ) + [] substring atlimit among ( + '{n}' '{r}' '{s}{p}{y}' '{s}{t}{r}{a}{v}{o}{m}{o}{u}{t}{s}' '{k}{a}{k}{o}{m}{o}{u}{t}{s}' '{e}{x}{oo}{n}' (<- '{o}{u}{n}') + ) + ) + + define step5m as ( + [substring] among ( + '{o}{u}{m}{e}' '{i}{s}{o}{u}{m}{e}' '{i}{th}{o}{u}{m}{e}' (delete unset test1) + ) + [] substring atlimit among ( + '{p}{a}{r}{a}{s}{o}{u}{s}' '{f}' '{ch}' '{oo}{r}{y}{o}{p}{l}' '{a}{z}' '{a}{l}{l}{o}{s}{o}{u}{s}' '{a}{s}{o}{u}{s}' + (<- '{o}{u}{m}') + ) + ) + + define step6 as ( + do ( + [substring] among ( + '{m}{a}{t}{a}' '{m}{a}{t}{oo}{n}' '{m}{a}{t}{o}{s}' (<- '{m}{a}') + ) + ) + test1 + [substring] among ( + '{a}' '{a}{g}{a}{t}{e}' '{a}{g}{a}{n}' '{a}{e}{y}' '{a}{m}{a}{y}' '{a}{n}' '{a}{s}' '{a}{s}{a}{y}' '{a}{t}{a}{y}' '{a}{oo}' '{e}' '{e}{y}' + '{e}{y}{s}' '{e}{y}{t}{e}' '{e}{s}{a}{y}' '{e}{s}' '{e}{t}{a}{y}' '{y}' '{y}{e}{m}{a}{y}' '{y}{e}{m}{a}{s}{t}{e}' '{y}{e}{t}{a}{y}' '{y}{e}{s}{a}{y}' + '{y}{e}{s}{a}{s}{t}{e}' '{y}{o}{m}{a}{s}{t}{a}{n}' '{y}{o}{m}{o}{u}{n}' '{y}{o}{m}{o}{u}{n}{a}' '{y}{o}{n}{t}{a}{n}' '{y}{o}{n}{t}{o}{u}{s}{a}{n}' '{y}{o}{s}{a}{s}{t}{a}{n}' + '{y}{o}{s}{a}{s}{t}{e}' '{y}{o}{s}{o}{u}{n}' '{y}{o}{s}{o}{u}{n}{a}' '{y}{o}{t}{a}{n}' '{y}{o}{u}{m}{a}' '{y}{o}{u}{m}{a}{s}{t}{e}' '{y}{o}{u}{n}{t}{a}{y}' + '{y}{o}{u}{n}{t}{a}{n}' '{i}' '{i}{d}{e}{s}' '{i}{d}{oo}{n}' '{i}{th}{e}{y}' '{i}{th}{e}{y}{s}' '{i}{th}{e}{y}{t}{e}' '{i}{th}{i}{k}{a}{t}{e}' '{i}{th}{i}{k}{a}{n}' + '{i}{th}{o}{u}{n}' '{i}{th}{oo}' '{i}{k}{a}{t}{e}' '{i}{k}{a}{n}' '{i}{s}' '{i}{s}{a}{n}' '{i}{s}{a}{t}{e}' '{i}{s}{e}{y}' '{i}{s}{e}{s}' '{i}{s}{o}{u}{n}' + '{i}{s}{oo}' '{o}' '{o}{y}' '{o}{m}{a}{y}' '{o}{m}{a}{s}{t}{a}{n}' '{o}{m}{o}{u}{n}' '{o}{m}{o}{u}{n}{a}' '{o}{n}{t}{a}{y}' '{o}{n}{t}{a}{n}' + '{o}{n}{t}{o}{u}{s}{a}{n}' '{o}{s}' '{o}{s}{a}{s}{t}{a}{n}' '{o}{s}{a}{s}{t}{e}' '{o}{s}{o}{u}{n}' '{o}{s}{o}{u}{n}{a}' '{o}{t}{a}{n}' '{o}{u}' '{o}{u}{m}{a}{y}' + '{o}{u}{m}{a}{s}{t}{e}' '{o}{u}{n}' '{o}{u}{n}{t}{a}{y}' '{o}{u}{n}{t}{a}{n}' '{o}{u}{s}' '{o}{u}{s}{a}{n}' '{o}{u}{s}{a}{t}{e}' '{u}' '{u}{s}' '{oo}' + '{oo}{n}' (delete) + ) + ) + + define step7 as ( + [substring] among ( + '{e}{s}{t}{e}{r}' '{e}{s}{t}{a}{t}' '{o}{t}{e}{r}' '{o}{t}{a}{t}' '{u}{t}{e}{r}' '{u}{t}{a}{t}' '{oo}{t}{e}{r}' '{oo}{t}{a}{t}' (delete) + ) + ) +) + +define stem as ( + backwards ( + do tolower + has_min_length + set test1 + do step1 + do steps1 + do steps2 + do steps3 + do steps4 + do steps5 + do steps6 + do steps7 + do steps8 + do steps9 + do steps10 + do step2a + do step2b + do step2c + do step2d + do step3 + do step4 + do step5a + do step5b + do step5c + do step5d + do step5e + do step5f + do step5g + do step5h + do step5j + do step5i + do step5k + do step5l + do step5m + do step6 + do step7 + ) +) diff --git a/contrib/snowball/algorithms/hindi.sbl b/contrib/snowball/algorithms/hindi.sbl new file mode 100644 index 000000000..bfdfac0e8 --- /dev/null +++ b/contrib/snowball/algorithms/hindi.sbl @@ -0,0 +1,323 @@ +// An implementation of "A Lightweight Stemmer for Hindi": +// http://www.kbcs.in/downloads/papers/StmmerHindi.pdf + +externals ( stem ) + +stringescapes {} + +// The transliteration scheme used for our stringdefs matches that used in the +// paper, as documented in the appendix. It appears to match the WX notation +// (https://en.wikipedia.org/wiki/WX_notation) except that WX apparently +// uses 'z' for Anunasika whereas the paper uses Mh. +// +// We discriminate dependent vowels by adding a leading "_" to their stringdef +// names (mnemonic: the _ signifies removing the implicit a from the preceding +// character). + +// Vowels and sonorants: +stringdef a '{U+0905}' +stringdef A '{U+0906}' +stringdef i '{U+0907}' +stringdef I '{U+0908}' +stringdef u '{U+0909}' +stringdef U '{U+090A}' +stringdef q '{U+090B}' +stringdef e '{U+090F}' +stringdef E '{U+0910}' +stringdef o '{U+0913}' +stringdef O '{U+0914}' + +// Vowel signs: +stringdef _A '{U+093E}' +stringdef _i '{U+093F}' +stringdef _I '{U+0940}' +stringdef _u '{U+0941}' +stringdef _U '{U+0942}' +stringdef _q '{U+0943}' +stringdef _e '{U+0947}' +stringdef _E '{U+0948}' +stringdef _o '{U+094B}' +stringdef _O '{U+094C}' + +// Diacritics: +stringdef M '{U+0902}' +stringdef H '{U+0903}' +stringdef Mh '{U+0901}' +stringdef Z '{U+093C}' // Nukta +stringdef virama '{U+094D}' + +// Velar consonants: +stringdef k '{U+0915}' +stringdef K '{U+0916}' +stringdef g '{U+0917}' +stringdef G '{U+0918}' +stringdef f '{U+0919}' + +// Palatal consonants: +stringdef c '{U+091A}' +stringdef C '{U+091B}' +stringdef j '{U+091C}' +stringdef J '{U+091D}' +stringdef F '{U+091E}' + +// Retroflex consonants: +stringdef t '{U+091F}' +stringdef T '{U+0920}' +stringdef d '{U+0921}' +stringdef D '{U+0922}' +stringdef N '{U+0923}' + +// Dental consonants: +stringdef w '{U+0924}' +stringdef W '{U+0925}' +stringdef x '{U+0926}' +stringdef X '{U+0927}' +stringdef n '{U+0928}' + +// Labial consonants: +stringdef p '{U+092A}' +stringdef P '{U+092B}' +stringdef b '{U+092C}' +stringdef B '{U+092D}' +stringdef m '{U+092E}' + +// Semi-vowels: +stringdef y '{U+092F}' +stringdef r '{U+0930}' +stringdef l '{U+0932}' +stringdef v '{U+0935}' + +// Fricatives: +stringdef S '{U+0936}' +stringdef R '{U+0937}' +stringdef s '{U+0938}' +stringdef h '{U+0939}' + +stringdef lY '{U+0933}' + +// Precomposed characters - letters + nukta: +stringdef nZ '{U+0929}' // ≡ {n}{Z} +stringdef rZ '{U+0931}' // ≡ {r}{Z} +stringdef lYZ '{U+0934}' // ≡ {lY}{Z} +stringdef kZ '{U+0958}' // ≡ {k}{Z} +stringdef KZ '{U+0959}' // ≡ {K}{Z} +stringdef gZ '{U+095A}' // ≡ {g}{Z} +stringdef jZ '{U+095B}' // ≡ {j}{Z} +stringdef dZ '{U+095C}' // ≡ {d}{Z} +stringdef DZ '{U+095D}' // ≡ {D}{Z} +stringdef PZ '{U+095E}' // ≡ {P}{Z} +stringdef yZ '{U+095F}' // ≡ {y}{Z} + +integers ( p ) + +groupings ( consonant ) + +routines ( CONSONANT ) + +define consonant '{k}{K}{g}{G}{f}' + + '{c}{C}{j}{J}{F}' + + '{t}{T}{d}{D}{N}' + + '{w}{W}{x}{X}{n}' + + '{p}{P}{b}{B}{m}' + + '{y}{r}{l}{v}' + + '{S}{R}{s}{h}' + + '{lY}' + + '{Z}' + // Nukta + // Precomposed characters - letter and nukta: + '{nZ}{rZ}{lYZ}{kZ}{KZ}{gZ}{jZ}{dZ}{DZ}{PZ}{yZ}' + +backwardmode ( define CONSONANT as ( consonant ) ) + +define stem as ( + test ( next setmark p ) + backwards ( + // We assume in this implementation that the whole word doesn't count + // as a valid suffix to remove, so we remove the longest suffix from + // the list which leaves at least one character. This change affects + // 47 words out of the 65,140 in the sample vocabulary from Hindi + // wikipedia. + setlimit tomark p for ([substring]) + among ( + // The list below is derived from figure 3 in the paper. + // + // We perform the stemming on the Devanagari characters rather than + // transliterating to Latin, so we have adapted the list below to + // reflect this by converting suffixes back to Devanagari as + // follows: + // + // * within the suffixes, "a" after a consonant is dropped since + // consonants have an implicit "a". + // + // * within the suffixes, a vowel other than "a" after a consonant + // is a dependent vowel (vowel sign); a vowel (including "a") + // after a non-consonant is an independent vowel. + // + // * to allow the vowel at the start of each suffix being dependent + // or independent, we include each suffix twice. For the + // dependent version, a leading "a" is dropped and we check that + // the suffix is preceded by a consonant (which will have an + // implicit "a"). + // + // * we add '{a}', which is needed for the example given right at + // the end of section 5 to work (conflating BarawIya and + // BarawIyawA), and which 3.1 a.v strongly suggests should be in + // the list: + // + // Thus, the following suffix deletions (longest possible + // match) are required to reduce inflected forms of masculine + // nouns to a common stem: + // a A i [...] + // + // Adding '{a}' only affect 2 words out of the 65,140 in the + // sample vocabulary. + // + // * The transliterations of our stems would end with "a" when our + // stems end in a consonant, so we also include {virama} in the + // list of suffixes to remove (this affects 222 words from the + // sample vocabulary). + // + // We've also assumed that Mh in the suffix list always means {Mh} + // and never {M}{h}{virama}. Only one of the 65,140 words in the + // sample vocabulary stems differently due to this (and that word + // seems to be a typo). + + '{virama}' + + '{a}' + '{A}' + '{i}' + '{I}' + '{u}' + '{U}' + '{e}' + '{o}' + '{e}{M}' + '{o}{M}' + '{A}{M}' + '{u}{A}{M}' + '{u}{e}{M}' + '{u}{o}{M}' + '{A}{e}{M}' + '{A}{o}{M}' + '{i}{y}{_A}{M}' + '{i}{y}{_o}{M}' + '{A}{i}{y}{_A}{M}' + '{A}{i}{y}{_o}{M}' + '{A}{Mh}' + '{i}{y}{_A}{Mh}' + '{A}{i}{y}{_A}{Mh}' + '{a}{w}{_A}{e}{M}' + '{a}{w}{_A}{o}{M}' + '{a}{n}{_A}{e}{M}' + '{a}{n}{_A}{o}{M}' + '{a}{w}{_A}' + '{a}{w}{_I}' + '{I}{M}' + '{a}{w}{_I}{M}' + '{a}{w}{_e}' + '{A}{w}{_A}' + '{A}{w}{_I}' + '{A}{w}{_I}{M}' + '{A}{w}{_e}' + '{a}{n}{_A}' + '{a}{n}{_I}' + '{a}{n}{_e}' + '{A}{n}{_A}' + '{A}{n}{_e}' + '{U}{M}{g}{_A}' + '{U}{M}{g}{_I}' + '{A}{U}{M}{g}{_A}' + '{A}{U}{M}{g}{_I}' + '{e}{M}{g}{_e}' + '{e}{M}{g}{_I}' + '{A}{e}{M}{g}{_e}' + '{A}{e}{M}{g}{_I}' + '{o}{g}{_e}' + '{o}{g}{_I}' + '{A}{o}{g}{_e}' + '{A}{o}{g}{_I}' + '{e}{g}{_A}' + '{e}{g}{_I}' + '{A}{e}{g}{_A}' + '{A}{e}{g}{_I}' + '{A}{y}{_A}' + '{A}{e}' + '{A}{I}' + '{A}{I}{M}' + '{i}{e}' + '{A}{o}' + '{A}{i}{e}' + '{a}{k}{r}' + '{A}{k}{r}' + + '{_A}' + '{_i}' + '{_I}' + '{_u}' + '{_U}' + '{_e}' + '{_o}' + '{_e}{M}' + '{_o}{M}' + '{_A}{M}' + '{_u}{A}{M}' + '{_u}{e}{M}' + '{_u}{o}{M}' + '{_A}{e}{M}' + '{_A}{o}{M}' + '{_i}{y}{_A}{M}' + '{_i}{y}{_o}{M}' + '{_A}{i}{y}{_A}{M}' + '{_A}{i}{y}{_o}{M}' + '{_A}{Mh}' + '{_i}{y}{_A}{Mh}' + '{_A}{i}{y}{_A}{Mh}' + '{_I}{M}' + '{_A}{w}{_A}' + '{_A}{w}{_I}' + '{_A}{w}{_I}{M}' + '{_A}{w}{_e}' + '{_A}{n}{_A}' + '{_A}{n}{_e}' + '{_U}{M}{g}{_A}' + '{_U}{M}{g}{_I}' + '{_A}{U}{M}{g}{_A}' + '{_A}{U}{M}{g}{_I}' + '{_e}{M}{g}{_e}' + '{_e}{M}{g}{_I}' + '{_A}{e}{M}{g}{_e}' + '{_A}{e}{M}{g}{_I}' + '{_o}{g}{_e}' + '{_o}{g}{_I}' + '{_A}{o}{g}{_e}' + '{_A}{o}{g}{_I}' + '{_e}{g}{_A}' + '{_e}{g}{_I}' + '{_A}{e}{g}{_A}' + '{_A}{e}{g}{_I}' + '{_A}{y}{_A}' + '{_A}{e}' + '{_A}{I}' + '{_A}{I}{M}' + '{_i}{e}' + '{_A}{o}' + '{_A}{i}{e}' + '{_A}{k}{r}' + + /* Suffixes with a leading implicit a: */ + '{w}{_A}{e}{M}' CONSONANT + '{w}{_A}{o}{M}' CONSONANT + '{n}{_A}{e}{M}' CONSONANT + '{n}{_A}{o}{M}' CONSONANT + '{w}{_A}' CONSONANT + '{w}{_I}' CONSONANT + '{w}{_I}{M}' CONSONANT + '{w}{_e}' CONSONANT + '{n}{_A}' CONSONANT + '{n}{_I}' CONSONANT + '{n}{_e}' CONSONANT + '{k}{r}' CONSONANT + ) + delete + ) +) diff --git a/contrib/snowball/algorithms/hungarian/stem_Unicode.sbl b/contrib/snowball/algorithms/hungarian.sbl index c812e055c..2d7885cf1 100644 --- a/contrib/snowball/algorithms/hungarian/stem_Unicode.sbl +++ b/contrib/snowball/algorithms/hungarian.sbl @@ -27,17 +27,17 @@ groupings ( v ) stringescapes {} -/* special characters (in Unicode) */ - -stringdef a' hex 'E1' //a-acute -stringdef e' hex 'E9' //e-acute -stringdef i' hex 'ED' //i-acute -stringdef o' hex 'F3' //o-acute -stringdef o" hex 'F6' //o-umlaut -stringdef oq hex '151' //o-double acute -stringdef u' hex 'FA' //u-acute -stringdef u" hex 'FC' //u-umlaut -stringdef uq hex '171' //u-double acute +/* special characters */ + +stringdef a' '{U+00E1}' //a-acute +stringdef e' '{U+00E9}' //e-acute +stringdef i' '{U+00ED}' //i-acute +stringdef o' '{U+00F3}' //o-acute +stringdef o" '{U+00F6}' //o-umlaut +stringdef oq '{U+0151}' //o-double acute +stringdef u' '{U+00FA}' //u-acute +stringdef u" '{U+00FC}' //u-umlaut +stringdef uq '{U+0171}' //u-double acute define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}' diff --git a/contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl b/contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl deleted file mode 100644 index d1a644931..000000000 --- a/contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl +++ /dev/null @@ -1,241 +0,0 @@ -/* -Hungarian Stemmer -Removes noun inflections -*/ - -routines ( - mark_regions - R1 - v_ending - case - case_special - case_other - plural - owned - sing_owner - plur_owner - instrum - factive - undouble - double -) - -externals ( stem ) - -integers ( p1 ) -groupings ( v ) - -stringescapes {} - -/* special characters (in ISO Latin 2) */ - -stringdef a' hex 'E1' //a-acute -stringdef e' hex 'E9' //e-acute -stringdef i' hex 'ED' //i-acute -stringdef o' hex 'F3' //o-acute -stringdef o" hex 'F6' //o-umlaut -stringdef oq hex 'F5' //o-double acute -stringdef u' hex 'FA' //u-acute -stringdef u" hex 'FC' //u-umlaut -stringdef uq hex 'FB' //u-double acute - -define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}' - -define mark_regions as ( - - $p1 = limit - - (v goto non-v - among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next - setmark p1) - or - - (non-v gopast v setmark p1) -) - -backwardmode ( - - define R1 as $p1 <= cursor - - define v_ending as ( - [substring] R1 among( - '{a'}' (<- 'a') - '{e'}' (<- 'e') - ) - ) - - define double as ( - test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm' - 'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs') - ) - - define undouble as ( - next [hop 1] delete - ) - - define instrum as( - [substring] R1 among( - 'al' (double) - 'el' (double) - ) - delete - undouble - ) - - - define case as ( - [substring] R1 among( - 'ban' 'ben' - 'ba' 'be' - 'ra' 're' - 'nak' 'nek' - 'val' 'vel' - 't{o'}l' 't{oq}l' - 'r{o'}l' 'r{oq}l' - 'b{o'}l' 'b{oq}l' - 'hoz' 'hez' 'h{o"}z' - 'n{a'}l' 'n{e'}l' - 'ig' - 'at' 'et' 'ot' '{o"}t' - '{e'}rt' - 'k{e'}pp' 'k{e'}ppen' - 'kor' - 'ul' '{u"}l' - 'v{a'}' 'v{e'}' - 'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt' - 'k{e'}nt' - 'en' 'on' 'an' '{o"}n' - 'n' - 't' - ) - delete - v_ending - ) - - define case_special as( - [substring] R1 among( - '{e'}n' (<- 'e') - '{a'}n' (<- 'a') - '{a'}nk{e'}nt' (<- 'a') - ) - ) - - define case_other as( - [substring] R1 among( - 'astul' 'est{u"}l' (delete) - 'stul' 'st{u"}l' (delete) - '{a'}stul' (<- 'a') - '{e'}st{u"}l' (<- 'e') - ) - ) - - define factive as( - [substring] R1 among( - '{a'}' (double) - '{e'}' (double) - ) - delete - undouble - ) - - define plural as ( - [substring] R1 among( - '{a'}k' (<- 'a') - '{e'}k' (<- 'e') - '{o"}k' (delete) - 'ak' (delete) - 'ok' (delete) - 'ek' (delete) - 'k' (delete) - ) - ) - - define owned as ( - [substring] R1 among ( - 'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete) - '{e'}k{e'}' (<- 'e') - '{a'}k{e'}' (<- 'a') - 'k{e'}' (delete) - '{e'}{e'}i' (<- 'e') - '{a'}{e'}i' (<- 'a') - '{e'}i' (delete) - '{e'}{e'}' (<- 'e') - '{e'}' (delete) - ) - ) - - define sing_owner as ( - [substring] R1 among( - '{u"}nk' 'unk' (delete) - '{a'}nk' (<- 'a') - '{e'}nk' (<- 'e') - 'nk' (delete) - '{a'}juk' (<- 'a') - '{e'}j{u"}k' (<- 'e') - 'juk' 'j{u"}k' (delete) - 'uk' '{u"}k' (delete) - 'em' 'om' 'am' (delete) - '{a'}m' (<- 'a') - '{e'}m' (<- 'e') - 'm' (delete) - 'od' 'ed' 'ad' '{o"}d' (delete) - '{a'}d' (<- 'a') - '{e'}d' (<- 'e') - 'd' (delete) - 'ja' 'je' (delete) - 'a' 'e' 'o' (delete) - '{a'}' (<- 'a') - '{e'}' (<- 'e') - ) - ) - - define plur_owner as ( - [substring] R1 among( - 'jaim' 'jeim' (delete) - '{a'}im' (<- 'a') - '{e'}im' (<- 'e') - 'aim' 'eim' (delete) - 'im' (delete) - 'jaid' 'jeid' (delete) - '{a'}id' (<- 'a') - '{e'}id' (<- 'e') - 'aid' 'eid' (delete) - 'id' (delete) - 'jai' 'jei' (delete) - '{a'}i' (<- 'a') - '{e'}i' (<- 'e') - 'ai' 'ei' (delete) - 'i' (delete) - 'jaink' 'jeink' (delete) - 'eink' 'aink' (delete) - '{a'}ink' (<- 'a') - '{e'}ink' (<- 'e') - 'ink' - 'jaitok' 'jeitek' (delete) - 'aitok' 'eitek' (delete) - '{a'}itok' (<- 'a') - '{e'}itek' (<- 'e') - 'itek' (delete) - 'jeik' 'jaik' (delete) - 'aik' 'eik' (delete) - '{a'}ik' (<- 'a') - '{e'}ik' (<- 'e') - 'ik' (delete) - ) - ) -) - -define stem as ( - do mark_regions - backwards ( - do instrum - do case - do case_special - do case_other - do factive - do owned - do sing_owner - do plur_owner - do plural - ) -) diff --git a/contrib/snowball/algorithms/indonesian.sbl b/contrib/snowball/algorithms/indonesian.sbl new file mode 100644 index 000000000..ac0ee3655 --- /dev/null +++ b/contrib/snowball/algorithms/indonesian.sbl @@ -0,0 +1,192 @@ +// An implementation of the "Porter Stemmer for Bahasa Indonesia" from: +// http://www.illc.uva.nl/Research/Publications/Reports/MoL-2003-02.text.pdf + +integers ( + // The paper defines measure as the number of vowels in the word. We + // count this initially, then adjust the count each time we remove a + // prefix or suffix. + measure + + // Numeric code for the type of prefix removed: + // + // 0 other/none + // 1 'di' or 'meng' or 'ter' + // 2 'per' + // 3 'ke' or 'peng' + // 4 'ber' + // + // Some of these have variant forms, so e.g. "meng" includes "men", "me", + // "meny", "mem". + // + // Note that the value of prefix is only used in remove_suffix (and + // routines it calls) so we don't need to worry about + // remove_second_order_prefix overwriting a value of prefix set by + // remove_first_order_prefix since remove_suffix gets called between + // the two. + prefix +) + +groupings ( vowel ) + +routines ( + remove_particle + remove_possessive_pronoun + remove_first_order_prefix + remove_second_order_prefix + remove_suffix + KER + SUFFIX_KAN_OK + SUFFIX_AN_OK + SUFFIX_I_OK + VOWEL +) + +externals ( stem ) + +stringescapes {} + +backwardmode ( + + define remove_particle as ( + [substring] among ( + 'kah' 'lah' 'pun' (delete $measure-=1) + ) + ) + + define remove_possessive_pronoun as ( + [substring] among ( + 'ku' 'mu' 'nya' (delete $measure-=1) + ) + ) + + // prefix not in {ke, peng, per} + define SUFFIX_KAN_OK as ( + // On page 29, the example "kompas Q.31" says "Both Nazief and Porter + // stemmer converted the word peledakan (blast, explotion) to ledak (to + // blast, to explode)". However, the algorithm as described doesn't + // behave in this way - grammatically the prefix pe- occurs as a + // variation of both the first-order derivational prefix peng- and the + // second-order derivational prefix per-, but table 2.5 doesn't include + // "pe", only table 2.6 does, so "peledakan" is handled (incorrectly) + // as having prefix "per" not "peng", and so we remove derivational + // suffix "kan" rather than "an" to give stem leda. (Porter-style + // stemmers remove the longest suffix they can amongst those available, + // which this paper notes in the last paragraph on page 15). + // + // We resolve this by amending the condition on suffix "kan" to + // "prefix ∉ {ke, peng, per}", which seems to make the stemmer's + // behaviour match all the examples in the paper except for one: + // "perbaikan" is shown in table 3.4 as stemming to "bai", but with + // this change it now stems to "baik". The table notes that "baik" is + // the actual root so this deviation is an improvement. In a sample + // vocabulary derived from the most common words in id.wikipedia.org, + // this change only affects 0.12% of words (76 out of 64,587, including + // "peledakan" and "perbaikan"). + $prefix != 3 and $prefix != 2 + ) + + // prefix not in {di, meng, ter} + define SUFFIX_AN_OK as ( $prefix != 1 ) + + define SUFFIX_I_OK as ( + // prefix not in {ke, peng, ber} + $prefix <= 2 + + // The rest of the condition from the paper is: + // V|K...c₁c₁, c₁ ≠ s, c₂ ≠ i + // + // The meaning of this is unclear in several ways, and none of the + // examples given of the stemmer's behaviour in the paper help to + // resolve these issues. + // + // Notice that c₂ isn't actually used - the most obvious explanation + // seems to be that "c₁c₁" should read "c₁c₂", or maybe "c₂c₁". + // + // Elsewhere the paper defines V... as meaning "the stem starts with + // a vowel" and K... as meaning "the stem starts with a consonant". + // + // In other places where it says X|Y... it seems the | binds more + // tightly, so it's (V|K)...cᵢcⱼ not V|(K...cᵢcⱼ). That seems a bit + // odd as the first letter must be either a vowel or a consonant, so + // that really just means "ends cᵢcⱼ". However, nowhere in the paper + // uses or defines a notation such as ...X, which may explain this + // seemingly redundant way of specifying this. + // + // The conditions elsewhere on prefix removal (e.g. V...) are clearly + // on the stem left after the prefix is removed. None of the other + // rules for suffix removal have conditions on the stem, but for + // consistency with the prefix rules we might expect that the cᵢcⱼ + // test is on what's left *after* removing the "i" suffix. + // + // However, studying Indonesian wordlists and discussion with a native + // speaker leads us to conclude that the purpose of this check is to + // protect words of foreign origin (e.g. "televisi", "organisasi", + // "komunikasi") from stemming, and the common feature of these is + // that the word ends "-si", so we conclude that the condition here + // should be read as "word does not end -si", and this is what we + // have implemented. + not 's' + ) + + define remove_suffix as ( + [substring] among ( + 'kan' SUFFIX_KAN_OK 'an' SUFFIX_AN_OK 'i' SUFFIX_I_OK + (delete $measure-=1) + ) + ) +) + +define vowel 'aeiou' + +define VOWEL as ( vowel ) + +define KER as ( non-vowel 'er' ) + +define remove_first_order_prefix as ( + [substring] among ( + 'di' 'meng' 'men' 'me' 'ter' (delete $prefix=1 $measure-=1) + 'ke' 'peng' 'pen' (delete $prefix=3 $measure-=1) + 'meny' VOWEL ($prefix=1 <-'s' $measure-=1) + 'peny' VOWEL ($prefix=3 <-'s' $measure-=1) + 'mem' ($prefix=1 $measure-=1 vowel and <-'p' or delete) + 'pem' ($prefix=3 $measure-=1 vowel and <-'p' or delete) + ) +) + +define remove_second_order_prefix as ( + // The paper has the condition on removal of prefix "bel" and "pel" as + // just "ajar" not "ajar..." but it seems that the latter must be what + // is intended so that e.g. "pelajaran" stems to "ajar" not "lajar". + // This change only affects a very small number of words (11 out of + // 64,587) and only for the better. + [substring] among ( + 'per' 'pe' (delete $prefix=2 $measure-=1) + 'pelajar' (<-'ajar' $measure-=1) + 'ber' (delete $prefix=4 $measure-=1) + 'belajar' (<-'ajar' $prefix=4 $measure-=1) + 'be' KER (delete $prefix=4 $measure-=1) + ) +) + +define stem as ( + $measure = 0 + do ( repeat ( gopast vowel $measure+=1 ) ) + $measure > 2 + $prefix = 0 + backwards ( + do remove_particle + $measure > 2 + do remove_possessive_pronoun + ) + $measure > 2 + test ( + remove_first_order_prefix + do ( + test ($measure > 2 backwards remove_suffix) + $measure > 2 remove_second_order_prefix + ) + ) or ( + do remove_second_order_prefix + do ($measure > 2 backwards remove_suffix) + ) +) diff --git a/contrib/snowball/algorithms/irish.sbl b/contrib/snowball/algorithms/irish.sbl new file mode 100644 index 000000000..0b1288a16 --- /dev/null +++ b/contrib/snowball/algorithms/irish.sbl @@ -0,0 +1,151 @@ +routines ( + R1 R2 RV + initial_morph + mark_regions + noun_sfx + deriv + verb_sfx +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v ) + +stringescapes {} + +/* Accented characters */ + +stringdef a' '{U+00E1}' // a-acute +stringdef e' '{U+00E9}' // e-acute +stringdef i' '{U+00ED}' // i-acute +stringdef o' '{U+00F3}' // o-acute +stringdef u' '{U+00FA}' // u-acute + +define v 'aeiou{a'}{e'}{i'}{o'}{u'}' + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + gopast v setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define initial_morph as ( + [substring] among ( + 'h-' 'n-' 't-' //nAthair -> n-athair, but alone are problematic + (delete) + + // verbs + 'd{'}' + (delete) + 'd{'}fh' + (<- 'f') + // other contractions + 'm{'}' 'b{'}' + (delete) + + 'sh' + (<- 's') + + 'mb' + (<- 'b') + 'gc' + (<- 'c') + 'nd' + (<- 'd') + 'bhf' + (<- 'f') + 'ng' + (<- 'g') + 'bp' + (<- 'p') + 'ts' + (<- 's') + 'dt' + (<- 't') + + // Lenition + 'bh' + (<- 'b') + 'ch' + (<- 'c') + 'dh' + (<- 'd') + 'fh' + (<- 'f') + 'gh' + (<- 'g') + 'mh' + (<- 'm') + 'ph' + (<- 'p') + 'th' + (<- 't') + ) +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define noun_sfx as ( + [substring] among ( + 'amh' 'eamh' 'abh' 'eabh' + 'aibh' 'ibh' 'aimh' 'imh' + 'a{i'}ocht' '{i'}ocht' 'a{i'}ochta' '{i'}ochta' + (R1 delete) + 'ire' 'ir{i'}' 'aire' 'air{i'}' + (R2 delete) + ) + ) + define deriv as ( + [substring] among ( + 'acht' 'eacht' 'ach' 'each' 'eacht{u'}il' 'eachta' 'acht{u'}il' 'achta' + (R2 delete) //siopadóireacht -> siopadóir but not poblacht -> pobl + 'arcacht' 'arcachta{i'}' 'arcachta' + (<- 'arc') // monarcacht -> monarc + 'gineach' 'gineas' 'ginis' + (<- 'gin') + 'grafa{i'}och' 'grafa{i'}ocht' 'grafa{i'}ochta' 'grafa{i'}ochta{i'}' + (<- 'graf') + 'paite' 'patach' 'pataigh' 'patacha' + (<- 'paite') + '{o'}ideach' '{o'}ideacha' '{o'}idigh' + (<- '{o'}id') + ) + ) + define verb_sfx as ( + [substring] among ( + 'imid' 'aimid' '{i'}mid' 'a{i'}mid' + 'faidh' 'fidh' + (RV delete) + 'ain' + 'eadh' 'adh' + '{a'}il' + 'tear' 'tar' + (R1 delete) + ) + ) +) + +define stem as ( + do initial_morph + do mark_regions + backwards ( + do noun_sfx + do deriv + do verb_sfx + ) +) diff --git a/contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/italian.sbl index b43295c09..bf0c16176 100644 --- a/contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl +++ b/contrib/snowball/algorithms/italian.sbl @@ -16,18 +16,18 @@ groupings ( v AEIO CG ) stringescapes {} -/* special characters (in MS-DOS Latin I) */ - -stringdef a' hex 'A0' -stringdef a` hex '85' -stringdef e' hex '82' -stringdef e` hex '8A' -stringdef i' hex 'A1' -stringdef i` hex '8D' -stringdef o' hex 'A2' -stringdef o` hex '95' -stringdef u' hex 'A3' -stringdef u` hex '97' +/* special characters */ + +stringdef a' '{U+00E1}' +stringdef a` '{U+00E0}' +stringdef e' '{U+00E9}' +stringdef e` '{U+00E8}' +stringdef i' '{U+00ED}' +stringdef i` '{U+00EC}' +stringdef o' '{U+00F3}' +stringdef o` '{U+00F2}' +stringdef u' '{U+00FA}' +stringdef u` '{U+00F9}' define v 'aeiou{a`}{e`}{i`}{o`}{u`}' diff --git a/contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl deleted file mode 100644 index 8d25cf64f..000000000 --- a/contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl +++ /dev/null @@ -1,195 +0,0 @@ - -routines ( - prelude postlude mark_regions - RV R1 R2 - attached_pronoun - standard_suffix - verb_suffix - vowel_suffix -) - -externals ( stem ) - -integers ( pV p1 p2 ) - -groupings ( v AEIO CG ) - -stringescapes {} - -/* special characters (in ISO Latin I) */ - -stringdef a' hex 'E1' -stringdef a` hex 'E0' -stringdef e' hex 'E9' -stringdef e` hex 'E8' -stringdef i' hex 'ED' -stringdef i` hex 'EC' -stringdef o' hex 'F3' -stringdef o` hex 'F2' -stringdef u' hex 'FA' -stringdef u` hex 'F9' - -define v 'aeiou{a`}{e`}{i`}{o`}{u`}' - -define prelude as ( - test repeat ( - [substring] among( - '{a'}' (<- '{a`}') - '{e'}' (<- '{e`}') - '{i'}' (<- '{i`}') - '{o'}' (<- '{o`}') - '{u'}' (<- '{u`}') - 'qu' (<- 'qU') - '' (next) - ) - ) - repeat goto ( - v [ ('u' ] v <- 'U') or - ('i' ] v <- 'I') - ) -) - -define mark_regions as ( - - $pV = limit - $p1 = limit - $p2 = limit // defaults - - do ( - ( v (non-v gopast v) or (v gopast non-v) ) - or - ( non-v (non-v gopast v) or (v next) ) - setmark pV - ) - do ( - gopast v gopast non-v setmark p1 - gopast v gopast non-v setmark p2 - ) -) - -define postlude as repeat ( - - [substring] among( - 'I' (<- 'i') - 'U' (<- 'u') - '' (next) - ) - -) - -backwardmode ( - - define RV as $pV <= cursor - define R1 as $p1 <= cursor - define R2 as $p2 <= cursor - - define attached_pronoun as ( - [substring] among( - 'ci' 'gli' 'la' 'le' 'li' 'lo' - 'mi' 'ne' 'si' 'ti' 'vi' - // the compound forms are: - 'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene' - 'mela' 'mele' 'meli' 'melo' 'mene' - 'tela' 'tele' 'teli' 'telo' 'tene' - 'cela' 'cele' 'celi' 'celo' 'cene' - 'vela' 'vele' 'veli' 'velo' 'vene' - ) - among( (RV) - 'ando' 'endo' (delete) - 'ar' 'er' 'ir' (<- 'e') - ) - ) - - define standard_suffix as ( - [substring] among( - - 'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo' - 'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti' - 'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente' - 'atrice' 'atrici' - 'ante' 'anti' // Note 1 - ( R2 delete ) - 'azione' 'azioni' 'atore' 'atori' - ( R2 delete - try ( ['ic'] R2 delete ) - ) - 'logia' 'logie' - ( R2 <- 'log' ) - 'uzione' 'uzioni' 'usione' 'usioni' - ( R2 <- 'u' ) - 'enza' 'enze' - ( R2 <- 'ente' ) - 'amento' 'amenti' 'imento' 'imenti' - ( RV delete ) - 'amente' ( - R1 delete - try ( - [substring] R2 delete among( - 'iv' ( ['at'] R2 delete ) - 'os' 'ic' 'abil' - ) - ) - ) - 'it{a`}' ( - R2 delete - try ( - [substring] among( - 'abil' 'ic' 'iv' (R2 delete) - ) - ) - ) - 'ivo' 'ivi' 'iva' 'ive' ( - R2 delete - try ( ['at'] R2 delete ['ic'] R2 delete ) - ) - ) - ) - - define verb_suffix as setlimit tomark pV for ( - [substring] among( - 'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi' - 'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate' - 'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai' - 'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo' - 'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete' - 'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo' - 'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei' - 'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono' - 'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita' - 'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo' - 'ono' 'uta' 'ute' 'uti' 'uto' - - 'ar' 'ir' // but 'er' is problematical - (delete) - ) - ) - - define AEIO 'aeio{a`}{e`}{i`}{o`}' - define CG 'cg' - - define vowel_suffix as ( - try ( - [AEIO] RV delete - ['i'] RV delete - ) - try ( - ['h'] CG RV delete - ) - ) -) - -define stem as ( - do prelude - do mark_regions - backwards ( - do attached_pronoun - do (standard_suffix or verb_suffix) - do vowel_suffix - ) - do postlude -) - -/* - Note 1: additions of 15 Jun 2005 -*/ - diff --git a/contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/kraaij_pohlmann.sbl index cd79d1280..409bf0ff7 100644 --- a/contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl +++ b/contrib/snowball/algorithms/kraaij_pohlmann.sbl @@ -1,5 +1,5 @@ strings ( ch ) -integers ( x p1 p2 ) +integers ( p1 p2 ) booleans ( Y_found stemmed GE_removed ) routines ( @@ -20,8 +20,6 @@ groupings ( v v_WX AOU AIOU ) stringescapes {} -stringdef ' hex '27' // yuk - define v 'aeiouy' define v_WX v + 'wx' define AOU 'aou' @@ -29,8 +27,8 @@ define AIOU 'aiou' backwardmode ( - define R1 as (setmark x $x >= p1) - define R2 as (setmark x $x >= p2) + define R1 as ($p1 <= cursor) + define R2 as ($p2 <= cursor) define V as test (v or 'ij') define VX as test (next v or 'ij') @@ -46,7 +44,7 @@ backwardmode ( define Step_1 as ( - [among ( (]) + [substring] among ( '{'}s' (delete) 's' (R1 not ('t' R1) C delete) @@ -68,7 +66,7 @@ backwardmode ( define Step_2 as ( - [among ( (]) + [substring] among ( 'je' (('{'}t' ] delete) or ('et' ] R1 C delete) or ('rnt' ] <-'rn') or @@ -92,7 +90,7 @@ backwardmode ( define Step_3 as ( - [among ( (]) + [substring] among ( 'atie' (R1 <-'eer') 'iteit' (R1 delete lengthen_V) 'heid' @@ -112,7 +110,7 @@ backwardmode ( define Step_4 as ( - ( [among ( (]) + ( [substring] among ( 'ioneel' (R1 <-'ie') 'atief' (R1 <-'eer') 'baar' (R1 delete) @@ -132,7 +130,7 @@ backwardmode ( ) ) or - ( [among ( (]) + ( [substring] among ( 'iger' 'igst' 'ig' (R1 C delete lengthen_V) @@ -142,7 +140,7 @@ backwardmode ( define Step_7 as ( - [among ( (]) + [substring] among ( 'kt' (<-'k') 'ft' (<-'f') 'pt' (<-'p') @@ -151,7 +149,7 @@ backwardmode ( define Step_6 as ( - [among ( (]) + [substring] among ( 'bb' (<-'b') 'cc' (<-'c') 'dd' (<-'d') @@ -179,7 +177,7 @@ backwardmode ( define Step_1c as ( - [among ( (] R1 C) + [substring] among ( (R1 C) 'd' (not ('n' R1) delete) 't' (not ('h' R1) delete) ) @@ -200,11 +198,8 @@ define Lose_infix as ( ) define measure as ( - do ( - tolimit - setmark p1 - setmark p2 - ) + $p1 = limit + $p2 = limit do( repeat non-v atleast 1 ('ij' or v) non-v setmark p1 repeat non-v atleast 1 ('ij' or v) non-v setmark p2 diff --git a/contrib/snowball/algorithms/lithuanian.sbl b/contrib/snowball/algorithms/lithuanian.sbl new file mode 100644 index 000000000..ff7fdb9da --- /dev/null +++ b/contrib/snowball/algorithms/lithuanian.sbl @@ -0,0 +1,373 @@ +externals ( stem ) + +// escape symbols for substituting lithuanian characters +stringescapes { } + +/* Special characters in Unicode Latin Extended-A */ +// ' nosine +stringdef a' '{U+0105}' // ą a + ogonek +stringdef e' '{U+0119}' // ę e + ogonek +stringdef i' '{U+012F}' // į i + ogonek +stringdef u' '{U+0173}' // ų u + ogonek + +// . taskas +stringdef e. '{U+0117}' // ė e + dot + +// - ilgoji +stringdef u- '{U+016B}' // ū u + macron + +// * varnele +stringdef c* '{U+010D}' // č c + caron (haček) +stringdef s* '{U+0161}' // š s + caron (haček) +stringdef z* '{U+017E}' // ž z + caron (haček) + +// [C](VC)^m[V|C] +// definitions of variables for +// p1 - position of m = 0 +integers ( p1 ) + +// groupings +// v - lithuanian vowels +groupings ( v ) + +// v - all lithuanian vowels +define v 'aeiyou{a'}{e'}{i'}{u'}{e.}{u-}' + +// all lithuanian stemmer routines: 4 steps +routines ( + step2 R1 step1 fix_chdz fix_gd fix_conflicts +) + +backwardmode ( + + define R1 as $p1 <= cursor + define step1 as ( + setlimit tomark p1 for ([substring]) R1 among ( + // Daiktavardžiai (Nouns) + // I linksniuotė (declension I) + 'as' 'ias' 'is' 'ys' // vyras, kelias, brolis, gaidys + 'o' 'io' // vyro, kelio + 'ui' 'iui' // vyrui, keliui + '{a'}' 'i{a'}' '{i'}' // vyrą, kelią, brolį + 'u' 'iu' // vyru, keliu + 'e' 'yje' // vyre, kelyje + 'y' 'au' 'i' // kely, brolau, broli, + 'an' // nusižengiman + + 'ai' 'iai' // vyrai, keliai + '{u'}' 'i{u'}' // vyrų, kelių + 'ams' 'am' // vyrams, vyram + 'iams' 'iam' // broliams, broliam + 'us' 'ius' // vyrus, brolius + 'ais' 'iais' // vyrais, keliais + 'uose' 'iuose' 'uos' 'iuos' // vyruose, keliuose, vyruos, keliuos + 'uosna' 'iuosna' // vyruosna, keliuosna + 'ysna' // žutysna + + 'asis' 'aisi' // sukimasis, sukimaisi + 'osi' '{u'}si' // sukimosi, sukimųsi + 'uisi' // sukimuisi + '{a'}si' // sukimąsi + 'usi' // sukimusi + 'esi' // sukimesi + + 'uo' // mėnuo + + + // II linksniuote (declension II) + 'a' 'ia' // galva, vysnios + 'os' 'ios' // galvos, vysnios + 'oj' 'oje' 'ioje' // galvoje, vysnioje + 'osna' 'iosna' // galvosna, vyšniosna + 'om' 'oms' 'ioms' // galvoms, vysnioms + 'omis' 'iomis' // galvomis, vysniomis + 'ose' 'iose' // galvose, vysniose + 'on' 'ion' // galvon, vyšnion + + + // III linksniuote (declension III) + '{e.}' // gervė + '{e.}s' // gervės + 'ei' // gervei + '{e'}' // gervę + '{e.}j' '{e.}je' // gervėj, gervėje + '{e.}ms' // gervėms + 'es' // gerves + '{e.}mis' // gervėmis + '{e.}se' // gervėse + '{e.}sna' // gervėsna + '{e.}n' // žydaitėn + + + // IV linksniuote (declension IV) + 'aus' 'iaus' // sūnaus, skaičiaus + 'umi' 'iumi' // sūnumi, skaičiumi + 'uje' 'iuje' // sūnuje, skaičiuje + 'iau' // skaičiau + + '{u-}s' // sūnūs + 'ums' // sūnums + 'umis' // sūnumis + 'un' 'iun' // sūnun, administratoriun + + + // V linksniuote (declension V) + 'ies' 'ens' 'enio' 'ers' // avies, vandens, sesers + 'eniui' 'eriai' // vandeniui, eriai + 'en{i'}' 'er{i'}' // vandenį, seserį + 'imi' 'eniu' 'erimi' 'eria' // avimi, vandeniu, seserimi, seseria + 'enyje' 'eryje' // vandenyje, seseryje + 'ie' 'enie' 'erie' // avie, vandenie, seserie + + 'enys' 'erys' // vandenys, seserys + // 'en{u'}' konfliktas su 'žandenų' 'antenų' + 'er{u'}' // seserų + 'ims' 'enims' 'erims' // avims, vandemins, seserims + 'enis' // vandenis + 'imis' // žebenkštimis + 'enimis' // vandenimis + 'yse' 'enyse' 'eryse' // avyse, vandenyse, seseryse + + + // Būdvardžiai (Adjectives) + // (i)a linksniuotė + 'iem' 'iems' // geriem, geriems + 'ame' 'iame' // naujame, mediniame + + + // Veiksmažodžiai (Verbs) + // Tiesioginė nuosaka (indicative mood) + // esamasis laikas (present tense) + // (i)a asmenuotė (declension (i)a) + 'uosi' 'iuosi' // dirbuosi, traukiuosi + 'iesi' // dirbiesi + 'asi' 'iasi' // dirbasi, traukiasi + 'am{e.}s' 'iam{e.}s' // dirbamės, traukiamės + 'at' 'ate' 'iat' 'iate' // dirbat, dirbate, ariat, traukiate + 'at{e.}s' 'iat{e.}s' // dirbatės, traukiatės + + // i asmenuotė (declension i) + 'isi' // tikisi + 'im' // mylim + // 'ime' konfliktassu daiktavardžiu vietininku, pvz. 'gėrime' + 'im{e.}s' // tikimės + 'it' 'ite' // mylit, mylite, tikitės + // 'it{e.}s' konfliktas su priesaga ir dgs. vardininko galūne -ait-ės pvz. žydaitės + + // o asmenuotė (declension o) + 'ome' // mokome + 'ot' 'ote' // mokot, mokote + + // būtasis laikas + // o asmenuotė (declension o) + '{e.}jo' '{e.}josi' // tikėjo, tikėjosi + 'ot{e.}s' // tikėjotės/bijotės + + // ė asmenuotė (declension ė) + 'eisi' // mokeisi + '{e.}si' // mokėsi + '{e.}m' '{e.}me' // mokėm, mokėme + '{e.}m{e.}s' // mokėmės + '{e.}t' '{e.}te' // mokėt, mokėte + '{e.}t{e.}s' // mokėtės + + // būtasis dažninis laikas (frequentative past tense) + 'ausi' // mokydavausi + 'om{e.}s' // mokydavomės/bijomės + + + // būsimasis laikas (future tense) + 'siu' 'siuosi' // dirbsiu, mokysiuosi + 'si' 'siesi' // dirbsi, dirbsiesi + 's' 'ysis' // dirbs, mokysis + 'sim' 'sime' // dirbsim, dirbsime + 'sit' 'site' // gersit, gersite + + // tariamoji nuosaka (subjunctive mood) + '{c*}iau' '{c*}iausi' // dirbčiau + 'tum' 'tumei' // dirbtum, dirbtumei + 'tumeis' 'tumeisi' // mokytumeis, mokytumeisi + // 't{u'}' nes blogai batutų -> batų + 't{u'}si' // mokytųsi + // 'tume' konfliktas su 'šventume' + 'tum{e.}m' // dirbtumėm + 'tum{e.}me' // dirbtumėme + 'tum{e.}m{e.}s' // mokytumėmės + 'tute' 'tum{e.}t' // dirbtute, dirbtumėt + 'tum{e.}te' // dirbtumėte + 'tum{e.}t{e.}s' // mokytumėtės + + // liepiamoji nuosaka (imperative mood) + 'k' 'ki' // dirbk, dirbki, mokykis + // 'kis' konfliktas viln-išk-is + // 'kime' konfliktas, nes pirkime + 'kim{e.}s' // mokykimės + + // bendratis (infinitive) + 'uoti' 'iuoti' // meluoti, dygsniuoti + 'auti' 'iauti' // draugauti, girtuokliauti + 'oti' 'ioti' // dovanoti, meškerioti + '{e.}ti' // auklėti + 'yti' // akyti + 'inti' // auginti + 'in{e.}ti' // blusinėti + 'enti' // gyventi + 'tel{e.}ti' // bumbtelėti + 'ter{e.}ti' // bumbterėti + + 'ti' // skalbti + // 'tis' konfliktas, nes rytme-tis -> rytme + + // dalyviai (participles) + '{a'}s' 'i{a'}s' '{i'}s' // dirbąs, žaidžiąs, gulįs + 't{u'}s' // suktųs -> suk + 'sim{e.}s' // suksimės + 'sit{e.}s' // suksitės + 'kite' // supkite + ) + + delete + ) + + define step2 as repeat ( + setlimit tomark p1 for ([substring]) among ( + // daiktavardziu priesagos (Noun suffixes) + + // budvardziu priesagos (Adjective suffixes) + // 'in' // konfliktas su 'augintinis' ir 'akiniais' // lauk-in-is + 'ing' // tvark-ing-as + 'i{s*}k' // lenk-išk-as + '{e.}t' // dem-ėt-as + 'ot' // garban-ot-as + 'uot' 'iuot' // lang-uot-as, akin-iuot-as + // 'tin', nes augintinis // dirb-tin-is + // 'ut', nes batutas, degutas etc. // maž-ut-is + 'yt' // maž-yt-is + 'iuk' // maž-iuk-as + 'iul' // maž-ul-is + '{e.}l' // maž-ėl-is + 'yl' // maž-yl-is + 'u{c*}iuk' // maž-učiuk-as + 'uliuk' // maž-uliuk-as + 'ut{e.}ait' // maž-utėlait-is + 'ok' // did-ok-as + 'iok' // višč-iok-as + 'sv' '{s*}v' 'zgan' // sal-sv-as, pilk-šv-as, bal-zgan-as + 'op' 'iop' // dvej-op-as, viener-iop-as + 'ain' // apval-ain-as + 'yk{s*}t' 'yk{s*}{c*}' // ten-ykšt-is, vakar-ykšč-ias + + // laisniai + 'esn' // did-esn-is + 'aus' 'iaus' // nauj-aus-ias, ger-iaus-ias + + // ivardziuotiniai budvardziai (Pronominal adjectives) + // vyriska gimine (Male gender) + 'ias' // žaliasis + 'oj' 'ioj' // gerojo, žaliojo + 'aj' 'iaj' // gerajam, žaliajam + '{a'}j' 'i{a'}j' // garąjį, žaliąjį + 'uoj' 'iuoj' // geruoju, žaliuoju + 'iej' // gerieji + '{u'}j' 'i{u'}j' // gerųjų, žaliųjų + 'ies' // geriesiems + 'uos' 'iuos' // geruosius, žaliuosius + 'ais' 'iais' // geraisiais, žaliaisiais + + // moteriska gimine (Female gender) + 'os' 'ios' // gerosios, žaliosios + '{a'}s' 'i{a'}s' // gerąsios, žaliąsias + + // būtasis dažninis laikas (frequentative past tense) + 'dav' // ei-dav-o + + // dalyvių priesagos (particple suffix) + 'ant' 'iant' + 'int' // tur-int-is + '{e.}j' // tur-ėj-o + '{e'}' // + '{e.}j{e'}' + '{e'}s' // dirb-ęs-is + + 'siant' // dirb-siant + + // pusdalyviai (participle) + 'dam' // bėg-dam-as + + 'auj' // ūkinink-auj-a + 'jam' + 'iau' + 'am' // baiminim-ams-i + ) + + delete + ) + + define fix_conflicts as ( + [substring] among ( + // 'lietuvaite' -> 'lietuvaitė', konfliktas su 'myl-ite' + 'aite' (<-'ait{e.}') + // 'lietuvaitės' -> 'lietuvaitė', konfliktas su 'myl-itės' + 'ait{e.}s' (<-'ait{e.}') + + // ''ūs-uotės' -> 'ūs-uotė', konfliktas 'mokotės' + 'uot{e.}s' (<-'uot{e.}') + // ''ūs-uote' -> 'ūs-uotė', konfliktas 'mokote' + 'uote' (<-'uot{e.}') + + // 'žerėjime' -> 'žėrėjimas', konfliktas su 'žais-ime' + '{e.}jime' (<-'{e.}jimas') + + // 'žvilgesiu' -> 'žvilgesys', konfliktas su 'dirb-siu' + 'esiu' (<-'esys') + // 'duobkasiu' -> 'duobkasys', konfliktas su 'pakasiu' + 'asius' (<-'asys') + + // 'žioravime' -> 'žioravimas', konfliktas su 'myl-ime' + 'avime' (<-'avimas') + 'ojime' (<-'ojimas') + + // 'advokatės' -> 'advokatė', konfliktas su 'dirb-atės' + 'okat{e.}s' (<-'okat{e.}') + // 'advokate' -> 'advokatė', konfliktas su 'dirb-ate' + 'okate' (<-'okat{e.}') + ) + ) + + define fix_chdz as ( + [substring] among ( + '{c*}' (<-'t') + 'd{z*}' (<-'d') + ) + ) + + define fix_gd as ( + [substring] among ( + 'gd' (<-'g') + // '{e.}k' (<-'{e.}g') + ) + ) + +) + +define stem as ( + + $p1 = limit + + do ( + // priešdėlis 'a' ilgeniuose nei 6 raidės žodžiuose, pvz. 'a-liejus'. + try (test 'a' $(len > 6) hop 1) + + gopast v gopast non-v setmark p1 + ) + + backwards ( + do fix_conflicts + do step1 + do fix_chdz + do step2 + do fix_chdz + do fix_gd + ) + +) diff --git a/contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/lovins.sbl index 3f69f1572..3f69f1572 100644 --- a/contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl +++ b/contrib/snowball/algorithms/lovins.sbl diff --git a/contrib/snowball/algorithms/nepali.sbl b/contrib/snowball/algorithms/nepali.sbl new file mode 100644 index 000000000..d3887486b --- /dev/null +++ b/contrib/snowball/algorithms/nepali.sbl @@ -0,0 +1,92 @@ +/* + * Authors: + * - Ingroj Shrestha <ing.stha@gmail.com>, Nepali NLP Group + * - Oleg Bartunov <obartunov@gmail.com>, Postgres Professional Ltd. + * - Shreeya Singh Dhakal, Nepali NLP Group + */ + +routines ( + remove_category_1 + check_category_2 + remove_category_2 + remove_category_3 +) + +stringescapes {} + +stringdef dsc '{U+0901}' // DEVANAGARI_SIGN_CANDRABINDU +stringdef dsa '{U+0902}' // DEVANAGARI_SIGN_ANUSVARA +stringdef dli '{U+0907}' // DEVANAGARI_LETTER_I +stringdef dlii '{U+0908}' // DEVANAGARI_LETTER_II +stringdef dle '{U+090F}' // DEVANAGARI_LETTER_E +stringdef dlka '{U+0915}' // DEVANAGARI_LETTER_KA +stringdef dlkha '{U+0916}' // DEVANAGARI_LETTER_KHA +stringdef dlg '{U+0917}' // DEVANAGARI_LETTER_GA +stringdef dlc '{U+091B}' // DEVANAGARI_LETTER_CHA +stringdef dlta '{U+0924}' // DEVANAGARI_LETTER_TA +stringdef dltha '{U+0925}' // DEVANAGARI_LETTER_THA +stringdef dld '{U+0926}' // DEVANAGARI_LETTER_DA +stringdef dln '{U+0928}' // DEVANAGARI_LETTER_NA +stringdef dlpa '{U+092A}' // DEVANAGARI_LETTER_PA +stringdef dlpha '{U+092B}' // DEVANAGARI_LETTER_PHA +stringdef dlb '{U+092D}' // DEVANAGARI_LETTER_BHA +stringdef dlm '{U+092E}' // DEVANAGARI_LETTER_MA +stringdef dly '{U+092F}' // DEVANAGARI_LETTER_YA +stringdef dlr '{U+0930}' // DEVANAGARI_LETTER_RA +stringdef dll '{U+0932}' // DEVANAGARI_LETTER_LA +stringdef dlv '{U+0935}' // DEVANAGARI_LETTER_VA +stringdef dls '{U+0938}' // DEVANAGARI_LETTER_SA +stringdef dlh '{U+0939}' // DEVANAGARI_LETTER_HA +stringdef dvsaa '{U+093E}' // DEVANAGARI_VOWEL_SIGN_AA +stringdef dvsi '{U+093F}' // DEVANAGARI_VOWEL_SIGN_I +stringdef dvsii '{U+0940}' // DEVANAGARI_VOWEL_SIGN_II +stringdef dvsu '{U+0941}' // DEVANAGARI_VOWEL_SIGN_U +stringdef dvsuu '{U+0942}' // DEVANAGARI_VOWEL_SIGN_UU +stringdef dvse '{U+0947}' // DEVANAGARI_VOWEL_SIGN_E +stringdef dvsai '{U+0948}' // DEVANAGARI_VOWEL_SIGN_AI +stringdef dvso '{U+094B}' // DEVANAGARI_VOWEL_SIGN_O +stringdef dvsau '{U+094C}' // DEVANAGARI_VOWEL_SIGN_AU +stringdef dsv '{U+094D}' // DEVANAGARI_SIGN_VIRAMA + +externals ( stem ) +backwardmode ( + define remove_category_1 as( + [substring] among ( + '{dlm}{dvsaa}{dlr}{dsv}{dlpha}{dlta}' '{dld}{dsv}{dlv}{dvsaa}{dlr}{dvsaa}' '{dls}{dsc}{dlg}{dvsai}' '{dls}{dsa}{dlg}' + '{dls}{dsc}{dlg}' '{dll}{dvsaa}{dli}' '{dll}{dvsaa}{dlii}' '{dlpa}{dlc}{dvsi}' + '{dll}{dvse}' '{dlr}{dlta}' '{dlm}{dvsai}' '{dlm}{dvsaa}' + (delete) + '{dlka}{dvso}' '{dlka}{dvsaa}' '{dlka}{dvsi}' '{dlka}{dvsii}' '{dlka}{dvsai}'(('{dle}' or '{dvse}' ()) or delete) + ) + ) + + define check_category_2 as( + [substring] among( + '{dsc}' '{dsa}' '{dvsai}' + ) + ) + + define remove_category_2 as ( + [substring] among( + '{dsc}' '{dsa}' ('{dly}{dvsau}' or '{dlc}{dvsau}' or '{dln}{dvsau}' or '{dltha}{dvse}' delete) + '{dvsai}' ('{dlta}{dsv}{dlr}' delete) + ) + ) + + define remove_category_3 as( + [substring] among( + '{dltha}{dvsi}{dli}{dls}{dsv}' '{dlh}{dvsu}{dln}{dvse}{dlc}' '{dlh}{dvsu}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dls}{dsv}' '{dln}{dvse}{dlc}{dln}{dsv}' '{dli}{dle}{dlka}{dvsii}' '{dli}{dle}{dlka}{dvsaa}' '{dli}{dle}{dlka}{dvso}' '{dvsi}{dle}{dlka}{dvsii}' '{dvsi}{dle}{dlka}{dvsaa}' '{dvsi}{dle}{dlka}{dvso}' '{dli}{dlc}{dln}{dsv}' '{dvsi}{dlc}{dln}{dsv}' '{dli}{dlc}{dls}{dsv}' '{dvsi}{dlc}{dls}{dsv}' '{dle}{dlc}{dln}{dsv}' '{dvse}{dlc}{dln}{dsv}' '{dle}{dlc}{dls}{dsv}' '{dvse}{dlc}{dls}{dsv}' '{dlc}{dvsi}{dln}{dsv}' '{dlc}{dvse}{dls}{dsv}' '{dlc}{dsv}{dly}{dvsau}' '{dltha}{dvsi}{dln}{dsv}' '{dltha}{dvsi}{dly}{dvso}' '{dltha}{dvsi}{dly}{dvsau}' '{dltha}{dvsi}{dls}{dsv}' '{dltha}{dsv}{dly}{dvso}' '{dltha}{dsv}{dly}{dvsau}' '{dld}{dvsi}{dly}{dvso}' '{dld}{dvse}{dlkha}{dvsi}' '{dld}{dvse}{dlkha}{dvsii}' '{dll}{dvsaa}{dln}{dsv}' '{dlm}{dvsaa}{dltha}{dvsi}' '{dln}{dvse}{dlka}{dvsai}' '{dln}{dvse}{dlka}{dvsaa}' '{dln}{dvse}{dlka}{dvso}' '{dln}{dvse}{dlc}{dvsau}' '{dlh}{dvso}{dls}{dsv}' '{dli}{dln}{dsv}{dlc}' '{dvsi}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dvsu}' '{dli}{dlc}{dvsau}' '{dvsi}{dlc}{dvsau}' '{dli}{dls}{dsv}' '{dvsi}{dls}{dsv}' '{dvsi}{dly}{dvso}' '{dli}{dly}{dvso}' '{dle}{dlka}{dvsaa}' '{dvse}{dlka}{dvsaa}' '{dle}{dlka}{dvsii}' '{dvse}{dlka}{dvsii}' '{dle}{dlka}{dvsai}' '{dvse}{dlka}{dvsai}' '{dle}{dlka}{dvso}' '{dvse}{dlka}{dvso}' '{dle}{dlc}{dvsu}' '{dvse}{dlc}{dvsu}' '{dle}{dlc}{dvsau}' '{dvse}{dlc}{dvsau}' '{dlc}{dln}{dsv}' '{dlc}{dls}{dsv}' '{dltha}{dvsi}{dle}' '{dlpa}{dlr}{dsv}' '{dlb}{dly}{dvso}' '{dlh}{dlr}{dvsu}' '{dlh}{dlr}{dvsuu}' '{dvsi}{dld}{dvsaa}' '{dli}{dld}{dvsaa}' '{dvsi}{dld}{dvso}' '{dli}{dld}{dvso}' '{dvsi}{dld}{dvsai}' '{dli}{dld}{dvsai}' '{dln}{dvse}{dlc}' '{dli}{dlc}' '{dvsi}{dlc}' '{dle}{dlc}' '{dvse}{dlc}' '{dlc}{dvsu}' '{dlc}{dvse}' '{dlc}{dvsau}' '{dltha}{dvsii}' '{dltha}{dvse}' '{dld}{dvsaa}' '{dld}{dvsii}' '{dld}{dvsai}' '{dld}{dvso}' '{dln}{dvsu}' '{dln}{dvse}' '{dly}{dvso}' '{dly}{dvsau}' '{dlc}' + (delete) + ) + ) + +) + +define stem as ( + backwards ( + do remove_category_1 + do ( + repeat (do (check_category_2 and remove_category_2) remove_category_3) + ) + ) +) diff --git a/contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/norwegian.sbl index 94a071653..39f4aff0f 100644 --- a/contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl +++ b/contrib/snowball/algorithms/norwegian.sbl @@ -13,11 +13,11 @@ groupings ( v s_ending ) stringescapes {} -/* special characters (in ISO Latin I) */ +/* special characters */ -stringdef ae hex 'E6' -stringdef ao hex 'E5' -stringdef o/ hex 'F8' +stringdef ae '{U+00E6}' +stringdef ao '{U+00E5}' +stringdef o/ '{U+00F8}' define v 'aeiouy{ae}{ao}{o/}' diff --git a/contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl deleted file mode 100644 index f57483354..000000000 --- a/contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl +++ /dev/null @@ -1,80 +0,0 @@ -routines ( - mark_regions - main_suffix - consonant_pair - other_suffix -) - -externals ( stem ) - -integers ( p1 x ) - -groupings ( v s_ending ) - -stringescapes {} - -/* special characters (in MS-DOS Latin I) */ - -stringdef ae hex '91' -stringdef ao hex '86' -stringdef o/ hex '9B' - -define v 'aeiouy{ae}{ao}{o/}' - -define s_ending 'bcdfghjlmnoprtvyz' - -define mark_regions as ( - - $p1 = limit - - test ( hop 3 setmark x ) - goto v gopast non-v setmark p1 - try ( $p1 < x $p1 = x ) -) - -backwardmode ( - - define main_suffix as ( - setlimit tomark p1 for ([substring]) - among( - - 'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar' - 'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens' - 'hetens' 'ers' 'ets' 'et' 'het' 'ast' - (delete) - 's' - (s_ending or ('k' non-v) delete) - 'erte' 'ert' - (<-'er') - ) - ) - - define consonant_pair as ( - test ( - setlimit tomark p1 for ([substring]) - among( - 'dt' 'vt' - ) - ) - next] delete - ) - - define other_suffix as ( - setlimit tomark p1 for ([substring]) - among( - 'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov' - 'hetslov' - (delete) - ) - ) -) - -define stem as ( - - do mark_regions - backwards ( - do main_suffix - do consonant_pair - do other_suffix - ) -) diff --git a/contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/porter.sbl index 9533b7932..9533b7932 100644 --- a/contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl +++ b/contrib/snowball/algorithms/porter.sbl diff --git a/contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/portuguese.sbl index 3e7da08d4..3fb14f195 100644 --- a/contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl +++ b/contrib/snowball/algorithms/portuguese.sbl @@ -15,20 +15,20 @@ groupings ( v ) stringescapes {} -/* special characters (in ISO Latin I) */ +/* special characters */ -stringdef a' hex 'E1' // a-acute -stringdef a^ hex 'E2' // a-circumflex e.g. 'bota^nico -stringdef e' hex 'E9' // e-acute -stringdef e^ hex 'EA' // e-circumflex -stringdef i' hex 'ED' // i-acute -stringdef o^ hex 'F4' // o-circumflex -stringdef o' hex 'F3' // o-acute -stringdef u' hex 'FA' // u-acute -stringdef c, hex 'E7' // c-cedilla +stringdef a' '{U+00E1}' // a-acute +stringdef a^ '{U+00E2}' // a-circumflex e.g. 'bota^nico +stringdef e' '{U+00E9}' // e-acute +stringdef e^ '{U+00EA}' // e-circumflex +stringdef i' '{U+00ED}' // i-acute +stringdef o^ '{U+00F4}' // o-circumflex +stringdef o' '{U+00F3}' // o-acute +stringdef u' '{U+00FA}' // u-acute +stringdef c, '{U+00E7}' // c-cedilla -stringdef a~ hex 'E3' // a-tilde -stringdef o~ hex 'F5' // o-tilde +stringdef a~ '{U+00E3}' // a-tilde +stringdef o~ '{U+00F5}' // o-tilde define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}' @@ -92,12 +92,12 @@ backwardmode ( ( R2 delete ) - 'log{i'}a' - 'log{i'}as' + 'logia' + 'logias' ( R2 <- 'log' ) - 'uci{o'}n' 'uciones' + 'u{c,}a~o' 'u{c,}o~es' ( R2 <- 'u' ) diff --git a/contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl deleted file mode 100644 index 4d6c85214..000000000 --- a/contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl +++ /dev/null @@ -1,218 +0,0 @@ -routines ( - prelude postlude mark_regions - RV R1 R2 - standard_suffix - verb_suffix - residual_suffix - residual_form -) - -externals ( stem ) - -integers ( pV p1 p2 ) - -groupings ( v ) - -stringescapes {} - -/* special characters (in MS-DOS Latin I) */ - -stringdef a' hex 'A0' // a-acute -stringdef a^ hex '83' // a-circumflex e.g. 'bota^nico -stringdef e' hex '82' // e-acute -stringdef e^ hex '88' // e-circumflex -stringdef i' hex 'A1' // i-acute -stringdef o^ hex '93' // o-circumflex -stringdef o' hex 'A2' // o-acute -stringdef u' hex 'A3' // u-acute -stringdef c, hex '87' // c-cedilla - -stringdef a~ hex 'C6' // a-tilde -stringdef o~ hex 'E4' // o-tilde - - -define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}' - -define prelude as repeat ( - [substring] among( - '{a~}' (<- 'a~') - '{o~}' (<- 'o~') - '' (next) - ) //or next -) - -define mark_regions as ( - - $pV = limit - $p1 = limit - $p2 = limit // defaults - - do ( - ( v (non-v gopast v) or (v gopast non-v) ) - or - ( non-v (non-v gopast v) or (v next) ) - setmark pV - ) - do ( - gopast v gopast non-v setmark p1 - gopast v gopast non-v setmark p2 - ) -) - -define postlude as repeat ( - [substring] among( - 'a~' (<- '{a~}') - 'o~' (<- '{o~}') - '' (next) - ) //or next -) - -backwardmode ( - - define RV as $pV <= cursor - define R1 as $p1 <= cursor - define R2 as $p2 <= cursor - - define standard_suffix as ( - [substring] among( - - 'eza' 'ezas' - 'ico' 'ica' 'icos' 'icas' - 'ismo' 'ismos' - '{a'}vel' - '{i'}vel' - 'ista' 'istas' - 'oso' 'osa' 'osos' 'osas' - 'amento' 'amentos' - 'imento' 'imentos' - - 'adora' 'ador' 'a{c,}a~o' - 'adoras' 'adores' 'a{c,}o~es' // no -ic test - 'ante' 'antes' '{a^}ncia' // Note 1 - ( - R2 delete - ) - 'log{i'}a' - 'log{i'}as' - ( - R2 <- 'log' - ) - 'uci{o'}n' 'uciones' - ( - R2 <- 'u' - ) - '{e^}ncia' '{e^}ncias' - ( - R2 <- 'ente' - ) - 'amente' - ( - R1 delete - try ( - [substring] R2 delete among( - 'iv' (['at'] R2 delete) - 'os' - 'ic' - 'ad' - ) - ) - ) - 'mente' - ( - R2 delete - try ( - [substring] among( - 'ante' // Note 1 - 'avel' - '{i'}vel' (R2 delete) - ) - ) - ) - 'idade' - 'idades' - ( - R2 delete - try ( - [substring] among( - 'abil' - 'ic' - 'iv' (R2 delete) - ) - ) - ) - 'iva' 'ivo' - 'ivas' 'ivos' - ( - R2 delete - try ( - ['at'] R2 delete // but not a further ['ic'] R2 delete - ) - ) - 'ira' 'iras' - ( - RV 'e' // -eira -eiras usually non-verbal - <- 'ir' - ) - ) - ) - - define verb_suffix as setlimit tomark pV for ( - [substring] among( - 'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}' - 'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste' - 'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam' - 'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem' - 'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o' - 'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias' - 'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras' - 'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres' - 'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is' - 'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis' - '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis' - '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos' - '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos' - 'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos' - 'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos' - '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou' - - 'ira' 'iras' - (delete) - ) - ) - - define residual_suffix as ( - [substring] among( - 'os' - 'a' 'i' 'o' '{a'}' '{i'}' '{o'}' - ( RV delete ) - ) - ) - - define residual_form as ( - [substring] among( - 'e' '{e'}' '{e^}' - ( RV delete [('u'] test 'g') or - ('i'] test 'c') RV delete ) - '{c,}' (<-'c') - ) - ) -) - -define stem as ( - do prelude - do mark_regions - backwards ( - do ( - ( ( standard_suffix or verb_suffix ) - and do ( ['i'] test 'c' RV delete ) - ) - or residual_suffix - ) - do residual_form - ) - do postlude -) - -/* - Note 1: additions of 15 Jun 2005 -*/ diff --git a/contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl b/contrib/snowball/algorithms/romanian.sbl index 48a148321..7db9e0a5a 100644 --- a/contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl +++ b/contrib/snowball/algorithms/romanian.sbl @@ -20,11 +20,11 @@ stringescapes {} /* special characters */ -stringdef a^ hex 'E2' // a circumflex -stringdef i^ hex 'EE' // i circumflex -stringdef a+ hex 'E3' // a breve -stringdef s, hex 'BA' // s cedilla -stringdef t, hex 'FE' // t cedilla +stringdef a^ '{U+00E2}' // a circumflex +stringdef i^ '{U+00EE}' // i circumflex +stringdef a+ '{U+0103}' // a breve +stringdef s, '{U+015F}' // s cedilla +stringdef t, '{U+0163}' // t cedilla define v 'aeiou{a^}{i^}{a+}' diff --git a/contrib/snowball/algorithms/romanian/stem_Unicode.sbl b/contrib/snowball/algorithms/romanian/stem_Unicode.sbl deleted file mode 100644 index 09aec6429..000000000 --- a/contrib/snowball/algorithms/romanian/stem_Unicode.sbl +++ /dev/null @@ -1,236 +0,0 @@ - -routines ( - prelude postlude mark_regions - RV R1 R2 - step_0 - standard_suffix combo_suffix - verb_suffix - vowel_suffix -) - -externals ( stem ) - -integers ( pV p1 p2 ) - -groupings ( v ) - -booleans ( standard_suffix_removed ) - -stringescapes {} - -/* special characters */ - -stringdef a^ hex '0E2' // a circumflex -stringdef i^ hex '0EE' // i circumflex -stringdef a+ hex '103' // a breve -stringdef s, hex '15F' // s cedilla -stringdef t, hex '163' // t cedilla - -define v 'aeiou{a^}{i^}{a+}' - -define prelude as ( - repeat goto ( - v [ ('u' ] v <- 'U') or - ('i' ] v <- 'I') - ) -) - -define mark_regions as ( - - $pV = limit - $p1 = limit - $p2 = limit // defaults - - do ( - ( v (non-v gopast v) or (v gopast non-v) ) - or - ( non-v (non-v gopast v) or (v next) ) - setmark pV - ) - do ( - gopast v gopast non-v setmark p1 - gopast v gopast non-v setmark p2 - ) -) - -define postlude as repeat ( - - [substring] among( - 'I' (<- 'i') - 'U' (<- 'u') - '' (next) - ) - -) - -backwardmode ( - - define RV as $pV <= cursor - define R1 as $p1 <= cursor - define R2 as $p2 <= cursor - - define step_0 as ( - [substring] R1 among( - 'ul' 'ului' - ( delete ) - 'aua' - ( <-'a' ) - 'ea' 'ele' 'elor' - ( <-'e' ) - 'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor' - ( <-'i') - 'ile' - ( not 'ab' <- 'i' ) - 'atei' - ( <- 'at' ) - 'a{t,}ie' 'a{t,}ia' - ( <- 'a{t,}i' ) - ) - ) - - define combo_suffix as test ( - [substring] R1 ( - among( - /* 'IST'. alternative: include the following - 'alism' 'alisme' - 'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' ( - <- 'al' - ) - */ - 'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' ( - <- 'abil' - ) - 'ibilitate' ( - <- 'ibil' - ) - 'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' ( - <- 'iv' - ) - 'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i' - 'icator' 'icatori' - 'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}' - 'ical' 'icala' 'icale' 'icali' 'ical{a+}' ( - <- 'ic' - ) - 'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune' - 'atoare' 'ator' 'atori' - '{a+}toare' '{a+}tor' '{a+}tori' ( - <- 'at' - ) - 'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune' - 'itoare' 'itor' 'itori' ( - <- 'it' - ) - ) - set standard_suffix_removed - ) - ) - - define standard_suffix as ( - unset standard_suffix_removed - repeat combo_suffix - [substring] R2 ( - among( - - // past participle is treated here, rather than - // as a verb ending: - 'at' 'ata' 'at{a+}' 'ati' 'ate' - 'ut' 'uta' 'ut{a+}' 'uti' 'ute' - 'it' 'ita' 'it{a+}' 'iti' 'ite' - - 'ic' 'ica' 'ice' 'ici' 'ic{a+}' - 'abil' 'abila' 'abile' 'abili' 'abil{a+}' - 'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}' - 'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i' - 'ant' 'anta' 'ante' 'anti' 'ant{a+}' - 'ator' 'atori' - 'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i' - 'iv' 'iva' 'ive' 'ivi' 'iv{a+}' ( - delete - ) - 'iune' 'iuni' ( - '{t,}'] <- 't' - ) - 'ism' 'isme' - 'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' ( - <- 'ist' - /* 'IST'. alternative: remove with <- '' */ - ) - ) - set standard_suffix_removed - ) - ) - - define verb_suffix as setlimit tomark pV for ( - [substring] among( - // 'long' infinitive: - 'are' 'ere' 'ire' '{a^}re' - - // gerund: - 'ind' '{a^}nd' - 'indu' '{a^}ndu' - - 'eze' - 'easc{a+}' - // present: - 'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti' - 'e{s,}te' - '{a+}sc' '{a+}{s,}ti' - '{a+}{s,}te' - - // imperfect: - 'am' 'ai' 'au' - 'eam' 'eai' 'ea' 'ea{t,}i' 'eau' - 'iam' 'iai' 'ia' 'ia{t,}i' 'iau' - - // past: // (not 'ii') - 'ui' - 'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}' - 'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}' - 'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}' - '{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}' - - // pluferfect: - 'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}' - 'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}' - '{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i' - '{a^}ser{a+}' - 'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}' - - ( non-v or 'u' delete ) - - // present: - '{a+}m' 'a{t,}i' - 'em' 'e{t,}i' - 'im' 'i{t,}i' - '{a^}m' '{a^}{t,}i' - - // past: - 'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}' - 'sei' 'se' - - // pluperfect: - 'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}' - (delete) - ) - ) - - define vowel_suffix as ( - [substring] RV among ( - 'a' 'e' 'i' 'ie' '{a+}' ( delete ) - ) - ) -) - -define stem as ( - do prelude - do mark_regions - backwards ( - do step_0 - do standard_suffix - do ( standard_suffix_removed or verb_suffix ) - do vowel_suffix - ) - do postlude -) - diff --git a/contrib/snowball/algorithms/russian/stem_KOI8_R.sbl b/contrib/snowball/algorithms/russian.sbl index cdacb19c4..20de6395e 100644 --- a/contrib/snowball/algorithms/russian/stem_KOI8_R.sbl +++ b/contrib/snowball/algorithms/russian.sbl @@ -1,41 +1,41 @@ stringescapes {} -/* the 32 Cyrillic letters in the KOI8-R coding scheme, and represented - in Latin characters following the conventions of the standard Library - of Congress transliteration: */ - -stringdef a hex 'C1' -stringdef b hex 'C2' -stringdef v hex 'D7' -stringdef g hex 'C7' -stringdef d hex 'C4' -stringdef e hex 'C5' -stringdef zh hex 'D6' -stringdef z hex 'DA' -stringdef i hex 'C9' -stringdef i` hex 'CA' -stringdef k hex 'CB' -stringdef l hex 'CC' -stringdef m hex 'CD' -stringdef n hex 'CE' -stringdef o hex 'CF' -stringdef p hex 'D0' -stringdef r hex 'D2' -stringdef s hex 'D3' -stringdef t hex 'D4' -stringdef u hex 'D5' -stringdef f hex 'C6' -stringdef kh hex 'C8' -stringdef ts hex 'C3' -stringdef ch hex 'DE' -stringdef sh hex 'DB' -stringdef shch hex 'DD' -stringdef " hex 'DF' -stringdef y hex 'D9' -stringdef ' hex 'D8' -stringdef e` hex 'DC' -stringdef iu hex 'C0' -stringdef ia hex 'D1' +/* the 33 Cyrillic letters represented in ASCII characters following the + * conventions of the standard Library of Congress transliteration: */ + +stringdef a '{U+0430}' +stringdef b '{U+0431}' +stringdef v '{U+0432}' +stringdef g '{U+0433}' +stringdef d '{U+0434}' +stringdef e '{U+0435}' +stringdef e" '{U+0451}' +stringdef zh '{U+0436}' +stringdef z '{U+0437}' +stringdef i '{U+0438}' +stringdef i` '{U+0439}' +stringdef k '{U+043A}' +stringdef l '{U+043B}' +stringdef m '{U+043C}' +stringdef n '{U+043D}' +stringdef o '{U+043E}' +stringdef p '{U+043F}' +stringdef r '{U+0440}' +stringdef s '{U+0441}' +stringdef t '{U+0442}' +stringdef u '{U+0443}' +stringdef f '{U+0444}' +stringdef kh '{U+0445}' +stringdef ts '{U+0446}' +stringdef ch '{U+0447}' +stringdef sh '{U+0448}' +stringdef shch '{U+0449}' +stringdef " '{U+044A}' +stringdef y '{U+044B}' +stringdef ' '{U+044C}' +stringdef e` '{U+044D}' +stringdef iu '{U+044E}' +stringdef ia '{U+044F}' routines ( mark_regions R2 perfective_gerund @@ -200,6 +200,10 @@ backwardmode ( define stem as ( + // Normalise {e"} to {e}. The documentation has long suggested the user + // should do this before calling the stemmer - we now do it for them. + do repeat ( goto (['{e"}']) <- '{e}' ) + do mark_regions backwards setlimit tomark pV for ( do ( diff --git a/contrib/snowball/algorithms/russian/stem_Unicode.sbl b/contrib/snowball/algorithms/russian/stem_Unicode.sbl deleted file mode 100644 index 9e1a93f93..000000000 --- a/contrib/snowball/algorithms/russian/stem_Unicode.sbl +++ /dev/null @@ -1,215 +0,0 @@ -stringescapes {} - -/* the 32 Cyrillic letters in Unicode */ - -stringdef a hex '430' -stringdef b hex '431' -stringdef v hex '432' -stringdef g hex '433' -stringdef d hex '434' -stringdef e hex '435' -stringdef zh hex '436' -stringdef z hex '437' -stringdef i hex '438' -stringdef i` hex '439' -stringdef k hex '43A' -stringdef l hex '43B' -stringdef m hex '43C' -stringdef n hex '43D' -stringdef o hex '43E' -stringdef p hex '43F' -stringdef r hex '440' -stringdef s hex '441' -stringdef t hex '442' -stringdef u hex '443' -stringdef f hex '444' -stringdef kh hex '445' -stringdef ts hex '446' -stringdef ch hex '447' -stringdef sh hex '448' -stringdef shch hex '449' -stringdef " hex '44A' -stringdef y hex '44B' -stringdef ' hex '44C' -stringdef e` hex '44D' -stringdef iu hex '44E' -stringdef ia hex '44F' - -routines ( mark_regions R2 - perfective_gerund - adjective - adjectival - reflexive - verb - noun - derivational - tidy_up -) - -externals ( stem ) - -integers ( pV p2 ) - -groupings ( v ) - -define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}' - -define mark_regions as ( - - $pV = limit - $p2 = limit - do ( - gopast v setmark pV gopast non-v - gopast v gopast non-v setmark p2 - ) -) - -backwardmode ( - - define R2 as $p2 <= cursor - - define perfective_gerund as ( - [substring] among ( - '{v}' - '{v}{sh}{i}' - '{v}{sh}{i}{s}{'}' - ('{a}' or '{ia}' delete) - '{i}{v}' - '{i}{v}{sh}{i}' - '{i}{v}{sh}{i}{s}{'}' - '{y}{v}' - '{y}{v}{sh}{i}' - '{y}{v}{sh}{i}{s}{'}' - (delete) - ) - ) - - define adjective as ( - [substring] among ( - '{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}' - '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}' - '{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}' - '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}' - '{ia}{ia}' - // and - - '{o}{iu}' // - which is somewhat archaic - '{e}{iu}' // - soft form of {o}{iu} - (delete) - ) - ) - - define adjectival as ( - adjective - - /* of the participle forms, em, vsh, ivsh, yvsh are readily removable. - nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of - errors. Removing im, uem, enn creates too many errors. - */ - - try ( - [substring] among ( - '{e}{m}' // present passive participle - '{n}{n}' // adjective from past passive participle - '{v}{sh}' // past active participle - '{iu}{shch}' '{shch}' // present active participle - ('{a}' or '{ia}' delete) - - //but not '{i}{m}' '{u}{e}{m}' // present passive participle - //or '{e}{n}{n}' // adjective from past passive participle - - '{i}{v}{sh}' '{y}{v}{sh}'// past active participle - '{u}{iu}{shch}' // present active participle - (delete) - ) - ) - - ) - - define reflexive as ( - [substring] among ( - '{s}{ia}' - '{s}{'}' - (delete) - ) - ) - - define verb as ( - [substring] among ( - '{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}' - '{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}' - '{n}{y}' '{t}{'}' '{e}{sh}{'}' - - '{n}{n}{o}' - ('{a}' or '{ia}' delete) - - '{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}' - '{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}' - '{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}' - '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}' - '{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}' - '{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}' - (delete) - /* note the short passive participle tests: - '{n}{a}' '{n}' '{n}{o}' '{n}{y}' - '{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}' - */ - ) - ) - - define noun as ( - [substring] among ( - '{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}' - '{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}' - '{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}' - '{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}' - '{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}' - '{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}' - (delete) - /* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}' - '{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}' - omitted - they only occur on 12 words. - */ - ) - ) - - define derivational as ( - [substring] R2 among ( - '{o}{s}{t}' - '{o}{s}{t}{'}' - (delete) - ) - ) - - define tidy_up as ( - [substring] among ( - - '{e}{i`}{sh}' - '{e}{i`}{sh}{e}' // superlative forms - (delete - ['{n}'] '{n}' delete - ) - '{n}' - ('{n}' delete) // e.g. -nno endings - '{'}' - (delete) // with some slight false conflations - ) - ) -) - -define stem as ( - - do mark_regions - backwards setlimit tomark pV for ( - do ( - perfective_gerund or - ( try reflexive - adjectival or verb or noun - ) - ) - try([ '{i}' ] delete) - // because noun ending -i{iu} is being treated as verb ending -{iu} - - do derivational - do tidy_up - ) -) diff --git a/contrib/snowball/algorithms/serbian.sbl b/contrib/snowball/algorithms/serbian.sbl new file mode 100644 index 000000000..bddf76b22 --- /dev/null +++ b/contrib/snowball/algorithms/serbian.sbl @@ -0,0 +1,2378 @@ +/* Stemmer for Serbian language, based on: + * + * Ljubesic, Nikola. Pandzic, Ivan. Stemmer for Croatian + * http://nlp.ffzg.hr/resources/tools/stemmer-for-croatian/ + * + * authors: Stefan Petkovic and Dragan Ivanovic + * emails: petkovic8 at gmail.com and dragan.ivanovic at uns.ac.rs + * version: 1.0 (20.04.2019) +*/ + +routines ( + cyr_to_lat + prelude + mark_regions + R1 R2 + Step_1 + Step_2 + Step_3 +) + +externals ( stem ) + +integers ( p1 p2 p3 ) + +groupings ( v ca sa rg ) + +stringescapes {} + +/* special characters - Unicode codepoints */ + +/* serbian cyrillic */ + +stringdef cyrA '{U+0430}' +stringdef cyrB '{U+0431}' +stringdef cyrV '{U+0432}' +stringdef cyrG '{U+0433}' +stringdef cyrD '{U+0434}' +stringdef cyrDx '{U+0452}' +stringdef cyrE '{U+0435}' +stringdef cyrZh '{U+0436}' +stringdef cyrZ '{U+0437}' +stringdef cyrI '{U+0438}' +stringdef cyrJ '{U+0458}' +stringdef cyrK '{U+043A}' +stringdef cyrL '{U+043B}' +stringdef cyrLJ '{U+0459}' +stringdef cyrM '{U+043C}' +stringdef cyrN '{U+043D}' +stringdef cyrNJ '{U+045A}' +stringdef cyrO '{U+043E}' +stringdef cyrP '{U+043F}' +stringdef cyrR '{U+0440}' +stringdef cyrS '{U+0441}' +stringdef cyrT '{U+0442}' +stringdef cyrCy '{U+045B}' +stringdef cyrU '{U+0443}' +stringdef cyrF '{U+0444}' +stringdef cyrH '{U+0445}' +stringdef cyrC '{U+0446}' +stringdef cyrCx '{U+0447}' +stringdef cyrDzx '{U+045F}' +stringdef cyrSx '{U+0448}' + +/* serbian latin with diacritics */ + +stringdef cx '{U+010D}' // small c with caron +stringdef cy '{U+0107}' // small c with acute +stringdef zx '{U+017E}' // small z with caron +stringdef sx '{U+0161}' // small s with caron +stringdef dx '{U+0111}' // small d with stroke + +define v 'aeiou' +define sa '{cx}{cy}{zx}{sx}{dx}' +define ca 'bvgdzjklmnprstfhc' + sa +define rg 'r' + + +define cyr_to_lat as ( + + do repeat goto ( + [substring] among ( + '{cyrA}' (<- 'a') + '{cyrB}' (<- 'b') + '{cyrV}' (<- 'v') + '{cyrG}' (<- 'g') + '{cyrD}' (<- 'd') + '{cyrDx}' (<- '{dx}') + '{cyrE}' (<- 'e') + '{cyrZh}' (<- '{zx}') + '{cyrZ}' (<- 'z') + '{cyrI}' (<- 'i') + '{cyrJ}' (<- 'j') + '{cyrK}' (<- 'k') + '{cyrL}' (<- 'l') + '{cyrLJ}' (<- 'lj') + '{cyrM}' (<- 'm') + '{cyrN}' (<- 'n') + '{cyrNJ}' (<- 'nj') + '{cyrO}' (<- 'o') + '{cyrP}' (<- 'p') + '{cyrR}' (<- 'r') + '{cyrS}' (<- 's') + '{cyrT}' (<- 't') + '{cyrCy}' (<- '{cy}') + '{cyrU}' (<- 'u') + '{cyrF}' (<- 'f') + '{cyrH}' (<- 'h') + '{cyrC}' (<- 'c') + '{cyrCx}' (<- '{cx}') + '{cyrDzx}' (<- 'd{zx}') + '{cyrSx}' (<- '{sx}') + ) + ) + +) + +define prelude as ( + + do repeat goto ( + ca ['ije'] ca <- 'e' + ) + + do repeat goto ( + ca ['je'] ca <- 'e' + ) + + do repeat goto ( + ['dj'] <- '{dx}' + ) + +) + +define mark_regions as ( + + $p3 = 0 + + do ( + gopast sa setmark p3 + ) + + $p1 = limit + $p2 = 0 + + do ( + gopast v setmark p1 + ) + do ( + gopast 'r' setmark p2 + $(p1 - p2 > 1) ($p1 = p2) + ) + ($p1 < 2) ( + ($p1 == p2 gopast 'r' gopast non-rg) or ($p1 != p2 gopast v gopast non-v) + setmark p1 + ) + +) + +backwardmode ( + + define R1 as $p1 <= cursor + define R2 as $p3 == 0 + + define Step_1 as ( + [substring] among ( + 'lozi' + 'lozima' (<-'loga') + 'pesi' + 'pesima' (<-'peh') + 'vojci' (<-'vojka') + 'bojci' (<-'bojka') + 'jaci' + 'jacima' (<-'jak') + '{cx}ajan' (<-'{cx}ajni') + 'cajan' (R2 <-'cajni') + 'eran' (<-'erni') + 'laran' (<-'larni') + 'esan' (<-'esni') + 'anjac' (<-'anjca') + 'ajac' + 'ajaca' (<-'ajca') + 'ljaca' + 'ljac' (<-'ljca') + 'ejac' + 'ejaca' (<-'ejca') + 'ojac' + 'ojaca' (<-'ojca') + 'ajaka' (<-'ajka') + 'ojaka' (<-'ojka') + '{sx}aca' + '{sx}ac' (<-'{sx}ca') + 'inzima' + 'inzi' (<-'ing') + 'tvenici' (<-'tvenik') + 'tetici' + 'teticima' (<-'tetika') + 'nstava' (<-'nstva') + 'nicima' (<-'nik') + 'ticima' (<-'tik') + 'zicima' (<-'zik') + 'snici' (<-'snik') + 'kuse' (<-'kusi') + 'kusan' (<-'kusni') + 'kustava' (<-'kustva') + 'du{sx}an' (<-'du{sx}ni') + 'dusan' (R2 <-'dusni') + 'antan' (<-'antni') + 'bilan' (<-'bilni') + 'tilan' (<-'tilni') + 'avilan' (<-'avilni') + 'silan' (<-'silni') + 'gilan' (<-'gilni') + 'rilan' (<-'rilni') + 'nilan' (<-'nilni') + 'alan' (<-'alni') + 'ozan' (<-'ozni') + 'rave' (<-'ravi') + 'stavan' (<-'stavni') + 'pravan' (<-'pravni') + 'tivan' (<-'tivni') + 'sivan' (<-'sivni') + 'atan' (<-'atni') + 'enat' (<-'enta') + 'tetan' (<-'tetni') + 'pletan' (<-'pletni') + '{sx}ave' (<-'{sx}avi') + 'save' (R2 <-'savi') + 'anata' (<-'anta') + 'a{cx}ak' + 'a{cx}aka' (<-'a{cx}ka') + 'acak' + 'acaka' (R2 <-'acka') + 'u{sx}ak' (<-'u{sx}ka') + 'usak' (R2 <-'uska') + 'atak' + 'ataka' + 'atci' + 'atcima' (<-'atka') + 'etak' + 'etaka' (<-'etka') + 'itak' + 'itaka' + 'itci' (<-'itka') + 'otak' + 'otaka' (<-'otka') + 'utak' + 'utaka' + 'utci' + 'utcima' (<-'utka') + 'eskan' (<-'eskna') + 'ti{cx}an' (<-'ti{cx}ni') + 'tican' (R2 <-'ticni') + 'ojsci' (<-'ojska') + 'esama' (<-'esma') + 'metar' + 'metara' (<-'metra') + 'centar' + 'centara' (<-'centra') + 'istar' + 'istara' (<-'istra') + 'o{sx}{cy}u' (<-'osti') + 'oscu' (R2 <-'osti') + 'daba' (<-'dba') + '{cx}cima' + '{cx}ci' (<-'{cx}ka') + 'mac' + 'maca' (<-'mca') + 'naca' + 'nac' (<-'nca') + 'voljan' (<-'voljni') + 'anaka' (<-'anki') + 'vac' + 'vaca' (<-'vca') + 'saca' + 'sac' (<-'sca') + 'raca' + 'rac' (<-'rca') + 'aoca' + 'alaca' + 'alac' (<-'alca') + 'elaca' + 'elac' (<-'elca') + 'olaca' + 'olac' + 'olce' (<-'olca') + 'njac' + 'njaca' (<-'njca') + 'ekata' + 'ekat' (<-'ekta') + 'izam' + 'izama' (<-'izma') + 'jebe' (<-'jebi') + 'baci' (<-'baci') + 'a{sx}an' (<-'a{sx}ni') + 'asan' (R2 <-'asni') + ) + ) + + define Step_2 as ( + [substring] R1 among ( + 'skijima' + 'skijega' + 'skijemu' + 'skijem' + 'skega' + 'skemu' + 'skem' + 'skijim' + 'skijih' + 'skijoj' + 'skijeg' + 'skiji' + 'skije' + 'skija' + 'skoga' + 'skome' + 'skomu' + 'skima' + 'skog' + 'skom' + 'skim' + 'skih' + 'skoj' + 'ski' + 'ske' + 'sko' + 'ska' + 'sku' (<-'sk') + '{sx}kijima' + '{sx}kijega' + '{sx}kijemu' + '{sx}kijem' + '{sx}kega' + '{sx}kemu' + '{sx}kem' + '{sx}kijim' + '{sx}kijih' + '{sx}kijoj' + '{sx}kijeg' + '{sx}kiji' + '{sx}kije' + '{sx}kija' + '{sx}koga' + '{sx}kome' + '{sx}komu' + '{sx}kima' + '{sx}kog' + '{sx}kom' + '{sx}kim' + '{sx}kih' + '{sx}koj' + '{sx}ki' + '{sx}ke' + '{sx}ko' + '{sx}ka' + '{sx}ku' (<-'{sx}k') + 'stvima' + 'stvom' + 'stvo' + 'stva' + 'stvu' (<-'stv') + '{sx}tvima' + '{sx}tvom' + '{sx}tvo' + '{sx}tva' + '{sx}tvu' (<-'{sx}tv') + 'tanijama' + 'tanijima' + 'tanijom' + 'tanija' + 'taniju' + 'tanije' + 'taniji' (<-'tanij') + 'manijama' + 'manijima' + 'manijom' + 'manija' + 'maniju' + 'manije' + 'maniji' (<-'manij') + 'panijama' + 'panijima' + 'panijom' + 'panija' + 'paniju' + 'panije' + 'paniji' (<-'panij') + 'ranijama' + 'ranijima' + 'ranijom' + 'ranija' + 'raniju' + 'ranije' + 'raniji' (<-'ranij') + 'ganijama' + 'ganijima' + 'ganijom' + 'ganija' + 'ganiju' + 'ganije' + 'ganiji' (<-'ganij') + 'aninom' + 'anina' + 'aninu' + 'anine' + 'anima' + 'anin' + 'anom' + 'anu' + 'ani' + 'ana' + 'ane' (<-'an') + 'inima' + 'inama' + 'inom' + 'ina' + 'ine' + 'ini' + 'inu' + 'ino' (<-'in') + 'onovima' + 'onova' + 'onove' + 'onovi' + 'onima' + 'onom' + 'ona' + 'one' + 'oni' + 'onu' (<-'on') + 'nijima' + 'nijega' + 'nijemu' + 'nijeg' + 'nijem' + 'nega' + 'nemu' + 'neg' + 'nem' + 'nijim' + 'nijih' + 'nijoj' + 'niji' + 'nije' + 'nija' + 'niju' + 'nima' + 'nome' + 'nomu' + 'noga' + 'noj' + 'nom' + 'nih' + 'nim' + 'nog' + 'no' + 'ne' + 'na' + 'nu' + 'ni' (<-'n') + 'a{cy}oga' + 'a{cy}ome' + 'a{cy}omu' + 'a{cy}ega' + 'a{cy}emu' + 'a{cy}ima' + 'a{cy}oj' + 'a{cy}ih' + 'a{cy}om' + 'a{cy}eg' + 'a{cy}em' + 'a{cy}og' + 'a{cy}uh' + 'a{cy}im' + 'a{cy}e' + 'a{cy}a' (<-'a{cy}') + 'e{cy}oga' + 'e{cy}ome' + 'e{cy}omu' + 'e{cy}ega' + 'e{cy}emu' + 'e{cy}ima' + 'e{cy}oj' + 'e{cy}ih' + 'e{cy}om' + 'e{cy}eg' + 'e{cy}em' + 'e{cy}og' + 'e{cy}uh' + 'e{cy}im' + 'e{cy}e' + 'e{cy}a' (<-'e{cy}') + 'u{cy}oga' + 'u{cy}ome' + 'u{cy}omu' + 'u{cy}ega' + 'u{cy}emu' + 'u{cy}ima' + 'u{cy}oj' + 'u{cy}ih' + 'u{cy}om' + 'u{cy}eg' + 'u{cy}em' + 'u{cy}og' + 'u{cy}uh' + 'u{cy}im' + 'u{cy}e' + 'u{cy}a' (<-'u{cy}') + 'ugovima' + 'ugovi' + 'ugove' + 'ugova' (<-'ugov') + 'ugama' + 'ugom' + 'uga' + 'uge' + 'ugi' + 'ugu' + 'ugo' (<-'ug') + 'logama' + 'logom' + 'loga' + 'logu' + 'loge' (<-'log') + 'govima' + 'gama' + 'govi' + 'gove' + 'gova' + 'gom' + 'ga' + 'ge' + 'gi' + 'gu' + 'go' (<-'g') + 'rarijem' + 'rarija' + 'rariju' + 'rario' (<-'rari') + 'otijem' + 'otija' + 'otiju' + 'otio' (<-'oti') + 'sijem' + 'sija' + 'siju' + 'sio' (<-'si') + 'lijem' + 'lija' + 'liju' + 'lio' (<-'li') + 'uju{cy}i' + 'ujemo' + 'ujete' + 'ujmo' + 'ujem' + 'uje{sx}' + 'uje' + 'uju' (<-'uj') + 'cajevima' + 'cajevi' + 'cajeva' + 'cajeve' + 'cajama' + 'cajima' + 'cajem' + 'caja' + 'caje' + 'caji' + 'caju' (<-'caj') + '{cx}ajevima' + '{cx}ajevi' + '{cx}ajeva' + '{cx}ajeve' + '{cx}ajama' + '{cx}ajima' + '{cx}ajem' + '{cx}aja' + '{cx}aje' + '{cx}aji' + '{cx}aju' (<-'{cx}aj') + '{cy}ajevima' + '{cy}ajevi' + '{cy}ajeva' + '{cy}ajeve' + '{cy}ajama' + '{cy}ajima' + '{cy}ajem' + '{cy}aja' + '{cy}aje' + '{cy}aji' + '{cy}aju' (<-'{cy}aj') + '{dx}ajevima' + '{dx}ajevi' + '{dx}ajeva' + '{dx}ajeve' + '{dx}ajama' + '{dx}ajima' + '{dx}ajem' + '{dx}aja' + '{dx}aje' + '{dx}aji' + '{dx}aju' (<-'{dx}aj') + 'lajevima' + 'lajevi' + 'lajeva' + 'lajeve' + 'lajama' + 'lajima' + 'lajem' + 'laja' + 'laje' + 'laji' + 'laju' (<-'laj') + 'rajevima' + 'rajevi' + 'rajeva' + 'rajeve' + 'rajama' + 'rajima' + 'rajem' + 'raja' + 'raje' + 'raji' + 'raju' (<-'raj') + 'bijima' + 'bijama' + 'bijom' + 'bija' + 'bije' + 'biji' + 'biju' + 'bijo' (<-'bij') + 'cijima' + 'cijama' + 'cijom' + 'cija' + 'cije' + 'ciji' + 'ciju' + 'cijo' (<-'cij') + 'dijima' + 'dijama' + 'dijom' + 'dija' + 'dije' + 'diji' + 'diju' + 'dijo' (<-'dij') + 'lijima' + 'lijama' + 'lijom' + 'lije' + 'liji' + 'lijo' (<-'lij') + 'nijama' + 'nijom' + 'nijo' (<-'nij') + 'mijima' + 'mijama' + 'mijom' + 'mija' + 'mije' + 'miji' + 'miju' + 'mijo' (<-'mij') + '{zx}ijima' + '{zx}ijama' + '{zx}ijom' + '{zx}ija' + '{zx}ije' + '{zx}iji' + '{zx}iju' + '{zx}ijo' (<-'{zx}ij') + 'gijima' + 'gijama' + 'gijom' + 'gija' + 'gije' + 'giji' + 'giju' + 'gijo' (<-'gij') + 'fijima' + 'fijama' + 'fijom' + 'fija' + 'fije' + 'fiji' + 'fiju' + 'fijo' (<-'fij') + 'pijima' + 'pijama' + 'pijom' + 'pija' + 'pije' + 'piji' + 'piju' + 'pijo' (<-'pij') + 'rijima' + 'rijama' + 'rijom' + 'rija' + 'rije' + 'riji' + 'riju' + 'rijo' (<-'rij') + 'sijima' + 'sijama' + 'sijom' + 'sije' + 'siji' + 'sijo' (<-'sij') + 'tijima' + 'tijama' + 'tijom' + 'tija' + 'tije' + 'tiji' + 'tiju' + 'tijo' (<-'tij') + 'zijima' + 'zijama' + 'zijom' + 'zija' + 'zije' + 'ziji' + 'ziju' + 'zijo' (<-'zij') + 'nalima' + 'nalama' + 'nalom' + 'nala' + 'nale' + 'nali' + 'nalu' + 'nalo' (<-'nal') + 'ijalima' + 'ijalama' + 'ijalom' + 'ijala' + 'ijale' + 'ijali' + 'ijalu' + 'ijalo' (<-'ijal') + 'ozilima' + 'ozilom' + 'ozila' + 'ozile' + 'ozilu' + 'ozili' (<-'ozil') + 'olovima' + 'olovi' + 'olova' + 'olove' (<-'olov') + 'olima' + 'olom' + 'ola' + 'olu' + 'ole' + 'oli' (<-'ol') + 'lemama' + 'lemima' + 'lemom' + 'lema' + 'leme' + 'lemi' + 'lemu' + 'lemo' (<-'lem') + 'ramama' + 'ramom' + 'rama' + 'rame' + 'rami' + 'ramu' + 'ramo' (<-'ram') + 'arama' + 'arima' + 'arom' + 'aru' + 'ara' + 'are' + 'ari' (<-'ar') + 'drama' + 'drima' + 'drom' + 'dru' + 'dra' + 'dre' + 'dri' (<-'dr') + 'erama' + 'erima' + 'erom' + 'eru' + 'era' + 'ere' + 'eri' (<-'er') + 'orama' + 'orima' + 'orom' + 'oru' + 'ora' + 'ore' + 'ori' (<-'or') + 'esima' + 'esom' + 'ese' + 'esa' + 'esu' (<-'es') + 'isima' + 'isom' + 'ise' + 'isa' + 'isu' (<-'is') + 'ta{sx}ama' + 'ta{sx}ima' + 'ta{sx}om' + 'ta{sx}em' + 'ta{sx}a' + 'ta{sx}u' + 'ta{sx}i' + 'ta{sx}e' (<-'ta{sx}') + 'na{sx}ama' + 'na{sx}ima' + 'na{sx}om' + 'na{sx}em' + 'na{sx}a' + 'na{sx}u' + 'na{sx}i' + 'na{sx}e' (<-'na{sx}') + 'ja{sx}ama' + 'ja{sx}ima' + 'ja{sx}om' + 'ja{sx}em' + 'ja{sx}a' + 'ja{sx}u' + 'ja{sx}i' + 'ja{sx}e' (<-'ja{sx}') + 'ka{sx}ama' + 'ka{sx}ima' + 'ka{sx}om' + 'ka{sx}em' + 'ka{sx}a' + 'ka{sx}u' + 'ka{sx}i' + 'ka{sx}e' (<-'ka{sx}') + 'ba{sx}ama' + 'ba{sx}ima' + 'ba{sx}om' + 'ba{sx}em' + 'ba{sx}a' + 'ba{sx}u' + 'ba{sx}i' + 'ba{sx}e' (<-'ba{sx}') + 'ga{sx}ama' + 'ga{sx}ima' + 'ga{sx}om' + 'ga{sx}em' + 'ga{sx}a' + 'ga{sx}u' + 'ga{sx}i' + 'ga{sx}e' (<-'ga{sx}') + 'va{sx}ama' + 'va{sx}ima' + 'va{sx}om' + 'va{sx}em' + 'va{sx}a' + 'va{sx}u' + 'va{sx}i' + 'va{sx}e' (<-'va{sx}') + 'e{sx}ima' + 'e{sx}ama' + 'e{sx}om' + 'e{sx}em' + 'e{sx}i' + 'e{sx}e' + 'e{sx}a' + 'e{sx}u' (<-'e{sx}') + 'i{sx}ima' + 'i{sx}ama' + 'i{sx}om' + 'i{sx}em' + 'i{sx}i' + 'i{sx}e' + 'i{sx}a' + 'i{sx}u' (<-'i{sx}') + 'ikatima' + 'ikatom' + 'ikata' + 'ikate' + 'ikati' + 'ikatu' + 'ikato' (<-'ikat') + 'latima' + 'latom' + 'lata' + 'late' + 'lati' + 'latu' + 'lato' (<-'lat') + 'etama' + 'etima' + 'etom' + 'eta' + 'ete' + 'eti' + 'etu' + 'eto' (<-'et') + 'estima' + 'estama' + 'estom' + 'esta' + 'este' + 'esti' + 'estu' + 'esto' (<-'est') + 'istima' + 'istama' + 'istom' + 'ista' + 'iste' + 'isti' + 'istu' + 'isto' (<-'ist') + 'kstima' + 'kstama' + 'kstom' + 'ksta' + 'kste' + 'ksti' + 'kstu' + 'ksto' (<-'kst') + 'ostima' + 'ostama' + 'ostom' + 'osta' + 'oste' + 'osti' + 'ostu' + 'osto' (<-'ost') + 'i{sx}tima' + 'i{sx}tem' + 'i{sx}ta' + 'i{sx}te' + 'i{sx}tu' (<-'i{sx}t') + 'ovasmo' + 'ovaste' + 'ovahu' + 'ovati' + 'ova{sx}e' + 'ovali' + 'ovala' + 'ovale' + 'ovalo' + 'ovat' + 'ovah' + 'ovao' (<-'ova') + 'avijemu' + 'avijima' + 'avijega' + 'avijeg' + 'avijem' + 'avemu' + 'avega' + 'aveg' + 'avem' + 'avijim' + 'avijih' + 'avijoj' + 'avoga' + 'avome' + 'avomu' + 'avima' + 'avama' + 'aviji' + 'avije' + 'avija' + 'aviju' + 'avim' + 'avih' + 'avoj' + 'avom' + 'avog' + 'avi' + 'ava' + 'avu' + 'ave' + 'avo' (<-'av') + 'evijemu' + 'evijima' + 'evijega' + 'evijeg' + 'evijem' + 'evemu' + 'evega' + 'eveg' + 'evem' + 'evijim' + 'evijih' + 'evijoj' + 'evoga' + 'evome' + 'evomu' + 'evima' + 'evama' + 'eviji' + 'evije' + 'evija' + 'eviju' + 'evim' + 'evih' + 'evoj' + 'evom' + 'evog' + 'evi' + 'eva' + 'evu' + 'eve' + 'evo' (<-'ev') + 'ivijemu' + 'ivijima' + 'ivijega' + 'ivijeg' + 'ivijem' + 'ivemu' + 'ivega' + 'iveg' + 'ivem' + 'ivijim' + 'ivijih' + 'ivijoj' + 'ivoga' + 'ivome' + 'ivomu' + 'ivima' + 'ivama' + 'iviji' + 'ivije' + 'ivija' + 'iviju' + 'ivim' + 'ivih' + 'ivoj' + 'ivom' + 'ivog' + 'ivi' + 'iva' + 'ivu' + 'ive' + 'ivo' (<-'iv') + 'ovijemu' + 'ovijima' + 'ovijega' + 'ovijeg' + 'ovijem' + 'ovemu' + 'ovega' + 'oveg' + 'ovijim' + 'ovijih' + 'ovijoj' + 'ovoga' + 'ovome' + 'ovomu' + 'ovima' + 'oviji' + 'ovije' + 'ovija' + 'oviju' + 'ovim' + 'ovih' + 'ovoj' + 'ovom' + 'ovog' + 'ovi' + 'ova' + 'ovu' + 'ove' + 'ovo' (<-'ov') + 'movima' + 'movom' + 'mova' + 'movu' + 'move' + 'movi' (<-'mov') + 'lovima' + 'lovom' + 'lova' + 'lovu' + 'love' + 'lovi' (<-'lov') + 'elijemu' + 'elijima' + 'elijega' + 'elijeg' + 'elijem' + 'elemu' + 'elega' + 'eleg' + 'elem' + 'elijim' + 'elijih' + 'elijoj' + 'eloga' + 'elome' + 'elomu' + 'elima' + 'eliji' + 'elije' + 'elija' + 'eliju' + 'elim' + 'elih' + 'eloj' + 'elom' + 'elog' + 'eli' + 'ela' + 'elu' + 'ele' + 'elo' (<-'el') + 'anjijemu' + 'anjijima' + 'anjijega' + 'anjijeg' + 'anjijem' + 'anjemu' + 'anjega' + 'anjeg' + 'anjem' + 'anjijim' + 'anjijih' + 'anjijoj' + 'anjoga' + 'anjome' + 'anjomu' + 'anjima' + 'anjiji' + 'anjije' + 'anjija' + 'anjiju' + 'anjim' + 'anjih' + 'anjoj' + 'anjom' + 'anjog' + 'anja' + 'anje' + 'anji' + 'anjo' + 'anju' (<-'anj') + 'enjijemu' + 'enjijima' + 'enjijega' + 'enjijeg' + 'enjijem' + 'enjemu' + 'enjega' + 'enjeg' + 'enjem' + 'enjijim' + 'enjijih' + 'enjijoj' + 'enjoga' + 'enjome' + 'enjomu' + 'enjima' + 'enjiji' + 'enjije' + 'enjija' + 'enjiju' + 'enjim' + 'enjih' + 'enjoj' + 'enjom' + 'enjog' + 'enja' + 'enje' + 'enji' + 'enjo' + 'enju' (<-'enj') + '{sx}njijemu' + '{sx}njijima' + '{sx}njijega' + '{sx}njijeg' + '{sx}njijem' + '{sx}njemu' + '{sx}njega' + '{sx}njeg' + '{sx}njem' + '{sx}njijim' + '{sx}njijih' + '{sx}njijoj' + '{sx}njoga' + '{sx}njome' + '{sx}njomu' + '{sx}njima' + '{sx}njiji' + '{sx}njije' + '{sx}njija' + '{sx}njiju' + '{sx}njim' + '{sx}njih' + '{sx}njoj' + '{sx}njom' + '{sx}njog' + '{sx}nja' + '{sx}nje' + '{sx}nji' + '{sx}njo' + '{sx}nju' (<-'{sx}nj') + 'anemu' + 'anega' + 'aneg' + 'anem' (<-'an') + 'enemu' + 'enega' + 'eneg' + 'enem' (<-'en') + '{sx}nemu' + '{sx}nega' + '{sx}neg' + '{sx}nem' (<-'{sx}n') + '{cx}inama' + '{cx}inome' + '{cx}inomu' + '{cx}inoga' + '{cx}inima' + '{cx}inog' + '{cx}inom' + '{cx}inim' + '{cx}inih' + '{cx}inoj' + '{cx}ina' + '{cx}inu' + '{cx}ini' + '{cx}ino' + '{cx}ine' (<-'{cx}in') + 'ro{sx}iv{sx}i' + 'ro{sx}ismo' + 'ro{sx}iste' + 'ro{sx}i{sx}e' + 'ro{sx}imo' + 'ro{sx}ite' + 'ro{sx}iti' + 'ro{sx}ili' + 'ro{sx}ila' + 'ro{sx}ilo' + 'ro{sx}ile' + 'ro{sx}im' + 'ro{sx}i{sx}' + 'ro{sx}it' + 'ro{sx}ih' + 'ro{sx}io' (<-'ro{sx}i') + 'o{sx}ijemu' + 'o{sx}ijima' + 'o{sx}ijega' + 'o{sx}ijeg' + 'o{sx}ijem' + 'o{sx}emu' + 'o{sx}ega' + 'o{sx}eg' + 'o{sx}em' + 'o{sx}ijim' + 'o{sx}ijih' + 'o{sx}ijoj' + 'o{sx}oga' + 'o{sx}ome' + 'o{sx}omu' + 'o{sx}ima' + 'o{sx}iji' + 'o{sx}ije' + 'o{sx}ija' + 'o{sx}iju' + 'o{sx}im' + 'o{sx}ih' + 'o{sx}oj' + 'o{sx}om' + 'o{sx}og' + 'o{sx}i' + 'o{sx}a' + 'o{sx}u' + 'o{sx}e' (<-'o{sx}') + 'evitijima' + 'evitijega' + 'evitijemu' + 'evitijem' + 'evitega' + 'evitemu' + 'evitem' + 'evitijim' + 'evitijih' + 'evitijoj' + 'evitijeg' + 'evitiji' + 'evitije' + 'evitija' + 'evitoga' + 'evitome' + 'evitomu' + 'evitima' + 'evitog' + 'evitom' + 'evitim' + 'evitih' + 'evitoj' + 'eviti' + 'evite' + 'evito' + 'evita' + 'evitu' (<-'evit') + 'ovitijima' + 'ovitijega' + 'ovitijemu' + 'ovitijem' + 'ovitega' + 'ovitemu' + 'ovitem' + 'ovitijim' + 'ovitijih' + 'ovitijoj' + 'ovitijeg' + 'ovitiji' + 'ovitije' + 'ovitija' + 'ovitoga' + 'ovitome' + 'ovitomu' + 'ovitima' + 'ovitog' + 'ovitom' + 'ovitim' + 'ovitih' + 'ovitoj' + 'oviti' + 'ovite' + 'ovito' + 'ovita' + 'ovitu' (<-'ovit') + 'astijima' + 'astijega' + 'astijemu' + 'astijem' + 'astega' + 'astemu' + 'astem' + 'astijim' + 'astijih' + 'astijoj' + 'astijeg' + 'astiji' + 'astije' + 'astija' + 'astoga' + 'astome' + 'astomu' + 'astima' + 'astog' + 'astom' + 'astim' + 'astih' + 'astoj' + 'asti' + 'aste' + 'asto' + 'asta' + 'astu' (<-'ast') + 'kijemu' + 'kijima' + 'kijega' + 'kijeg' + 'kijem' + 'kemu' + 'kega' + 'keg' + 'kem' + 'kijim' + 'kijih' + 'kijoj' + 'koga' + 'kome' + 'komu' + 'kima' + 'kiji' + 'kije' + 'kija' + 'kiju' + 'kim' + 'kih' + 'koj' + 'kom' + 'kog' + 'kov' + 'ki' + 'ka' + 'ku' + 'ke' + 'ko' (<-'k') + 'evaju{cy}i' + 'evasmo' + 'evaste' + 'evajmo' + 'evajte' + 'evaju' + 'evala' + 'evale' + 'evali' + 'evalo' + 'evamo' + 'evana' + 'evane' + 'evani' + 'evano' + 'evate' + 'evati' + 'eva{sx}e' + 'evahu' + 'evah' + 'evaj' + 'evam' + 'evan' + 'evao' + 'evat' + 'evav' + 'eva{sx}' (<-'eva') + 'avaju{cy}i' + 'avasmo' + 'avaste' + 'avajmo' + 'avajte' + 'avaju' + 'avala' + 'avale' + 'avali' + 'avalo' + 'avamo' + 'avana' + 'avane' + 'avani' + 'avano' + 'avate' + 'avati' + 'ava{sx}e' + 'avahu' + 'avah' + 'avaj' + 'avam' + 'avan' + 'avao' + 'avat' + 'avav' + 'ava{sx}' (<-'ava') + 'ivaju{cy}i' + 'ivasmo' + 'ivaste' + 'ivajmo' + 'ivajte' + 'ivaju' + 'ivala' + 'ivale' + 'ivali' + 'ivalo' + 'ivamo' + 'ivana' + 'ivane' + 'ivani' + 'ivano' + 'ivate' + 'ivati' + 'iva{sx}e' + 'ivahu' + 'ivah' + 'ivaj' + 'ivam' + 'ivan' + 'ivao' + 'ivat' + 'ivav' + 'iva{sx}' (<-'iva') + 'uvaju{cy}i' + 'uvasmo' + 'uvaste' + 'uvajmo' + 'uvajte' + 'uvaju' + 'uvala' + 'uvale' + 'uvali' + 'uvalo' + 'uvamo' + 'uvana' + 'uvane' + 'uvani' + 'uvano' + 'uvate' + 'uvati' + 'uva{sx}e' + 'uvahu' + 'uvah' + 'uvaj' + 'uvam' + 'uvan' + 'uvao' + 'uvat' + 'uvav' + 'uva{sx}' (<-'uva') + 'irujemo' + 'irujete' + 'iruju{cy}i' + 'iraju{cy}i' + 'irivat' + 'irujem' + 'iruje{sx}' + 'irujmo' + 'irujte' + 'irav{sx}i' + 'irasmo' + 'iraste' + 'irati' + 'iramo' + 'irate' + 'iraju' + 'ira{sx}e' + 'irahu' + 'irala' + 'iralo' + 'irali' + 'irale' + 'iruje' + 'iruju' + 'iruj' + 'iral' + 'iran' + 'iram' + 'ira{sx}' + 'irat' + 'irah' + 'irao' (<-'ir') + 'a{cx}ismo' + 'a{cx}iste' + 'a{cx}iti' + 'a{cx}imo' + 'a{cx}ite' + 'a{cx}i{sx}e' + 'a{cx}e{cy}i' + 'a{cx}ila' + 'a{cx}ilo' + 'a{cx}ili' + 'a{cx}ile' + 'a{cx}ena' + 'a{cx}eno' + 'a{cx}eni' + 'a{cx}ene' + 'a{cx}io' + 'a{cx}im' + 'a{cx}i{sx}' + 'a{cx}it' + 'a{cx}ih' + 'a{cx}en' + 'a{cx}i' + 'a{cx}e' (<-'a{cx}') + 'a{cx}av{sx}i' + 'a{cx}asmo' + 'a{cx}aste' + 'a{cx}ahu' + 'a{cx}ati' + 'a{cx}amo' + 'a{cx}ate' + 'a{cx}a{sx}e' + 'a{cx}ala' + 'a{cx}alo' + 'a{cx}ali' + 'a{cx}ale' + 'a{cx}aju' + 'a{cx}ana' + 'a{cx}ano' + 'a{cx}ani' + 'a{cx}ane' + 'a{cx}ao' + 'a{cx}am' + 'a{cx}a{sx}' + 'a{cx}at' + 'a{cx}ah' + 'a{cx}an' (<-'a{cx}a') + 'nuv{sx}i' + 'nusmo' + 'nuste' + 'nu{cy}i' + 'nimo' + 'nite' + 'nemo' + 'nete' + 'nula' + 'nulo' + 'nule' + 'nuli' + 'nuto' + 'nuti' + 'nuta' + 'ne{sx}' + 'nuo' + 'nut' (<-'n') + 'niv{sx}i' + 'nismo' + 'niste' + 'niti' + 'nila' + 'nilo' + 'nile' + 'nili' + 'ni{sx}' + 'nio' (<-'ni') + 'aju{cy}i' + 'av{sx}i' + 'asmo' + 'ajmo' + 'ajte' + 'ajem' + 'aloj' + 'amo' + 'ate' + 'aje' + 'aju' + 'ati' + 'a{sx}e' + 'ahu' + 'ala' + 'ali' + 'ale' + 'alo' + 'ano' + 'at' + 'ah' + 'ao' + 'aj' + 'an' + 'am' + 'a{sx}' (<-'a') + 'uraju{cy}i' + 'urasmo' + 'uraste' + 'urajmo' + 'urajte' + 'uramo' + 'urate' + 'uraju' + 'urati' + 'ura{sx}e' + 'urahu' + 'urala' + 'urali' + 'urale' + 'uralo' + 'urana' + 'urano' + 'urani' + 'urane' + 'ural' + 'urat' + 'urah' + 'urao' + 'uraj' + 'uran' + 'uram' + 'ura{sx}' (<-'ur') + 'astajasmo' + 'astajaste' + 'astajahu' + 'astajati' + 'astajemo' + 'astajete' + 'astaja{sx}e' + 'astajali' + 'astaju{cy}i' + 'astajala' + 'astajalo' + 'astajale' + 'astajmo' + 'astajao' + 'astajem' + 'astaje{sx}' + 'astajat' + 'astajah' + 'astajte' + 'astaje' + 'astaju' (<-'astaj') + 'istajasmo' + 'istajaste' + 'istajahu' + 'istajati' + 'istajemo' + 'istajete' + 'istaja{sx}e' + 'istajali' + 'istaju{cy}i' + 'istajala' + 'istajalo' + 'istajale' + 'istajmo' + 'istajao' + 'istajem' + 'istaje{sx}' + 'istajat' + 'istajah' + 'istajte' + 'istaje' + 'istaju' (<-'istaj') + 'ostajasmo' + 'ostajaste' + 'ostajahu' + 'ostajati' + 'ostajemo' + 'ostajete' + 'ostaja{sx}e' + 'ostajali' + 'ostaju{cy}i' + 'ostajala' + 'ostajalo' + 'ostajale' + 'ostajmo' + 'ostajao' + 'ostajem' + 'ostaje{sx}' + 'ostajat' + 'ostajah' + 'ostajte' + 'ostaje' + 'ostaju' (<-'ostaj') + 'alama' + 'alima' + 'alom' + 'alu' + 'al' (<-'a') + 'ajevima' + 'ajevi' + 'ajeva' + 'ajeve' + 'ajama' + 'ajima' + 'aja' + 'aji' (<-'aj') + 'astadosmo' + 'astadoste' + 'astado{sx}e' + 'astanemo' + 'astademo' + 'astanete' + 'astadete' + 'astanimo' + 'astanite' + 'astanila' + 'astav{sx}i' + 'astanem' + 'astadem' + 'astane{sx}' + 'astade{sx}' + 'astadoh' + 'astade' + 'astati' + 'astane' + 'astanu' + 'astadu' + 'astala' + 'astali' + 'astalo' + 'astale' + 'astat' + 'astao' (<-'asta') + 'istadosmo' + 'istadoste' + 'istado{sx}e' + 'istanemo' + 'istademo' + 'istanete' + 'istadete' + 'istanimo' + 'istanite' + 'istanila' + 'istav{sx}i' + 'istanem' + 'istadem' + 'istane{sx}' + 'istade{sx}' + 'istadoh' + 'istade' + 'istati' + 'istane' + 'istanu' + 'istadu' + 'istala' + 'istali' + 'istalo' + 'istale' + 'istat' + 'istao' (<-'ista') + 'ostadosmo' + 'ostadoste' + 'ostado{sx}e' + 'ostanemo' + 'ostademo' + 'ostanete' + 'ostadete' + 'ostanimo' + 'ostanite' + 'ostanila' + 'ostav{sx}i' + 'ostanem' + 'ostadem' + 'ostane{sx}' + 'ostade{sx}' + 'ostadoh' + 'ostade' + 'ostati' + 'ostane' + 'ostanu' + 'ostadu' + 'ostala' + 'ostali' + 'ostalo' + 'ostale' + 'ostat' + 'ostao' (<-'osta') + 'tasmo' + 'taste' + 'tajmo' + 'tajte' + 'tav{sx}i' + 'tati' + 'tamo' + 'tate' + 'taju' + 'tala' + 'talo' + 'tale' + 'tali' + 'tana' + 'tano' + 'tani' + 'tane' + 'tan' + 'taj' + 'tao' + 'tam' + 'ta{sx}' + 'tat' + 'tah' (<-'ta') + 'injasmo' + 'injaste' + 'injati' + 'injemo' + 'injete' + 'injali' + 'injala' + 'injalo' + 'injale' + 'inja{sx}e' + 'injahu' + 'injem' + 'inje{sx}' + 'injat' + 'injah' + 'injao' (<-'inj') + 'astemo' + 'astete' + 'astimo' + 'astite' + 'astu{cy}i' + 'aste{sx}' + 'asli' + 'asla' + 'aslo' + 'asle' (<-'as') + 'iv{sx}i' + 'ie{cy}i' + 'ismo' + 'imo' + 'ite' + 'iti' + 'ili' + 'ila' + 'ilo' + 'ile' + 'im' + 'i{sx}' + 'it' + 'ih' + 'io' (<-'i') + 'ijemo' + 'ijete' + 'ijem' + 'ije{sx}' + 'ijmo' + 'ijte' + 'iju' + 'ije' + 'ij' + 'ilu' (<-'i') + 'lu{cx}ujete' + 'lu{cx}uju{cy}i' + 'lu{cx}ujemo' + 'lu{cx}ujem' + 'lu{cx}uje{sx}' + 'lu{cx}ismo' + 'lu{cx}iste' + 'lu{cx}ujmo' + 'lu{cx}ujte' + 'lu{cx}uje' + 'lu{cx}uju' + 'lu{cx}i{sx}e' + 'lu{cx}iti' + 'lu{cx}imo' + 'lu{cx}ite' + 'lu{cx}ila' + 'lu{cx}ilo' + 'lu{cx}ili' + 'lu{cx}ile' + 'lu{cx}ena' + 'lu{cx}eno' + 'lu{cx}eni' + 'lu{cx}ene' + 'lu{cx}uj' + 'lu{cx}io' + 'lu{cx}en' + 'lu{cx}im' + 'lu{cx}i{sx}' + 'lu{cx}it' + 'lu{cx}ih' + 'lu{cx}e' + 'lu{cx}i' (<-'lu{cx}') + 'jetismo' + 'jetiste' + 'jeti{sx}e' + 'jetimo' + 'jetite' + 'jetiti' + 'jetili' + 'jetila' + 'jetilo' + 'jetile' + 'jetim' + 'jeti{sx}' + 'jetit' + 'jetih' + 'jetio' (<-'jeti') + 'emo' + 'em' + 'e{sx}' + 'elama' + 'el' (<-'e') + 'ilama' + 'ilima' + 'ilom' + 'il' (<-'i') + 'atijega' + 'atijemu' + 'atijima' + 'atijeg' + 'atijem' + 'atega' + 'atemu' + 'ateg' + 'atem' + 'atijih' + 'atijim' + 'atima' + 'atoga' + 'atome' + 'atomu' + 'atiji' + 'atije' + 'atija' + 'atiju' + 'atoj' + 'atog' + 'atom' + 'atim' + 'atih' + 'ata' + 'atu' + 'ato' (<-'at') + 'etav{sx}i' + 'etu{cy}i' + 'etemo' + 'etimo' + 'etem' + 'ete{sx}' (<-'et') + 'lucujuci' + 'lucujemo' + 'lucujete' + 'lucujem' + 'lucujes' + 'lucujmo' + 'lucujte' + 'lucismo' + 'luciste' + 'luciti' + 'lucite' + 'lucise' + 'lucuje' + 'lucuju' + 'lucila' + 'lucile' + 'lucili' + 'lucilo' + 'lucena' + 'luceni' + 'lucene' + 'luceno' + 'lucimo' + 'lucim' + 'lucis' + 'lucih' + 'lucit' + 'lucio' + 'lucuj' + 'lucen' + 'luce' + 'luci' (R2 <-'luc') + 'snjijima' + 'snjijemu' + 'snjijega' + 'snjijim' + 'snjijih' + 'snjijeg' + 'snjijoj' + 'snjiji' + 'snjija' + 'snjije' + 'snjiju' + 'snjima' + 'snjemu' + 'snjomu' + 'snjome' + 'snjega' + 'snjoga' + 'snjih' + 'snjim' + 'snjem' + 'snjom' + 'snjeg' + 'snjog' + 'snjoj' + 'snja' + 'snje' + 'snji' + 'snjo' + 'snju' (R2 <-'snj') + 'osijima' + 'osijemu' + 'osijega' + 'snjijem' + 'osijih' + 'osijim' + 'osijem' + 'osijeg' + 'osijoj' + 'osima' + 'osemu' + 'osomu' + 'osome' + 'osega' + 'osoga' + 'osija' + 'osije' + 'osiji' + 'osiju' + 'osih' + 'osim' + 'osem' + 'osom' + 'oseg' + 'osog' + 'osoj' + 'osa' + 'ose' + 'osi' + 'osu' (R2 <-'os') + 'acismo' + 'aciste' + 'acima' + 'acimo' + 'acome' + 'acomu' + 'acite' + 'aciti' + 'acise' + 'acila' + 'acile' + 'acili' + 'acilo' + 'acega' + 'acene' + 'aceci' + 'aceni' + 'acemu' + 'acena' + 'aceno' + 'acoga' + 'acoj' + 'acih' + 'acem' + 'acom' + 'acen' + 'acog' + 'acit' + 'acio' + 'aceg' + 'acim' + 'acuh' + 'acis' + 'ace' + 'aca' + 'aci' (R2 <-'ac') + 'ecome' + 'ecoga' + 'ecemu' + 'ecima' + 'ecega' + 'ecomu' + 'ecoj' + 'ecuh' + 'ecom' + 'ecog' + 'eceg' + 'ecih' + 'ecem' + 'ecim' + 'eca' + 'ece' (R2 <-'ec') + 'ucomu' + 'ucome' + 'ucima' + 'ucoga' + 'ucega' + 'ucemu' + 'ucih' + 'ucog' + 'uceg' + 'ucom' + 'ucem' + 'ucim' + 'ucuh' + 'ucoj' + 'uca' + 'uce' (R2 <-'uc') + 'rosismo' + 'rosivsi' + 'rosiste' + 'rositi' + 'rosili' + 'rosise' + 'rosite' + 'rosilo' + 'rosimo' + 'rosile' + 'rosila' + 'rosit' + 'rosis' + 'rosio' + 'rosim' + 'rosih' (R2 <-'rosi') + 'acavsi' + 'acaste' + 'acasmo' + 'acaju' + 'acane' + 'acate' + 'acali' + 'acani' + 'acati' + 'acale' + 'acahu' + 'acase' + 'acano' + 'acamo' + 'acalo' + 'acana' + 'acala' + 'acam' + 'acan' + 'acao' + 'acas' + 'acat' + 'acah' (R2 <-'aca') + 'jasima' + 'jasama' + 'jasem' + 'jasom' + 'jase' + 'jasi' + 'jasa' + 'jasu' (R2 <-'jas') + 'tasima' + 'tasama' + 'tasem' + 'tasom' + 'tase' + 'tasa' + 'tasu' + 'tasi' (R2 <-'tas') + 'gasima' + 'gasama' + 'gasem' + 'gasom' + 'gasi' + 'gasu' + 'gase' + 'gasa' (R2 <-'gas') + 'nasama' + 'nasima' + 'nasem' + 'nasom' + 'nasu' + 'nasi' + 'nase' + 'nasa' (R2 <-'nas') + 'kasama' + 'kasima' + 'kasom' + 'kasem' + 'kasi' + 'kasu' + 'kase' + 'kasa' (R2 <-'kas') + 'vasama' + 'vasima' + 'vasom' + 'vasem' + 'vasi' + 'vase' + 'vasa' + 'vasu' (R2 <-'vas') + 'basama' + 'basima' + 'basom' + 'basem' + 'basi' + 'base' + 'basu' + 'basa' (R2 <-'bas') + 'astuci' + 'astes' (R2 <-'as') + 'cinima' + 'cinome' + 'cinama' + 'cinomu' + 'cinoga' + 'cinom' + 'cinih' + 'cinim' + 'cinog' + 'cinoj' + 'cino' + 'cini' + 'cinu' + 'cine' + 'cina' (R2 <-'cin') + 'astajase' + 'astajuci' + 'astajes' (R2 <-'astaj') + 'istajase' + 'istajuci' + 'istajes' (R2 <-'istaj') + 'ostajase' + 'ostajuci' + 'ostajes' (R2 <-'ostaj') + 'astadose' + 'astades' + 'astanes' + 'astavsi' (R2 <-'asta') + 'istadose' + 'istades' + 'istanes' + 'istavsi' (R2 <-'ista') + 'ostadose' + 'ostades' + 'ostanes' + 'ostavsi' (R2 <-'osta') + 'avajuci' + 'avase' + 'avas' (R2 <-'ava') + 'evajuci' + 'evase' + 'evas' (R2 <-'eva') + 'ivajuci' + 'ivase' + 'ivas' (R2 <-'iva') + 'uvajuci' + 'uvase' + 'uvas' (R2 <-'uva') + 'ovase' (R2 <-'ova') + 'jetise' + 'jetis' (R2 <-'jeti') + 'injase' + 'injes' (R2 <-'inj') + 'istem' (R2 <-'ist') + 'esama' + 'esem' + 'esi' (R2 <-'es') + 'etavsi' + 'etuci' + 'etes' (R2 <-'et') + 'isama' + 'isem' + 'isi' (R2 <-'is') + 'irajuci' + 'irujuci' + 'irujes' + 'iravsi' + 'irase' + 'iras' (R2 <-'ir') + 'urajuci' + 'urase' + 'uras' (R2 <-'ur') + 'ujuci' + 'ujes' (R2 <-'uj') + 'nivsi' + 'nis' (R2 <-'ni') + 'snega' + 'snemu' + 'snem' + 'sneg' (R2 <-'sn') + 'tavsi' + 'tas' (R2 <-'ta') + 'ajuci' + 'avsi' + 'ase' + 'as' (R2 <-'a') + 'ijes' + 'ivsi' + 'ieci' + 'is' (R2 <-'i') + 'es' (R2 <-'e') + 'nuvsi' + 'nuci' + 'nes' (R2 <-'n') + ) + ) + + define Step_3 as ( + [substring] R1 among ( + 'enom' + 'enoj' + 'enog' + 'enim' + 'enih' + 'anoj' + 'anog' + 'anim' + 'anih' + 'ost' + 'eno' + 'eni' + 'oga' + 'ima' + 'enu' + 'ena' + 'ama' + 'ano' + 'ani' + 'om' + 'og' + 'u' + 'o' + 'i' + 'e' + 'a' (<-'') + ) + ) +) + +define stem as ( + do cyr_to_lat + do prelude + do mark_regions + backwards ( + do Step_1 + do (Step_2 or Step_3) + ) +) diff --git a/contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/spanish.sbl index 9dee289cc..6638f5f70 100644 --- a/contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl +++ b/contrib/snowball/algorithms/spanish.sbl @@ -16,15 +16,15 @@ groupings ( v ) stringescapes {} -/* special characters (in ISO Latin I) */ - -stringdef a' hex 'E1' // a-acute -stringdef e' hex 'E9' // e-acute -stringdef i' hex 'ED' // i-acute -stringdef o' hex 'F3' // o-acute -stringdef u' hex 'FA' // u-acute -stringdef u" hex 'FC' // u-diaeresis -stringdef n~ hex 'F1' // n-tilde +/* special characters */ + +stringdef a' '{U+00E1}' // a-acute +stringdef e' '{U+00E9}' // e-acute +stringdef i' '{U+00ED}' // i-acute +stringdef o' '{U+00F3}' // o-acute +stringdef u' '{U+00FA}' // u-acute +stringdef u" '{U+00FC}' // u-diaeresis +stringdef n~ '{U+00F1}' // n-tilde define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}' diff --git a/contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl deleted file mode 100644 index db7a46201..000000000 --- a/contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl +++ /dev/null @@ -1,230 +0,0 @@ -routines ( - postlude mark_regions - RV R1 R2 - attached_pronoun - standard_suffix - y_verb_suffix - verb_suffix - residual_suffix -) - -externals ( stem ) - -integers ( pV p1 p2 ) - -groupings ( v ) - -stringescapes {} - -/* special characters (in MS-DOS Latin I) */ - -stringdef a' hex 'A0' // a-acute -stringdef e' hex '82' // e-acute -stringdef i' hex 'A1' // i-acute -stringdef o' hex 'A2' // o-acute -stringdef u' hex 'A3' // u-acute -stringdef u" hex '81' // u-diaeresis -stringdef n~ hex 'A4' // n-tilde - -define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}' - -define mark_regions as ( - - $pV = limit - $p1 = limit - $p2 = limit // defaults - - do ( - ( v (non-v gopast v) or (v gopast non-v) ) - or - ( non-v (non-v gopast v) or (v next) ) - setmark pV - ) - do ( - gopast v gopast non-v setmark p1 - gopast v gopast non-v setmark p2 - ) -) - -define postlude as repeat ( - [substring] among( - '{a'}' (<- 'a') - '{e'}' (<- 'e') - '{i'}' (<- 'i') - '{o'}' (<- 'o') - '{u'}' (<- 'u') - // and possibly {u"}->u here, or in prelude - '' (next) - ) //or next -) - -backwardmode ( - - define RV as $pV <= cursor - define R1 as $p1 <= cursor - define R2 as $p2 <= cursor - - define attached_pronoun as ( - [substring] among( - 'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo' - 'las' 'les' 'los' 'nos' - ) - substring RV among( - 'i{e'}ndo' (] <- 'iendo') - '{a'}ndo' (] <- 'ando') - '{a'}r' (] <- 'ar') - '{e'}r' (] <- 'er') - '{i'}r' (] <- 'ir') - 'ando' - 'iendo' - 'ar' 'er' 'ir' - (delete) - 'yendo' ('u' delete) - ) - ) - - define standard_suffix as ( - [substring] among( - - 'anza' 'anzas' - 'ico' 'ica' 'icos' 'icas' - 'ismo' 'ismos' - 'able' 'ables' - 'ible' 'ibles' - 'ista' 'istas' - 'oso' 'osa' 'osos' 'osas' - 'amiento' 'amientos' - 'imiento' 'imientos' - ( - R2 delete - ) - 'adora' 'ador' 'aci{o'}n' - 'adoras' 'adores' 'aciones' - 'ante' 'antes' 'ancia' 'ancias'// Note 1 - ( - R2 delete - try ( ['ic'] R2 delete ) - ) - 'log{i'}a' - 'log{i'}as' - ( - R2 <- 'log' - ) - 'uci{o'}n' 'uciones' - ( - R2 <- 'u' - ) - 'encia' 'encias' - ( - R2 <- 'ente' - ) - 'amente' - ( - R1 delete - try ( - [substring] R2 delete among( - 'iv' (['at'] R2 delete) - 'os' - 'ic' - 'ad' - ) - ) - ) - 'mente' - ( - R2 delete - try ( - [substring] among( - 'ante' // Note 1 - 'able' - 'ible' (R2 delete) - ) - ) - ) - 'idad' - 'idades' - ( - R2 delete - try ( - [substring] among( - 'abil' - 'ic' - 'iv' (R2 delete) - ) - ) - ) - 'iva' 'ivo' - 'ivas' 'ivos' - ( - R2 delete - try ( - ['at'] R2 delete // but not a further ['ic'] R2 delete - ) - ) - ) - ) - - define y_verb_suffix as ( - setlimit tomark pV for ([substring]) among( - 'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}' - 'yas' 'yes' 'yais' 'yamos' - ('u' delete) - ) - ) - - define verb_suffix as ( - setlimit tomark pV for ([substring]) among( - - 'en' 'es' '{e'}is' 'emos' - (try ('u' test 'g') ] delete) - - 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais' - 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}' - 'ar{e'}' - 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais' - 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}' - 'er{e'}' - 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais' - 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}' - 'ir{e'}' - - 'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed' - 'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an' - 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado' - 'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as' - 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases' - 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais' - 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados' - 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' - '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos' - (delete) - ) - ) - - define residual_suffix as ( - [substring] among( - 'os' - 'a' 'o' '{a'}' '{i'}' '{o'}' - ( RV delete ) - 'e' '{e'}' - ( RV delete try( ['u'] test 'g' RV delete ) ) - ) - ) -) - -define stem as ( - do mark_regions - backwards ( - do attached_pronoun - do ( standard_suffix or - y_verb_suffix or - verb_suffix - ) - do residual_suffix - ) - do postlude -) - -/* - Note 1: additions of 15 Jun 2005 -*/ diff --git a/contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/swedish.sbl index 03ce1e22f..2cbb88596 100644 --- a/contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl +++ b/contrib/snowball/algorithms/swedish.sbl @@ -13,11 +13,11 @@ groupings ( v s_ending ) stringescapes {} -/* special characters (in ISO Latin I) */ +/* special characters */ -stringdef a" hex 'E4' -stringdef ao hex 'E5' -stringdef o" hex 'F6' +stringdef a" '{U+00E4}' +stringdef ao '{U+00E5}' +stringdef o" '{U+00F6}' define v 'aeiouy{a"}{ao}{o"}' diff --git a/contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl deleted file mode 100644 index 1631f401a..000000000 --- a/contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl +++ /dev/null @@ -1,72 +0,0 @@ -routines ( - mark_regions - main_suffix - consonant_pair - other_suffix -) - -externals ( stem ) - -integers ( p1 x ) - -groupings ( v s_ending ) - -stringescapes {} - -/* special characters (in MS-DOS Latin I) */ - -stringdef a" hex '84' -stringdef ao hex '86' -stringdef o" hex '94' - -define v 'aeiouy{a"}{ao}{o"}' - -define s_ending 'bcdfghjklmnoprtvy' - -define mark_regions as ( - - $p1 = limit - test ( hop 3 setmark x ) - goto v gopast non-v setmark p1 - try ( $p1 < x $p1 = x ) -) - -backwardmode ( - - define main_suffix as ( - setlimit tomark p1 for ([substring]) - among( - - 'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne' - 'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter' - 'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens' - 'hetens' 'erns' 'at' 'andet' 'het' 'ast' - (delete) - 's' - (s_ending delete) - ) - ) - - define consonant_pair as setlimit tomark p1 for ( - among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt') - and ([next] delete) - ) - - define other_suffix as setlimit tomark p1 for ( - [substring] among( - 'lig' 'ig' 'els' (delete) - 'l{o"}st' (<-'l{o"}s') - 'fullt' (<-'full') - ) - ) -) - -define stem as ( - - do mark_regions - backwards ( - do main_suffix - do consonant_pair - do other_suffix - ) -) diff --git a/contrib/snowball/algorithms/tamil.sbl b/contrib/snowball/algorithms/tamil.sbl new file mode 100644 index 000000000..9635777ab --- /dev/null +++ b/contrib/snowball/algorithms/tamil.sbl @@ -0,0 +1,405 @@ +/* +* Affix stripping stemming algorithm for Tamil +* By Damodharan Rajalingam +*/ + +stringescapes {} + +/* Aytham */ +stringdef aytham '{U+0B83}' + +/* Uyir - independent vowels */ +stringdef a '{U+0B85}' +stringdef aa '{U+0B86}' +stringdef i '{U+0B87}' +stringdef ii '{U+0B88}' +stringdef u '{U+0B89}' +stringdef uu '{U+0B8A}' +stringdef e '{U+0B8E}' +stringdef ee '{U+0B8F}' +stringdef ai '{U+0B90}' +stringdef o '{U+0B92}' +stringdef oo '{U+0B93}' +stringdef au '{U+0B94}' + +/* Consonants */ +stringdef ka '{U+0B95}' +stringdef nga '{U+0B99}' +stringdef ca '{U+0B9A}' +stringdef ja '{U+0B9C}' +stringdef nya '{U+0B9E}' +stringdef tta '{U+0B9F}' +stringdef nna '{U+0BA3}' +stringdef ta '{U+0BA4}' +stringdef tha '{U+0BA4}' +stringdef na '{U+0BA8}' +stringdef nnna '{U+0BA9}' +stringdef pa '{U+0BAA}' +stringdef ma '{U+0BAE}' +stringdef ya '{U+0BAF}' +stringdef ra '{U+0BB0}' +stringdef rra '{U+0BB1}' +stringdef la '{U+0BB2}' +stringdef lla '{U+0BB3}' +stringdef llla '{U+0BB4}' +stringdef zha '{U+0BB4}' +stringdef va '{U+0BB5}' + +/* Vatamozi - borrowed */ +stringdef sha '{U+0BB6}' +stringdef ssa '{U+0BB7}' +stringdef sa '{U+0BB8}' +stringdef ha '{U+0BB9}' + + +/* Dependent vowel signs (kombu etc.) */ +stringdef vs_aa '{U+0BBE}' +stringdef vs_i '{U+0BBF}' +stringdef vs_ii '{U+0BC0}' +stringdef vs_u '{U+0BC1}' +stringdef vs_uu '{U+0BC2}' +stringdef vs_e '{U+0BC6}' +stringdef vs_ee '{U+0BC7}' +stringdef vs_ai '{U+0BC8}' +stringdef vs_o '{U+0BCA}' +stringdef vs_oo '{U+0BCB}' +stringdef vs_au '{U+0BCC}' + +/* Pulli */ +stringdef pulli '{U+0BCD}' + +/* AU length markk */ +stringdef au_lmark '{U+0BD7}' + + +routines ( + remove_plural_suffix + remove_question_suffixes + remove_question_prefixes + remove_pronoun_prefixes + remove_command_suffixes + remove_um + remove_vetrumai_urupukal + fix_va_start + fix_ending + fix_endings + remove_tense_suffix + remove_tense_suffixes + remove_common_word_endings + has_min_length +) + +externals ( stem ) + +booleans ( + found_a_match + found_vetrumai_urupu +) + +define has_min_length as ( + $(len > 4) +) + +define fix_va_start as ( + (try '{va}{vs_oo}' and [ '{va}{vs_oo}' ] <- '{oo}' ) or + (try '{va}{vs_o}' and [ '{va}{vs_o}' ] <- '{o}' ) or + (try '{va}{vs_u}' and [ '{va}{vs_u}' ] <- '{u}' ) or + (try '{va}{vs_uu}' and [ '{va}{vs_uu}' ] <- '{uu}' ) +) + +define fix_endings as ( + do repeat fix_ending +) + +define remove_question_prefixes as ( + [ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete + do fix_va_start +) + +// Gives signal t if an ending was fixed, signal f otherwise. +define fix_ending as ( + $(len > 3) + backwards ( + ( [among('{na}{pulli}' '{na}{pulli}{ta}' '{na}{pulli}{ta}{pulli}') ] delete ) + or + ( ['{ya}{pulli}' test among('{vs_ai}' '{vs_i}' '{vs_ii}') ] delete ) + or + ( [ '{tta}{pulli}{pa}{pulli}' or '{tta}{pulli}{ka}{pulli}' ] <- '{lla}{pulli}' ) + or + ( [ '{nnna}{pulli}{rra}{pulli}' ] <- '{la}{pulli}' ) + or +// ( [ '{rra}{pulli}{ka}{pulli}' or '{nnna}{pulli}{nnna}{pulli}' ] <- '{la}{pulli}' ) + ( [ '{rra}{pulli}{ka}{pulli}' ] <- '{la}{pulli}' ) + or + ( [ '{tta}{pulli}{tta}{pulli}' ] <- '{tta}{vs_u}' ) + or + ( found_vetrumai_urupu [ '{ta}{pulli}{ta}{pulli}' (test not '{vs_ai}') ] <- '{ma}{pulli}' ] ) + or + ( [ '{vs_u}{ka}{pulli}' or '{vs_u}{ka}{pulli}{ka}{pulli}' ] <- '{pulli}' ) + or + ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete ) + or + ( [ '{vs_u}{ka}{pulli}' ] <- '{pulli}' ) + or + ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete ) + or + ( [ '{pulli}' (among('{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}') or among('{nga}' '{nya}' '{nna}' '{na}' '{ma}' '{nnna}')) '{pulli}' ] <- '{pulli}' ) + or + ( [ among('{va}' '{ya}' '{va}{pulli}') ] delete ) + or + ( [ '{nnna}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')) ] delete ) + or + ( [ '{nga}{pulli}' (test not '{vs_ai}')] <- '{ma}{pulli}' ) + or + ( [ '{nga}{pulli}' ] delete ) + or + ( [ '{pulli}' (test (among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') or '{pulli}')) ] delete ) + ) +) + +define remove_pronoun_prefixes as ( + unset found_a_match + [ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete + (set found_a_match) + do fix_va_start +) + +define remove_plural_suffix as ( + unset found_a_match + backwards ( + ( [ '{vs_u}{nga}{pulli}{ka}{lla}{pulli}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')) ] <- '{pulli}' ) or + ( [ '{rra}{pulli}{ka}{lla}{pulli}' ] <- '{la}{pulli}' ) or + ( [ '{tta}{pulli}{ka}{lla}{pulli}' ] <- '{lla}{pulli}' ) or + ( [ '{ka}{lla}{pulli}' ] delete ) + (set found_a_match) + ) +) + +define remove_question_suffixes as ( + has_min_length + unset found_a_match + backwards ( + do ( + [ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}' + (set found_a_match) + ) + ) + do fix_endings +) + +define remove_command_suffixes as ( + has_min_length + unset found_a_match + backwards ( + [ among('{pa}{vs_i}' '{va}{vs_i}') ] delete + (set found_a_match) + ) +) + +define remove_um as ( + unset found_a_match + has_min_length + backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}' + (set found_a_match) + ) + do fix_ending +) + +define remove_common_word_endings as ( + // These are not suffixes actually but are + // some words that are attached to other words + // but can be removed for stemming + unset found_a_match + has_min_length + backwards ( + test ( [ '{vs_u}{tta}{nnna}{pulli}' or + '{vs_i}{la}{pulli}{la}{vs_ai}' or + '{vs_i}{tta}{ma}{pulli}' or + '{vs_i}{nnna}{pulli}{rra}{vs_i}' or + '{vs_aa}{ka}{vs_i}' or + '{vs_aa}{ka}{vs_i}{ya}' or + '{vs_e}{nnna}{pulli}{rra}{vs_u}' or + '{vs_u}{lla}{pulli}{lla}' or + '{vs_u}{tta}{vs_ai}{ya}' or + '{vs_u}{tta}{vs_ai}' or + '{vs_e}{nnna}{vs_u}{ma}{pulli}' or + ('{la}{pulli}{la}' test (not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or + '{vs_e}{nnna}' or + '{vs_aa}{ka}{vs_i}' ] <- '{pulli}' + (set found_a_match) + ) + or + test ( [ among('{pa}{tta}{vs_u}' + '{pa}{tta}{pulli}{tta}' + '{pa}{tta}{pulli}{tta}{vs_u}' + '{pa}{tta}{pulli}{tta}{ta}{vs_u}' + '{pa}{tta}{pulli}{tta}{nna}' + '{ka}{vs_u}{ra}{vs_i}{ya}' + '{pa}{rra}{pulli}{rra}{vs_i}' + '{va}{vs_i}{tta}{vs_u}' + '{va}{vs_i}{tta}{pulli}{tta}{vs_u}' + '{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}' + '{pa}{tta}{vs_i}' + '{ta}{vs_aa}{nnna}' + '{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}') + ] delete + (set found_a_match) + ) + ) + do fix_endings +) + +define remove_vetrumai_urupukal as ( + unset found_a_match + unset found_vetrumai_urupu + has_min_length + backwards ( + ( + test ( ['{nnna}{vs_ai}'] delete ) + or + test ([ ( '{vs_i}{nnna}{vs_ai}' or + '{vs_ai}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}'))) or + ( '{vs_ai}' (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}'))) + ] <- '{pulli}' + ) + or + test ( [ + '{vs_o}{tta}{vs_u}' or + '{vs_oo}{tta}{vs_u}' or + '{vs_i}{la}{pulli}' or + '{vs_i}{rra}{pulli}' or + ('{vs_i}{nnna}{pulli}' (test not '{ma}')) or + '{vs_i}{nnna}{pulli}{rra}{vs_u}' or + '{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}' or + '{va}{vs_i}{tta}' or + ($(len >= 7) '{vs_i}{tta}{ma}{pulli}') or + '{vs_aa}{la}{pulli}' or + '{vs_u}{tta}{vs_ai}' or + '{vs_aa}{ma}{la}{pulli}' or + ('{la}{pulli}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or + '{vs_u}{lla}{pulli}' + ] <- '{pulli}' + ) + or + test ( [ + '{ka}{nna}{pulli}' or + '{ma}{vs_u}{nnna}{pulli}' or + '{ma}{vs_ee}{la}{pulli}' or + '{ma}{vs_ee}{rra}{pulli}' or + '{ka}{vs_ii}{llla}{pulli}' or + '{pa}{vs_i}{nnna}{pulli}' or + ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) + ] delete + ) + or + test ([ '{vs_ii}' ] <- '{vs_i}') + ) + (set found_a_match) + (set found_vetrumai_urupu) + do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' ) + ) + do fix_endings +) + +define remove_tense_suffixes as ( + set found_a_match + repeat ( found_a_match (do remove_tense_suffix) ) +) + +define remove_tense_suffix as ( + unset found_a_match + has_min_length + backwards ( + do ( + test ( [among( + '{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}' + '{pa}{tta}{vs_u}' + )] delete + (set found_a_match) + ) + or + test ( [ + '{ma}{vs_aa}{ra}{pulli}' or + '{ma}{vs_i}{nnna}{pulli}' or + '{nnna}{nnna}{pulli}' or + '{nnna}{vs_aa}{nnna}{pulli}' or + '{nnna}{vs_aa}{lla}{pulli}' or + '{nnna}{vs_aa}{ra}{pulli}' or + ('{va}{nnna}{pulli}' test (not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}')) ) or + '{nnna}{lla}{pulli}' or + '{va}{lla}{pulli}' or + '{nnna}{ra}{pulli}' or + '{va}{ra}{pulli}' or + '{nnna}' or '{pa}' or '{ka}' or '{ta}' or '{ya}' or + '{pa}{nnna}{pulli}' or + '{pa}{lla}{pulli}' or + '{pa}{ra}{pulli}' or + ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or + '{vs_i}{rra}{pulli}{rra}{vs_u}' or + '{pa}{ma}{pulli}' or + '{nnna}{ma}{pulli}' or + '{ta}{vs_u}{ma}{pulli}' or + '{rra}{vs_u}{ma}{pulli}' or + '{ka}{vs_u}{ma}{pulli}' or + '{nnna}{vs_e}{nnna}{pulli}' or + '{nnna}{vs_ai}' or + '{va}{vs_ai}' + ] delete + (set found_a_match) + ) + or + test ( [ + ('{vs_aa}{nnna}{pulli}' test (not '{ca}')) or + '{vs_aa}{lla}{pulli}' or + '{vs_aa}{ra}{pulli}' or + '{vs_ee}{nnna}{pulli}' or + '{vs_aa}' or + '{vs_aa}{ma}{pulli}' or + '{vs_e}{ma}{pulli}' or + '{vs_ee}{ma}{pulli}' or + '{vs_oo}{ma}{pulli}' or + '{ka}{vs_u}{ma}{pulli}' or + '{ta}{vs_u}{ma}{pulli}' or + '{tta}{vs_u}{ma}{pulli}' or + '{rra}{vs_u}{ma}{pulli}' or + '{vs_aa}{ya}{pulli}' or + '{nnna}{vs_e}{nnna}{pulli}' or + '{nnna}{vs_i}{ra}{pulli}' or + '{vs_ii}{ra}{pulli}' or + '{vs_ii}{ya}{ra}{pulli}' + ] <- '{pulli}' + (set found_a_match) + ) + or + test ( ([ '{ka}{vs_u}' or '{ta}{vs_u}' ) (test '{pulli}') ] delete + (set found_a_match) + ) + ) + do ([among( + '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}' + '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}' + '{ka}{vs_i}{nnna}{pulli}{rra}' + '{ka}{vs_i}{nnna}{pulli}{rra}{pulli}' + '{ka}{vs_i}{rra}' + '{ka}{vs_i}{rra}{pulli}' + )] delete + (set found_a_match) + ) + ) + do fix_endings +) + +define stem as ( + unset found_vetrumai_urupu + do fix_ending + has_min_length + do remove_question_prefixes + do remove_pronoun_prefixes + do remove_question_suffixes + do remove_um + do remove_common_word_endings + do remove_vetrumai_urupukal + do remove_plural_suffix + do remove_command_suffixes + do remove_tense_suffixes +) diff --git a/contrib/snowball/algorithms/turkish/stem_Unicode.sbl b/contrib/snowball/algorithms/turkish.sbl index 16c02a51f..eadd61d02 100644 --- a/contrib/snowball/algorithms/turkish/stem_Unicode.sbl +++ b/contrib/snowball/algorithms/turkish.sbl @@ -2,7 +2,7 @@ * author: Evren (Kapusuz) Çilden * email: evren.kapusuz at gmail.com * version: 1.0 (15.01.2007) - + * stems nominal verb suffixes * stems nominal inflections @@ -10,13 +10,13 @@ * (y,n,s,U) context check * vowel harmony check * last consonant check and conversion (b, c, d, ğ to p, ç, t, k) - + * The stemming algorithm is based on the paper "An Affix Stripping * Morphological Analyzer for Turkish" by Gülşen Eryiğit and * Eşref Adalı (Proceedings of the IAESTED International Conference * ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004, * Innsbruck, Austria - + * Turkish is an agglutinative language and has a very rich morphological * structure. In Turkish, you can form many different words from a single stem * by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means @@ -59,14 +59,14 @@ routines ( mark_yken // nominal verb suffix mark_ymUs_ // nominal verb suffix mark_ysA // nominal verb suffix - + mark_suffix_with_optional_y_consonant mark_suffix_with_optional_U_vowel mark_suffix_with_optional_n_consonant mark_suffix_with_optional_s_consonant - + more_than_one_syllable_word - + post_process_last_consonants postlude @@ -75,34 +75,32 @@ routines ( stem_suffix_chain_before_ki ) -/* Special characters in Unicode Latin-1 and Latin Extended-A */ -stringdef c. hex 'E7' // LATIN SMALL LETTER C WITH CEDILLA -stringdef g~ hex '011F' // LATIN SMALL LETTER G WITH BREVE -stringdef i' hex '0131' // LATIN SMALL LETTER I WITHOUT DOT -stringdef o" hex 'F6' // LATIN SMALL LETTER O WITH DIAERESIS -stringdef s. hex '015F' // LATIN SMALL LETTER S WITH CEDILLA -stringdef u" hex 'FC' // LATIN SMALL LETTER U WITH DIAERESIS - -stringescapes { } +stringescapes { } -integers ( strlen ) // length of a string +/* Special characters in Unicode Latin-1 and Latin Extended-A */ +stringdef c, '{U+00E7}' // LATIN SMALL LETTER C WITH CEDILLA +stringdef g~ '{U+011F}' // LATIN SMALL LETTER G WITH BREVE +stringdef i' '{U+0131}' // LATIN SMALL LETTER I WITHOUT DOT +stringdef o" '{U+00F6}' // LATIN SMALL LETTER O WITH DIAERESIS +stringdef s, '{U+015F}' // LATIN SMALL LETTER S WITH CEDILLA +stringdef u" '{U+00FC}' // LATIN SMALL LETTER U WITH DIAERESIS booleans ( continue_stemming_noun_suffixes ) -groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6) +groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6) -define vowel 'ae{i'}io{o"}u{u"}' +define vowel 'ae{i'}io{o"}u{u"}' define U '{i'}iu{u"}' // the vowel grouping definitions below are used for checking vowel harmony -define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a' -define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e' -define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i'' -define vowel4 'ei' // vowels that can end with suffixes containing 'i' -define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u' -define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"' +define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a' +define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e' +define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i'' +define vowel4 'ei' // vowels that can end with suffixes containing 'i' +define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u' +define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"' -externals ( stem ) +externals ( stem ) backwardmode ( // checks vowel harmony for possible suffixes, @@ -124,165 +122,165 @@ backwardmode ( ) ) ) - + // if the last consonant before suffix is vowel and n then advance and delete // if the last consonant before suffix is non vowel and n do nothing // if the last consonant before suffix is not n then only delete the suffix // assumption: slice beginning is set correctly define mark_suffix_with_optional_n_consonant as ( - ((test 'n') next (test vowel)) + ('n' (test vowel)) or - ((not(test 'n')) test(next (test vowel))) + ((not(test 'n')) test(next vowel)) ) - + // if the last consonant before suffix is vowel and s then advance and delete // if the last consonant before suffix is non vowel and s do nothing // if the last consonant before suffix is not s then only delete the suffix // assumption: slice beginning is set correctly define mark_suffix_with_optional_s_consonant as ( - ((test 's') next (test vowel)) + ('s' (test vowel)) or - ((not(test 's')) test(next (test vowel))) + ((not(test 's')) test(next vowel)) ) - + // if the last consonant before suffix is vowel and y then advance and delete // if the last consonant before suffix is non vowel and y do nothing // if the last consonant before suffix is not y then only delete the suffix // assumption: slice beginning is set correctly define mark_suffix_with_optional_y_consonant as ( - ((test 'y') next (test vowel)) + ('y' (test vowel)) or - ((not(test 'y')) test(next (test vowel))) + ((not(test 'y')) test(next vowel)) ) - + define mark_suffix_with_optional_U_vowel as ( - ((test U) next (test non-vowel)) + (U (test non-vowel)) or - ((not(test U)) test(next (test non-vowel))) + ((not(test U)) test(next non-vowel)) ) - + define mark_possessives as ( among ('m{i'}z' 'miz' 'muz' 'm{u"}z' 'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n') (mark_suffix_with_optional_U_vowel) ) - + define mark_sU as ( check_vowel_harmony U (mark_suffix_with_optional_s_consonant) ) - + define mark_lArI as ( among ('leri' 'lar{i'}') ) - + define mark_yU as ( check_vowel_harmony U - (mark_suffix_with_optional_y_consonant) + (mark_suffix_with_optional_y_consonant) ) - + define mark_nU as ( check_vowel_harmony - among ('n{i'}' 'ni' 'nu' 'n{u"}') + among ('n{i'}' 'ni' 'nu' 'n{u"}') ) - + define mark_nUn as ( check_vowel_harmony - among ('{i'}n' 'in' 'un' '{u"}n') + among ('{i'}n' 'in' 'un' '{u"}n') (mark_suffix_with_optional_n_consonant) ) - + define mark_yA as ( check_vowel_harmony among('a' 'e') (mark_suffix_with_optional_y_consonant) ) - + define mark_nA as ( check_vowel_harmony among('na' 'ne') ) - + define mark_DA as ( check_vowel_harmony among('da' 'de' 'ta' 'te') ) - + define mark_ndA as ( check_vowel_harmony among('nda' 'nde') ) - + define mark_DAn as ( check_vowel_harmony among('dan' 'den' 'tan' 'ten') ) - + define mark_ndAn as ( check_vowel_harmony among('ndan' 'nden') ) - + define mark_ylA as ( check_vowel_harmony among('la' 'le') (mark_suffix_with_optional_y_consonant) ) - + define mark_ki as ( 'ki' ) - + define mark_ncA as ( check_vowel_harmony - among('ca' 'ce') + among('ca' 'ce') (mark_suffix_with_optional_n_consonant) ) - + define mark_yUm as ( check_vowel_harmony among ('{i'}m' 'im' 'um' '{u"}m') (mark_suffix_with_optional_y_consonant) ) - + define mark_sUn as ( check_vowel_harmony among ('s{i'}n' 'sin' 'sun' 's{u"}n' ) ) - + define mark_yUz as ( check_vowel_harmony among ('{i'}z' 'iz' 'uz' '{u"}z') (mark_suffix_with_optional_y_consonant) ) - + define mark_sUnUz as ( among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z') ) - + define mark_lAr as ( check_vowel_harmony among ('ler' 'lar') ) - + define mark_nUz as ( check_vowel_harmony among ('n{i'}z' 'niz' 'nuz' 'n{u"}z') ) - + define mark_DUr as ( check_vowel_harmony among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r') ) - + define mark_cAsInA as ( among ('cas{i'}na' 'cesine') ) - + define mark_yDU as ( check_vowel_harmony among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m' @@ -292,24 +290,24 @@ backwardmode ( (mark_suffix_with_optional_y_consonant) ) - // does not fully obey vowel harmony + // does not fully obey vowel harmony define mark_ysA as ( among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se') (mark_suffix_with_optional_y_consonant) ) - + define mark_ymUs_ as ( check_vowel_harmony - among ('m{i'}{s.}' 'mi{s.}' 'mu{s.}' 'm{u"}{s.}') + among ('m{i'}{s,}' 'mi{s,}' 'mu{s,}' 'm{u"}{s,}') (mark_suffix_with_optional_y_consonant) ) - + define mark_yken as ( 'ken' (mark_suffix_with_optional_y_consonant) ) - + define stem_nominal_verb_suffixes as ( - [ + [ set continue_stemming_noun_suffixes (mark_ymUs_ or mark_yDU or mark_ysA or mark_yken) or @@ -327,7 +325,7 @@ backwardmode ( (mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)) ]delete ) - + // stems noun suffix chains ending with -ki define stem_suffix_chain_before_ki as ( [ @@ -337,7 +335,7 @@ backwardmode ( (mark_lAr] delete try(stem_suffix_chain_before_ki)) or (mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) - + )) or (mark_nUn] delete try([ @@ -348,7 +346,7 @@ backwardmode ( (stem_suffix_chain_before_ki) )) or - (mark_ndA ( + (mark_ndA ( (mark_lArI] delete) or ((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki))) @@ -357,7 +355,7 @@ backwardmode ( )) ) ) - + define stem_noun_suffixes as ( ([mark_lAr] delete try(stem_suffix_chain_before_ki)) or @@ -373,24 +371,24 @@ backwardmode ( or ([(mark_ndA or mark_nA) ( - (mark_lArI] delete) - or - (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) - or - (stem_suffix_chain_before_ki) - ) + (mark_lArI] delete) + or + (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + (stem_suffix_chain_before_ki) + ) ) or ([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI))) or ( [mark_DAn] delete try ([ ( - (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) - or - (mark_lAr] delete try(stem_suffix_chain_before_ki)) - or - (stem_suffix_chain_before_ki) - )) + (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + (mark_lAr] delete try(stem_suffix_chain_before_ki)) + or + (stem_suffix_chain_before_ki) + )) ) or ([mark_nUn or mark_ylA] delete @@ -404,18 +402,18 @@ backwardmode ( ) or ([mark_lArI] delete) - or + or (stem_suffix_chain_before_ki) or ([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki)) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) ) - - define post_process_last_consonants as ( + + define post_process_last_consonants as ( [substring] among ( 'b' (<- 'p') - 'c' (<- '{c.}') + 'c' (<- '{c,}') 'd' (<- 't') '{g~}' (<- 'k') ) @@ -424,7 +422,7 @@ backwardmode ( // after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed // like in 'kedim' -> 'ked' // Turkish words don't usually end with 'd' or 'g' - // some very well known words are ignored (like 'ad' 'soyad' + // some very well known words are ignored (like 'ad' 'soyad' // appends U to stems ending with d or g, decides which vowel to add // based on the last vowel in the stem define append_U_to_stems_ending_with_d_or_g as ( @@ -437,7 +435,10 @@ backwardmode ( or (test((goto vowel) '{o"}' or '{u"}') <+ '{u"}') ) - + + define is_reserved_word as ( + 'ad' try 'soy' atlimit + ) ) // Tests if there are more than one syllables @@ -446,18 +447,12 @@ define more_than_one_syllable_word as ( test (atleast 2 (gopast vowel)) ) -define is_reserved_word as ( - test(gopast 'ad' ($strlen = 2) ($strlen == limit)) - or - test(gopast 'soyad' ($strlen = 5) ($strlen == limit)) -) - define postlude as ( - not(is_reserved_word) backwards ( + not(is_reserved_word) do append_U_to_stems_ending_with_d_or_g do post_process_last_consonants - + ) ) @@ -469,9 +464,7 @@ define stem as ( continue_stemming_noun_suffixes do stem_noun_suffixes ) - + postlude ) ) - - diff --git a/contrib/snowball/charsets/ISO-8859-2.sbl b/contrib/snowball/charsets/ISO-8859-2.sbl new file mode 100644 index 000000000..5829ea8a9 --- /dev/null +++ b/contrib/snowball/charsets/ISO-8859-2.sbl @@ -0,0 +1,98 @@ +// ISO-8859-2 character mappings. + +stringdef U+00A0 hex 'A0' +stringdef U+0104 hex 'A1' +stringdef U+02D8 hex 'A2' +stringdef U+0141 hex 'A3' +stringdef U+00A4 hex 'A4' +stringdef U+013D hex 'A5' +stringdef U+015A hex 'A6' +stringdef U+00A7 hex 'A7' +stringdef U+00A8 hex 'A8' +stringdef U+0160 hex 'A9' +stringdef U+015E hex 'AA' +stringdef U+0164 hex 'AB' +stringdef U+0179 hex 'AC' +stringdef U+00AD hex 'AD' +stringdef U+017D hex 'AE' +stringdef U+017B hex 'AF' +stringdef U+00B0 hex 'B0' +stringdef U+0105 hex 'B1' +stringdef U+02DB hex 'B2' +stringdef U+0142 hex 'B3' +stringdef U+00B4 hex 'B4' +stringdef U+013E hex 'B5' +stringdef U+015B hex 'B6' +stringdef U+02C7 hex 'B7' +stringdef U+00B8 hex 'B8' +stringdef U+0161 hex 'B9' +stringdef U+015F hex 'BA' +stringdef U+0165 hex 'BB' +stringdef U+017A hex 'BC' +stringdef U+02DD hex 'BD' +stringdef U+017E hex 'BE' +stringdef U+017C hex 'BF' +stringdef U+0154 hex 'C0' +stringdef U+00C1 hex 'C1' +stringdef U+00C2 hex 'C2' +stringdef U+0102 hex 'C3' +stringdef U+00C4 hex 'C4' +stringdef U+0139 hex 'C5' +stringdef U+0106 hex 'C6' +stringdef U+00C7 hex 'C7' +stringdef U+010C hex 'C8' +stringdef U+00C9 hex 'C9' +stringdef U+0118 hex 'CA' +stringdef U+00CB hex 'CB' +stringdef U+011A hex 'CC' +stringdef U+00CD hex 'CD' +stringdef U+00CE hex 'CE' +stringdef U+010E hex 'CF' +stringdef U+0110 hex 'D0' +stringdef U+0143 hex 'D1' +stringdef U+0147 hex 'D2' +stringdef U+00D3 hex 'D3' +stringdef U+00D4 hex 'D4' +stringdef U+0150 hex 'D5' +stringdef U+00D6 hex 'D6' +stringdef U+00D7 hex 'D7' +stringdef U+0158 hex 'D8' +stringdef U+016E hex 'D9' +stringdef U+00DA hex 'DA' +stringdef U+0170 hex 'DB' +stringdef U+00DC hex 'DC' +stringdef U+00DD hex 'DD' +stringdef U+0162 hex 'DE' +stringdef U+00DF hex 'DF' +stringdef U+0155 hex 'E0' +stringdef U+00E1 hex 'E1' +stringdef U+00E2 hex 'E2' +stringdef U+0103 hex 'E3' +stringdef U+00E4 hex 'E4' +stringdef U+013A hex 'E5' +stringdef U+0107 hex 'E6' +stringdef U+00E7 hex 'E7' +stringdef U+010D hex 'E8' +stringdef U+00E9 hex 'E9' +stringdef U+0119 hex 'EA' +stringdef U+00EB hex 'EB' +stringdef U+011B hex 'EC' +stringdef U+00ED hex 'ED' +stringdef U+00EE hex 'EE' +stringdef U+010F hex 'EF' +stringdef U+0111 hex 'F0' +stringdef U+0144 hex 'F1' +stringdef U+0148 hex 'F2' +stringdef U+00F3 hex 'F3' +stringdef U+00F4 hex 'F4' +stringdef U+0151 hex 'F5' +stringdef U+00F6 hex 'F6' +stringdef U+00F7 hex 'F7' +stringdef U+0159 hex 'F8' +stringdef U+016F hex 'F9' +stringdef U+00FA hex 'FA' +stringdef U+0171 hex 'FB' +stringdef U+00FC hex 'FC' +stringdef U+00FD hex 'FD' +stringdef U+0163 hex 'FE' +stringdef U+02D9 hex 'FF' diff --git a/contrib/snowball/charsets/KOI8-R.sbl b/contrib/snowball/charsets/KOI8-R.sbl new file mode 100644 index 000000000..46854e825 --- /dev/null +++ b/contrib/snowball/charsets/KOI8-R.sbl @@ -0,0 +1,74 @@ +// KOI8-R character mappings. + +stringdef U+00A0 hex '9A' +stringdef U+00A9 hex 'BF' +stringdef U+00B0 hex '9C' +stringdef U+00B2 hex '9D' +stringdef U+00B7 hex '9E' +stringdef U+00F7 hex '9F' +stringdef U+0401 hex 'B3' +stringdef U+0410 hex 'E1' +stringdef U+0411 hex 'E2' +stringdef U+0412 hex 'F7' +stringdef U+0413 hex 'E7' +stringdef U+0414 hex 'E4' +stringdef U+0415 hex 'E5' +stringdef U+0416 hex 'F6' +stringdef U+0417 hex 'FA' +stringdef U+0418 hex 'E9' +stringdef U+0419 hex 'EA' +stringdef U+041A hex 'EB' +stringdef U+041B hex 'EC' +stringdef U+041C hex 'ED' +stringdef U+041D hex 'EE' +stringdef U+041E hex 'EF' +stringdef U+041F hex 'F0' +stringdef U+0420 hex 'F2' +stringdef U+0421 hex 'F3' +stringdef U+0422 hex 'F4' +stringdef U+0423 hex 'F5' +stringdef U+0424 hex 'E6' +stringdef U+0425 hex 'E8' +stringdef U+0426 hex 'E3' +stringdef U+0427 hex 'FE' +stringdef U+0428 hex 'FB' +stringdef U+0429 hex 'FD' +stringdef U+042A hex 'FF' +stringdef U+042B hex 'F9' +stringdef U+042C hex 'F8' +stringdef U+042D hex 'FC' +stringdef U+042E hex 'E0' +stringdef U+042F hex 'F1' +stringdef U+0430 hex 'C1' +stringdef U+0431 hex 'C2' +stringdef U+0432 hex 'D7' +stringdef U+0433 hex 'C7' +stringdef U+0434 hex 'C4' +stringdef U+0435 hex 'C5' +stringdef U+0436 hex 'D6' +stringdef U+0437 hex 'DA' +stringdef U+0438 hex 'C9' +stringdef U+0439 hex 'CA' +stringdef U+043A hex 'CB' +stringdef U+043B hex 'CC' +stringdef U+043C hex 'CD' +stringdef U+043D hex 'CE' +stringdef U+043E hex 'CF' +stringdef U+043F hex 'D0' +stringdef U+0440 hex 'D2' +stringdef U+0441 hex 'D3' +stringdef U+0442 hex 'D4' +stringdef U+0443 hex 'D5' +stringdef U+0444 hex 'C6' +stringdef U+0445 hex 'C8' +stringdef U+0446 hex 'C3' +stringdef U+0447 hex 'DE' +stringdef U+0448 hex 'DB' +stringdef U+0449 hex 'DD' +stringdef U+044A hex 'DF' +stringdef U+044B hex 'D9' +stringdef U+044C hex 'D8' +stringdef U+044D hex 'DC' +stringdef U+044E hex 'C0' +stringdef U+044F hex 'D1' +stringdef U+0451 hex 'A3' diff --git a/contrib/snowball/charsets/cp850.sbl b/contrib/snowball/charsets/cp850.sbl new file mode 100644 index 000000000..b78022037 --- /dev/null +++ b/contrib/snowball/charsets/cp850.sbl @@ -0,0 +1,130 @@ +// Code page 850 (MSDOS Latin 1) character mappings. + +stringdef U+00A0 hex 'FF' +stringdef U+00A1 hex 'AD' +stringdef U+00A2 hex 'BD' +stringdef U+00A3 hex '9C' +stringdef U+00A4 hex 'CF' +stringdef U+00A5 hex 'BE' +stringdef U+00A6 hex 'DD' +stringdef U+00A7 hex 'F5' +stringdef U+00A8 hex 'F9' +stringdef U+00A9 hex 'B8' +stringdef U+00AA hex 'A6' +stringdef U+00AB hex 'AE' +stringdef U+00AC hex 'AA' +stringdef U+00AD hex 'F0' +stringdef U+00AE hex 'A9' +stringdef U+00AF hex 'EE' +stringdef U+00B0 hex 'F8' +stringdef U+00B1 hex 'F1' +stringdef U+00B2 hex 'FD' +stringdef U+00B3 hex 'FC' +stringdef U+00B4 hex 'EF' +stringdef U+00B5 hex 'E6' +stringdef U+00B6 hex 'F4' +stringdef U+00B7 hex 'FA' +stringdef U+00B8 hex 'F7' +stringdef U+00B9 hex 'FB' +stringdef U+00BA hex 'A7' +stringdef U+00BB hex 'AF' +stringdef U+00BC hex 'AC' +stringdef U+00BD hex 'AB' +stringdef U+00BE hex 'F3' +stringdef U+00BF hex 'A8' +stringdef U+00C0 hex 'B7' +stringdef U+00C1 hex 'B5' +stringdef U+00C2 hex 'B6' +stringdef U+00C3 hex 'C7' +stringdef U+00C4 hex '8E' +stringdef U+00C5 hex '8F' +stringdef U+00C6 hex '92' +stringdef U+00C7 hex '80' +stringdef U+00C8 hex 'D4' +stringdef U+00C9 hex '90' +stringdef U+00CA hex 'D2' +stringdef U+00CB hex 'D3' +stringdef U+00CC hex 'DE' +stringdef U+00CD hex 'D6' +stringdef U+00CE hex 'D7' +stringdef U+00CF hex 'D8' +stringdef U+00D0 hex 'D1' +stringdef U+00D1 hex 'A5' +stringdef U+00D2 hex 'E3' +stringdef U+00D3 hex 'E0' +stringdef U+00D4 hex 'E2' +stringdef U+00D5 hex 'E5' +stringdef U+00D6 hex '99' +stringdef U+00D7 hex '9E' +stringdef U+00D8 hex '9D' +stringdef U+00D9 hex 'EB' +stringdef U+00DA hex 'E9' +stringdef U+00DB hex 'EA' +stringdef U+00DC hex '9A' +stringdef U+00DD hex 'ED' +stringdef U+00DE hex 'E8' +stringdef U+00DF hex 'E1' +stringdef U+00E0 hex '85' +stringdef U+00E1 hex 'A0' +stringdef U+00E2 hex '83' +stringdef U+00E3 hex 'C6' +stringdef U+00E4 hex '84' +stringdef U+00E5 hex '86' +stringdef U+00E6 hex '91' +stringdef U+00E7 hex '87' +stringdef U+00E8 hex '8A' +stringdef U+00E9 hex '82' +stringdef U+00EA hex '88' +stringdef U+00EB hex '89' +stringdef U+00EC hex '8D' +stringdef U+00ED hex 'A1' +stringdef U+00EE hex '8C' +stringdef U+00EF hex '8B' +stringdef U+00F0 hex 'D0' +stringdef U+00F1 hex 'A4' +stringdef U+00F2 hex '95' +stringdef U+00F3 hex 'A2' +stringdef U+00F4 hex '93' +stringdef U+00F5 hex 'E4' +stringdef U+00F6 hex '94' +stringdef U+00F7 hex 'F6' +stringdef U+00F8 hex '9B' +stringdef U+00F9 hex '97' +stringdef U+00FA hex 'A3' +stringdef U+00FB hex '96' +stringdef U+00FC hex '81' +stringdef U+00FD hex 'EC' +stringdef U+00FE hex 'E7' +stringdef U+00FF hex '98' +stringdef U+0131 hex 'D5' +stringdef U+0192 hex '9F' +stringdef U+2017 hex 'F2' +stringdef U+2500 hex 'C4' +stringdef U+2502 hex 'B3' +stringdef U+250C hex 'DA' +stringdef U+2510 hex 'BF' +stringdef U+2514 hex 'C0' +stringdef U+2518 hex 'D9' +stringdef U+251C hex 'C3' +stringdef U+2524 hex 'B4' +stringdef U+252C hex 'C2' +stringdef U+2534 hex 'C1' +stringdef U+253C hex 'C5' +stringdef U+2550 hex 'CD' +stringdef U+2551 hex 'BA' +stringdef U+2554 hex 'C9' +stringdef U+2557 hex 'BB' +stringdef U+255A hex 'C8' +stringdef U+255D hex 'BC' +stringdef U+2560 hex 'CC' +stringdef U+2563 hex 'B9' +stringdef U+2566 hex 'CB' +stringdef U+2569 hex 'CA' +stringdef U+256C hex 'CE' +stringdef U+2580 hex 'DF' +stringdef U+2584 hex 'DC' +stringdef U+2588 hex 'DB' +stringdef U+2591 hex 'B0' +stringdef U+2592 hex 'B1' +stringdef U+2593 hex 'B2' +stringdef U+25A0 hex 'FE' diff --git a/contrib/snowball/compiler/analyser.c b/contrib/snowball/compiler/analyser.c index ed712f64f..dffa5552d 100644 --- a/contrib/snowball/compiler/analyser.c +++ b/contrib/snowball/compiler/analyser.c @@ -4,31 +4,53 @@ #include <string.h> /* memmove */ #include "header.h" +typedef enum { + e_token_omitted = 0, + e_unexpected_token = 1, + e_string_omitted = 2, + e_unexpected_token_in_among = 3, + /* For codes above here, report "after " t->previous_token after the error. */ + e_unresolved_substring = 14, + e_not_allowed_inside_reverse = 15, + e_empty_grouping = 16, + e_already_backwards = 17, + e_empty_among = 18, + e_adjacent_bracketed_in_among = 19, + e_substring_preceded_by_substring = 20, + /* For codes below here, tokeniser->b is printed before the error. */ + e_redeclared = 30, + e_undeclared = 31, + e_declared_as_different_mode = 32, + e_not_of_type_x = 33, + e_not_of_type_string_or_integer = 34, + e_misplaced = 35, + e_redefined = 36, + e_misused = 37 +} error_code; + /* recursive usage: */ static void read_program_(struct analyser * a, int terminator); static struct node * read_C(struct analyser * a); -static struct node * C_style(struct analyser * a, char * s, int token); - +static struct node * C_style(struct analyser * a, const char * s, int token); -static void fault(int n) { fprintf(stderr, "fault %d\n", n); exit(1); } static void print_node_(struct node * p, int n, const char * s) { int i; for (i = 0; i < n; i++) fputs(i == n - 1 ? s : " ", stdout); printf("%s ", name_of_token(p->type)); - unless (p->name == 0) report_b(stdout, p->name->b); - unless (p->literalstring == 0) { + if (p->name) report_b(stdout, p->name->b); + if (p->literalstring) { printf("'"); report_b(stdout, p->literalstring); printf("'"); } printf("\n"); - unless (p->AE == 0) print_node_(p->AE, n+1, "# "); - unless (p->left == 0) print_node_(p->left, n+1, " "); - unless (p->right == 0) print_node_(p->right, n, " "); - if (p->aux != 0) print_node_(p->aux, n+1, "@ "); + if (p->AE) print_node_(p->AE, n+1, "# "); + if (p->left) print_node_(p->left, n+1, " "); + if (p->aux) print_node_(p->aux, n+1, "@ "); + if (p->right) print_node_(p->right, n, " "); } extern void print_program(struct analyser * a) { @@ -52,22 +74,37 @@ static struct node * new_node(struct analyser * a, int type) { static const char * name_of_mode(int n) { switch (n) { - default: fault(0); - case m_backward: return "string backward"; - case m_forward: return "string forward"; - /* case m_integer: return "integer"; */ + case m_backward: return "string backward"; + case m_forward: return "string forward"; + /* case m_integer: return "integer"; */ } + fprintf(stderr, "Invalid mode %d in name_of_mode()\n", n); + exit(1); } static const char * name_of_type(int n) { switch (n) { - default: fault(1); - case 's': return "string"; - case 'i': return "integer"; - case 'r': return "routine"; - case 'R': return "routine or grouping"; - case 'g': return "grouping"; + case 's': return "string"; + case 'i': return "integer"; + case 'r': return "routine"; + case 'R': return "routine or grouping"; + case 'g': return "grouping"; + } + fprintf(stderr, "Invalid type %d in name_of_type()\n", n); + exit(1); +} + +static const char * name_of_name_type(int code) { + switch (code) { + case t_string: return "string"; + case t_boolean: return "boolean"; + case t_integer: return "integer"; + case t_routine: return "routine"; + case t_external: return "external"; + case t_grouping: return "grouping"; } + fprintf(stderr, "Invalid type code %d in name_of_name_type()\n", code); + exit(1); } static void count_error(struct analyser * a) { @@ -76,86 +113,78 @@ static void count_error(struct analyser * a) { t->error_count++; } -static void error2(struct analyser * a, int n, int x) { +static void error2(struct analyser * a, error_code n, int x) { struct tokeniser * t = a->tokeniser; count_error(a); fprintf(stderr, "%s:%d: ", t->file, t->line_number); - if (n >= 30) report_b(stderr, t->b); + if ((int)n >= (int)e_redeclared) report_b(stderr, t->b); switch (n) { - case 0: + case e_token_omitted: fprintf(stderr, "%s omitted", name_of_token(t->omission)); break; - case 3: + case e_unexpected_token_in_among: fprintf(stderr, "in among(...), "); - case 1: + /* fall through */ + case e_unexpected_token: fprintf(stderr, "unexpected %s", name_of_token(t->token)); if (t->token == c_number) fprintf(stderr, " %d", t->number); if (t->token == c_name) { fprintf(stderr, " "); report_b(stderr, t->b); } break; - case 2: + case e_string_omitted: fprintf(stderr, "string omitted"); break; - case 14: + case e_unresolved_substring: fprintf(stderr, "unresolved substring on line %d", x); break; - case 15: + case e_not_allowed_inside_reverse: fprintf(stderr, "%s not allowed inside reverse(...)", name_of_token(t->token)); break; - case 16: + case e_empty_grouping: fprintf(stderr, "empty grouping"); break; - case 17: + case e_already_backwards: fprintf(stderr, "backwards used when already in this mode"); break; - case 18: + case e_empty_among: fprintf(stderr, "empty among(...)"); break; - case 19: + case e_adjacent_bracketed_in_among: fprintf(stderr, "two adjacent bracketed expressions in among(...)"); break; - case 20: + case e_substring_preceded_by_substring: fprintf(stderr, "substring preceded by another substring on line %d", x); break; - case 30: + case e_redeclared: fprintf(stderr, " re-declared"); break; - case 31: + case e_undeclared: fprintf(stderr, " undeclared"); break; - case 32: + case e_declared_as_different_mode: fprintf(stderr, " declared as %s mode; used as %s mode", name_of_mode(a->mode), name_of_mode(x)); break; - case 33: + case e_not_of_type_x: fprintf(stderr, " not of type %s", name_of_type(x)); break; - case 34: + case e_not_of_type_string_or_integer: fprintf(stderr, " not of type string or integer"); break; - case 35: + case e_misplaced: fprintf(stderr, " misplaced"); break; - case 36: + case e_redefined: fprintf(stderr, " redefined"); break; - case 37: + case e_misused: fprintf(stderr, " mis-used as %s mode", name_of_mode(x)); break; - default: - fprintf(stderr, " error %d", n); break; - } - if (n <= 13 && t->previous_token > 0) + if ((int)n < (int)e_unresolved_substring && t->previous_token > 0) fprintf(stderr, " after %s", name_of_token(t->previous_token)); fprintf(stderr, "\n"); } -static void error(struct analyser * a, int n) { error2(a, n, 0); } - -static void error3(struct analyser * a, struct node * p, symbol * b) { - count_error(a); - fprintf(stderr, "%s:%d: among(...) has repeated string '", a->tokeniser->file, p->line_number); - report_b(stderr, b); - fprintf(stderr, "'\n"); -} +static void error(struct analyser * a, error_code n) { error2(a, n, 0); } static void error4(struct analyser * a, struct name * q) { count_error(a); + fprintf(stderr, "%s:%d: ", a->tokeniser->file, q->used->line_number); report_b(stderr, q->b); fprintf(stderr, " undefined\n"); } static void omission_error(struct analyser * a, int n) { a->tokeniser->omission = n; - error(a, 0); + error(a, e_token_omitted); } static int check_token(struct analyser * a, int code) { @@ -169,71 +198,120 @@ static int get_token(struct analyser * a, int code) { read_token(t); { int x = check_token(a, code); - unless (x) t->token_held = true; + if (!x) t->token_held = true; return x; } } static struct name * look_for_name(struct analyser * a) { - struct name * p = a->names; symbol * q = a->tokeniser->b; - repeat { - if (p == 0) return 0; - { symbol * b = p->b; - int n = SIZE(b); - if (n == SIZE(q) && memcmp(q, b, n * sizeof(symbol)) == 0) { - p->referenced = true; - return p; - } + struct name * p; + for (p = a->names; p; p = p->next) { + symbol * b = p->b; + int n = SIZE(b); + if (n == SIZE(q) && memcmp(q, b, n * sizeof(symbol)) == 0) { + p->referenced = true; + return p; } - p = p->next; } + return 0; } static struct name * find_name(struct analyser * a) { struct name * p = look_for_name(a); - if (p == 0) error(a, 31); + if (p == 0) error(a, e_undeclared); return p; } static void check_routine_mode(struct analyser * a, struct name * p, int mode) { if (p->mode < 0) p->mode = mode; else - unless (p->mode == mode) error2(a, 37, mode); + if (p->mode != mode) error2(a, e_misused, mode); } static void check_name_type(struct analyser * a, struct name * p, int type) { switch (type) { - case 's': if (p->type == t_string) return; break; - case 'i': if (p->type == t_integer) return; break; - case 'b': if (p->type == t_boolean) return; break; - case 'R': if (p->type == t_grouping) return; - case 'r': if (p->type == t_routine || p->type == t_external) return; break; - case 'g': if (p->type == t_grouping) return; break; + case 's': + if (p->type == t_string) return; + break; + case 'i': + if (p->type == t_integer) return; + break; + case 'b': + if (p->type == t_boolean) return; + break; + case 'R': + if (p->type == t_grouping) return; + /* FALLTHRU */ + case 'r': + if (p->type == t_routine || p->type == t_external) return; + break; + case 'g': + if (p->type == t_grouping) return; + break; } - error2(a, 33, type); + error2(a, e_not_of_type_x, type); } static void read_names(struct analyser * a, int type) { struct tokeniser * t = a->tokeniser; - unless (get_token(a, c_bra)) return; - repeat { - if (read_token(t) != c_name) break; - if (look_for_name(a) != 0) error(a, 30); else { - NEW(name, p); - p->b = copy_b(t->b); - p->type = type; - p->mode = -1; /* routines, externals */ - p->count = a->name_count[type]; - p->referenced = false; - p->used = false; - p->grouping = 0; - p->definition = 0; - a->name_count[type] ++; - p->next = a->names; - a->names = p; - } - } - unless (check_token(a, c_ket)) t->token_held = true; + if (!get_token(a, c_bra)) return; + while (true) { + int token = read_token(t); + switch (token) { + case c_len: { + /* Context-sensitive token - once declared as a name, it loses + * its special meaning, for compatibility with older versions + * of snowball. + */ + static const symbol c_len_lit[] = { + 'l', 'e', 'n' + }; + MOVE_TO_B(t->b, c_len_lit); + goto handle_as_name; + } + case c_lenof: { + /* Context-sensitive token - once declared as a name, it loses + * its special meaning, for compatibility with older versions + * of snowball. + */ + static const symbol c_lenof_lit[] = { + 'l', 'e', 'n', 'o', 'f' + }; + MOVE_TO_B(t->b, c_lenof_lit); + goto handle_as_name; + } + case c_name: +handle_as_name: + if (look_for_name(a) != 0) error(a, e_redeclared); else { + NEW(name, p); + p->b = copy_b(t->b); + p->type = type; + p->mode = -1; /* routines, externals */ + /* We defer assigning counts until after we've eliminated + * variables whose values are never used. */ + p->count = -1; + p->referenced = false; + p->used_in_among = false; + p->used = 0; + p->value_used = false; + p->initialised = false; + p->used_in_definition = false; + p->local_to = 0; + p->grouping = 0; + p->definition = 0; + p->declaration_line_number = t->line_number; + p->next = a->names; + a->names = p; + if (token != c_name) { + disable_token(t, token); + } + } + break; + default: + if (!check_token(a, c_ket)) t->token_held = true; + return; + } + } } static symbol * new_literalstring(struct analyser * a) { @@ -259,7 +337,7 @@ static int read_AE_test(struct analyser * a) { case c_ge: case c_ls: case c_le: return t->token; - default: error(a, 1); t->token_held = true; return c_eq; + default: error(a, e_unexpected_token); t->token_held = true; return c_eq; } } @@ -271,11 +349,23 @@ static int binding(int t) { } } +static void mark_used_in(struct analyser * a, struct name * q, struct node * p) { + if (!q->used) { + q->used = p; + q->local_to = a->program_end->name; + } else if (q->local_to) { + if (q->local_to != a->program_end->name) { + /* Used in more than one routine/external. */ + q->local_to = NULL; + } + } +} + static void name_to_node(struct analyser * a, struct node * p, int type) { struct name * q = find_name(a); - unless (q == 0) { + if (q) { check_name_type(a, q, type); - q->used = true; + mark_used_in(a, q, p); } p->name = q; } @@ -286,8 +376,18 @@ static struct node * read_AE(struct analyser * a, int B) { struct node * q; switch (read_token(t)) { case c_minus: /* monadic */ + q = read_AE(a, 100); + if (q->type == c_neg) { + /* Optimise away double negation, which avoids generators + * having to worry about generating "--" (decrement operator + * in many languages). + */ + p = q->right; + /* Don't free q, it's in the linked list a->nodes. */ + break; + } p = new_node(a, c_neg); - p->right = read_AE(a, 100); + p->right = q; break; case c_bra: p = read_AE(a, 0); @@ -296,11 +396,15 @@ static struct node * read_AE(struct analyser * a, int B) { case c_name: p = new_node(a, c_name); name_to_node(a, p, 'i'); + if (p->name) p->name->value_used = true; break; case c_maxint: case c_minint: + a->int_limits_used = true; + /* fall through */ case c_cursor: case c_limit: + case c_len: case c_size: p = new_node(a, t->token); break; @@ -308,18 +412,19 @@ static struct node * read_AE(struct analyser * a, int B) { p = new_node(a, c_number); p->number = t->number; break; + case c_lenof: case c_sizeof: - p = C_style(a, "s", c_sizeof); + p = C_style(a, "s", t->token); break; default: - error(a, 1); + error(a, e_unexpected_token); t->token_held = true; return 0; } - repeat { + while (true) { int token = read_token(t); int b = binding(token); - unless (binding(token) > B) { + if (binding(token) <= B) { t->token_held = true; return p; } @@ -335,14 +440,11 @@ static struct node * read_C_connection(struct analyser * a, struct node * q, int struct node * p = new_node(a, op); struct node * p_end = q; p->left = q; - repeat { + do { q = read_C(a); p_end->right = q; p_end = q; - if (read_token(t) != op) { - t->token_held = true; - break; - } - } + } while (read_token(t) == op); + t->token_held = true; return p; } @@ -350,14 +452,14 @@ static struct node * read_C_list(struct analyser * a) { struct tokeniser * t = a->tokeniser; struct node * p = new_node(a, c_bra); struct node * p_end = 0; - repeat { + while (true) { int token = read_token(t); if (token == c_ket) return p; if (token < 0) { omission_error(a, c_ket); return p; } t->token_held = true; { struct node * q = read_C(a); - repeat { + while (true) { token = read_token(t); if (token != c_and && token != c_or) { t->token_held = true; @@ -371,10 +473,10 @@ static struct node * read_C_list(struct analyser * a) { } } -static struct node * C_style(struct analyser * a, char * s, int token) { +static struct node * C_style(struct analyser * a, const char * s, int token) { int i; struct node * p = new_node(a, token); - for (i = 0; s[i] != 0; i++) switch(s[i]) { + for (i = 0; s[i] != 0; i++) switch (s[i]) { case 'C': p->left = read_C(a); continue; case 'D': @@ -388,7 +490,7 @@ static struct node * C_style(struct analyser * a, char * s, int token) { int str_token = read_token(a->tokeniser); if (str_token == c_name) name_to_node(a, p, 's'); else if (str_token == c_literalstring) p->literalstring = new_literalstring(a); - else error(a, 2); + else error(a, e_string_omitted); } continue; case 'b': @@ -408,7 +510,7 @@ static struct node * read_literalstring(struct analyser * a) { static void reverse_b(symbol * b) { int i = 0; int j = SIZE(b) - 1; - until (i >= j) { + while (i < j) { int ch1 = b[i]; int ch2 = b[j]; b[i++] = ch2; b[j--] = ch1; } @@ -423,7 +525,74 @@ static int compare_amongvec(const void *pv, const void *qv) { int i; for (i = 0; i < smaller_size; i++) if (b_p[i] != b_q[i]) return b_p[i] - b_q[i]; - return p_size - q_size; + if (p_size - q_size) + return p_size - q_size; + return p->line_number - q->line_number; +} + +#define PTR_NULL_CHECK(P, Q) do {\ + if ((Q) == NULL) {\ + if ((P) != NULL) return 1;\ + } else {\ + if ((P) == NULL) return -1;\ + }\ + } while (0) + +static int compare_node(const struct node *p, const struct node *q) { + PTR_NULL_CHECK(p, q); + if (q == NULL) { + /* p must be NULL too. */ + return 0; + } + + if (p->type != q->type) return p->type > q->type ? 1 : -1; + if (p->mode != q->mode) return p->mode > q->mode ? 1 : -1; + if (p->type == c_number) { + if (p->number != q->number) + return p->number > q->number ? 1 : -1; + } + + PTR_NULL_CHECK(p->left, q->left); + if (p->left) { + int r = compare_node(p->left, q->left); + if (r != 0) return r; + } + + PTR_NULL_CHECK(p->AE, q->AE); + if (p->AE) { + int r = compare_node(p->AE, q->AE); + if (r != 0) return r; + } + + PTR_NULL_CHECK(p->aux, q->aux); + if (p->aux) { + int r = compare_node(p->aux, q->aux); + if (r != 0) return r; + } + + PTR_NULL_CHECK(p->name, q->name); + if (p->name) { + int r; + if (SIZE(p->name->b) != SIZE(q->name->b)) { + return SIZE(p->name->b) - SIZE(q->name->b); + } + r = memcmp(p->name->b, q->name->b, + SIZE(p->name->b) * sizeof(symbol)); + if (r != 0) return r; + } + + PTR_NULL_CHECK(p->literalstring, q->literalstring); + if (p->literalstring) { + int r; + if (SIZE(p->literalstring) != SIZE(q->literalstring)) { + return SIZE(p->literalstring) - SIZE(q->literalstring); + } + r = memcmp(p->literalstring, q->literalstring, + SIZE(p->literalstring) * sizeof(symbol)); + if (r != 0) return r; + } + + return compare_node(p->right, q->right); } static void make_among(struct analyser * a, struct node * p, struct node * substring) { @@ -443,38 +612,84 @@ static void make_among(struct analyser * a, struct node * p, struct node * subst x->next = 0; x->b = v; x->number = a->among_count++; + x->function_count = 0; x->starter = 0; + x->nocommand_count = 0; + x->amongvar_needed = false; if (q->type == c_bra) { x->starter = q; q = q->right; } - until (q == 0) { + while (q) { if (q->type == c_literalstring) { symbol * b = q->literalstring; w1->b = b; /* pointer to case string */ - w1->p = 0; /* pointer to corresponding case expression */ + w1->action = NULL; /* action gets filled in below */ + w1->line_number = q->line_number; w1->size = SIZE(b); /* number of characters in string */ w1->i = -1; /* index of longest substring */ w1->result = -1; /* number of corresponding case expression */ - w1->function = q->left == 0 ? 0 : q->left->name; - unless (w1->function == 0) - check_routine_mode(a, w1->function, direction); + if (q->left) { + struct name * function = q->left->name; + w1->function = function; + function->used_in_among = true; + check_routine_mode(a, function, direction); + x->function_count++; + } else { + w1->function = 0; + } w1++; - } - else - if (q->left == 0) /* empty command: () */ + } else if (q->left == 0) { + /* empty command: () */ w0 = w1; - else { - until (w0 == w1) { - w0->p = q; - w0->result = result; + } else { + /* Check for previous action which is the same as this one and use + * the same action code if we find one. + */ + int among_result = -1; + struct amongvec * w; + for (w = v; w < w0; ++w) { + if (w->action && compare_node(w->action->left, q->left) == 0) { + if (w->result <= 0) { + printf("Among code %d isn't positive\n", w->result); + exit(1); + } + among_result = w->result; + break; + } + } + if (among_result < 0) { + among_result = result++; + } + + while (w0 != w1) { + w0->action = q; + w0->result = among_result; w0++; } - result++; } q = q->right; } - unless (w1-v == p->number) { fprintf(stderr, "oh! %d %d\n", (int)(w1-v), p->number); exit(1); } - if (backward) for (w0 = v; w0 < w1; w0++) reverse_b(w0->b); + if (w1-v != p->number) { fprintf(stderr, "oh! %d %d\n", (int)(w1-v), p->number); exit(1); } + x->command_count = result - 1; + { + NEWVEC(node*, commands, x->command_count); + memset(commands, 0, x->command_count * sizeof(struct node*)); + for (w0 = v; w0 < w1; w0++) { + if (w0->result > 0) { + /* result == -1 when there's no command. */ + if (w0->result > x->command_count) { + fprintf(stderr, "More among codes than expected\n"); + exit(1); + } + if (!commands[w0->result - 1]) + commands[w0->result - 1] = w0->action; + } else { + ++x->nocommand_count; + } + if (backward) reverse_b(w0->b); + } + x->commands = commands; + } qsort(v, w1 - v, sizeof(struct amongvec), compare_amongvec); /* the following loop is O(n squared) */ @@ -494,15 +709,30 @@ static void make_among(struct analyser * a, struct node * p, struct node * subst for (w0 = v; w0 < w1 - 1; w0++) if (w0->size == (w0 + 1)->size && - memcmp(w0->b, (w0 + 1)->b, w0->size * sizeof(symbol)) == 0) error3(a, p, w0->b); + memcmp(w0->b, (w0 + 1)->b, w0->size * sizeof(symbol)) == 0) { + count_error(a); + fprintf(stderr, "%s:%d: among(...) has repeated string '", + a->tokeniser->file, (w0 + 1)->line_number); + report_b(stderr, (w0 + 1)->b); + fprintf(stderr, "'\n"); + count_error(a); + fprintf(stderr, "%s:%d: previously seen here\n", + a->tokeniser->file, w0->line_number); + } x->literalstring_count = p->number; - x->command_count = result - 1; p->among = x; x->substring = substring; if (substring != 0) substring->among = x; - unless (x->command_count == 0 && x->starter == 0) a->amongvar_needed = true; + if (x->command_count > 1 || + (x->command_count == 1 && x->nocommand_count > 0) || + x->starter != 0) { + /* We need to set among_var rather than just checking if find_among*() + * returns zero or not. + */ + x->amongvar_needed = a->amongvar_needed = true; + } } static struct node * read_among(struct analyser * a) { @@ -514,8 +744,8 @@ static struct node * read_among(struct analyser * a) { a->substring = 0; p->number = 0; /* counts the number of literals */ - unless (get_token(a, c_bra)) return p; - repeat { + if (!get_token(a, c_bra)) return p; + while (true) { struct node * q; int token = read_token(t); switch (token) { @@ -529,12 +759,14 @@ static struct node * read_among(struct analyser * a) { else t->token_held = true; p->number++; break; case c_bra: - if (previous_token == c_bra) error(a, 19); + if (previous_token == c_bra) error(a, e_adjacent_bracketed_in_among); q = read_C_list(a); break; default: - error(a, 3); + error(a, e_unexpected_token_in_among); + previous_token = token; + continue; case c_ket: - if (p->number == 0) error(a, 18); + if (p->number == 0) error(a, e_empty_among); if (t->error_count == 0) make_among(a, p, substring); return p; } @@ -547,13 +779,13 @@ static struct node * read_among(struct analyser * a) { static struct node * read_substring(struct analyser * a) { struct node * p = new_node(a, c_substring); - if (a->substring != 0) error2(a, 20, a->substring->line_number); + if (a->substring != 0) error2(a, e_substring_preceded_by_substring, a->substring->line_number); a->substring = p; return p; } static void check_modifyable(struct analyser * a) { - unless (a->modifyable) error(a, 15); + if (!a->modifyable) error(a, e_not_allowed_inside_reverse); } static struct node * read_C(struct analyser * a) { @@ -565,7 +797,7 @@ static struct node * read_C(struct analyser * a) { case c_backwards: { int mode = a->mode; - if (a->mode == m_backward) error(a, 17); else a->mode = m_backward; + if (a->mode == m_backward) error(a, e_already_backwards); else a->mode = m_backward; { struct node * p = C_style(a, "C", token); a->mode = mode; return p; @@ -596,14 +828,18 @@ static struct node * read_C(struct analyser * a) { case c_loop: case c_atleast: return C_style(a, "AC", token); - case c_setmark: - return C_style(a, "i", token); + case c_setmark: { + struct node * n = C_style(a, "i", token); + if (n->name) n->name->initialised = true; + return n; + } case c_tomark: case c_atmark: case c_hop: return C_style(a, "A", token); case c_delete: check_modifyable(a); + /* fall through */ case c_next: case c_tolimit: case c_atlimit: @@ -612,60 +848,147 @@ static struct node * read_C(struct analyser * a) { case c_true: case c_false: case c_debug: - return C_style(a, "", token); + return new_node(a, token); case c_assignto: - case c_sliceto: + case c_sliceto: { + struct node *n; check_modifyable(a); - return C_style(a, "s", token); + n = C_style(a, "s", token); + if (n->name) n->name->initialised = true; + return n; + } case c_assign: case c_insert: case c_attach: - case c_slicefrom: + case c_slicefrom: { + struct node *n; check_modifyable(a); - return C_style(a, "S", token); + n = C_style(a, "S", token); + if (n->name) n->name->value_used = true; + return n; + } case c_setlimit: return C_style(a, "CfD", token); case c_set: - case c_unset: - return C_style(a, "b", token); - case c_dollar: - get_token(a, c_name); - { + case c_unset: { + struct node * n = C_style(a, "b", token); + if (n->name) n->name->initialised = true; + return n; + } + case c_dollar: { + struct tokeniser * t = a->tokeniser; + read_token(t); + if (t->token == c_bra) { + /* Handle newer $(AE REL_OP AE) syntax. */ + struct node * n = read_AE(a, 0); + read_token(t); + switch (t->token) { + case c_eq: + case c_ne: + case c_gr: + case c_ge: + case c_ls: + case c_le: { + struct node * lhs = n; + n = new_node(a, t->token); + n->left = lhs; + n->AE = read_AE(a, 0); + get_token(a, c_ket); + break; + } + default: + error(a, e_unexpected_token); + t->token_held = true; + break; + } + return n; + } + + if (t->token == c_name) { struct node * p; struct name * q = find_name(a); int mode = a->mode; int modifyable = a->modifyable; - switch (q ? q->type : t_string) - /* above line was: switch (q->type) - bug #1 fix 7/2/2003 */ - { - default: error(a, 34); - case t_string: - a->mode = m_forward; - a->modifyable = true; - p = new_node(a, c_dollar); - p->left = read_C(a); break; - case t_integer: - /* a->mode = m_integer; */ - p = new_node(a, read_AE_test(a)); - p->AE = read_AE(a, 0); break; + if (q && q->type == t_string) { + /* Assume for now that $ on string both initialises and + * uses the string variable. FIXME: Can we do better? + */ + q->initialised = true; + q->value_used = true; + a->mode = m_forward; + a->modifyable = true; + p = new_node(a, c_dollar); + p->left = read_C(a); + p->name = q; + } else { + if (q && q->type != t_integer) { + /* If $ is used on an unknown name or a name which + * isn't a string or an integer then we assume the + * unknown name is an integer as $ is used more often + * on integers than strings, so hopefully this it less + * likely to cause an error avalanche. + * + * For an unknown name, we'll already have reported an + * error. + */ + error(a, e_not_of_type_string_or_integer); + q = NULL; + } + p = new_node(a, read_AE_test(a)); + p->AE = read_AE(a, 0); + + if (q) { + switch (p->type) { + case c_mathassign: + q->initialised = true; + p->name = q; + break; + default: + /* +=, etc don't "initialise" as they only + * amend an existing value. Similarly, they + * don't count as using the value. + */ + p->name = q; + break; + case c_eq: + case c_ne: + case c_gr: + case c_ge: + case c_ls: + case c_le: + p->left = new_node(a, c_name); + p->left->name = q; + q->value_used = true; + break; + } + } } - p->name = q; + if (q) mark_used_in(a, q, p); a->mode = mode; a->modifyable = modifyable; return p; } + + error(a, e_unexpected_token); + t->token_held = true; + return new_node(a, c_dollar); + } case c_name: { struct name * q = find_name(a); struct node * p = new_node(a, c_name); - unless (q == 0) { - q->used = true; + if (q) { + mark_used_in(a, q, p); switch (q->type) { case t_boolean: - p->type = c_booltest; break; + p->type = c_booltest; + q->value_used = true; + break; case t_integer: - error(a, 35); /* integer name misplaced */ + error(a, e_misplaced); /* integer name misplaced */ + break; case t_string: + q->value_used = true; break; case t_routine: case t_external: @@ -684,7 +1007,7 @@ static struct node * read_C(struct analyser * a) { struct node * p = new_node(a, token); read_token(t); if (t->token == c_minus) read_token(t); - unless (check_token(a, c_name)) { omission_error(a, c_name); return p; } + if (!check_token(a, c_name)) { omission_error(a, c_name); return p; } name_to_node(a, p, 'g'); return p; } @@ -692,7 +1015,7 @@ static struct node * read_C(struct analyser * a) { return read_literalstring(a); case c_among: return read_among(a); case c_substring: return read_substring(a); - default: error(a, 1); return 0; + default: error(a, e_unexpected_token); return 0; } } @@ -739,26 +1062,27 @@ static void read_define_grouping(struct analyser * a, struct name * q) { NEW(grouping, p); if (a->groupings == 0) a->groupings = p; else a->groupings_end->next = p; a->groupings_end = p; - q->grouping = p; + if (q) q->grouping = p; p->next = 0; p->name = q; - p->number = q->count; + p->line_number = a->tokeniser->line_number; p->b = create_b(0); - repeat { + while (true) { switch (read_token(t)) { case c_name: { struct name * r = find_name(a); - unless (r == 0) { + if (r) { check_name_type(a, r, 'g'); p->b = alter_grouping(p->b, r->grouping->b, style, false); + r->used_in_definition = true; } } break; case c_literalstring: - p->b = alter_grouping(p->b, t->b, style, a->utf8); + p->b = alter_grouping(p->b, t->b, style, (a->encoding == ENC_UTF8)); break; - default: error(a, 1); return; + default: error(a, e_unexpected_token); return; } switch (read_token(t)) { case c_plus: @@ -777,7 +1101,7 @@ static void read_define_grouping(struct analyser * a, struct name * q) { } p->largest_ch = max; p->smallest_ch = min; - if (min == 1<<16) error(a, 16); + if (min == 1<<16) error(a, e_empty_grouping); } t->token_held = true; return; } @@ -786,32 +1110,49 @@ static void read_define_grouping(struct analyser * a, struct name * q) { static void read_define_routine(struct analyser * a, struct name * q) { struct node * p = new_node(a, c_define); a->amongvar_needed = false; - unless (q == 0) { + if (q) { check_name_type(a, q, 'R'); - if (q->definition != 0) error(a, 36); + if (q->definition != 0) error(a, e_redefined); if (q->mode < 0) q->mode = a->mode; else - if (q->mode != a->mode) error2(a, 32, q->mode); + if (q->mode != a->mode) error2(a, e_declared_as_different_mode, q->mode); } p->name = q; if (a->program == 0) a->program = p; else a->program_end->right = p; a->program_end = p; get_token(a, c_as); p->left = read_C(a); - unless (q == 0) q->definition = p->left; + if (q) q->definition = p->left; if (a->substring != 0) { - error2(a, 14, a->substring->line_number); - a->substring = 0; + error2(a, e_unresolved_substring, a->substring->line_number); + a->substring = 0; } p->amongvar_needed = a->amongvar_needed; } static void read_define(struct analyser * a) { - unless (get_token(a, c_name)) return; - { + if (get_token(a, c_name)) { struct name * q = find_name(a); - if (q != 0 && q->type == t_grouping) read_define_grouping(a, q); - else read_define_routine(a, q); + int type; + if (q) { + type = q->type; + } else { + /* No declaration, so sniff next token - if it is 'as' then parse + * as a routine, otherwise as a grouping. + */ + if (read_token(a->tokeniser) == c_as) { + type = t_routine; + } else { + type = t_grouping; + } + a->tokeniser->token_held = true; + } + + if (type == t_grouping) { + read_define_grouping(a, q); + } else { + read_define_routine(a, q); + } } } @@ -827,7 +1168,7 @@ static void read_backwardmode(struct analyser * a) { static void read_program_(struct analyser * a, int terminator) { struct tokeniser * t = a->tokeniser; - repeat { + while (true) { switch (read_token(t)) { case c_strings: read_names(a, t_string); break; case c_booleans: read_names(a, t_boolean); break; @@ -839,25 +1180,60 @@ static void read_program_(struct analyser * a, int terminator) { case c_backwardmode:read_backwardmode(a); break; case c_ket: if (terminator == c_ket) return; + /* fall through */ default: - error(a, 1); break; + error(a, e_unexpected_token); break; case -1: - unless (terminator < 0) omission_error(a, c_ket); + if (terminator >= 0) omission_error(a, c_ket); return; } } } +static void remove_dead_assignments(struct node * p, struct name * q) { + if (p->name == q) { + switch (p->type) { + case c_assignto: + case c_sliceto: + case c_mathassign: + case c_plusassign: + case c_minusassign: + case c_multiplyassign: + case c_divideassign: + case c_setmark: + case c_set: + case c_unset: + case c_dollar: + /* c_true is a no-op. */ + p->type = c_true; + break; + default: + /* There are no read accesses to this variable, so any + * references must be assignments. + */ + fprintf(stderr, "Unhandled type of dead assignment via %s\n", + name_of_token(p->type)); + exit(1); + } + } + if (p->AE) remove_dead_assignments(p->AE, q); + if (p->left) remove_dead_assignments(p->left, q); + if (p->aux) remove_dead_assignments(p->aux, q); + if (p->right) remove_dead_assignments(p->right, q); +} + extern void read_program(struct analyser * a) { read_program_(a, -1); { struct name * q = a->names; - until (q == 0) { - switch(q->type) { + while (q) { + switch (q->type) { case t_external: case t_routine: - if (q->used && q->definition == 0) { error4(a, q); } break; + if (q->used && q->definition == 0) error4(a, q); + break; case t_grouping: - if (q->used && q->grouping == 0) { error4(a, q); } break; + if (q->used && q->grouping == 0) error4(a, q); + break; } q = q->next; } @@ -865,33 +1241,77 @@ extern void read_program(struct analyser * a) { if (a->tokeniser->error_count == 0) { struct name * q = a->names; - int warned = false; - until (q == 0) { - unless (q->referenced) { - unless (warned) { - fprintf(stderr, "Declared but not used:"); - warned = true; + struct name ** ptr = &(a->names); + while (q) { + if (!q->referenced) { + fprintf(stderr, "%s:%d: warning: %s '", + a->tokeniser->file, + q->declaration_line_number, + name_of_name_type(q->type)); + report_b(stderr, q->b); + if (q->type == t_routine || + q->type == t_external || + q->type == t_grouping) { + fprintf(stderr, "' declared but not defined\n"); + } else { + fprintf(stderr, "' defined but not used\n"); + q = q->next; + *ptr = q; + continue; } - fprintf(stderr, " "); report_b(stderr, q->b); + } else if (q->type == t_routine || q->type == t_grouping) { + /* It's OK to define a grouping but only use it to define other + * groupings. + */ + if (!q->used && !q->used_in_definition) { + int line_num; + if (q->type == t_routine) { + line_num = q->definition->line_number; + } else { + line_num = q->grouping->line_number; + } + fprintf(stderr, "%s:%d: warning: %s '", + a->tokeniser->file, + line_num, + name_of_name_type(q->type)); + report_b(stderr, q->b); + fprintf(stderr, "' defined but not used\n"); + } + } else if (q->type == t_external) { + /* Unused is OK. */ + } else if (!q->initialised) { + fprintf(stderr, "%s:%d: warning: %s '", + a->tokeniser->file, + q->declaration_line_number, + name_of_name_type(q->type)); + report_b(stderr, q->b); + fprintf(stderr, "' is never initialised\n"); + } else if (!q->value_used) { + fprintf(stderr, "%s:%d: warning: %s '", + a->tokeniser->file, + q->declaration_line_number, + name_of_name_type(q->type)); + report_b(stderr, q->b); + fprintf(stderr, "' is set but never used\n"); + remove_dead_assignments(a->program, q); + q = q->next; + *ptr = q; + continue; } + ptr = &(q->next); q = q->next; } - if (warned) fprintf(stderr, "\n"); - q = a->names; - warned = false; - until (q == 0) { - if (! q->used && (q->type == t_routine || - q->type == t_grouping)) { - unless (warned) { - fprintf(stderr, "Declared and defined but not used:"); - warned = true; - } - fprintf(stderr, " "); report_b(stderr, q->b); + { + /* Now we've eliminated variables whose values are never used we + * can number the variables, which is used by some generators. + */ + int * name_count = a->name_count; + struct name * n; + for (n = a->names; n; n = n->next) { + n->count = name_count[n->type]++; } - q = q->next; } - if (warned) fprintf(stderr, "\n"); } } @@ -909,13 +1329,14 @@ extern struct analyser * create_analyser(struct tokeniser * t) { a->modifyable = true; { int i; for (i = 0; i < t_size; i++) a->name_count[i] = 0; } a->substring = 0; + a->int_limits_used = false; return a; } extern void close_analyser(struct analyser * a) { { struct node * q = a->nodes; - until (q == 0) { + while (q) { struct node * q_next = q->next; FREE(q); q = q_next; @@ -923,7 +1344,7 @@ extern void close_analyser(struct analyser * a) { } { struct name * q = a->names; - until (q == 0) { + while (q) { struct name * q_next = q->next; lose_b(q->b); FREE(q); q = q_next; @@ -931,7 +1352,7 @@ extern void close_analyser(struct analyser * a) { } { struct literalstring * q = a->literalstrings; - until (q == 0) { + while (q) { struct literalstring * q_next = q->next; lose_b(q->b); FREE(q); q = q_next; @@ -939,15 +1360,17 @@ extern void close_analyser(struct analyser * a) { } { struct among * q = a->amongs; - until (q == 0) { + while (q) { struct among * q_next = q->next; - FREE(q->b); FREE(q); + FREE(q->b); + FREE(q->commands); + FREE(q); q = q_next; } } { struct grouping * q = a->groupings; - until (q == 0) { + while (q) { struct grouping * q_next = q->next; lose_b(q->b); FREE(q); q = q_next; @@ -955,4 +1378,3 @@ extern void close_analyser(struct analyser * a) { } FREE(a); } - diff --git a/contrib/snowball/compiler/driver.c b/contrib/snowball/compiler/driver.c index fbb1e9cae..9cbe447a5 100644 --- a/contrib/snowball/compiler/driver.c +++ b/contrib/snowball/compiler/driver.c @@ -1,48 +1,86 @@ +#include <ctype.h> /* for toupper etc */ #include <stdio.h> /* for fprintf etc */ #include <stdlib.h> /* for free etc */ -#include <string.h> /* for strlen */ +#include <string.h> /* for strcmp */ #include "header.h" -#define DEFAULT_PACKAGE "org.tartarus.snowball.ext" -#define DEFAULT_BASE_CLASS "org.tartarus.snowball.SnowballProgram" -#define DEFAULT_AMONG_CLASS "org.tartarus.snowball.Among" -#define DEFAULT_STRING_CLASS "java.lang.StringBuilder" +#define DEFAULT_JAVA_PACKAGE "org.tartarus.snowball.ext" +#define DEFAULT_JAVA_BASE_CLASS "org.tartarus.snowball.SnowballProgram" +#define DEFAULT_JAVA_AMONG_CLASS "org.tartarus.snowball.Among" +#define DEFAULT_JAVA_STRING_CLASS "java.lang.StringBuilder" + +#define DEFAULT_GO_PACKAGE "snowball" +#define DEFAULT_GO_SNOWBALL_RUNTIME "github.com/snowballstem/snowball/go" + +#define DEFAULT_CS_NAMESPACE "Snowball" +#define DEFAULT_CS_BASE_CLASS "Stemmer" +#define DEFAULT_CS_AMONG_CLASS "Among" +#define DEFAULT_CS_STRING_CLASS "StringBuilder" + +#define DEFAULT_JS_BASE_CLASS "BaseStemmer" + +#define DEFAULT_PYTHON_BASE_CLASS "BaseStemmer" static int eq(const char * s1, const char * s2) { - int s1_len = strlen(s1); - int s2_len = strlen(s2); - return s1_len == s2_len && memcmp(s1, s2, s1_len) == 0; + return strcmp(s1, s2) == 0; } -static void print_arglist(void) { - fprintf(stderr, "Usage: snowball <file> [options]\n\n" - "options are: [-o[utput] file]\n" - " [-s[yntax]]\n" +static void print_arglist(int exit_code) { + FILE * f = exit_code ? stderr : stdout; + fprintf(f, "Usage: snowball SOURCE_FILE... [OPTIONS]\n\n" + "Supported options:\n" + " -o[utput] file\n" + " -s[yntax]\n" + " -comments\n" #ifndef DISABLE_JAVA - " [-j[ava]]\n" + " -j[ava]\n" #endif - " [-c++]\n" - " [-w[idechars]]\n" - " [-u[tf8]]\n" - " [-n[ame] class name]\n" - " [-ep[refix] string]\n" - " [-vp[refix] string]\n" - " [-i[nclude] directory]\n" - " [-r[untime] path to runtime headers]\n" -#ifndef DISABLE_JAVA - " [-p[arentclassname] fully qualified parent class name]\n" - " [-P[ackage] package name for stemmers]\n" - " [-S[tringclass] StringBuffer-compatible class]\n" - " [-a[mongclass] fully qualified name of the Among class]\n" +#ifndef DISABLE_CSHARP + " -cs[harp]\n" +#endif + " -c++\n" +#ifndef DISABLE_PASCAL + " -pascal\n" +#endif +#ifndef DISABLE_PYTHON + " -py[thon]\n" #endif +#ifndef DISABLE_JS + " -js\n" +#endif +#ifndef DISABLE_RUST + " -rust\n" +#endif +#ifndef DISABLE_GO + " -go\n" +#endif + " -w[idechars]\n" + " -u[tf8]\n" + " -n[ame] class name\n" + " -ep[refix] string\n" + " -vp[refix] string\n" + " -i[nclude] directory\n" + " -r[untime] path to runtime headers\n" + " -p[arentclassname] fully qualified parent class name\n" +#if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP) + " -P[ackage] package name for stemmers\n" + " -S[tringclass] StringBuffer-compatible class\n" + " -a[mongclass] fully qualified name of the Among class\n" +#endif +#ifndef DISABLE_GO + " -gop[ackage] Go package name for stemmers\n" + " -gor[untime] Go snowball runtime package\n" +#endif + " --help display this help and exit\n" + " --version output version information and exit\n" ); - exit(1); + exit(exit_code); } static void check_lim(int i, int argc) { if (i >= argc) { fprintf(stderr, "argument list is one short\n"); - print_arglist(); + print_arglist(1); } } @@ -57,35 +95,47 @@ static FILE * get_output(symbol * b) { return output; } -static void read_options(struct options * o, int argc, char * argv[]) { +static int read_options(struct options * o, int argc, char * argv[]) { char * s; - int i = 2; + int i = 1; + int new_argc = 1; + /* Note down the last option used to specify an explicit encoding so + * we can warn we ignored it for languages with a fixed encoding. + */ + const char * encoding_opt = NULL; /* set defaults: */ o->output_file = 0; o->syntax_tree = false; - o->externals_prefix = ""; + o->comments = false; + o->externals_prefix = NULL; o->variables_prefix = 0; o->runtime_path = 0; - o->parent_class_name = DEFAULT_BASE_CLASS; - o->string_class = DEFAULT_STRING_CLASS; - o->among_class = DEFAULT_AMONG_CLASS; - o->package = DEFAULT_PACKAGE; - o->name = ""; + o->parent_class_name = NULL; + o->string_class = NULL; + o->among_class = NULL; + o->package = NULL; + o->go_snowball_runtime = DEFAULT_GO_SNOWBALL_RUNTIME; + o->name = NULL; o->make_lang = LANG_C; - o->widechars = false; o->includes = 0; o->includes_end = 0; - o->utf8 = false; + o->encoding = ENC_SINGLEBYTE; /* read options: */ - repeat { - if (i >= argc) break; + while (i < argc) { s = argv[i++]; - { if (eq(s, "-o") || eq(s, "-output")) { - check_lim(i, argc); + if (s[0] != '-') { + /* Non-option argument - shuffle down. */ + argv[new_argc++] = s; + continue; + } + + { + if (eq(s, "-o") || eq(s, "-output")) { + check_lim(i, argc); o->output_file = argv[i++]; continue; } @@ -94,10 +144,33 @@ static void read_options(struct options * o, int argc, char * argv[]) { o->name = argv[i++]; continue; } +#ifndef DISABLE_JS + if (eq(s, "-js")) { + o->make_lang = LANG_JAVASCRIPT; + continue; + } +#endif +#ifndef DISABLE_RUST + if (eq(s, "-rust")) { + o->make_lang = LANG_RUST; + continue; + } +#endif +#ifndef DISABLE_GO + if (eq(s, "-go")) { + o->make_lang = LANG_GO; + continue; + } +#endif #ifndef DISABLE_JAVA if (eq(s, "-j") || eq(s, "-java")) { o->make_lang = LANG_JAVA; - o->widechars = true; + continue; + } +#endif +#ifndef DISABLE_CSHARP + if (eq(s, "-cs") || eq(s, "-csharp")) { + o->make_lang = LANG_CSHARP; continue; } #endif @@ -105,15 +178,31 @@ static void read_options(struct options * o, int argc, char * argv[]) { o->make_lang = LANG_CPLUSPLUS; continue; } +#ifndef DISABLE_PASCAL + if (eq(s, "-pascal")) { + o->make_lang = LANG_PASCAL; + continue; + } +#endif +#ifndef DISABLE_PYTHON + if (eq(s, "-py") || eq(s, "-python")) { + o->make_lang = LANG_PYTHON; + continue; + } +#endif if (eq(s, "-w") || eq(s, "-widechars")) { - o->widechars = true; - o->utf8 = false; + encoding_opt = s; + o->encoding = ENC_WIDECHARS; continue; } if (eq(s, "-s") || eq(s, "-syntax")) { o->syntax_tree = true; continue; } + if (eq(s, "-comments")) { + o->comments = true; + continue; + } if (eq(s, "-ep") || eq(s, "-eprefix")) { check_lim(i, argc); o->externals_prefix = argv[i++]; @@ -145,16 +234,16 @@ static void read_options(struct options * o, int argc, char * argv[]) { continue; } if (eq(s, "-u") || eq(s, "-utf8")) { - o->utf8 = true; - o->widechars = false; + encoding_opt = s; + o->encoding = ENC_UTF8; continue; } -#ifndef DISABLE_JAVA if (eq(s, "-p") || eq(s, "-parentclassname")) { check_lim(i, argc); o->parent_class_name = argv[i++]; continue; } +#if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP) if (eq(s, "-P") || eq(s, "-Package")) { check_lim(i, argc); o->package = argv[i++]; @@ -171,44 +260,216 @@ static void read_options(struct options * o, int argc, char * argv[]) { continue; } #endif +#ifndef DISABLE_GO + if (eq(s, "-gop") || eq(s, "-gopackage")) { + check_lim(i, argc); + o->package = argv[i++]; + continue; + } + if (eq(s, "-gor") || eq(s, "-goruntime")) { + check_lim(i, argc); + o->go_snowball_runtime = argv[i++]; + continue; + } +#endif + if (eq(s, "--help")) { + print_arglist(0); + } + + if (eq(s, "--version")) { + printf("Snowball compiler version " SNOWBALL_VERSION "\n"); + exit(0); + } + fprintf(stderr, "'%s' misplaced\n", s); - print_arglist(); + print_arglist(1); } } + if (new_argc == 1) { + fprintf(stderr, "no source files specified\n"); + print_arglist(1); + } + argv[new_argc] = NULL; + + /* Set language-dependent defaults. */ + switch (o->make_lang) { + case LANG_C: + case LANG_CPLUSPLUS: + encoding_opt = NULL; + break; + case LANG_CSHARP: + o->encoding = ENC_WIDECHARS; + if (!o->parent_class_name) + o->parent_class_name = DEFAULT_CS_BASE_CLASS; + if (!o->string_class) + o->string_class = DEFAULT_CS_STRING_CLASS; + if (!o->among_class) + o->among_class = DEFAULT_CS_AMONG_CLASS; + if (!o->package) + o->package = DEFAULT_CS_NAMESPACE; + break; + case LANG_GO: + o->encoding = ENC_UTF8; + if (!o->package) + o->package = DEFAULT_GO_PACKAGE; + break; + case LANG_JAVA: + o->encoding = ENC_WIDECHARS; + if (!o->parent_class_name) + o->parent_class_name = DEFAULT_JAVA_BASE_CLASS; + if (!o->string_class) + o->string_class = DEFAULT_JAVA_STRING_CLASS; + if (!o->among_class) + o->among_class = DEFAULT_JAVA_AMONG_CLASS; + if (!o->package) + o->package = DEFAULT_JAVA_PACKAGE; + break; + case LANG_JAVASCRIPT: + o->encoding = ENC_WIDECHARS; + if (!o->parent_class_name) + o->parent_class_name = DEFAULT_JS_BASE_CLASS; + break; + case LANG_PYTHON: + o->encoding = ENC_WIDECHARS; + if (!o->parent_class_name) + o->parent_class_name = DEFAULT_PYTHON_BASE_CLASS; + break; + case LANG_RUST: + o->encoding = ENC_UTF8; + break; + default: + break; + } + + if (encoding_opt) { + fprintf(stderr, "warning: %s only meaningful for C and C++\n", + encoding_opt); + } + + if (o->make_lang != LANG_C && o->make_lang != LANG_CPLUSPLUS) { + if (o->runtime_path) { + fprintf(stderr, "warning: -r/-runtime only meaningful for C and C++\n"); + } + if (o->externals_prefix) { + fprintf(stderr, "warning: -ep/-eprefix only meaningful for C and C++\n"); + } + } + if (!o->externals_prefix) o->externals_prefix = ""; + + if (!o->name && o->output_file) { + /* Default class name to basename of output_file - this is the standard + * convention for at least Java and C#. + */ + const char * slash = strrchr(o->output_file, '/'); + size_t len; + const char * leaf = (slash == NULL) ? o->output_file : slash + 1; + + slash = strrchr(leaf, '\\'); + if (slash != NULL) leaf = slash + 1; + + { + const char * dot = strchr(leaf, '.'); + len = (dot == NULL) ? strlen(leaf) : (size_t)(dot - leaf); + } + + { + char * new_name = malloc(len + 1); + switch (o->make_lang) { + case LANG_CSHARP: + case LANG_PASCAL: + /* Upper case initial letter. */ + memcpy(new_name, leaf, len); + new_name[0] = toupper(new_name[0]); + break; + case LANG_JAVASCRIPT: + case LANG_PYTHON: { + /* Upper case initial letter and change each + * underscore+letter or hyphen+letter to an upper case + * letter. + */ + size_t i, j = 0; + int uc_next = true; + for (i = 0; i != len; ++i) { + unsigned char ch = leaf[i]; + if (ch == '_' || ch == '-') { + uc_next = true; + } else { + if (uc_next) { + new_name[j] = toupper(ch); + uc_next = false; + } else { + new_name[j] = ch; + } + ++j; + } + } + len = j; + break; + } + default: + /* Just copy. */ + memcpy(new_name, leaf, len); + break; + } + new_name[len] = '\0'; + o->name = new_name; + } + } + + return new_argc; } extern int main(int argc, char * argv[]) { + int i; NEW(options, o); - if (argc == 1) print_arglist(); - read_options(o, argc, argv); + argc = read_options(o, argc, argv); { - symbol * filename = add_s_to_b(0, argv[1]); - char * file; - symbol * u = get_input(filename, &file); + char * file = argv[1]; + symbol * u = get_input(file); if (u == 0) { - fprintf(stderr, "Can't open input %s\n", argv[1]); + fprintf(stderr, "Can't open input %s\n", file); exit(1); } { struct tokeniser * t = create_tokeniser(u, file); struct analyser * a = create_analyser(t); - t->widechars = o->widechars; + struct input ** next_input_ptr = &(t->next); + a->encoding = t->encoding = o->encoding; t->includes = o->includes; - a->utf8 = t->utf8 = o->utf8; + /* If multiple source files are specified, set up the others to be + * read after the first in order, using the same mechanism as + * 'get' uses. */ + for (i = 2; i != argc; ++i) { + NEW(input, q); + file = argv[i]; + u = get_input(file); + if (u == 0) { + fprintf(stderr, "Can't open input %s\n", file); + exit(1); + } + q->p = u; + q->c = 0; + q->file = file; + q->file_needs_freeing = false; + q->line_number = 1; + *next_input_ptr = q; + next_input_ptr = &(q->next); + } + *next_input_ptr = NULL; read_program(a); if (t->error_count > 0) exit(1); if (o->syntax_tree) print_program(a); close_tokeniser(t); - unless (o->syntax_tree) { + if (!o->syntax_tree) { struct generator * g; - char * s = o->output_file; - unless (s) { + const char * s = o->output_file; + if (!s) { fprintf(stderr, "Please include the -o option\n"); - print_arglist(); - exit(1); + print_arglist(1); } + g = create_generator(a, o); if (o->make_lang == LANG_C || o->make_lang == LANG_CPLUSPLUS) { symbol * b = add_s_to_b(0, s); b = add_s_to_b(b, ".h"); @@ -217,41 +478,96 @@ extern int main(int argc, char * argv[]) { if (o->make_lang == LANG_CPLUSPLUS) { b = add_s_to_b(b, "c"); } - o->output_c = get_output(b); + o->output_src = get_output(b); lose_b(b); - g = create_generator_c(a, o); generate_program_c(g); - close_generator_c(g); - fclose(o->output_c); + fclose(o->output_src); fclose(o->output_h); } #ifndef DISABLE_JAVA if (o->make_lang == LANG_JAVA) { symbol * b = add_s_to_b(0, s); b = add_s_to_b(b, ".java"); - o->output_java = get_output(b); + o->output_src = get_output(b); lose_b(b); - g = create_generator_java(a, o); generate_program_java(g); - close_generator_java(g); - fclose(o->output_java); + fclose(o->output_src); } #endif +#ifndef DISABLE_PASCAL + if (o->make_lang == LANG_PASCAL) { + symbol *b = add_s_to_b(0, s); + b = add_s_to_b(b, ".pas"); + o->output_src = get_output(b); + lose_b(b); + generate_program_pascal(g); + fclose(o->output_src); + } +#endif +#ifndef DISABLE_PYTHON + if (o->make_lang == LANG_PYTHON) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".py"); + o->output_src = get_output(b); + lose_b(b); + generate_program_python(g); + fclose(o->output_src); + } +#endif +#ifndef DISABLE_JS + if (o->make_lang == LANG_JAVASCRIPT) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".js"); + o->output_src = get_output(b); + lose_b(b); + generate_program_js(g); + fclose(o->output_src); + } +#endif +#ifndef DISABLE_CSHARP + if (o->make_lang == LANG_CSHARP) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".cs"); + o->output_src = get_output(b); + lose_b(b); + generate_program_csharp(g); + fclose(o->output_src); + } +#endif +#ifndef DISABLE_RUST + if (o->make_lang == LANG_RUST) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".rs"); + o->output_src = get_output(b); + lose_b(b); + generate_program_rust(g); + fclose(o->output_src); + } +#endif +#ifndef DISABLE_GO + if (o->make_lang == LANG_GO) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".go"); + o->output_src = get_output(b); + lose_b(b); + generate_program_go(g); + fclose(o->output_src); + } +#endif + close_generator(g); } close_analyser(a); } lose_b(u); - lose_b(filename); } { struct include * p = o->includes; - until (p == 0) - { struct include * q = p->next; + while (p) { + struct include * q = p->next; lose_b(p->b); FREE(p); p = q; } } FREE(o); - unless (space_count == 0) fprintf(stderr, "%d blocks unfreed\n", space_count); + if (space_count) fprintf(stderr, "%d blocks unfreed\n", space_count); return 0; } - diff --git a/contrib/snowball/compiler/generator.c b/contrib/snowball/compiler/generator.c index 7294b0471..eed86c117 100644 --- a/contrib/snowball/compiler/generator.c +++ b/contrib/snowball/compiler/generator.c @@ -12,286 +12,349 @@ static void generate(struct generator * g, struct node * p); -enum special_labels { - - x_return = -1 - -}; - static int new_label(struct generator * g) { return g->next_label++; } -/* Output routines */ -static void output_str(FILE * outfile, struct str * str) { +/* Write routines for simple entities */ - char * s = b_to_s(str_data(str)); - fprintf(outfile, "%s", s); - free(s); +/* Write a space if the preceding character was not whitespace */ +static void ws_opt_space(struct generator * g, const char * s) { + int ch = str_back(g->outbuf); + if (ch != ' ' && ch != '\n' && ch != '\t' && ch != -1) + write_char(g, ' '); + write_string(g, s); } -static void wch(struct generator * g, int ch) { - str_append_ch(g->outbuf, ch); /* character */ +static void wi3(struct generator * g, int i) { + if (i < 100) write_char(g, ' '); + if (i < 10) write_char(g, ' '); + write_int(g, i); /* integer (width 3) */ } -static void wnl(struct generator * g) { - str_append_ch(g->outbuf, '\n'); /* newline */ - g->line_count++; -} -static void ws(struct generator * g, const char * s) { - str_append_string(g->outbuf, s); /* string */ -} +/* Write routines for items from the syntax tree */ -static void wi(struct generator * g, int i) { - str_append_int(g->outbuf, i); /* integer */ -} +static void write_varname(struct generator * g, struct name * p) { -static void wh_ch(struct generator * g, int i) { - str_append_ch(g->outbuf, "0123456789ABCDEF"[i & 0xF]); /* hexchar */ + int ch = "SIIrxg"[p->type]; + switch (p->type) { + case t_external: + write_string(g, g->options->externals_prefix); break; + case t_string: + case t_boolean: + case t_integer: { + int count = p->count; + if (count < 0) { + fprintf(stderr, "Reference to optimised out variable "); + report_b(stderr, p->b); + fprintf(stderr, " attempted\n"); + exit(1); + } + if (p->type == t_boolean) { + /* We use a single array for booleans and integers, with the + * integers first. + */ + count += g->analyser->name_count[t_integer]; + } + write_char(g, ch); + write_char(g, '['); + write_int(g, count); + write_char(g, ']'); + return; + } + default: + write_char(g, ch); write_char(g, '_'); + } + write_b(g, p->b); } -static void wh(struct generator * g, int i) { - if (i >> 4) wh(g, i >> 4); - wh_ch(g, i); /* hex integer */ +static void write_varref(struct generator * g, struct name * p) { /* reference to variable */ + if (p->type < t_routine) write_string(g, "z->"); + write_varname(g, p); } -static void wi3(struct generator * g, int i) { - if (i < 100) wch(g, ' '); - if (i < 10) wch(g, ' '); - wi(g, i); /* integer (width 3) */ +static void write_hexdigit(struct generator * g, int i) { + str_append_ch(g->outbuf, "0123456789ABCDEF"[i & 0xF]); /* hexchar */ } -static void wvn(struct generator * g, struct name * p) { /* variable name */ - - int ch = "SBIrxg"[p->type]; - switch (p->type) { - case t_string: - case t_boolean: - case t_integer: - wch(g, ch); wch(g, '['); wi(g, p->count); wch(g, ']'); return; - case t_external: - ws(g, g->options->externals_prefix); break; - default: - wch(g, ch); wch(g, '_'); - } - str_append_b(g->outbuf, p->b); +static void write_hex(struct generator * g, int i) { + if (i >> 4) write_hex(g, i >> 4); + write_hexdigit(g, i); /* hex integer */ } -static void wv(struct generator * g, struct name * p) { /* reference to variable */ - if (p->type < t_routine) ws(g, "z->"); - wvn(g, p); +/* write character literal */ +static void wlitch(struct generator * g, int ch) { + if (32 <= ch && ch < 127) { + write_char(g, '\''); + if (ch == '\'' || ch == '\\') { + write_char(g, '\\'); + } + write_char(g, ch); + write_char(g, '\''); + } else { + write_string(g, "0x"); write_hex(g, ch); + } } static void wlitarray(struct generator * g, symbol * p) { /* write literal array */ - ws(g, "{ "); + write_string(g, "{ "); { int i; for (i = 0; i < SIZE(p); i++) { - int ch = p[i]; - if (32 <= ch && ch < 127) { - wch(g, '\''); - switch (ch) { - case '\'': - case '\\': wch(g, '\\'); - default: wch(g, ch); - } - wch(g, '\''); - } else { - wch(g, '0'); wch(g, 'x'); wh(g, ch); - } - if (i < SIZE(p) - 1) ws(g, ", "); + wlitch(g, p[i]); + if (i < SIZE(p) - 1) write_string(g, ", "); } } - ws(g, " }"); + write_string(g, " }"); } static void wlitref(struct generator * g, symbol * p) { /* write ref to literal array */ - if (SIZE(p) == 0) ws(g, "0"); else { + if (SIZE(p) == 0) { + write_char(g, '0'); + } else { struct str * s = g->outbuf; g->outbuf = g->declarations; - ws(g, "static const symbol s_"); wi(g, g->literalstring_count); ws(g, "[] = "); + write_string(g, "static const symbol s_"); write_int(g, g->literalstring_count); write_string(g, "[] = "); wlitarray(g, p); - ws(g, ";\n"); + write_string(g, ";\n"); g->outbuf = s; - ws(g, "s_"); wi(g, g->literalstring_count); + write_string(g, "s_"); write_int(g, g->literalstring_count); g->literalstring_count++; } } - -static void wm(struct generator * g) { /* margin */ +static void write_margin(struct generator * g) { int i; - for (i = 0; i < g->margin; i++) ws(g, " "); + for (i = 0; i < g->margin; i++) write_string(g, " "); } -static void wc(struct generator * g, struct node * p) { /* comment */ - - ws(g, " /* "); +void write_comment_content(struct generator * g, struct node * p) { switch (p->type) { case c_mathassign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: + if (p->name) { + write_char(g, '$'); + write_b(g, p->name->b); + write_char(g, ' '); + } + write_string(g, name_of_token(p->type)); + write_string(g, " <integer expression>"); + break; case c_eq: case c_ne: case c_gr: case c_ge: case c_ls: case c_le: - if (p->name) { - wch(g, '$'); - str_append_b(g->outbuf, p->name->b); - wch(g, ' '); - } - ws(g, name_of_token(p->type)); - ws(g, " <integer expression>"); + write_string(g, "$(<integer expression> "); + write_string(g, name_of_token(p->type)); + write_string(g, " <integer expression>)"); break; default: - ws(g, name_of_token(p->type)); + write_string(g, name_of_token(p->type)); if (p->name) { - wch(g, ' '); - str_append_b(g->outbuf, p->name->b); + write_char(g, ' '); + write_b(g, p->name->b); } } - ws(g, ", line "); wi(g, p->line_number); ws(g, " */"); - wnl(g); + write_string(g, ", line "); + write_int(g, p->line_number); +} + +static void write_comment(struct generator * g, struct node * p) { + if (g->options->comments) { + ws_opt_space(g, "/* "); + write_comment_content(g, p); + write_string(g, " */"); + } + write_newline(g); } static void wms(struct generator * g, const char * s) { - wm(g); ws(g, s); } /* margin + string */ + write_margin(g); write_string(g, s); } /* margin + string */ -static void wbs(struct generator * g) { /* block start */ +static void write_block_start(struct generator * g) { /* block start */ wms(g, "{ "); g->margin++; } -static void wbe(struct generator * g) { /* block end */ +static void write_block_end(struct generator * g) { /* block end */ - if (g->line_labelled == g->line_count) { wms(g, ";"); wnl(g); } + if (g->line_labelled == g->line_count) { wms(g, ";"); write_newline(g); } g->margin--; - wms(g, "}"); wnl(g); + wms(g, "}"); write_newline(g); } -static void wk(struct generator * g, struct node * p) { /* keep c */ +static void w(struct generator * g, const char * s); + +/* keep c */ +static void wk(struct generator * g, struct node * p, int keep_limit) { ++g->keep_count; if (p->mode == m_forward) { - ws(g, "int c"); wi(g, g->keep_count); ws(g, " = z->c;"); + write_string(g, "int c"); + write_int(g, g->keep_count); + write_string(g, " = z->c"); + if (keep_limit) { + write_string(g, ", mlimit"); + write_int(g, g->keep_count); + } + write_char(g, ';'); } else { - ws(g, "int m"); wi(g, g->keep_count); ws(g, " = z->l - z->c; (void)m"); - wi(g, g->keep_count); ws(g, ";"); + write_string(g, "int m"); + write_int(g, g->keep_count); + write_string(g, " = z->l - z->c"); + if (keep_limit) { + write_string(g, ", mlimit"); + write_int(g, g->keep_count); + } + write_string(g, "; (void)m"); + write_int(g, g->keep_count); + write_char(g, ';'); } } static void wrestore(struct generator * g, struct node * p, int keep_token) { /* restore c */ if (p->mode == m_forward) { - ws(g, "z->c = c"); + write_string(g, "z->c = c"); } else { - ws(g, "z->c = z->l - m"); + write_string(g, "z->c = z->l - m"); } - wi(g, keep_token); ws(g, ";"); + write_int(g, keep_token); write_char(g, ';'); +} + +static void wrestorelimit(struct generator * g, struct node * p, int keep_token) { /* restore limit */ + if (p->mode == m_forward) { + w(g, "z->l += mlimit"); + } else { + w(g, "z->lb = mlimit"); + } + write_int(g, keep_token); write_string(g, ";"); } static void winc(struct generator * g, struct node * p) { /* increment c */ - ws(g, p->mode == m_forward ? "z->c++;" : + write_string(g, p->mode == m_forward ? "z->c++;" : "z->c--;"); } static void wsetl(struct generator * g, int n) { g->margin--; - wms(g, "lab"); wi(g, n); wch(g, ':'); wnl(g); + wms(g, "lab"); write_int(g, n); write_char(g, ':'); write_newline(g); g->line_labelled = g->line_count; g->margin++; } static void wgotol(struct generator * g, int n) { - wms(g, "goto lab"); wi(g, n); wch(g, ';'); wnl(g); + wms(g, "goto lab"); write_int(g, n); write_char(g, ';'); write_newline(g); } -static void wf(struct generator * g) { /* fail */ - if (g->failure_string != 0) { ws(g, "{ "); ws(g, g->failure_string); wch(g, ' '); } - switch (g->failure_label) - { +static void write_failure(struct generator * g, struct node * p) { /* fail */ + if (g->failure_keep_count != 0) { + write_string(g, "{ "); + if (g->failure_keep_count > 0) { + wrestore(g, p, g->failure_keep_count); + } else { + wrestorelimit(g, p, -g->failure_keep_count); + } + write_char(g, ' '); + } + switch (g->failure_label) { case x_return: - ws(g, "return 0;"); - break; + write_string(g, "return 0;"); + break; default: - ws(g, "goto lab"); - wi(g, g->failure_label); - wch(g, ';'); - g->label_used = 1; + write_string(g, "goto lab"); + write_int(g, g->failure_label); + write_char(g, ';'); + g->label_used = 1; } - if (g->failure_string != 0) ws(g, " }"); + if (g->failure_keep_count != 0) write_string(g, " }"); } -static void wlim(struct generator * g, struct node * p) { /* if at limit fail */ - ws(g, p->mode == m_forward ? "if (z->c >= z->l) " : +/* if at limit fail */ +static void write_check_limit(struct generator * g, struct node * p) { + + write_string(g, p->mode == m_forward ? "if (z->c >= z->l) " : "if (z->c <= z->lb) "); - wf(g); + write_failure(g, p); } -static void wp(struct generator * g, const char * s, struct node * p) { /* formatted write */ +static void write_data_address(struct generator * g, struct node * p) { + symbol * b = p->literalstring; + if (b != 0) { + write_int(g, SIZE(b)); w(g, ", "); + wlitref(g, b); + } else { + write_varref(g, p->name); + } +} + +/* Formatted write. */ +static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; - int l = strlen(s); - until (i >= l) { - int ch = s[i++]; - if (ch != '~') wch(g, ch); else - switch(s[i++]) { - default: wch(g, s[i - 1]); continue; - case 'C': wc(g, p); continue; - case 'k': wk(g, p); continue; - case 'K': /* keep for c_test */ - ws(g, p->mode == m_forward ? "int c_test = z->c;" : - "int m_test = z->l - z->c;"); - continue; - case 'R': /* restore for c_test */ - ws(g, p->mode == m_forward ? "z->c = c_test;" : - "z->c = z->l - m_test;"); - continue; + int l = strlen(input); + + while (i < l) { + int ch = input[i++]; + if (ch != '~') { + write_char(g, ch); + continue; + } + switch (input[i++]) { + default: write_char(g, input[i - 1]); continue; + case 'C': write_comment(g, p); continue; + case 'k': wk(g, p, false); continue; + case 'K': wk(g, p, true); continue; case 'i': winc(g, p); continue; - case 'l': wlim(g, p); continue; - case 'f': wf(g); continue; - case 'M': wm(g); continue; - case 'N': wnl(g); continue; - case '{': wbs(g); continue; - case '}': wbe(g); continue; - case 'S': ws(g, g->S[s[i++] - '0']); continue; - case 'I': wi(g, g->I[s[i++] - '0']); continue; - case 'J': wi3(g, g->I[s[i++] - '0']); continue; - case 'V': wv(g, g->V[s[i++] - '0']); continue; - case 'W': wvn(g, g->V[s[i++] - '0']); continue; - case 'L': wlitref(g, g->L[s[i++] - '0']); continue; - case 'A': wlitarray(g, g->L[s[i++] - '0']); continue; + case 'l': write_check_limit(g, p); continue; + case 'f': write_failure(g, p); continue; + case 'M': write_margin(g); continue; + case 'N': write_newline(g); continue; + case '{': write_block_start(g); continue; + case '}': write_block_end(g); continue; + case 'S': write_string(g, g->S[input[i++] - '0']); continue; + case 'I': write_int(g, g->I[input[i++] - '0']); continue; + case 'J': wi3(g, g->I[input[i++] - '0']); continue; + case 'V': write_varref(g, g->V[input[i++] - '0']); continue; + case 'W': write_varname(g, g->V[input[i++] - '0']); continue; + case 'L': wlitref(g, g->L[input[i++] - '0']); continue; + case 'A': wlitarray(g, g->L[input[i++] - '0']); continue; + case 'c': wlitch(g, g->I[input[i++] - '0']); continue; + case 'a': write_data_address(g, p); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case '$': /* insert_s, insert_v etc */ - wch(g, p->literalstring == 0 ? 'v' : 's'); + write_char(g, p->literalstring == 0 ? 'v' : 's'); continue; - case 'p': ws(g, g->options->externals_prefix); continue; + case 'p': write_string(g, g->options->externals_prefix); continue; } } } -static void w(struct generator * g, const char * s) { wp(g, s, 0); } +static void w(struct generator * g, const char * s) { + writef(g, s, 0); +} static void generate_AE(struct generator * g, struct node * p) { - char * s; + const char * s; switch (p->type) { case c_name: - wv(g, p->name); break; + write_varref(g, p->name); break; case c_number: - wi(g, p->number); break; + write_int(g, p->number); break; case c_maxint: - ws(g, "MAXINT"); break; + write_string(g, "MAXINT"); break; case c_minint: - ws(g, "MININT"); break; + write_string(g, "MININT"); break; case c_neg: - wch(g, '-'); generate_AE(g, p->right); break; + write_char(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_plus: @@ -301,28 +364,45 @@ static void generate_AE(struct generator * g, struct node * p) { case c_divide: s = " / "; label0: - wch(g, '('); generate_AE(g, p->left); - ws(g, s); generate_AE(g, p->right); wch(g, ')'); break; - case c_sizeof: - g->V[0] = p->name; - w(g, "SIZE(~V0)"); break; + write_char(g, '('); generate_AE(g, p->left); + write_string(g, s); generate_AE(g, p->right); write_char(g, ')'); break; case c_cursor: w(g, "z->c"); break; case c_limit: w(g, p->mode == m_forward ? "z->l" : "z->lb"); break; + case c_len: + if (g->options->encoding == ENC_UTF8) { + w(g, "len_utf8(z->p)"); + break; + } + /* FALLTHRU */ case c_size: - w(g, "SIZE(z->p)"); break; + w(g, "SIZE(z->p)"); + break; + case c_lenof: + if (g->options->encoding == ENC_UTF8) { + g->V[0] = p->name; + w(g, "len_utf8(~V0)"); + break; + } + /* FALLTHRU */ + case c_sizeof: + g->V[0] = p->name; + w(g, "SIZE(~V0)"); + break; } } /* K_needed() tests to see if we really need to keep c. Not true when the - the command does not touch the cursor. This and repeat_score() could be + command does not touch the cursor. This and repeat_score() could be elaborated almost indefinitely. */ -static int K_needed(struct generator * g, struct node * p) { - until (p == 0) { +static int K_needed_(struct generator * g, struct node * p, int call_depth) { + while (p) { switch (p->type) { + case c_atlimit: + case c_do: case c_dollar: case c_leftslice: case c_rightslice: @@ -338,17 +418,28 @@ static int K_needed(struct generator * g, struct node * p) { case c_ls: case c_le: case c_sliceto: + case c_booltest: + case c_set: + case c_unset: case c_true: case c_false: case c_debug: break; case c_call: - if (K_needed(g, p->name->definition)) return true; + /* Recursive functions aren't typical in snowball programs, so + * make the pessimistic assumption that keep is needed if we + * hit a generous limit on recursion. It's not likely to make + * a difference to any real world program, but means we won't + * recurse until we run out of stack for pathological cases. + */ + if (call_depth >= 100) return true; + if (K_needed_(g, p->name->definition, call_depth + 1)) + return true; break; case c_bra: - if (K_needed(g, p->left)) return true; + if (K_needed_(g, p->left, call_depth)) return true; break; default: return true; @@ -358,10 +449,13 @@ static int K_needed(struct generator * g, struct node * p) { return false; } -static int repeat_score(struct generator * g, struct node * p) { +extern int K_needed(struct generator * g, struct node * p) { + return K_needed_(g, p, 0); +} + +static int repeat_score(struct generator * g, struct node * p, int call_depth) { int score = 0; - until (p == 0) - { + while (p) { switch (p->type) { case c_dollar: case c_leftslice: @@ -382,11 +476,25 @@ static int repeat_score(struct generator * g, struct node * p) { break; case c_call: - score += repeat_score(g, p->name->definition); + /* Recursive functions aren't typical in snowball programs, so + * make the pessimistic assumption that repeat requires cursor + * reinstatement if we hit a generous limit on recursion. It's + * not likely to make a difference to any real world program, + * but means we won't recurse until we run out of stack for + * pathological cases. + */ + if (call_depth >= 100) { + return 2; + } + score += repeat_score(g, p->name->definition, call_depth + 1); + if (score >= 2) + return score; break; case c_bra: - score += repeat_score(g, p->left); + score += repeat_score(g, p->left, call_depth); + if (score >= 2) + return score; break; case c_name: @@ -395,9 +503,12 @@ static int repeat_score(struct generator * g, struct node * p) { case c_grouping: case c_non: case c_hop: - score = score + 1; break; + if (++score >= 2) + return score; + break; - default: score = 2; break; + default: + return 2; } p = p->right; } @@ -406,25 +517,28 @@ static int repeat_score(struct generator * g, struct node * p) { /* tests if an expression requires cursor reinstatement in a repeat */ -static int repeat_restore(struct generator * g, struct node * p) { - return repeat_score(g, p) >= 2; +extern int repeat_restore(struct generator * g, struct node * p) { + return repeat_score(g, p, 0) >= 2; } static void generate_bra(struct generator * g, struct node * p) { p = p->left; - until (p == 0) { generate(g, p); p = p->right; } + while (p) { + generate(g, p); + p = p->right; + } } static void generate_and(struct generator * g, struct node * p) { int keep_c = 0; if (K_needed(g, p->left)) { - wp(g, "~{~k~C", p); + writef(g, "~{~k~C", p); keep_c = g->keep_count; } else { - wp(g, "~M~C", p); + writef(g, "~M~C", p); } p = p->left; - until (p == 0) { + while (p) { generate(g, p); if (keep_c && p->right != 0) { w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); @@ -439,19 +553,19 @@ static void generate_or(struct generator * g, struct node * p) { int used = g->label_used; int a0 = g->failure_label; - const char * a1 = g->failure_string; + int a1 = g->failure_keep_count; int out_lab = new_label(g); if (K_needed(g, p->left)) { - wp(g, "~{~k~C", p); + writef(g, "~{~k~C", p); keep_c = g->keep_count; } else { - wp(g, "~M~C", p); + writef(g, "~M~C", p); } p = p->left; - g->failure_string = 0; - until (p->right == 0) { + g->failure_keep_count = 0; + while (p->right) { g->failure_label = new_label(g); g->label_used = 0; generate(g, p); @@ -465,7 +579,7 @@ static void generate_or(struct generator * g, struct node * p) { } g->label_used = used; g->failure_label = a0; - g->failure_string = a1; + g->failure_keep_count = a1; generate(g, p); if (keep_c) w(g, "~}"); @@ -474,7 +588,7 @@ static void generate_or(struct generator * g, struct node * p) { static void generate_backwards(struct generator * g, struct node * p) { - wp(g,"~Mz->lb = z->c; z->c = z->l;~C~N", p); + writef(g, "~Mz->lb = z->c; z->c = z->l;~C~N", p); generate(g, p->left); w(g, "~Mz->c = z->lb;~N"); } @@ -485,18 +599,18 @@ static void generate_not(struct generator * g, struct node * p) { int used = g->label_used; int a0 = g->failure_label; - const char * a1 = g->failure_string; + int a1 = g->failure_keep_count; if (K_needed(g, p->left)) { - wp(g, "~{~k~C", p); + writef(g, "~{~k~C", p); keep_c = g->keep_count; } else { - wp(g, "~M~C", p); + writef(g, "~M~C", p); } g->failure_label = new_label(g); g->label_used = 0; - g->failure_string = 0; + g->failure_keep_count = 0; generate(g, p->left); { @@ -505,9 +619,9 @@ static void generate_not(struct generator * g, struct node * p) { g->label_used = used; g->failure_label = a0; - g->failure_string = a1; + g->failure_keep_count = a1; - w(g, "~M~f~N"); + writef(g, "~M~f~N", p); if (u) wsetl(g, l); } @@ -518,20 +632,14 @@ static void generate_not(struct generator * g, struct node * p) { static void generate_try(struct generator * g, struct node * p) { - int keep_c = K_needed(g, p->left); - - if (keep_c) { - if (p->mode == m_forward) { - wp(g, "~{int c_keep = z->c;~C", p); - g->failure_string = "z->c = c_keep;"; - } else { - wp(g, "~{int m_keep = z->l - z->c;/* (void) m_keep;*/~C", p); - g->failure_string = "z->c = z->l - m_keep;"; - } + int keep_c = 0; + if (K_needed(g, p->left)) { + writef(g, "~{~k~C", p); + keep_c = g->keep_count; } else { - wp(g, "~M~C", p); - g->failure_string = 0; + writef(g, "~M~C", p); } + g->failure_keep_count = keep_c; g->failure_label = new_label(g); g->label_used = 0; @@ -544,47 +652,65 @@ static void generate_try(struct generator * g, struct node * p) { } static void generate_set(struct generator * g, struct node * p) { - g->V[0] = p->name; wp(g, "~M~V0 = 1;~C", p); + g->V[0] = p->name; writef(g, "~M~V0 = 1;~C", p); } static void generate_unset(struct generator * g, struct node * p) { - g->V[0] = p->name; wp(g, "~M~V0 = 0;~C", p); + g->V[0] = p->name; writef(g, "~M~V0 = 0;~C", p); } static void generate_fail(struct generator * g, struct node * p) { generate(g, p->left); - wp(g, "~M~f~C", p); + writef(g, "~M~f~C", p); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { - int keep_c = K_needed(g, p->left); - if (keep_c) wp(g, "~{~K~C", p); - else wp(g, "~M~C", p); + int keep_c = 0; + if (K_needed(g, p->left)) { + keep_c = ++g->keep_count; + w(g, p->mode == m_forward ? "~{int c_test" : + "~{int m_test"); + write_int(g, keep_c); + w(g, p->mode == m_forward ? " = z->c;" : + " = z->l - z->c;"); + writef(g, "~C", p); + } else writef(g, "~M~C", p); generate(g, p->left); - if (keep_c) wp(g, "~M~R~N" - "~}", p); + if (keep_c) { + w(g, p->mode == m_forward ? "~Mz->c = c_test" : + "~Mz->c = z->l - m_test"); + write_int(g, keep_c); + writef(g, ";~N~}", p); + } } static void generate_do(struct generator * g, struct node * p) { int keep_c = 0; if (K_needed(g, p->left)) { - wp(g, "~{~k~C", p); + writef(g, "~{~k~C", p); keep_c = g->keep_count; } else { - wp(g, "~M~C", p); + writef(g, "~M~C", p); } - g->failure_label = new_label(g); - g->label_used = 0; - g->failure_string = 0; - generate(g, p->left); + if (p->left->type == c_call) { + /* Optimise do <call> */ + g->V[0] = p->left->name; + writef(g, "~{int ret = ~V0(z);~C", p->left); + w(g, "~Mif (ret < 0) return ret;~N~}"); + } else { + g->failure_label = new_label(g); + g->label_used = 0; + g->failure_keep_count = 0; + generate(g, p->left); - if (g->label_used) - wsetl(g, g->failure_label); + if (g->label_used) + wsetl(g, g->failure_label); + } if (keep_c) { w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N~}"); @@ -592,17 +718,17 @@ static void generate_do(struct generator * g, struct node * p) { } static void generate_next(struct generator * g, struct node * p) { - if (g->options->utf8) { + if (g->options->encoding == ENC_UTF8) { if (p->mode == m_forward) w(g, "~{int ret = skip_utf8(z->p, z->c, 0, z->l, 1"); else w(g, "~{int ret = skip_utf8(z->p, z->c, z->lb, 0, -1"); - wp(g, ");~N" + writef(g, ");~N" "~Mif (ret < 0) ~f~N" "~Mz->c = ret;~C" "~}", p); } else - wp(g, "~M~l~N" + writef(g, "~M~l~N" "~M~i~C", p); } @@ -611,21 +737,21 @@ static void generate_GO_grouping(struct generator * g, struct node * p, int is_g struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; - g->S[2] = g->options->utf8 ? "_U" : ""; + g->S[2] = g->options->encoding == ENC_UTF8 ? "_U" : ""; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (is_goto) { - wp(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1) < 0) ~f /* goto */~C", p); + writef(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1) < 0) ~f~C", p); } else { - wp(g, "~{ /* gopast */~C" - "~Mint ret = ~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1);~N" - "~Mif (ret < 0) ~f~N", p); - if (p->mode == m_forward) - w(g, "~Mz->c += ret;~N"); - else - w(g, "~Mz->c -= ret;~N"); - w(g, "~}"); + writef(g, "~{~C" + "~Mint ret = ~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1);~N" + "~Mif (ret < 0) ~f~N", p); + if (p->mode == m_forward) + w(g, "~Mz->c += ret;~N"); + else + w(g, "~Mz->c -= ret;~N"); + w(g, "~}"); } } @@ -634,23 +760,26 @@ static void generate_GO(struct generator * g, struct node * p, int style) { int used = g->label_used; int a0 = g->failure_label; - const char * a1 = g->failure_string; + int a1 = g->failure_keep_count; if (p->left->type == c_grouping || p->left->type == c_non) { - /* Special case for "goto" or "gopast" when used on a grouping or an - * inverted grouping - the movement of c by the matching action is - * exactly what we want! */ + /* Special case for "goto" or "gopast" when used on a grouping or an + * inverted grouping - the movement of c by the matching action is + * exactly what we want! */ #ifdef OPTIMISATION_WARNINGS - printf("Optimising %s %s\n", style ? "goto" : "gopast", p->left->type == c_non ? "non" : "grouping"); + printf("Optimising %s %s\n", style ? "goto" : "gopast", p->left->type == c_non ? "non" : "grouping"); #endif - generate_GO_grouping(g, p->left, style, p->left->type == c_non); - return; + if (g->options->comments) { + writef(g, "~M~C", p); + } + generate_GO_grouping(g, p->left, style, p->left->type == c_non); + return; } - w(g, "~Mwhile(1) {"); wp(g, "~C~+", p); + w(g, "~Mwhile(1) {"); writef(g, "~C~+", p); if (style == 1 || repeat_restore(g, p->left)) { - wp(g, "~M~k~N", p); + writef(g, "~M~k~N", p); keep_c = g->keep_count; } @@ -671,16 +800,16 @@ static void generate_GO(struct generator * g, struct node * p, int style) { g->label_used = used; g->failure_label = a0; - g->failure_string = a1; + g->failure_keep_count = a1; -/* wp(g, "~M~l~N" +/* writef(g, "~M~l~N" "~M~i~N", p); */ generate_next(g, p); w(g, "~}"); } static void generate_loop(struct generator * g, struct node * p) { - w(g, "~{int i; for (i = "); generate_AE(g, p->AE); wp(g, "; i > 0; i--)~C" + w(g, "~{int i; for (i = "); generate_AE(g, p->AE); writef(g, "; i > 0; i--)~C" "~{", p); generate(g, p->left); @@ -689,18 +818,22 @@ static void generate_loop(struct generator * g, struct node * p) { "~}"); } -static void generate_repeat(struct generator * g, struct node * p, int atleast_case) { +static void generate_repeat_or_atleast(struct generator * g, struct node * p, int atleast_case) { int keep_c = 0; - wp(g, "~Mwhile(1) {~C~+", p); + if (atleast_case) { + writef(g, "~Mwhile(1) {~+~N", p); + } else { + writef(g, "~Mwhile(1) {~+~C", p); + } if (repeat_restore(g, p->left)) { - wp(g, "~M~k~N", p); + writef(g, "~M~k~N", p); keep_c = g->keep_count; } g->failure_label = new_label(g); g->label_used = 0; - g->failure_string = 0; + g->failure_keep_count = 0; generate(g, p->left); if (atleast_case) w(g, "~Mi--;~N"); @@ -717,130 +850,120 @@ static void generate_repeat(struct generator * g, struct node * p, int atleast_c "~}"); } +static void generate_repeat(struct generator * g, struct node * p) { + generate_repeat_or_atleast(g, p, false); +} + static void generate_atleast(struct generator * g, struct node * p) { - w(g, "~{int i = "); generate_AE(g, p->AE); w(g, ";~N"); + w(g, "~{int i = "); generate_AE(g, p->AE); w(g, ";~C"); { int used = g->label_used; int a0 = g->failure_label; - const char * a1 = g->failure_string; + int a1 = g->failure_keep_count; - generate_repeat(g, p, true); + generate_repeat_or_atleast(g, p, true); g->label_used = used; g->failure_label = a0; - g->failure_string = a1; + g->failure_keep_count = a1; } - w(g, "~Mif (i > 0) ~f~N" - "~}"); + writef(g, "~Mif (i > 0) ~f~N" + "~}", p); } static void generate_setmark(struct generator * g, struct node * p) { g->V[0] = p->name; - wp(g, "~M~V0 = z->c;~C", p); + writef(g, "~M~V0 = z->c;~C", p); } static void generate_tomark(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? ">" : "<"; - w(g, "~Mif (z->c ~S0 "); generate_AE(g, p->AE); w(g, ") ~f~N"); - w(g, "~Mz->c = "); generate_AE(g, p->AE); wp(g, ";~C", p); + w(g, "~Mif (z->c ~S0 "); generate_AE(g, p->AE); writef(g, ") ~f~N", p); + w(g, "~Mz->c = "); generate_AE(g, p->AE); writef(g, ";~C", p); } static void generate_atmark(struct generator * g, struct node * p) { - w(g, "~Mif (z->c != "); generate_AE(g, p->AE); wp(g, ") ~f~C", p); + w(g, "~Mif (z->c != "); generate_AE(g, p->AE); writef(g, ") ~f~C", p); } static void generate_hop(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "+" : "-"; g->S[1] = p->mode == m_forward ? "0" : "z->lb"; - if (g->options->utf8) { + if (g->options->encoding == ENC_UTF8) { w(g, "~{int ret = skip_utf8(z->p, z->c, ~S1, z->l, ~S0 "); - generate_AE(g, p->AE); wp(g, ");~C", p); - w(g, "~Mif (ret < 0) ~f~N"); + generate_AE(g, p->AE); writef(g, ");~C", p); + writef(g, "~Mif (ret < 0) ~f~N", p); } else { w(g, "~{int ret = z->c ~S0 "); - generate_AE(g, p->AE); wp(g, ";~C", p); - w(g, "~Mif (~S1 > ret || ret > z->l) ~f~N"); + generate_AE(g, p->AE); writef(g, ";~C", p); + writef(g, "~Mif (~S1 > ret || ret > z->l) ~f~N", p); } - wp(g, "~Mz->c = ret;~C" + writef(g, "~Mz->c = ret;~N" "~}", p); } static void generate_delete(struct generator * g, struct node * p) { - wp(g, "~{int ret = slice_del(z);~C", p); - wp(g, "~Mif (ret < 0) return ret;~N" + writef(g, "~{int ret = slice_del(z);~C", p); + writef(g, "~Mif (ret < 0) return ret;~N" "~}", p); } static void generate_tolimit(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "" : "b"; - wp(g, "~Mz->c = z->l~S0;~C", p); + writef(g, "~Mz->c = z->l~S0;~C", p); } static void generate_atlimit(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "" : "b"; g->S[1] = p->mode == m_forward ? "<" : ">"; - wp(g, "~Mif (z->c ~S1 z->l~S0) ~f~C", p); + writef(g, "~Mif (z->c ~S1 z->l~S0) ~f~C", p); } static void generate_leftslice(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "bra" : "ket"; - wp(g, "~Mz->~S0 = z->c;~C", p); + writef(g, "~Mz->~S0 = z->c;~C", p); } static void generate_rightslice(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "ket" : "bra"; - wp(g, "~Mz->~S0 = z->c;~C", p); + writef(g, "~Mz->~S0 = z->c;~C", p); } static void generate_assignto(struct generator * g, struct node * p) { g->V[0] = p->name; - wp(g, "~M~V0 = assign_to(z, ~V0);~C" + writef(g, "~M~V0 = assign_to(z, ~V0);~C" "~Mif (~V0 == 0) return -1;~C", p); } static void generate_sliceto(struct generator * g, struct node * p) { g->V[0] = p->name; - wp(g, "~M~V0 = slice_to(z, ~V0);~C" - "~Mif (~V0 == 0) return -1;~C", p); -} - -static void generate_data_address(struct generator * g, struct node * p) { - - symbol * b = p->literalstring; - if (b != 0) { - wi(g, SIZE(b)); w(g, ", "); - wlitref(g, b); - } else - wv(g, p->name); + writef(g, "~M~V0 = slice_to(z, ~V0);~C" + "~Mif (~V0 == 0) return -1;~N", p); } static void generate_insert(struct generator * g, struct node * p, int style) { int keep_c = style == c_attach; if (p->mode == m_backward) keep_c = !keep_c; - wp(g, "~{", p); - if (keep_c) w(g, "int c_keep = z->c;~N~M"); - wp(g, "int ret = insert_~$(z, z->c, z->c, ", p); - generate_data_address(g, p); - wp(g, ");~C", p); - if (keep_c) w(g, "~Mz->c = c_keep;~N"); - wp(g, "~Mif (ret < 0) return ret;~N" + writef(g, "~{int ret;~N", p); + if (keep_c) w(g, "~{int saved_c = z->c;~N"); + writef(g, "~Mret = insert_~$(z, z->c, z->c, ~a);~C", p); + if (keep_c) w(g, "~Mz->c = saved_c;~N~}"); + writef(g, "~Mif (ret < 0) return ret;~N" "~}", p); } static void generate_assignfrom(struct generator * g, struct node * p) { int keep_c = p->mode == m_forward; /* like 'attach' */ - wp(g, "~{", p); - if (keep_c) wp(g, "int c_keep = z->c;~N" - "~Mret = insert_~$(z, z->c, z->l, ", p); - else wp(g, "ret = insert_~$(z, z->lb, z->c, ", p); - generate_data_address(g, p); - wp(g, ");~C", p); - if (keep_c) w(g, "~Mz->c = c_keep;~N"); - wp(g, "~Mif (ret < 0) return ret;~N" + writef(g, "~{int ret;~N", p); + if (keep_c) writef(g, "~{int saved_c = z->c;~N", p); + w(g, "~Mret = "); + writef(g, keep_c ? "insert_~$(z, z->c, z->l, ~a);~C" : "insert_~$(z, z->lb, z->c, ~a);~C", p); + if (keep_c) w(g, "~Mz->c = saved_c;~N~}"); + writef(g, "~Mif (ret < 0) return ret;~N" "~}", p); } @@ -849,81 +972,132 @@ static void generate_assignfrom(struct generator * g, struct node * p) { static void generate_slicefrom(struct generator * g, struct node * p) { /* w(g, "~Mslice_from_s(z, "); <============= bug! should be: */ - wp(g, "~{int ret = slice_from_~$(z, ", p); - generate_data_address(g, p); - wp(g, ");~C", p); - wp(g, "~Mif (ret < 0) return ret;~N" + writef(g, "~{int ret = slice_from_~$(z, ~a);~C", p); + writef(g, "~Mif (ret < 0) return ret;~N" "~}", p); } static void generate_setlimit(struct generator * g, struct node * p) { int keep_c; - wp(g, "~{int mlimit;~C" - "~M~k~N" - , p); - keep_c = g->keep_count; - generate(g, p->left); - if (p->mode == m_forward) w(g, "~Mmlimit = z->l - z->c; z->l = z->c;~N"); - else w(g, "~Mmlimit = z->lb; z->lb = z->c;~N"); - w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); - g->failure_string = p->mode == m_forward ? "z->l += mlimit;" : - "z->lb = mlimit;"; + if (p->left && p->left->type == c_tomark) { + /* Special case for: + * + * setlimit tomark AE for C + * + * All uses of setlimit in the current stemmers we ship follow this + * pattern, and by special-casing we can avoid having to save and + * restore c. + */ + struct node * q = p->left; + + ++g->keep_count; + writef(g, "~N~{int mlimit", p); + write_int(g, g->keep_count); + writef(g, ";~C", p); + keep_c = g->keep_count; + + g->S[0] = q->mode == m_forward ? ">" : "<"; + + w(g, "~Mif (z->c ~S0 "); generate_AE(g, q->AE); writef(g, ") ~f~N", q); + w(g, "~Mmlimit"); + write_int(g, keep_c); + if (p->mode == m_forward) { + w(g, " = z->l - z->c; z->l = "); + } else { + w(g, " = z->lb; z->lb = "); + } + generate_AE(g, q->AE); + w(g, ";~N"); + } else { + writef(g, "~{~K~C", p); + keep_c = g->keep_count; + generate(g, p->left); + + w(g, "~Mmlimit"); + write_int(g, keep_c); + if (p->mode == m_forward) + w(g, " = z->l - z->c; z->l = z->c;~N"); + else + w(g, " = z->lb; z->lb = z->c;~N"); + w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); + } + + g->failure_keep_count = -keep_c; generate(g, p->aux); - wms(g, g->failure_string); + w(g, "~M"); + wrestorelimit(g, p, -g->failure_keep_count); w(g, "~N" "~}"); } +/* dollar sets snowball up to operate on a string variable as if it were the + * current string */ static void generate_dollar(struct generator * g, struct node * p) { int used = g->label_used; int a0 = g->failure_label; - const char * a1 = g->failure_string; + int a1 = g->failure_keep_count; + int keep_token; g->failure_label = new_label(g); g->label_used = 0; - g->failure_string = 0; + g->failure_keep_count = 0; + keep_token = ++g->keep_count; + g->I[0] = keep_token; + writef(g, "~{struct SN_env env~I0 = * z;~C", p); g->V[0] = p->name; - wp(g, "~{struct SN_env env = * z;~C" - "~Mint failure = 1; /* assume failure */~N" - "~Mz->p = ~V0;~N" - "~Mz->lb = z->c = 0;~N" - "~Mz->l = SIZE(z->p);~N", p); + /* Assume failure. */ + writef(g, "~Mint failure = 1;~N" + "~Mz->p = ~V0;~N" + "~Mz->lb = z->c = 0;~N" + "~Mz->l = SIZE(z->p);~N", p); generate(g, p->left); - w(g, "~Mfailure = 0; /* mark success */~N"); + /* Mark success. */ + w(g, "~Mfailure = 0;~N"); if (g->label_used) wsetl(g, g->failure_label); g->V[0] = p->name; /* necessary */ g->label_used = used; g->failure_label = a0; - g->failure_string = a1; + g->failure_keep_count = a1; - w(g, "~M~V0 = z->p;~N" - "~M* z = env;~N" - "~Mif (failure) ~f~N~}"); + g->I[0] = keep_token; + writef(g, "~M~V0 = z->p;~N" + "~M* z = env~I0;~N" + "~Mif (failure) ~f~N~}", p); } static void generate_integer_assign(struct generator * g, struct node * p, char * s) { g->V[0] = p->name; g->S[0] = s; - w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); wp(g, ";~C", p); + w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); writef(g, ";~C", p); } static void generate_integer_test(struct generator * g, struct node * p, char * s) { - g->V[0] = p->name; - g->S[0] = s; - w(g, "~Mif (!(~V0 ~S0 "); generate_AE(g, p->AE); wp(g, ")) ~f~C", p); + w(g, "~Mif (!("); + generate_AE(g, p->left); + write_char(g, ' '); + write_string(g, s); + write_char(g, ' '); + generate_AE(g, p->AE); + writef(g, ")) ~f~C", p); } static void generate_call(struct generator * g, struct node * p) { g->V[0] = p->name; - wp(g, "~{int ret = ~V0(z);~C" - "~Mif (ret == 0) ~f~N" - "~Mif (ret < 0) return ret;~N~}", p); + writef(g, "~{int ret = ~V0(z);~C", p); + if (g->failure_keep_count == 0 && g->failure_label == x_return) { + /* Combine the two tests in this special case for better optimisation + * and clearer generated code. */ + writef(g, "~Mif (ret <= 0) return ret;~N~}", p); + } else { + writef(g, "~Mif (ret == 0) ~f~N" + "~Mif (ret < 0) return ret;~N~}", p); + } } static void generate_grouping(struct generator * g, struct node * p, int complement) { @@ -931,27 +1105,47 @@ static void generate_grouping(struct generator * g, struct node * p, int complem struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; - g->S[2] = g->options->utf8 ? "_U" : ""; + g->S[2] = g->options->encoding == ENC_UTF8 ? "_U" : ""; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; - wp(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 0)) ~f~C", p); + writef(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 0)) ~f~C", p); } static void generate_namedstring(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "" : "_b"; g->V[0] = p->name; - wp(g, "~Mif (!(eq_v~S0(z, ~V0))) ~f~C", p); + writef(g, "~Mif (!(eq_v~S0(z, ~V0))) ~f~C", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; - g->S[0] = p->mode == m_forward ? "" : "_b"; - g->I[0] = SIZE(b); - g->L[0] = b; + if (SIZE(b) == 1) { + /* It's quite common to compare with a single character literal string, + * so just inline the simpler code for this case rather than making a + * function call. In UTF-8 mode, only do this for the ASCII subset, + * since multi-byte characters are more complex to test against. + */ + if (g->options->encoding == ENC_UTF8 && *b >= 128) { + printf("single byte %d\n", *b); + exit(1); + } + g->I[0] = *b; + if (p->mode == m_forward) { + writef(g, "~Mif (z->c == z->l || z->p[z->c] != ~c0) ~f~C" + "~Mz->c++;~N", p); + } else { + writef(g, "~Mif (z->c <= z->lb || z->p[z->c - 1] != ~c0) ~f~C" + "~Mz->c--;~N", p); + } + } else { + g->S[0] = p->mode == m_forward ? "" : "_b"; + g->I[0] = SIZE(b); + g->L[0] = b; - wp(g, "~Mif (!(eq_s~S0(z, ~I0, ~L0))) ~f~C", p); + writef(g, "~Mif (!(eq_s~S0(z, ~I0, ~L0))) ~f~C", p); + } } static void generate_define(struct generator * g, struct node * p) { @@ -961,9 +1155,13 @@ static void generate_define(struct generator * g, struct node * p) { g->S[0] = q->type == t_routine ? "static" : "extern"; g->V[0] = q; - w(g, "~N~S0 int ~V0(struct SN_env * z) {~N~+"); + w(g, "~N~S0 int ~V0(struct SN_env * z) {"); + if (g->options->comments) { + write_string(g, p->mode == m_forward ? " /* forwardmode */" : " /* backwardmode */"); + } + w(g, "~N~+"); if (p->amongvar_needed) w(g, "~Mint among_var;~N"); - g->failure_string = 0; + g->failure_keep_count = 0; g->failure_label = x_return; g->label_used = 0; g->keep_count = 0; @@ -982,6 +1180,7 @@ static void generate_substring(struct generator * g, struct node * p) { int n_cases = 0; symbol cases[2]; int shortest_size = INT_MAX; + int shown_comment = 0; g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; @@ -1019,19 +1218,19 @@ static void generate_substring(struct generator * g, struct node * p) { if (n_cases > 2) break; } if (block == -1) { - if (ch == cases[0]) continue; + if (n_cases > 0 && ch == cases[0]) continue; if (n_cases < 2) { - cases[n_cases++] = ch; + cases[n_cases++] = ch; } else if (ch != cases[1]) { - ++n_cases; - break; + ++n_cases; + break; } } else { if ((bitmap & (1u << (ch & 0x1f))) == 0) { - bitmap |= 1u << (ch & 0x1f); - if (n_cases < 2) - cases[n_cases] = ch; - ++n_cases; + bitmap |= 1u << (ch & 0x1f); + if (n_cases < 2) + cases[n_cases] = ch; + ++n_cases; } } } @@ -1045,16 +1244,16 @@ static void generate_substring(struct generator * g, struct node * p) { sprintf(buf, "z->p[z->c + %d]", shortest_size - 1); g->S[1] = buf; if (shortest_size == 1) { - wp(g, "~Mif (z->c >= z->l || ", p); + writef(g, "~Mif (z->c >= z->l", p); } else { - wp(g, "~Mif (z->c + ~I4 >= z->l || ", p); + writef(g, "~Mif (z->c + ~I4 >= z->l", p); } } else { g->S[1] = "z->p[z->c - 1]"; if (shortest_size == 1) { - wp(g, "~Mif (z->c <= z->lb || ", p); + writef(g, "~Mif (z->c <= z->lb", p); } else { - wp(g, "~Mif (z->c - ~I4 <= z->lb || ", p); + writef(g, "~Mif (z->c - ~I4 <= z->lb", p); } } if (n_cases == 0) { @@ -1062,81 +1261,83 @@ static void generate_substring(struct generator * g, struct node * p) { * This doesn't seem to be a useful construct, but it is * syntactically valid. */ - wp(g, "0", p); } else if (n_cases == 1) { g->I[4] = cases[0]; - wp(g, "~S1 != ~I4", p); + writef(g, " || ~S1 != ~I4", p); } else if (n_cases == 2) { g->I[4] = cases[0]; g->I[5] = cases[1]; - wp(g, "(~S1 != ~I4 && ~S1 != ~I5)", p); + writef(g, " || (~S1 != ~I4 && ~S1 != ~I5)", p); } else { - wp(g, "~S1 >> 5 != ~I2 || !((~I3 >> (~S1 & 0x1f)) & 1)", p); + writef(g, " || ~S1 >> 5 != ~I2 || !((~I3 >> (~S1 & 0x1f)) & 1)", p); } - ws(g, ") "); + write_string(g, ") "); if (empty_case != -1) { /* If the among includes the empty string, it can never fail * so not matching the bitmap means we match the empty string. */ g->I[4] = among_cases[empty_case].result; - wp(g, "among_var = ~I4; else~C", p); + writef(g, "among_var = ~I4; else~C", p); } else { - wp(g, "~f~C", p); + writef(g, "~f~C", p); } + shown_comment = 1; } else { #ifdef OPTIMISATION_WARNINGS printf("Couldn't shortcut among %d\n", x->number); #endif } - if (x->command_count == 0 && x->starter == 0) - wp(g, "~Mif (!(find_among~S0(z, a_~I0, ~I1))) ~f~C", p); - else - wp(g, "~Mamong_var = find_among~S0(z, a_~I0, ~I1);~C" - "~Mif (!(among_var)) ~f~N", p); + if (!x->amongvar_needed) { + writef(g, "~Mif (!(find_among~S0(z, a_~I0, ~I1))) ~f", p); + writef(g, shown_comment ? "~N" : "~C", p); + } else { + writef(g, "~Mamong_var = find_among~S0(z, a_~I0, ~I1);", p); + writef(g, shown_comment ? "~N" : "~C", p); + writef(g, "~Mif (!(among_var)) ~f~N", p); + } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; - int case_number = 1; if (x->substring == 0) generate_substring(g, p); - if (x->command_count == 0 && x->starter == 0) return; - unless (x->starter == 0) generate(g, x->starter); + if (x->starter != 0) generate(g, x->starter); - p = p->left; - if (p != 0 && p->type != c_literalstring) p = p->right; - w(g, "~Mswitch(among_var) {~N~+" - "~Mcase 0: ~f~N"); - - until (p == 0) { - if (p->type == c_bra && p->left != 0) { - g->I[0] = case_number++; - w(g, "~Mcase ~I0:~N~+"); generate(g, p); w(g, "~Mbreak;~N~-"); - } - p = p->right; + if (x->command_count == 1 && x->nocommand_count == 0) { + /* Only one outcome ("no match" already handled). */ + generate(g, x->commands[0]); + } else if (x->command_count > 0) { + int i; + writef(g, "~Mswitch (among_var) {~C~+", p); + for (i = 1; i <= x->command_count; i++) { + g->I[0] = i; + w(g, "~Mcase ~I0:~N~+"); + generate(g, x->commands[i - 1]); + w(g, "~Mbreak;~N~-"); + } + w(g, "~}"); } - w(g, "~}"); } static void generate_booltest(struct generator * g, struct node * p) { g->V[0] = p->name; - wp(g, "~Mif (!(~V0)) ~f~C", p); + writef(g, "~Mif (!(~V0)) ~f~C", p); } static void generate_false(struct generator * g, struct node * p) { - wp(g, "~M~f~C", p); + writef(g, "~M~f~C", p); } static void generate_debug(struct generator * g, struct node * p) { g->I[0] = g->debug_count++; g->I[1] = p->line_number; - wp(g, "~Mdebug(z, ~I0, ~I1);~C", p); + writef(g, "~Mdebug(z, ~I0, ~I1);~C", p); } @@ -1144,10 +1345,9 @@ static void generate(struct generator * g, struct node * p) { int used = g->label_used; int a0 = g->failure_label; - const char * a1 = g->failure_string; + int a1 = g->failure_keep_count; - switch (p->type) - { + switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; @@ -1163,7 +1363,7 @@ static void generate(struct generator * g, struct node * p) { case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; - case c_repeat: generate_repeat(g, p, false); break; + case c_repeat: generate_repeat(g, p); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; @@ -1213,30 +1413,40 @@ static void generate(struct generator * g, struct node * p) { if (g->failure_label != a0) g->label_used = used; g->failure_label = a0; - g->failure_string = a1; + g->failure_keep_count = a1; } -static void generate_start_comment(struct generator * g) { +void write_generated_comment_content(struct generator * g) { + w(g, "Generated by Snowball " SNOWBALL_VERSION + " - https://snowballstem.org/"); +} - w(g, "~N/* This file was generated automatically by the Snowball to ANSI C compiler */~N"); +void write_start_comment(struct generator * g, + const char * comment_start, + const char * comment_end) { + write_margin(g); + w(g, comment_start); + write_generated_comment_content(g); + if (comment_end) { + w(g, comment_end); + } + w(g, "~N~N"); } static void generate_head(struct generator * g) { - if (g->options->runtime_path == 0) { - w(g, "~N#include \"header.h\"~N~N"); - } else { - w(g, "~N#include \""); - ws(g, g->options->runtime_path); + w(g, "#include \""); + if (g->options->runtime_path) { + write_string(g, g->options->runtime_path); if (g->options->runtime_path[strlen(g->options->runtime_path) - 1] != '/') - wch(g, '/'); - w(g, "header.h\"~N~N"); + write_char(g, '/'); } + w(g, "header.h\"~N~N"); } static void generate_routine_headers(struct generator * g) { - struct name * q = g->analyser->names; - until (q == 0) { + struct name * q; + for (q = g->analyser->names; q; q = q->next) { g->V[0] = q; switch (q->type) { case t_routine: @@ -1254,7 +1464,6 @@ static void generate_routine_headers(struct generator * g) { ); break; } - q = q->next; } } @@ -1265,12 +1474,11 @@ static void generate_among_table(struct generator * g, struct among * x) { g->I[0] = x->number; { int i; - for (i = 0; i < x->literalstring_count; i++) - { + for (i = 0; i < x->literalstring_count; i++) { g->I[1] = i; g->I[2] = v->size; g->L[0] = v->b; - unless (v->size == 0) + if (v->size) w(g, "static const symbol s_~I0_~I1[~I2] = ~A0;~N"); v++; } @@ -1289,12 +1497,21 @@ static void generate_among_table(struct generator * g, struct among * x) { g->I[4] = v->result; g->S[0] = i < x->literalstring_count - 1 ? "," : ""; - w(g, "/*~J1 */ { ~I2, "); - if (v->size == 0) w(g, "0,"); - else w(g, "s_~I0_~I1,"); + if (g->options->comments) { + w(g, "/*~J1 */ "); + } + w(g, "{ ~I2, "); + if (v->size == 0) { + w(g, "0,"); + } else { + w(g, "s_~I0_~I1,"); + } w(g, " ~I3, ~I4, "); - if (v->function == 0) w(g, "0"); else - wvn(g, v->function); + if (v->function == 0) { + write_char(g, '0'); + } else { + write_varname(g, v->function); + } w(g, "}~S0~N"); v++; } @@ -1303,10 +1520,9 @@ static void generate_among_table(struct generator * g, struct among * x) { } static void generate_amongs(struct generator * g) { - struct among * x = g->analyser->amongs; - until (x == 0) { + struct among * x; + for (x = g->analyser->amongs; x; x = x->next) { generate_among_table(g, x); - x = x->next; } } @@ -1323,24 +1539,22 @@ static void generate_grouping_table(struct generator * g, struct grouping * q) { for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); - { - g->V[0] = q->name; + g->V[0] = q->name; - w(g, "static const unsigned char ~V0[] = { "); - for (i = 0; i < size; i++) { - wi(g, map[i]); - if (i < size - 1) w(g, ", "); - } - w(g, " };~N~N"); + w(g, "static const unsigned char ~V0[] = { "); + for (i = 0; i < size; i++) { + write_int(g, map[i]); + if (i < size - 1) w(g, ", "); } + w(g, " };~N~N"); lose_b(map); } static void generate_groupings(struct generator * g) { - struct grouping * q = g->analyser->groupings; - until (q == 0) { - generate_grouping_table(g, q); - q = q->next; + struct grouping * q; + for (q = g->analyser->groupings; q; q = q->next) { + if (q->name->used) + generate_grouping_table(g, q); } } @@ -1348,10 +1562,9 @@ static void generate_create(struct generator * g) { int * p = g->analyser->name_count; g->I[0] = p[t_string]; - g->I[1] = p[t_integer]; - g->I[2] = p[t_boolean]; + g->I[1] = p[t_integer] + p[t_boolean]; w(g, "~N" - "extern struct SN_env * ~pcreate_env(void) { return SN_create_env(~I0, ~I1, ~I2); }" + "extern struct SN_env * ~pcreate_env(void) { return SN_create_env(~I0, ~I1); }" "~N"); } @@ -1371,36 +1584,47 @@ static void generate_create_and_close_templates(struct generator * g) { static void generate_header_file(struct generator * g) { - struct name * q = g->analyser->names; - char * vp = g->options->variables_prefix; + struct name * q; + const char * vp = g->options->variables_prefix; g->S[0] = vp; - w(g, "~N" - "#ifdef __cplusplus~N" + w(g, "#ifdef __cplusplus~N" "extern \"C\" {~N" "#endif~N"); /* for C++ */ generate_create_and_close_templates(g); - until (q == 0) { + for (q = g->analyser->names; q; q = q->next) { g->V[0] = q; - switch (q->type) - { + switch (q->type) { case t_external: w(g, "extern int ~W0(struct SN_env * z);~N"); break; - case t_string: g->S[1] = "S"; goto label0; - case t_integer: g->S[1] = "I"; goto label0; - case t_boolean: g->S[1] = "B"; - label0: + case t_string: + case t_integer: + case t_boolean: if (vp) { - g->I[0] = q->count; + int count = q->count; + if (count < 0) { + /* Unused variables should get removed from `names`. */ + fprintf(stderr, "Optimised out variable "); + report_b(stderr, q->b); + fprintf(stderr, " still in names list\n"); + exit(1); + } + if (q->type == t_boolean) { + /* We use a single array for booleans and integers, + * with the integers first. + */ + count += g->analyser->name_count[t_integer]; + } + g->I[0] = count; + g->I[1] = "SIIrxg"[q->type]; w(g, "#define ~S0"); - str_append_b(g->outbuf, q->b); - w(g, " (~S1[~I0])~N"); + write_b(g, q->b); + w(g, " (~c1[~I0])~N"); } break; } - q = q->next; } w(g, "~N" @@ -1414,7 +1638,7 @@ static void generate_header_file(struct generator * g) { extern void generate_program_c(struct generator * g) { g->outbuf = str_new(); - generate_start_comment(g); + write_start_comment(g, "/* ", " */"); generate_head(g); generate_routine_headers(g); w(g, "#ifdef __cplusplus~N" @@ -1433,33 +1657,69 @@ extern void generate_program_c(struct generator * g) { g->literalstring_count = 0; { struct node * p = g->analyser->program; - until (p == 0) { generate(g, p); p = p->right; } + while (p) { generate(g, p); p = p->right; } } generate_create(g); generate_close(g); - output_str(g->options->output_c, g->declarations); + output_str(g->options->output_src, g->declarations); str_delete(g->declarations); - output_str(g->options->output_c, g->outbuf); + output_str(g->options->output_src, g->outbuf); str_clear(g->outbuf); - generate_start_comment(g); + write_start_comment(g, "/* ", " */"); generate_header_file(g); output_str(g->options->output_h, g->outbuf); str_delete(g->outbuf); } -extern struct generator * create_generator_c(struct analyser * a, struct options * o) { +/* Generator functions common to multiple languages. */ + +extern struct generator * create_generator(struct analyser * a, struct options * o) { NEW(generator, g); g->analyser = a; g->options = o; g->margin = 0; g->debug_count = 0; + g->copy_from_count = 0; g->line_count = 0; + g->line_labelled = 0; + g->failure_label = -1; + g->unreachable = false; +#ifndef DISABLE_PYTHON + g->max_label = 0; +#endif return g; } -extern void close_generator_c(struct generator * g) { - +extern void close_generator(struct generator * g) { FREE(g); } +/* Write routines for simple entities */ + +extern void write_char(struct generator * g, int ch) { + str_append_ch(g->outbuf, ch); /* character */ +} + +extern void write_newline(struct generator * g) { + str_append_ch(g->outbuf, '\n'); /* newline */ + g->line_count++; +} + +extern void write_string(struct generator * g, const char * s) { + str_append_string(g->outbuf, s); +} + +extern void write_int(struct generator * g, int i) { + str_append_int(g->outbuf, i); +} + +extern void write_b(struct generator * g, symbol * b) { + + str_append_b(g->outbuf, b); +} + +extern void write_str(struct generator * g, struct str * str) { + + str_append(g->outbuf, str); +} diff --git a/contrib/snowball/compiler/generator_java.c b/contrib/snowball/compiler/generator_java.c deleted file mode 100644 index 07a4b8e2e..000000000 --- a/contrib/snowball/compiler/generator_java.c +++ /dev/null @@ -1,1452 +0,0 @@ - -#include <stdlib.h> /* for exit */ -#include <string.h> /* for strlen */ -#include <stdio.h> /* for fprintf etc */ -#include "header.h" - -/* prototypes */ - -static void generate(struct generator * g, struct node * p); -static void w(struct generator * g, const char * s); -static void writef(struct generator * g, const char * s, struct node * p); - - -enum special_labels { - x_return = -1 -}; - -static int new_label(struct generator * g) { - - return g->next_label++; -} - -static struct str * vars_newname(struct generator * g) { - - struct str * output; - g->var_number ++; - output = str_new(); - str_append_string(output, "v_"); - str_append_int(output, g->var_number); - return output; -} - -/* Output routines */ -static void output_str(FILE * outfile, struct str * str) { - - char * s = b_to_s(str_data(str)); - fprintf(outfile, "%s", s); - free(s); -} - -/* Write routines for simple entities */ - -static void write_char(struct generator * g, int ch) { - - str_append_ch(g->outbuf, ch); -} - -static void write_newline(struct generator * g) { - - str_append_string(g->outbuf, "\n"); -} - -static void write_string(struct generator * g, const char * s) { - - str_append_string(g->outbuf, s); -} - -static void write_b(struct generator * g, symbol * b) { - - str_append_b(g->outbuf, b); -} - -static void write_str(struct generator * g, struct str * str) { - - str_append(g->outbuf, str); -} - -static void write_int(struct generator * g, int i) { - - str_append_int(g->outbuf, i); -} - - -/* Write routines for items from the syntax tree */ - -static void write_varname(struct generator * g, struct name * p) { - - int ch = "SBIrxg"[p->type]; - if (p->type != t_external) - { - write_char(g, ch); - write_char(g, '_'); - } - str_append_b(g->outbuf, p->b); -} - -static void write_varref(struct generator * g, struct name * p) { - - /* In java, references look just the same */ - write_varname(g, p); -} - -static void write_hexdigit(struct generator * g, int n) { - - write_char(g, n < 10 ? n + '0' : n - 10 + 'A'); -} - -static void write_hex(struct generator * g, int ch) { - - write_string(g, "\\u"); - { - int i; - for (i = 12; i >= 0; i -= 4) write_hexdigit(g, ch >> i & 0xf); - } -} - -static void write_literal_string(struct generator * g, symbol * p) { - - int i; - write_string(g, "\""); - for (i = 0; i < SIZE(p); i++) { - int ch = p[i]; - if (32 <= ch && ch <= 127) { - if (ch == '\"' || ch == '\\') write_string(g, "\\"); - write_char(g, ch); - } else { - write_hex(g, ch); - } - } - write_string(g, "\""); -} - -static void write_margin(struct generator * g) { - - int i; - for (i = 0; i < g->margin; i++) write_string(g, " "); -} - -/* Write a variable declaration. */ -static void write_declare(struct generator * g, - char * declaration, - struct node * p) { - - struct str * temp = g->outbuf; - g->outbuf = g->declarations; - write_string(g, " "); - writef(g, declaration, p); - write_string(g, ";"); - write_newline(g); - g->outbuf = temp; -} - -static void write_comment(struct generator * g, struct node * p) { - - write_margin(g); - write_string(g, "// "); - write_string(g, name_of_token(p->type)); - if (p->name != 0) { - write_string(g, " "); - str_append_b(g->outbuf, p->name->b); - } - write_string(g, ", line "); - write_int(g, p->line_number); - write_newline(g); -} - -static void write_block_start(struct generator * g) { - - w(g, "~M{~+~N"); -} - -static void write_block_end(struct generator * g) /* block end */ { - - w(g, "~-~M}~N"); -} - -static void write_savecursor(struct generator * g, struct node * p, - struct str * savevar) { - - g->B[0] = str_data(savevar); - g->S[1] = ""; - if (p->mode != m_forward) g->S[1] = "limit - "; - write_declare(g, "int ~B0", p); - writef(g, "~M~B0 = ~S1cursor;~N" , p); -} - -static void restore_string(struct node * p, struct str * out, struct str * savevar) { - - str_clear(out); - str_append_string(out, "cursor = "); - if (p->mode != m_forward) str_append_string(out, "limit - "); - str_append(out, savevar); - str_append_string(out, ";"); -} - -static void write_restorecursor(struct generator * g, struct node * p, - struct str * savevar) { - - struct str * temp = str_new(); - write_margin(g); - restore_string(p, temp, savevar); - write_str(g, temp); - write_newline(g); - str_delete(temp); -} - -static void write_inc_cursor(struct generator * g, struct node * p) { - - write_margin(g); - write_string(g, p->mode == m_forward ? "cursor++;" : "cursor--;"); - write_newline(g); -} - -static void wsetlab_begin(struct generator * g, int n) { - - w(g, "~Mlab"); - write_int(g, n); - w(g, ": do {~+~N"); -} - -static void wsetlab_end(struct generator * g) { - - w(g, "~-~M} while (false);~N"); -} - -static void wgotol(struct generator * g, int n) { - - write_margin(g); - write_string(g, "break lab"); - write_int(g, n); - write_string(g, ";"); - write_newline(g); -} - -static void write_failure(struct generator * g) { - - if (str_len(g->failure_str) != 0) { - write_margin(g); - write_str(g, g->failure_str); - write_newline(g); - } - write_margin(g); - switch (g->failure_label) - { - case x_return: - write_string(g, "return false;"); - break; - default: - write_string(g, "break lab"); - write_int(g, g->failure_label); - write_string(g, ";"); - g->unreachable = true; - } - write_newline(g); -} - -static void write_failure_if(struct generator * g, char * s, struct node * p) { - - writef(g, "~Mif (", p); - writef(g, s, p); - writef(g, ")~N", p); - write_block_start(g); - write_failure(g); - write_block_end(g); - g->unreachable = false; -} - -/* if at limit fail */ -static void write_check_limit(struct generator * g, struct node * p) { - - if (p->mode == m_forward) { - write_failure_if(g, "cursor >= limit", p); - } else { - write_failure_if(g, "cursor <= limit_backward", p); - } -} - -/* Formatted write. */ -static void writef(struct generator * g, const char * input, struct node * p) { - - int i = 0; - int l = strlen(input); - - while (i < l) { - int ch = input[i++]; - if (ch == '~') { - switch(input[i++]) { - default: write_char(g, input[i - 1]); continue; - case 'C': write_comment(g, p); continue; - case 'f': write_block_start(g); - write_failure(g); - g->unreachable = false; - write_block_end(g); - continue; - case 'M': write_margin(g); continue; - case 'N': write_newline(g); continue; - case '{': write_block_start(g); continue; - case '}': write_block_end(g); continue; - case 'S': write_string(g, g->S[input[i++] - '0']); continue; - case 'B': write_b(g, g->B[input[i++] - '0']); continue; - case 'I': write_int(g, g->I[input[i++] - '0']); continue; - case 'V': write_varref(g, g->V[input[i++] - '0']); continue; - case 'W': write_varname(g, g->V[input[i++] - '0']); continue; - case 'L': write_literal_string(g, g->L[input[i++] - '0']); continue; - case '+': g->margin++; continue; - case '-': g->margin--; continue; - case 'n': write_string(g, g->options->name); continue; - } - } else { - write_char(g, ch); - } - } -} - -static void w(struct generator * g, const char * s) { - writef(g, s, 0); -} - -static void generate_AE(struct generator * g, struct node * p) { - char * s; - switch (p->type) { - case c_name: - write_varref(g, p->name); break; - case c_number: - write_int(g, p->number); break; - case c_maxint: - write_string(g, "MAXINT"); break; - case c_minint: - write_string(g, "MININT"); break; - case c_neg: - write_string(g, "-"); generate_AE(g, p->right); break; - case c_multiply: - s = " * "; goto label0; - case c_plus: - s = " + "; goto label0; - case c_minus: - s = " - "; goto label0; - case c_divide: - s = " / "; - label0: - write_string(g, "("); generate_AE(g, p->left); - write_string(g, s); generate_AE(g, p->right); write_string(g, ")"); break; - case c_sizeof: - g->V[0] = p->name; - w(g, "(~V0.length())"); break; - case c_cursor: - w(g, "cursor"); break; - case c_limit: - w(g, p->mode == m_forward ? "limit" : "limit_backward"); break; - case c_size: - w(g, "(current.length())"); break; - } -} - -/* K_needed() tests to see if we really need to keep c. Not true when the - the command does not touch the cursor. This and repeat_score() could be - elaborated almost indefinitely. -*/ - -static int K_needed(struct generator * g, struct node * p) { - - while (p != 0) { - switch (p->type) { - case c_dollar: - case c_leftslice: - case c_rightslice: - case c_mathassign: - case c_plusassign: - case c_minusassign: - case c_multiplyassign: - case c_divideassign: - case c_eq: - case c_ne: - case c_gr: - case c_ge: - case c_ls: - case c_le: - case c_sliceto: - case c_booltest: - case c_true: - case c_false: - case c_debug: - break; - - case c_call: - if (K_needed(g, p->name->definition)) return true; - break; - - case c_bra: - if (K_needed(g, p->left)) return true; - break; - - default: return true; - } - p = p->right; - } - return false; -} - -static int repeat_score(struct generator * g, struct node * p) { - - int score = 0; - while (p != 0) { - switch (p->type) { - case c_dollar: - case c_leftslice: - case c_rightslice: - case c_mathassign: - case c_plusassign: - case c_minusassign: - case c_multiplyassign: - case c_divideassign: - case c_eq: - case c_ne: - case c_gr: - case c_ge: - case c_ls: - case c_le: - case c_sliceto: /* case c_not: must not be included here! */ - case c_debug: - break; - - case c_call: - score += repeat_score(g, p->name->definition); - break; - - case c_bra: - score += repeat_score(g, p->left); - break; - - case c_name: - case c_literalstring: - case c_next: - case c_grouping: - case c_non: - case c_hop: - score = score + 1; - break; - - default: - score = 2; - break; - } - p = p->right; - } - return score; -} - -/* tests if an expression requires cursor reinstatement in a repeat */ - -static int repeat_restore(struct generator * g, struct node * p) { - - return repeat_score(g, p) >= 2; -} - -static void generate_bra(struct generator * g, struct node * p) { - - write_comment(g, p); - p = p->left; - while (p != 0) { - generate(g, p); - p = p->right; - } -} - -static void generate_and(struct generator * g, struct node * p) { - - struct str * savevar = vars_newname(g); - int keep_c = K_needed(g, p->left); - - write_comment(g, p); - - if (keep_c) write_savecursor(g, p, savevar); - - p = p->left; - while (p != 0) { - generate(g, p); - if (g->unreachable) break; - if (keep_c && p->right != 0) write_restorecursor(g, p, savevar); - p = p->right; - } - str_delete(savevar); -} - -static void generate_or(struct generator * g, struct node * p) { - - struct str * savevar = vars_newname(g); - int keep_c = K_needed(g, p->left); - - int a0 = g->failure_label; - struct str * a1 = str_copy(g->failure_str); - - int out_lab = new_label(g); - write_comment(g, p); - wsetlab_begin(g, out_lab); - - if (keep_c) write_savecursor(g, p, savevar); - - p = p->left; - str_clear(g->failure_str); - - if (p == 0) { - /* p should never be 0 after an or: there should be at least two - * sub nodes. */ - fprintf(stderr, "Error: \"or\" node without children nodes."); - exit (1); - } - while (p->right != 0) { - g->failure_label = new_label(g); - wsetlab_begin(g, g->failure_label); - generate(g, p); - if (!g->unreachable) wgotol(g, out_lab); - wsetlab_end(g); - g->unreachable = false; - if (keep_c) write_restorecursor(g, p, savevar); - p = p->right; - } - - g->failure_label = a0; - str_delete(g->failure_str); - g->failure_str = a1; - - generate(g, p); - wsetlab_end(g); - str_delete(savevar); -} - -static void generate_backwards(struct generator * g, struct node * p) { - - write_comment(g, p); - writef(g,"~Mlimit_backward = cursor; cursor = limit;~N", p); - generate(g, p->left); - w(g, "~Mcursor = limit_backward;"); -} - - -static void generate_not(struct generator * g, struct node * p) { - - struct str * savevar = vars_newname(g); - int keep_c = K_needed(g, p->left); - - int a0 = g->failure_label; - struct str * a1 = str_copy(g->failure_str); - - write_comment(g, p); - if (keep_c) { - write_block_start(g); - write_savecursor(g, p, savevar); - } - - g->failure_label = new_label(g); - str_clear(g->failure_str); - - wsetlab_begin(g, g->failure_label); - - generate(g, p->left); - - g->failure_label = a0; - str_delete(g->failure_str); - g->failure_str = a1; - - if (!g->unreachable) write_failure(g); - - wsetlab_end(g); - g->unreachable = false; - - if (keep_c) write_restorecursor(g, p, savevar); - if (keep_c) write_block_end(g); - str_delete(savevar); -} - - -static void generate_try(struct generator * g, struct node * p) { - - struct str * savevar = vars_newname(g); - int keep_c = K_needed(g, p->left); - - write_comment(g, p); - if (keep_c) write_savecursor(g, p, savevar); - - g->failure_label = new_label(g); - if (keep_c) restore_string(p, g->failure_str, savevar); - - wsetlab_begin(g, g->failure_label); - generate(g, p->left); - wsetlab_end(g); - g->unreachable = false; - - str_delete(savevar); -} - -static void generate_set(struct generator * g, struct node * p) { - - write_comment(g, p); - g->V[0] = p->name; - writef(g, "~M~V0 = true;~N", p); -} - -static void generate_unset(struct generator * g, struct node * p) { - - write_comment(g, p); - g->V[0] = p->name; - writef(g, "~M~V0 = false;~N", p); -} - -static void generate_fail(struct generator * g, struct node * p) { - - write_comment(g, p); - generate(g, p->left); - if (!g->unreachable) write_failure(g); -} - -/* generate_test() also implements 'reverse' */ - -static void generate_test(struct generator * g, struct node * p) { - - struct str * savevar = vars_newname(g); - int keep_c = K_needed(g, p->left); - - write_comment(g, p); - - if (keep_c) { - write_savecursor(g, p, savevar); - } - - generate(g, p->left); - - if (!g->unreachable) { - if (keep_c) { - write_restorecursor(g, p, savevar); - } - } - str_delete(savevar); -} - -static void generate_do(struct generator * g, struct node * p) { - - struct str * savevar = vars_newname(g); - int keep_c = K_needed(g, p->left); - write_comment(g, p); - if (keep_c) write_savecursor(g, p, savevar); - - g->failure_label = new_label(g); - str_clear(g->failure_str); - - wsetlab_begin(g, g->failure_label); - generate(g, p->left); - wsetlab_end(g); - g->unreachable = false; - - if (keep_c) write_restorecursor(g, p, savevar); - str_delete(savevar); -} - -static void generate_GO(struct generator * g, struct node * p, int style) { - - int end_unreachable = false; - struct str * savevar = vars_newname(g); - int keep_c = style == 1 || repeat_restore(g, p->left); - - int a0 = g->failure_label; - struct str * a1 = str_copy(g->failure_str); - - int golab = new_label(g); - g->I[0] = golab; - write_comment(g, p); - w(g, "~Mgolab~I0: while(true)~N"); - w(g, "~{"); - - if (keep_c) write_savecursor(g, p, savevar); - - g->failure_label = new_label(g); - wsetlab_begin(g, g->failure_label); - generate(g, p->left); - - if (g->unreachable) { - /* Cannot break out of this loop: therefore the code after the - * end of the loop is unreachable.*/ - end_unreachable = true; - } else { - /* include for goto; omit for gopast */ - if (style == 1) write_restorecursor(g, p, savevar); - g->I[0] = golab; - w(g, "~Mbreak golab~I0;~N"); - } - g->unreachable = false; - wsetlab_end(g); - if (keep_c) write_restorecursor(g, p, savevar); - - g->failure_label = a0; - str_delete(g->failure_str); - g->failure_str = a1; - - write_check_limit(g, p); - write_inc_cursor(g, p); - write_block_end(g); - str_delete(savevar); - g->unreachable = end_unreachable; -} - -static void generate_loop(struct generator * g, struct node * p) { - - struct str * loopvar = vars_newname(g); - write_comment(g, p); - g->B[0] = str_data(loopvar); - write_declare(g, "int ~B0", p); - w(g, "~Mfor (~B0 = "); - generate_AE(g, p->AE); - g->B[0] = str_data(loopvar); - writef(g, "; ~B0 > 0; ~B0--)~N", p); - writef(g, "~{", p); - - generate(g, p->left); - - w(g, "~}"); - str_delete(loopvar); - g->unreachable = false; -} - -static void generate_repeat(struct generator * g, struct node * p, struct str * loopvar) { - - struct str * savevar = vars_newname(g); - int keep_c = repeat_restore(g, p->left); - int replab = new_label(g); - g->I[0] = replab; - write_comment(g, p); - writef(g, "~Mreplab~I0: while(true)~N~{", p); - - if (keep_c) write_savecursor(g, p, savevar); - - g->failure_label = new_label(g); - str_clear(g->failure_str); - wsetlab_begin(g, g->failure_label); - generate(g, p->left); - - if (!g->unreachable) { - if (loopvar != 0) { - g->B[0] = str_data(loopvar); - w(g, "~M~B0--;~N"); - } - - g->I[0] = replab; - w(g, "~Mcontinue replab~I0;~N"); - } - - wsetlab_end(g); - g->unreachable = false; - - if (keep_c) write_restorecursor(g, p, savevar); - - g->I[0] = replab; - w(g, "~Mbreak replab~I0;~N~}"); - str_delete(savevar); -} - -static void generate_atleast(struct generator * g, struct node * p) { - - struct str * loopvar = vars_newname(g); - write_comment(g, p); - w(g, "~{"); - g->B[0] = str_data(loopvar); - w(g, "~Mint ~B0 = "); - generate_AE(g, p->AE); - w(g, ";~N"); - { - int a0 = g->failure_label; - struct str * a1 = str_copy(g->failure_str); - - generate_repeat(g, p, loopvar); - - g->failure_label = a0; - str_delete(g->failure_str); - g->failure_str = a1; - } - g->B[0] = str_data(loopvar); - write_failure_if(g, "~B0 > 0", p); - w(g, "~}"); - str_delete(loopvar); -} - -static void generate_setmark(struct generator * g, struct node * p) { - - write_comment(g, p); - g->V[0] = p->name; - writef(g, "~M~V0 = cursor;~N", p); -} - -static void generate_tomark(struct generator * g, struct node * p) { - - write_comment(g, p); - g->S[0] = p->mode == m_forward ? ">" : "<"; - - w(g, "~Mif (cursor ~S0 "); generate_AE(g, p->AE); w(g, ")~N"); - write_block_start(g); - write_failure(g); - write_block_end(g); - g->unreachable = false; - w(g, "~Mcursor = "); generate_AE(g, p->AE); writef(g, ";~N", p); -} - -static void generate_atmark(struct generator * g, struct node * p) { - - write_comment(g, p); - w(g, "~Mif (cursor != "); generate_AE(g, p->AE); writef(g, ")~N", p); - write_block_start(g); - write_failure(g); - write_block_end(g); - g->unreachable = false; -} - - -static void generate_hop(struct generator * g, struct node * p) { - - write_comment(g, p); - g->S[0] = p->mode == m_forward ? "+" : "-"; - - w(g, "~{~Mint c = cursor ~S0 "); - generate_AE(g, p->AE); - w(g, ";~N"); - - g->S[0] = p->mode == m_forward ? "0" : "limit_backward"; - - write_failure_if(g, "~S0 > c || c > limit", p); - writef(g, "~Mcursor = c;~N", p); - writef(g, "~}", p); -} - -static void generate_delete(struct generator * g, struct node * p) { - - write_comment(g, p); - writef(g, "~Mslice_del();~N", p); -} - - -static void generate_next(struct generator * g, struct node * p) { - - write_comment(g, p); - write_check_limit(g, p); - write_inc_cursor(g, p); -} - -static void generate_tolimit(struct generator * g, struct node * p) { - - write_comment(g, p); - g->S[0] = p->mode == m_forward ? "limit" : "limit_backward"; - writef(g, "~Mcursor = ~S0;~N", p); -} - -static void generate_atlimit(struct generator * g, struct node * p) { - - write_comment(g, p); - g->S[0] = p->mode == m_forward ? "limit" : "limit_backward"; - g->S[1] = p->mode == m_forward ? "<" : ">"; - write_failure_if(g, "cursor ~S1 ~S0", p); -} - -static void generate_leftslice(struct generator * g, struct node * p) { - - write_comment(g, p); - g->S[0] = p->mode == m_forward ? "bra" : "ket"; - writef(g, "~M~S0 = cursor;~N", p); -} - -static void generate_rightslice(struct generator * g, struct node * p) { - - write_comment(g, p); - g->S[0] = p->mode == m_forward ? "ket" : "bra"; - writef(g, "~M~S0 = cursor;~N", p); -} - -static void generate_assignto(struct generator * g, struct node * p) { - - write_comment(g, p); - g->V[0] = p->name; - writef(g, "~M~V0 = assign_to(~V0);~N", p); -} - -static void generate_sliceto(struct generator * g, struct node * p) { - - write_comment(g, p); - g->V[0] = p->name; - writef(g, "~M~V0 = slice_to(~V0);~N", p); -} - -static void generate_address(struct generator * g, struct node * p) { - - symbol * b = p->literalstring; - if (b != 0) { - write_literal_string(g, b); - } else { - write_varref(g, p->name); - } -} - -static void generate_insert(struct generator * g, struct node * p, int style) { - - int keep_c = style == c_attach; - write_comment(g, p); - if (p->mode == m_backward) keep_c = !keep_c; - if (keep_c) w(g, "~{~Mint c = cursor;~N"); - writef(g, "~Minsert(cursor, cursor, ", p); - generate_address(g, p); - writef(g, ");~N", p); - if (keep_c) w(g, "~Mcursor = c;~N~}"); -} - -static void generate_assignfrom(struct generator * g, struct node * p) { - - int keep_c = p->mode == m_forward; /* like 'attach' */ - - write_comment(g, p); - if (keep_c) writef(g, "~{~Mint c = cursor;~N", p); - if (p->mode == m_forward) { - writef(g, "~Minsert(cursor, limit, ", p); - } else { - writef(g, "~Minsert(limit_backward, cursor, ", p); - } - generate_address(g, p); - writef(g, ");~N", p); - if (keep_c) w(g, "~Mcursor = c;~N~}"); -} - - -static void generate_slicefrom(struct generator * g, struct node * p) { - - write_comment(g, p); - w(g, "~Mslice_from("); - generate_address(g, p); - writef(g, ");~N", p); -} - -static void generate_setlimit(struct generator * g, struct node * p) { - - struct str * savevar = vars_newname(g); - struct str * varname = vars_newname(g); - write_comment(g, p); - write_savecursor(g, p, savevar); - generate(g, p->left); - - if (!g->unreachable) { - g->B[0] = str_data(varname); - write_declare(g, "int ~B0", p); - if (p->mode == m_forward) { - w(g, "~M~B0 = limit - cursor;~N"); - w(g, "~Mlimit = cursor;~N"); - } else { - w(g, "~M~B0 = limit_backward;~N"); - w(g, "~Mlimit_backward = cursor;~N"); - } - write_restorecursor(g, p, savevar); - - if (p->mode == m_forward) { - str_assign(g->failure_str, "limit += "); - str_append(g->failure_str, varname); - str_append_ch(g->failure_str, ';'); - } else { - str_assign(g->failure_str, "limit_backward = "); - str_append(g->failure_str, varname); - str_append_ch(g->failure_str, ';'); - } - generate(g, p->aux); - - if (!g->unreachable) { - write_margin(g); - write_str(g, g->failure_str); - write_newline(g); - } - } - str_delete(varname); - str_delete(savevar); -} - -/* dollar sets snowball up to operate on a string variable as if it were the - * current string */ -static void generate_dollar(struct generator * g, struct node * p) { - - struct str * savevar = vars_newname(g); - write_comment(g, p); - g->V[0] = p->name; - - str_assign(g->failure_str, "copy_from("); - str_append(g->failure_str, savevar); - str_append_string(g->failure_str, ");"); - g->B[0] = str_data(savevar); - writef(g, "~{~M~n ~B0 = this;~N" - "~Mcurrent = new StringBuffer(~V0.toString());~N" - "~Mcursor = 0;~N" - "~Mlimit = (current.length());~N", p); - generate(g, p->left); - if (!g->unreachable) { - write_margin(g); - write_str(g, g->failure_str); - write_newline(g); - } - w(g, "~}"); - str_delete(savevar); -} - -static void generate_integer_assign(struct generator * g, struct node * p, char * s) { - - g->V[0] = p->name; - g->S[0] = s; - w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); -} - -static void generate_integer_test(struct generator * g, struct node * p, char * s) { - - g->V[0] = p->name; - g->S[0] = s; - w(g, "~Mif (!(~V0 ~S0 "); generate_AE(g, p->AE); w(g, "))~N"); - write_block_start(g); - write_failure(g); - write_block_end(g); - g->unreachable = false; -} - -static void generate_call(struct generator * g, struct node * p) { - - write_comment(g, p); - g->V[0] = p->name; - write_failure_if(g, "!~V0()", p); -} - -static void generate_grouping(struct generator * g, struct node * p, int complement) { - - struct grouping * q = p->name->grouping; - g->S[0] = p->mode == m_forward ? "" : "_b"; - g->S[1] = complement ? "out" : "in"; - g->V[0] = p->name; - g->I[0] = q->smallest_ch; - g->I[1] = q->largest_ch; - if (q->no_gaps) - write_failure_if(g, "!(~S1_range~S0(~I0, ~I1))", p); - else - write_failure_if(g, "!(~S1_grouping~S0(~V0, ~I0, ~I1))", p); -} - -static void generate_namedstring(struct generator * g, struct node * p) { - - write_comment(g, p); - g->S[0] = p->mode == m_forward ? "" : "_b"; - g->V[0] = p->name; - write_failure_if(g, "!(eq_v~S0(~V0))", p); -} - -static void generate_literalstring(struct generator * g, struct node * p) { - - symbol * b = p->literalstring; - write_comment(g, p); - g->S[0] = p->mode == m_forward ? "" : "_b"; - g->I[0] = SIZE(b); - g->L[0] = b; - write_failure_if(g, "!(eq_s~S0(~I0, ~L0))", p); -} - -static void generate_define(struct generator * g, struct node * p) { - - struct name * q = p->name; - - struct str * saved_output = g->outbuf; - struct str * saved_declarations = g->declarations; - - g->S[0] = q->type == t_routine ? "private" : "public"; - g->V[0] = q; - w(g, "~+~+~N~M~S0 boolean ~V0() {~+~N"); - - g->outbuf = str_new(); - g->declarations = str_new(); - - g->next_label = 0; - g->var_number = 0; - - if (p->amongvar_needed) write_declare(g, "int among_var", p); - str_clear(g->failure_str); - g->failure_label = x_return; - g->unreachable = false; - generate(g, p->left); - if (!g->unreachable) w(g, "~Mreturn true;~N"); - w(g, "~}~-~-"); - - str_append(saved_output, g->declarations); - str_append(saved_output, g->outbuf); - str_delete(g->declarations); - str_delete(g->outbuf); - g->declarations = saved_declarations; - g->outbuf = saved_output; -} - -static void generate_substring(struct generator * g, struct node * p) { - - struct among * x = p->among; - - write_comment(g, p); - - g->S[0] = p->mode == m_forward ? "" : "_b"; - g->I[0] = x->number; - g->I[1] = x->literalstring_count; - - if (x->command_count == 0 && x->starter == 0) { - write_failure_if(g, "find_among~S0(a_~I0, ~I1) == 0", p); - } else { - writef(g, "~Mamong_var = find_among~S0(a_~I0, ~I1);~N", p); - write_failure_if(g, "among_var == 0", p); - } -} - -static void generate_among(struct generator * g, struct node * p) { - - struct among * x = p->among; - int case_number = 1; - - if (x->substring == 0) generate_substring(g, p); - if (x->command_count == 0 && x->starter == 0) return; - - if (x->starter != 0) generate(g, x->starter); - - p = p->left; - if (p != 0 && p->type != c_literalstring) p = p->right; - w(g, "~Mswitch(among_var) {~N~+"); - w(g, "~Mcase 0:~N~+"); - write_failure(g); - g->unreachable = false; - w(g, "~-"); - - while (p != 0) { - if (p->type == c_bra && p->left != 0) { - g->I[0] = case_number++; - w(g, "~Mcase ~I0:~N~+"); - generate(g, p); - if (!g->unreachable) w(g, "~Mbreak;~N"); - w(g, "~-"); - g->unreachable = false; - } - p = p->right; - } - write_block_end(g); -} - -static void generate_booltest(struct generator * g, struct node * p) { - - write_comment(g, p); - g->V[0] = p->name; - write_failure_if(g, "!(~V0)", p); -} - -static void generate_false(struct generator * g, struct node * p) { - - write_comment(g, p); - write_failure(g); -} - -static void generate_debug(struct generator * g, struct node * p) { - - write_comment(g, p); - g->I[0] = g->debug_count++; - g->I[1] = p->line_number; - writef(g, "~Mdebug(~I0, ~I1);~N", p); -} - -static void generate(struct generator * g, struct node * p) { - - int a0; - struct str * a1; - - if (g->unreachable) return; - - a0 = g->failure_label; - a1 = str_copy(g->failure_str); - - switch (p->type) - { - case c_define: generate_define(g, p); break; - case c_bra: generate_bra(g, p); break; - case c_and: generate_and(g, p); break; - case c_or: generate_or(g, p); break; - case c_backwards: generate_backwards(g, p); break; - case c_not: generate_not(g, p); break; - case c_set: generate_set(g, p); break; - case c_unset: generate_unset(g, p); break; - case c_try: generate_try(g, p); break; - case c_fail: generate_fail(g, p); break; - case c_reverse: - case c_test: generate_test(g, p); break; - case c_do: generate_do(g, p); break; - case c_goto: generate_GO(g, p, 1); break; - case c_gopast: generate_GO(g, p, 0); break; - case c_repeat: generate_repeat(g, p, 0); break; - case c_loop: generate_loop(g, p); break; - case c_atleast: generate_atleast(g, p); break; - case c_setmark: generate_setmark(g, p); break; - case c_tomark: generate_tomark(g, p); break; - case c_atmark: generate_atmark(g, p); break; - case c_hop: generate_hop(g, p); break; - case c_delete: generate_delete(g, p); break; - case c_next: generate_next(g, p); break; - case c_tolimit: generate_tolimit(g, p); break; - case c_atlimit: generate_atlimit(g, p); break; - case c_leftslice: generate_leftslice(g, p); break; - case c_rightslice: generate_rightslice(g, p); break; - case c_assignto: generate_assignto(g, p); break; - case c_sliceto: generate_sliceto(g, p); break; - case c_assign: generate_assignfrom(g, p); break; - case c_insert: - case c_attach: generate_insert(g, p, p->type); break; - case c_slicefrom: generate_slicefrom(g, p); break; - case c_setlimit: generate_setlimit(g, p); break; - case c_dollar: generate_dollar(g, p); break; - case c_mathassign: generate_integer_assign(g, p, "="); break; - case c_plusassign: generate_integer_assign(g, p, "+="); break; - case c_minusassign: generate_integer_assign(g, p, "-="); break; - case c_multiplyassign:generate_integer_assign(g, p, "*="); break; - case c_divideassign: generate_integer_assign(g, p, "/="); break; - case c_eq: generate_integer_test(g, p, "=="); break; - case c_ne: generate_integer_test(g, p, "!="); break; - case c_gr: generate_integer_test(g, p, ">"); break; - case c_ge: generate_integer_test(g, p, ">="); break; - case c_ls: generate_integer_test(g, p, "<"); break; - case c_le: generate_integer_test(g, p, "<="); break; - case c_call: generate_call(g, p); break; - case c_grouping: generate_grouping(g, p, false); break; - case c_non: generate_grouping(g, p, true); break; - case c_name: generate_namedstring(g, p); break; - case c_literalstring: generate_literalstring(g, p); break; - case c_among: generate_among(g, p); break; - case c_substring: generate_substring(g, p); break; - case c_booltest: generate_booltest(g, p); break; - case c_false: generate_false(g, p); break; - case c_true: break; - case c_debug: generate_debug(g, p); break; - default: fprintf(stderr, "%d encountered\n", p->type); - exit(1); - } - - g->failure_label = a0; - str_delete(g->failure_str); - g->failure_str = a1; -} - -static void generate_start_comment(struct generator * g) { - - w(g, "// This file was generated automatically by the Snowball to Java compiler~N"); - w(g, "~N"); -} - -static void generate_class_begin(struct generator * g) { - - w(g, "package " ); - w(g, g->options->package); - w(g, ";~N~N" ); - - w(g, "import "); - w(g, g->options->among_class ); - w(g, ";~N" - "~N" - " /**~N" - " * This class was automatically generated by a Snowball to Java compiler ~N" - " * It implements the stemming algorithm defined by a snowball script.~N" - " */~N" - "~N" - "public class ~n extends "); - - w(g, g->options->parent_class_name); - w(g, " {~N" - "~N" - "private static final long serialVersionUID = 1L;~N" - "~N" - "~+~+~Mprivate final static ~n methodObject = new ~n ();~N" - "~N"); -} - -static void generate_class_end(struct generator * g) { - - w(g, "~N}"); - w(g, "~N~N"); -} - -static void generate_equals(struct generator * g) { - - w(g, "~N" - "~Mpublic boolean equals( Object o ) {~N" - "~+~Mreturn o instanceof "); - w(g, g->options->name); - w(g, ";~N~-~M}~N" - "~N" - "~Mpublic int hashCode() {~N" - "~+~Mreturn "); - w(g, g->options->name); - w(g, ".class.getName().hashCode();~N" - "~-~M}~N"); - w(g, "~N~N"); -} - -static void generate_among_table(struct generator * g, struct among * x) { - - struct amongvec * v = x->b; - - g->I[0] = x->number; - g->I[1] = x->literalstring_count; - - w(g, "~+~+~Mprivate final static Among a_~I0[] = {~N~+"); - { - int i; - for (i = 0; i < x->literalstring_count; i++) { - g->I[0] = i; - g->I[1] = v->i; - g->I[2] = v->result; - g->L[0] = v->b; - g->S[0] = i < x->literalstring_count - 1 ? "," : ""; - - w(g, "~Mnew Among ( ~L0, ~I1, ~I2, \""); - if (v->function != 0) { - write_varname(g, v->function); - } - w(g, "\", methodObject )~S0~N"); - v++; - } - } - w(g, "~-~M};~-~-~N~N"); -} - -static void generate_amongs(struct generator * g) { - - struct among * x = g->analyser->amongs; - while (x != 0) { - generate_among_table(g, x); - x = x->next; - } -} - -static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } - -static int bit_is_set(symbol * b, int i) { return b[i/8] & 1 << i%8; } - -static void generate_grouping_table(struct generator * g, struct grouping * q) { - - int range = q->largest_ch - q->smallest_ch + 1; - int size = (range + 7)/ 8; /* assume 8 bits per symbol */ - symbol * b = q->b; - symbol * map = create_b(size); - int i; - for (i = 0; i < size; i++) map[i] = 0; - - /* Using unicode would require revision here */ - - for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); - - q->no_gaps = true; - for (i = 0; i < range; i++) unless (bit_is_set(map, i)) q->no_gaps = false; - - unless (q->no_gaps) { - g->V[0] = q->name; - - w(g, "~+~+~Mprivate static final char ~V0[] = {"); - for (i = 0; i < size; i++) { - write_int(g, map[i]); - if (i < size - 1) w(g, ", "); - } - w(g, " };~N~-~-~N"); - } - lose_b(map); -} - -static void generate_groupings(struct generator * g) { - struct grouping * q = g->analyser->groupings; - until (q == 0) { - generate_grouping_table(g, q); - q = q->next; - } -} - -static void generate_members(struct generator * g) { - - struct name * q = g->analyser->names; - until (q == 0) { - g->V[0] = q; - switch (q->type) { - case t_string: - w(g, " private "); - w(g, g->options->string_class ); - w(g, " ~W0 = new "); - w(g, g->options->string_class); - w(g, "();~N"); - break; - case t_integer: - w(g, " private int ~W0;~N"); - break; - case t_boolean: - w(g, " private boolean ~W0;~N"); - break; - } - q = q->next; - } - w(g, "~N"); -} - -static void generate_copyfrom(struct generator * g) { - - struct name * q; - w(g, "~+~+~Mprivate void copy_from(~n other) {~+~N"); - for (q = g->analyser->names; q != 0; q = q->next) { - g->V[0] = q; - switch (q->type) { - case t_string: - case t_integer: - case t_boolean: - w(g, "~M~W0 = other.~W0;~N"); - break; - } - } - w(g, "~Msuper.copy_from(other);~N"); - w(g, "~-~M}~-~-~N"); -} - -static void generate_methods(struct generator * g) { - - struct node * p = g->analyser->program; - while (p != 0) { - generate(g, p); - g->unreachable = false; - p = p->right; - } -} - -extern void generate_program_java(struct generator * g) { - - g->outbuf = str_new(); - g->failure_str = str_new(); - - generate_start_comment(g); - generate_class_begin(g); - - generate_amongs(g); - generate_groupings(g); - - generate_members(g); - generate_copyfrom(g); - generate_methods(g); - generate_equals(g); - - generate_class_end(g); - - output_str(g->options->output_java, g->outbuf); - str_delete(g->failure_str); - str_delete(g->outbuf); -} - -extern struct generator * create_generator_java(struct analyser * a, struct options * o) { - - NEW(generator, g); - g->analyser = a; - g->options = o; - g->margin = 0; - g->debug_count = 0; - g->unreachable = false; - return g; -} - -extern void close_generator_java(struct generator * g) { - - FREE(g); -} - diff --git a/contrib/snowball/compiler/header.h b/contrib/snowball/compiler/header.h index 9baf1d917..dadbee36c 100644 --- a/contrib/snowball/compiler/header.h +++ b/contrib/snowball/compiler/header.h @@ -1,49 +1,56 @@ +#include <stdio.h> + +#define SNOWBALL_VERSION "2.0.0" typedef unsigned char byte; typedef unsigned short symbol; #define true 1 #define false 0 -#define repeat while(true) -#define unless(C) if(!(C)) -#define until(C) while(!(C)) #define MALLOC check_malloc #define FREE check_free #define NEW(type, p) struct type * p = (struct type *) MALLOC(sizeof(struct type)) -#define NEWVEC(type, p, n) struct type * p = (struct type *) MALLOC(sizeof(struct type) * n) +#define NEWVEC(type, p, n) struct type * p = (struct type *) MALLOC(sizeof(struct type) * (n)) -#define STARTSIZE 10 #define SIZE(p) ((int *)(p))[-1] #define CAPACITY(p) ((int *)(p))[-2] extern symbol * create_b(int n); -extern void report_b(FILE * out, symbol * p); +extern void report_b(FILE * out, const symbol * p); extern void lose_b(symbol * p); extern symbol * increase_capacity(symbol * p, int n); -extern symbol * move_to_b(symbol * p, int n, symbol * q); -extern symbol * add_to_b(symbol * p, int n, symbol * q); -extern symbol * copy_b(symbol * p); -extern char * b_to_s(symbol * p); +extern symbol * move_to_b(symbol * p, int n, const symbol * q); +extern symbol * add_to_b(symbol * p, int n, const symbol * q); +extern symbol * copy_b(const symbol * p); +extern char * b_to_s(const symbol * p); extern symbol * add_s_to_b(symbol * p, const char * s); +#define MOVE_TO_B(B, LIT) \ + move_to_b(B, sizeof(LIT) / sizeof(LIT[0]), LIT) + struct str; /* defined in space.c */ extern struct str * str_new(void); extern void str_delete(struct str * str); -extern void str_append(struct str * str, struct str * add); +extern void str_append(struct str * str, const struct str * add); extern void str_append_ch(struct str * str, char add); -extern void str_append_b(struct str * str, symbol * q); +extern void str_append_b(struct str * str, const symbol * q); +extern void str_append_b_tail(struct str * str, const symbol * q, int skip); extern void str_append_string(struct str * str, const char * s); extern void str_append_int(struct str * str, int i); extern void str_clear(struct str * str); -extern void str_assign(struct str * str, char * s); -extern struct str * str_copy(struct str * old); -extern symbol * str_data(struct str * str); -extern int str_len(struct str * str); +extern void str_assign(struct str * str, const char * s); +extern struct str * str_copy(const struct str * old); +extern symbol * str_data(const struct str * str); +extern int str_len(const struct str * str); +extern int str_back(const struct str *str); extern int get_utf8(const symbol * p, int * slot); extern int put_utf8(int ch, symbol * p); +extern void output_str(FILE * outfile, struct str * str); + +typedef enum { ENC_SINGLEBYTE, ENC_UTF8, ENC_WIDECHARS } enc; struct m_pair { @@ -60,6 +67,7 @@ struct input { symbol * p; int c; char * file; + int file_needs_freeing; int line_number; }; @@ -71,6 +79,28 @@ struct include { }; +enum token_codes { + +#include "syswords2.h" + + c_mathassign, + c_name, + c_number, + c_literalstring, + c_neg, + c_call, + c_grouping, + c_booltest, + + NUM_TOKEN_CODES +}; + +enum uplus_modes { + UPLUS_NONE, + UPLUS_DEFINED, + UPLUS_UNICODE +}; + /* struct input must be a prefix of struct tokeniser. */ struct tokeniser { @@ -78,6 +108,7 @@ struct tokeniser { symbol * p; int c; char * file; + int file_needs_freeing; int line_number; symbol * b; symbol * b2; @@ -90,34 +121,28 @@ struct tokeniser { int token; int previous_token; byte token_held; - byte widechars; - byte utf8; + enc encoding; int omission; struct include * includes; + /* Mode in which U+ has been used: + * UPLUS_NONE - not used yet + * UPLUS_DEFINED - stringdef U+xxxx .... + * UPLUS_UNICODE - {U+xxxx} used with implicit meaning + */ + int uplusmode; + + char token_disabled[NUM_TOKEN_CODES]; }; -extern symbol * get_input(symbol * p, char ** p_file); +extern symbol * get_input(const char * filename); extern struct tokeniser * create_tokeniser(symbol * b, char * file); extern int read_token(struct tokeniser * t); extern const char * name_of_token(int code); +extern void disable_token(struct tokeniser * t, int code); extern void close_tokeniser(struct tokeniser * t); -enum token_codes { - -#include "syswords2.h" - - c_mathassign, - c_name, - c_number, - c_literalstring, - c_neg, - c_call, - c_grouping, - c_booltest -}; - extern int space_count; extern void * check_malloc(int n); extern void check_free(void * p); @@ -134,7 +159,13 @@ struct name { int count; /* 0, 1, 2 for each type */ struct grouping * grouping; /* for grouping names */ byte referenced; - byte used; + byte used_in_among; /* Function used in among? */ + byte value_used; /* (For variables) is its value ever used? */ + byte initialised; /* (For variables) is it ever initialised? */ + byte used_in_definition; /* (grouping) used in grouping definition? */ + struct node * used; /* First use, or NULL if not used */ + struct name * local_to; /* Local to one routine/external */ + int declaration_line_number;/* Line number of declaration */ }; @@ -149,9 +180,10 @@ struct amongvec { symbol * b; /* the string giving the case */ int size; /* - and its size */ - struct node * p; /* the corresponding command */ + struct node * action; /* the corresponding action */ int i; /* the amongvec index of the longest substring of b */ int result; /* the numeric result for the case */ + int line_number; /* for diagnostics and stable sorting */ struct name * function; }; @@ -163,19 +195,22 @@ struct among { int number; /* amongs are numbered 0, 1, 2 ... */ int literalstring_count; /* in this among */ int command_count; /* in this among */ + int nocommand_count; /* number of "no command" entries in this among */ + int function_count; /* in this among */ + int amongvar_needed; /* do we need to set among_var? */ struct node * starter; /* i.e. among( (starter) 'string' ... ) */ struct node * substring; /* i.e. substring ... among ( ... ) */ + struct node ** commands; /* array with command_count entries */ }; struct grouping { struct grouping * next; - int number; /* groupings are numbered 0, 1, 2 ... */ symbol * b; /* the characters of this group */ int largest_ch; /* character with max code */ int smallest_ch; /* character with min code */ - byte no_gaps; /* not used in generator.c after 11/5/05 */ struct name * name; /* so g->name->grouping == g */ + int line_number; }; struct node { @@ -234,7 +269,8 @@ struct analyser { struct grouping * groupings; struct grouping * groupings_end; struct node * substring; /* pending 'substring' in current routine definition */ - byte utf8; + enc encoding; + byte int_limits_used; /* are maxint or minint used? */ }; enum analyser_modes { @@ -259,16 +295,23 @@ struct generator { struct str * outbuf; /* temporary str to store output */ struct str * declarations; /* str storing variable declarations */ int next_label; +#ifndef DISABLE_PYTHON + int max_label; +#endif int margin; - const char * failure_string; /* String to output in case of a failure. */ -#ifndef DISABLE_JAVA - struct str * failure_str; /* This is used by the java generator instead of failure_string */ + /* if > 0, keep_count to restore in case of a failure; + * if < 0, the negated keep_count for the limit to restore in case of + * failure. */ + int failure_keep_count; +#if !defined(DISABLE_JAVA) && !defined(DISABLE_JS) && !defined(DISABLE_PYTHON) && !defined(DISABLE_CSHARP) + struct str * failure_str; /* This is used by some generators instead of failure_keep_count */ #endif int label_used; /* Keep track of whether the failure label is used. */ int failure_label; int debug_count; + int copy_from_count; /* count of calls to copy_from() */ const char * S[10]; /* strings */ symbol * B[10]; /* blocks */ @@ -277,48 +320,92 @@ struct generator { symbol * L[5]; /* literals, used in formatted write */ int line_count; /* counts number of lines output */ - int line_labelled; /* in ANSI C, will need extra ';' if it is a block end */ + int line_labelled; /* in ISO C, will need extra ';' if it is a block end */ int literalstring_count; int keep_count; /* used to number keep/restore pairs to avoid compiler warnings about shadowed variables */ }; +/* Special values for failure_label in struct generator. */ +enum special_labels { + x_return = -1 +}; + struct options { /* for the command line: */ - char * output_file; - char * name; - FILE * output_c; + const char * output_file; + const char * name; + FILE * output_src; FILE * output_h; -#ifndef DISABLE_JAVA - FILE * output_java; -#endif byte syntax_tree; - byte widechars; - enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS } make_lang; - char * externals_prefix; - char * variables_prefix; - char * runtime_path; - char * parent_class_name; - char * package; - char * string_class; - char * among_class; + byte comments; + enc encoding; + enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS, LANG_CSHARP, LANG_PASCAL, LANG_PYTHON, LANG_JAVASCRIPT, LANG_RUST, LANG_GO } make_lang; + const char * externals_prefix; + const char * variables_prefix; + const char * runtime_path; + const char * parent_class_name; + const char * package; + const char * go_snowball_runtime; + const char * string_class; + const char * among_class; struct include * includes; struct include * includes_end; - byte utf8; }; -/* Generator for C code. */ -extern struct generator * create_generator_c(struct analyser * a, struct options * o); -extern void close_generator_c(struct generator * g); +/* Generator functions common to several backends. */ + +extern struct generator * create_generator(struct analyser * a, struct options * o); +extern void close_generator(struct generator * g); + +extern void write_char(struct generator * g, int ch); +extern void write_newline(struct generator * g); +extern void write_string(struct generator * g, const char * s); +extern void write_int(struct generator * g, int i); +extern void write_b(struct generator * g, symbol * b); +extern void write_str(struct generator * g, struct str * str); + +extern void write_comment_content(struct generator * g, struct node * p); +extern void write_generated_comment_content(struct generator * g); +extern void write_start_comment(struct generator * g, + const char * comment_start, + const char * comment_end); + +extern int K_needed(struct generator * g, struct node * p); +extern int repeat_restore(struct generator * g, struct node * p); +/* Generator for C code. */ extern void generate_program_c(struct generator * g); #ifndef DISABLE_JAVA /* Generator for Java code. */ -extern struct generator * create_generator_java(struct analyser * a, struct options * o); -extern void close_generator_java(struct generator * g); - extern void generate_program_java(struct generator * g); #endif + +#ifndef DISABLE_CSHARP +/* Generator for C# code. */ +extern void generate_program_csharp(struct generator * g); +#endif + +#ifndef DISABLE_PASCAL +extern void generate_program_pascal(struct generator * g); +#endif + +#ifndef DISABLE_PYTHON +/* Generator for Python code. */ +extern void generate_program_python(struct generator * g); +#endif + +#ifndef DISABLE_JS +extern void generate_program_js(struct generator * g); +#endif + +#ifndef DISABLE_RUST +extern void generate_program_rust(struct generator * g); +#endif + +#ifndef DISABLE_GO +extern void generate_program_go(struct generator * g); +#endif diff --git a/contrib/snowball/compiler/space.c b/contrib/snowball/compiler/space.c index 310024e76..5b058763a 100644 --- a/contrib/snowball/compiler/space.c +++ b/contrib/snowball/compiler/space.c @@ -57,9 +57,19 @@ extern symbol * create_b(int n) { return p; } -extern void report_b(FILE * out, symbol * p) { +extern void report_b(FILE * out, const symbol * p) { int i; - for (i = 0; i < SIZE(p); i++) fprintf(out, "%c", p[i]); + for (i = 0; i < SIZE(p); i++) { + if (p[i] > 255) { + printf("In report_b, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]); + exit(1); + } + putc(p[i], out); + } +} + +extern void output_str(FILE * outfile, struct str * str) { + report_b(outfile, str_data(str)); } extern void lose_b(symbol * p) { @@ -74,19 +84,19 @@ extern symbol * increase_capacity(symbol * p, int n) { lose_b(p); return q; } -extern symbol * move_to_b(symbol * p, int n, symbol * q) { +extern symbol * move_to_b(symbol * p, int n, const symbol * q) { int x = n - CAPACITY(p); if (x > 0) p = increase_capacity(p, x); memmove(p, q, n * sizeof(symbol)); SIZE(p) = n; return p; } -extern symbol * add_to_b(symbol * p, int n, symbol * q) { +extern symbol * add_to_b(symbol * p, int n, const symbol * q) { int x = SIZE(p) + n - CAPACITY(p); if (x > 0) p = increase_capacity(p, x); memmove(p + SIZE(p), q, n * sizeof(symbol)); SIZE(p) += n; return p; } -extern symbol * copy_b(symbol * p) { +extern symbol * copy_b(const symbol * p) { int n = SIZE(p); symbol * q = create_b(n); move_to_b(q, n, p); @@ -97,7 +107,7 @@ int space_count = 0; extern void * check_malloc(int n) { space_count++; - return calloc(1, n); + return malloc(n); } extern void check_free(void * p) { @@ -107,18 +117,18 @@ extern void check_free(void * p) { /* To convert a block to a zero terminated string: */ -extern char * b_to_s(symbol * p) { +extern char * b_to_s(const symbol * p) { int n = SIZE(p); - char * s = (char *)calloc(1, n + 1); + char * s = (char *)malloc(n + 1); { int i; for (i = 0; i < n; i++) { - if (p[i] > 255) { - printf("In b_to_s, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]); - exit(1); - } - s[i] = (char)p[i]; - } + if (p[i] > 255) { + printf("In b_to_s, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]); + exit(1); + } + s[i] = (char)p[i]; + } } s[n] = 0; return s; @@ -153,9 +163,9 @@ struct str { }; /* Create a new string. */ -extern struct str * str_new() { +extern struct str * str_new(void) { - struct str * output = (struct str *) calloc(1, sizeof(struct str)); + struct str * output = (struct str *) malloc(sizeof(struct str)); output->data = create_b(0); return output; } @@ -168,7 +178,7 @@ extern void str_delete(struct str * str) { } /* Append a str to this str. */ -extern void str_append(struct str * str, struct str * add) { +extern void str_append(struct str * str, const struct str * add) { symbol * q = add->data; str->data = add_to_b(str->data, SIZE(q), q); @@ -183,12 +193,19 @@ extern void str_append_ch(struct str * str, char add) { } /* Append a low level block to a str. */ -extern void str_append_b(struct str * str, symbol * q) { +extern void str_append_b(struct str * str, const symbol * q) { str->data = add_to_b(str->data, SIZE(q), q); } -/* Append a (char *, null teminated) string to a str. */ +/* Append the tail of a low level block to a str. */ +extern void str_append_b_tail(struct str * str, const symbol * q, int skip) { + if (skip < 0 || skip >= SIZE(q)) return; + + str->data = add_to_b(str->data, SIZE(q) - skip, q + skip); +} + +/* Append a (char *, null terminated) string to a str. */ extern void str_append_string(struct str * str, const char * s) { str->data = add_s_to_b(str->data, s); @@ -209,14 +226,14 @@ extern void str_clear(struct str * str) { } /* Set a string */ -extern void str_assign(struct str * str, char * s) { +extern void str_assign(struct str * str, const char * s) { str_clear(str); str_append_string(str, s); } /* Copy a string. */ -extern struct str * str_copy(struct str * old) { +extern struct str * str_copy(const struct str * old) { struct str * newstr = str_new(); str_append(newstr, old); @@ -224,17 +241,25 @@ extern struct str * str_copy(struct str * old) { } /* Get the data stored in this str. */ -extern symbol * str_data(struct str * str) { +extern symbol * str_data(const struct str * str) { return str->data; } /* Get the length of the str. */ -extern int str_len(struct str * str) { +extern int str_len(const struct str * str) { return SIZE(str->data); } +/* Get the last character of the str. + * + * Or -1 if the string is empty. + */ +extern int str_back(const struct str *str) { + return SIZE(str->data) ? str->data[SIZE(str->data) - 1] : -1; +} + extern int get_utf8(const symbol * p, int * slot) { int b0, b1; b0 = *p++; @@ -260,4 +285,3 @@ extern int put_utf8(int ch, symbol * p) { p[1] = ((ch >> 6) & 0x3F) | 0x80; p[2] = (ch & 0x3F) | 0x80; return 3; } - diff --git a/contrib/snowball/compiler/syswords.h b/contrib/snowball/compiler/syswords.h index 917dd5751..c401d3e4f 100644 --- a/contrib/snowball/compiler/syswords.h +++ b/contrib/snowball/compiler/syswords.h @@ -1,5 +1,5 @@ -static const struct system_word vocab[80+1] = { - { 0, (const byte *)"", 80+1}, +static const struct system_word vocab[82+1] = { + { 0, (const byte *)"", 82+1}, { 1, (const byte *)"$", c_dollar }, { 1, (const byte *)"(", c_bra }, @@ -36,6 +36,7 @@ static const struct system_word vocab[80+1] = { { 3, (const byte *)"get", c_get }, { 3, (const byte *)"hex", c_hex }, { 3, (const byte *)"hop", c_hop }, + { 3, (const byte *)"len", c_len }, { 3, (const byte *)"non", c_non }, { 3, (const byte *)"not", c_not }, { 3, (const byte *)"set", c_set }, @@ -49,6 +50,7 @@ static const struct system_word vocab[80+1] = { { 4, (const byte *)"true", c_true }, { 5, (const byte *)"among", c_among }, { 5, (const byte *)"false", c_false }, + { 5, (const byte *)"lenof", c_lenof }, { 5, (const byte *)"limit", c_limit }, { 5, (const byte *)"unset", c_unset }, { 6, (const byte *)"atmark", c_atmark }, diff --git a/contrib/snowball/compiler/syswords2.h b/contrib/snowball/compiler/syswords2.h index eb8a91241..e853e32dc 100644 --- a/contrib/snowball/compiler/syswords2.h +++ b/contrib/snowball/compiler/syswords2.h @@ -4,8 +4,8 @@ c_decimal, c_define, c_delete, c_divide, c_divideassign, c_do, c_dollar, c_eq, c_externals, c_fail, c_false, c_for, c_ge, c_get, c_gopast, c_goto, c_gr, c_groupings, c_hex, c_hop, c_insert, - c_integers, c_ket, c_le, c_leftslice, c_limit, c_loop, c_ls, - c_maxint, c_minint, c_minus, c_minusassign, c_multiply, + c_integers, c_ket, c_le, c_leftslice, c_len, c_lenof, c_limit, c_loop, + c_ls, c_maxint, c_minint, c_minus, c_minusassign, c_multiply, c_multiplyassign, c_ne, c_next, c_non, c_not, c_or, c_plus, c_plusassign, c_repeat, c_reverse, c_rightslice, c_routines, c_set, c_setlimit, c_setmark, c_size, c_sizeof, c_slicefrom, diff --git a/contrib/snowball/compiler/tokeniser.c b/contrib/snowball/compiler/tokeniser.c index 3dae5f744..e6c63861a 100644 --- a/contrib/snowball/compiler/tokeniser.c +++ b/contrib/snowball/compiler/tokeniser.c @@ -16,57 +16,57 @@ struct system_word { #include "syswords.h" -static int smaller(int a, int b) { return a < b ? a : b; } +#define INITIAL_INPUT_BUFFER_SIZE 8192 + +static int hex_to_num(int ch); -extern symbol * get_input(symbol * p, char ** p_file) { +static int smaller(int a, int b) { return a < b ? a : b; } - char * s = b_to_s(p); +extern symbol * get_input(const char * filename) { + FILE * input = fopen(filename, "r"); + if (input == 0) { return 0; } { - FILE * input = fopen(s, "r"); - if (input == 0) { free(s); return 0; } - *p_file = s; - { - symbol * u = create_b(STARTSIZE); - int size = 0; - repeat - { int ch = getc(input); - if (ch == EOF) break; - if (size >= CAPACITY(u)) u = increase_capacity(u, size/2); - u[size++] = ch; - } - fclose(input); - SIZE(u) = size; return u; + symbol * u = create_b(INITIAL_INPUT_BUFFER_SIZE); + int size = 0; + while (true) { + int ch = getc(input); + if (ch == EOF) break; + if (size >= CAPACITY(u)) u = increase_capacity(u, size); + u[size++] = ch; } + fclose(input); + SIZE(u) = size; + return u; } } -static void error(struct tokeniser * t, char * s1, int n, symbol * p, char * s2) { +static void error(struct tokeniser * t, const char * s1, int n, symbol * p, const char * s2) { if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); } fprintf(stderr, "%s:%d: ", t->file, t->line_number); - unless (s1 == 0) fprintf(stderr, "%s", s1); - unless (p == 0) { + if (s1) fprintf(stderr, "%s", s1); + if (p) { int i; for (i = 0; i < n; i++) fprintf(stderr, "%c", p[i]); } - unless (s2 == 0) fprintf(stderr, "%s", s2); + if (s2) fprintf(stderr, "%s", s2); fprintf(stderr, "\n"); t->error_count++; } -static void error1(struct tokeniser * t, char * s) { +static void error1(struct tokeniser * t, const char * s) { error(t, s, 0,0, 0); } -static void error2(struct tokeniser * t, char * s) { +static void error2(struct tokeniser * t, const char * s) { error(t, "unexpected end of text after ", 0,0, s); } static int compare_words(int m, symbol * p, int n, const byte * q) { - unless (m == n) return m - n; + if (m != n) return m - n; { int i; for (i = 0; i < n; i++) { int diff = p[i] - q[i]; - unless (diff == 0) return diff; + if (diff) return diff; } } return 0; @@ -74,14 +74,13 @@ static int compare_words(int m, symbol * p, int n, const byte * q) { static int find_word(int n, symbol * p) { int i = 0; int j = vocab->code; - repeat { + do { int k = i + (j - i)/2; const struct system_word * w = vocab + k; int diff = compare_words(n, p, w->s_size, w->s); if (diff == 0) return w->code; if (diff < 0) j = k; else i = k; - if (j - i == 1) break; - } + } while (j - i != 1); return -1; } @@ -91,7 +90,7 @@ static int get_number(int n, symbol * p) { return x; } -static int eq_s(struct tokeniser * t, char * s) { +static int eq_s(struct tokeniser * t, const char * s) { int l = strlen(s); if (SIZE(t->p) - t->c < l) return false; { @@ -103,64 +102,141 @@ static int eq_s(struct tokeniser * t, char * s) { static int white_space(struct tokeniser * t, int ch) { switch (ch) { - case '\n': t->line_number++; + case '\n': + t->line_number++; + /* fall through */ case '\r': case '\t': - case ' ': return true; + case ' ': + return true; } return false; } static symbol * find_in_m(struct tokeniser * t, int n, symbol * p) { - struct m_pair * q = t->m_pairs; - repeat { - if (q == 0) return 0; - { - symbol * name = q->name; - if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value; - } - q = q->next; + struct m_pair * q; + for (q = t->m_pairs; q; q = q->next) { + symbol * name = q->name; + if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value; } + return 0; } static int read_literal_string(struct tokeniser * t, int c) { symbol * p = t->p; int ch; SIZE(t->b) = 0; - repeat { + while (true) { if (c >= SIZE(p)) { error2(t, "'"); return c; } ch = p[c]; if (ch == '\n') { error1(t, "string not terminated"); return c; } c++; if (ch == t->m_start) { + /* Inside insert characters. */ int c0 = c; int newlines = false; /* no newlines as yet */ int black_found = false; /* no printing chars as yet */ - repeat { + while (true) { if (c >= SIZE(p)) { error2(t, "'"); return c; } ch = p[c]; c++; if (ch == t->m_end) break; - unless (white_space(t, ch)) black_found = true; + if (!white_space(t, ch)) black_found = true; if (ch == '\n') newlines = true; if (newlines && black_found) { error1(t, "string not terminated"); return c; } } - unless (newlines) { + if (!newlines) { int n = c - c0 - 1; /* macro size */ int firstch = p[c0]; symbol * q = find_in_m(t, n, p + c0); if (q == 0) { if (n == 1 && (firstch == '\'' || firstch == t->m_start)) t->b = add_to_b(t->b, 1, p + c0); - else + else if (n >= 3 && firstch == 'U' && p[c0 + 1] == '+') { + int codepoint = 0; + int x; + if (t->uplusmode == UPLUS_DEFINED) { + /* See if found with xxxx upper-cased. */ + symbol * uc = create_b(n); + int i; + for (i = 0; i != n; ++i) { + uc[i] = toupper(p[c0 + i]); + } + q = find_in_m(t, n, uc); + lose_b(uc); + if (q != 0) { + t->b = add_to_b(t->b, SIZE(q), q); + continue; + } + error1(t, "Some U+xxxx stringdefs seen but not this one"); + } else { + t->uplusmode = UPLUS_UNICODE; + } + for (x = c0 + 2; x != c - 1; ++x) { + int hex = hex_to_num(p[x]); + if (hex < 0) { + error1(t, "Bad hex digit following U+"); + break; + } + codepoint = (codepoint << 4) | hex; + } + if (t->encoding == ENC_UTF8) { + if (codepoint < 0 || codepoint > 0x01ffff) { + error1(t, "character values exceed 0x01ffff"); + } + /* Ensure there's enough space for a max length + * UTF-8 sequence. */ + if (CAPACITY(t->b) < SIZE(t->b) + 3) { + t->b = increase_capacity(t->b, 3); + } + SIZE(t->b) += put_utf8(codepoint, t->b + SIZE(t->b)); + } else { + symbol sym; + if (t->encoding == ENC_SINGLEBYTE) { + /* Only ISO-8859-1 is handled this way - for + * other single-byte character sets you need + * stringdef all the U+xxxx codes you use + * like - e.g.: + * + * stringdef U+0171 hex 'FB' + */ + if (codepoint < 0 || codepoint > 0xff) { + error1(t, "character values exceed 256"); + } + } else { + if (codepoint < 0 || codepoint > 0xffff) { + error1(t, "character values exceed 64K"); + } + } + sym = codepoint; + t->b = add_to_b(t->b, 1, &sym); + } + } else error(t, "string macro '", n, p + c0, "' undeclared"); } else t->b = add_to_b(t->b, SIZE(q), q); } } else { if (ch == '\'') return c; + if (ch < 0 || ch >= 0x80) { + if (t->encoding != ENC_WIDECHARS) { + /* We don't really want people using non-ASCII literal + * strings, but historically it's worked for single-byte + * and UTF-8 if the source encoding matches what the + * generated stemmer works in and it seems unfair to just + * suddenly make this a hard error.` + */ + fprintf(stderr, + "%s:%d: warning: Non-ASCII literal strings aren't " + "portable - use stringdef instead\n", + t->file, t->line_number); + } else { + error1(t, "Non-ASCII literal strings aren't " + "portable - use stringdef instead"); + } + } t->b = add_to_b(t->b, 1, p + c - 1); } } @@ -171,7 +247,7 @@ static int next_token(struct tokeniser * t) { int c = t->c; int ch; int code = -1; - repeat { + while (true) { if (c >= SIZE(p)) { t->c = c; return -1; } ch = p[c]; if (white_space(t, ch)) { c++; continue; } @@ -179,7 +255,7 @@ static int next_token(struct tokeniser * t) { int c0 = c; while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++; code = find_word(c - c0, p + c0); - if (code < 0) { + if (code < 0 || t->token_disabled[code]) { t->b = move_to_b(t->b, c - c0, p + c0); code = c_name; } @@ -218,10 +294,9 @@ static int next_char(struct tokeniser * t) { } static int next_real_char(struct tokeniser * t) { - repeat { + while (true) { int ch = next_char(t); - if (white_space(t, ch)) continue; - return ch; + if (!white_space(t, ch)) return ch; } } @@ -230,7 +305,7 @@ static void read_chars(struct tokeniser * t) { if (ch < 0) { error2(t, "stringdef"); return; } { int c0 = t->c-1; - repeat { + while (true) { ch = next_char(t); if (white_space(t, ch) || ch < 0) break; } @@ -246,19 +321,20 @@ static int decimal_to_num(int ch) { static int hex_to_num(int ch) { if ('0' <= ch && ch <= '9') return ch - '0'; if ('a' <= ch && ch <= 'f') return ch - 'a' + 10; + if ('A' <= ch && ch <= 'F') return ch - 'A' + 10; return -1; } static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) { int c = 0; int d = 0; - repeat { + while (true) { while (c < SIZE(p) && p[c] == ' ') c++; if (c == SIZE(p)) break; { int number = 0; - repeat { + while (c != SIZE(p)) { int ch = p[c]; - if (c == SIZE(p) || ch == ' ') break; + if (ch == ' ') break; if (base == 10) { ch = decimal_to_num(ch); if (ch < 0) { @@ -266,7 +342,7 @@ static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) { return; } } else { - ch = hex_to_num(tolower(ch)); + ch = hex_to_num(ch); if (ch < 0) { error1(t, "hex string contains non-hex characters"); return; @@ -275,18 +351,18 @@ static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) { number = base * number + ch; c++; } - if (t->widechars || t->utf8) { - unless (0 <= number && number <= 0xffff) { - error1(t, "character values exceed 64K"); + if (t->encoding == ENC_SINGLEBYTE) { + if (number < 0 || number > 0xff) { + error1(t, "character values exceed 256"); return; } } else { - unless (0 <= number && number <= 0xff) { - error1(t, "character values exceed 256"); + if (number < 0 || number > 0xffff) { + error1(t, "character values exceed 64K"); return; } } - if (t->utf8) + if (t->encoding == ENC_UTF8) d += put_utf8(number, p + d); else p[d++] = number; @@ -300,104 +376,118 @@ extern int read_token(struct tokeniser * t) { int held = t->token_held; t->token_held = false; if (held) return t->token; - repeat { + while (true) { int code = next_token(t); switch (code) { case c_comment1: /* slash-slash comment */ - while (t->c < SIZE(p) && p[t->c] != '\n') t->c++; - continue; + while (t->c < SIZE(p) && p[t->c] != '\n') t->c++; + continue; case c_comment2: /* slash-star comment */ - repeat { - if (t->c >= SIZE(p)) { - error1(t, "/* comment not terminated"); - t->token = -1; - return -1; - } - if (p[t->c] == '\n') t->line_number++; - if (eq_s(t, "*/")) break; - t->c++; - } - continue; - case c_stringescapes: - { - int ch1 = next_real_char(t); - int ch2 = next_real_char(t); - if (ch2 < 0) - { error2(t, "stringescapes"); continue; } - if (ch1 == '\'') - { error1(t, "first stringescape cannot be '"); continue; } - t->m_start = ch1; - t->m_end = ch2; - } - continue; - case c_stringdef: - { - int base = 0; - read_chars(t); - code = read_token(t); - if (code == c_hex) { base = 16; code = read_token(t); } else - if (code == c_decimal) { base = 10; code = read_token(t); } - unless (code == c_literalstring) - { error1(t, "string omitted after stringdef"); continue; } - if (base > 0) convert_numeric_string(t, t->b, base); - { NEW(m_pair, q); - q->next = t->m_pairs; - q->name = copy_b(t->b2); - q->value = copy_b(t->b); - t->m_pairs = q; - } - } - continue; + while (true) { + if (t->c >= SIZE(p)) { + error1(t, "/* comment not terminated"); + t->token = -1; + return -1; + } + if (p[t->c] == '\n') t->line_number++; + if (eq_s(t, "*/")) break; + t->c++; + } + continue; + case c_stringescapes: { + int ch1 = next_real_char(t); + int ch2 = next_real_char(t); + if (ch2 < 0) { + error2(t, "stringescapes"); + continue; + } + if (ch1 == '\'') { + error1(t, "first stringescape cannot be '"); + continue; + } + t->m_start = ch1; + t->m_end = ch2; + continue; + } + case c_stringdef: { + int base = 0; + read_chars(t); + code = read_token(t); + if (code == c_hex) { base = 16; code = read_token(t); } else + if (code == c_decimal) { base = 10; code = read_token(t); } + if (code != c_literalstring) { + error1(t, "string omitted after stringdef"); + continue; + } + if (base > 0) convert_numeric_string(t, t->b, base); + { NEW(m_pair, q); + q->next = t->m_pairs; + q->name = copy_b(t->b2); + q->value = copy_b(t->b); + t->m_pairs = q; + if (t->uplusmode != UPLUS_DEFINED && + (SIZE(t->b2) >= 3 && t->b2[0] == 'U' && t->b2[1] == '+')) { + if (t->uplusmode == UPLUS_UNICODE) { + error1(t, "U+xxxx already used with implicit meaning"); + } else { + t->uplusmode = UPLUS_DEFINED; + } + } + } + continue; + } case c_get: - code = read_token(t); - unless (code == c_literalstring) { - error1(t, "string omitted after get"); continue; - } - t->get_depth++; - if (t->get_depth > 10) { - fprintf(stderr, "get directives go 10 deep. Looping?\n"); - exit(1); - } - { - char * file; - NEW(input, q); - symbol * u = get_input(t->b, &file); - if (u == 0) { - struct include * r = t->includes; - until (r == 0) { - symbol * b = copy_b(r->b); - b = add_to_b(b, SIZE(t->b), t->b); - u = get_input(b, &file); - lose_b(b); - unless (u == 0) break; - r = r->next; - } - } - if (u == 0) { - error(t, "Can't get '", SIZE(t->b), t->b, "'"); - exit(1); - } - memmove(q, t, sizeof(struct input)); - t->next = q; - t->p = u; - t->c = 0; - t->file = file; - t->line_number = 1; - } - p = t->p; - continue; + code = read_token(t); + if (code != c_literalstring) { + error1(t, "string omitted after get"); continue; + } + t->get_depth++; + if (t->get_depth > 10) { + fprintf(stderr, "get directives go 10 deep. Looping?\n"); + exit(1); + } + { + NEW(input, q); + char * file = b_to_s(t->b); + symbol * u = get_input(file); + if (u == 0) { + struct include * r; + for (r = t->includes; r; r = r->next) { + symbol * b = copy_b(r->b); + b = add_to_b(b, SIZE(t->b), t->b); + free(file); + file = b_to_s(b); + u = get_input(file); + lose_b(b); + if (u != 0) break; + } + } + if (u == 0) { + error(t, "Can't get '", SIZE(t->b), t->b, "'"); + exit(1); + } + memmove(q, t, sizeof(struct input)); + t->next = q; + t->p = u; + t->c = 0; + t->file = file; + t->file_needs_freeing = true; + t->line_number = 1; + } + p = t->p; + continue; case -1: - unless (t->next == 0) { - lose_b(p); - { - struct input * q = t->next; - memmove(t, q, sizeof(struct input)); p = t->p; - FREE(q); - } - t->get_depth--; - continue; - } - /* drop through */ + if (t->next) { + lose_b(p); + { + struct input * q = t->next; + memmove(t, q, sizeof(struct input)); p = t->p; + FREE(q); + } + t->get_depth--; + continue; + } + /* fall through */ default: t->previous_token = t->token; t->token = code; @@ -425,12 +515,17 @@ extern const char * name_of_token(int code) { } } +extern void disable_token(struct tokeniser * t, int code) { + t->token_disabled[code] = 1; +} + extern struct tokeniser * create_tokeniser(symbol * p, char * file) { NEW(tokeniser, t); t->next = 0; t->p = p; t->c = 0; t->file = file; + t->file_needs_freeing = false; t->line_number = 1; t->b = create_b(0); t->b2 = create_b(0); @@ -441,6 +536,8 @@ extern struct tokeniser * create_tokeniser(symbol * p, char * file) { t->token_held = false; t->token = -2; t->previous_token = -2; + t->uplusmode = UPLUS_NONE; + memset(t->token_disabled, 0, sizeof(t->token_disabled)); return t; } @@ -449,7 +546,7 @@ extern void close_tokeniser(struct tokeniser * t) { lose_b(t->b2); { struct m_pair * q = t->m_pairs; - until (q == 0) { + while (q) { struct m_pair * q_next = q->next; lose_b(q->name); lose_b(q->value); @@ -459,12 +556,12 @@ extern void close_tokeniser(struct tokeniser * t) { } { struct input * q = t->next; - until (q == 0) { + while (q) { struct input * q_next = q->next; FREE(q); q = q_next; } } - free(t->file); + if (t->file_needs_freeing) free(t->file); FREE(t); } diff --git a/contrib/snowball/examples/stemwords.c b/contrib/snowball/examples/stemwords.c deleted file mode 100644 index 995bf40dc..000000000 --- a/contrib/snowball/examples/stemwords.c +++ /dev/null @@ -1,209 +0,0 @@ -/* This is a simple program which uses libstemmer to provide a command - * line interface for stemming using any of the algorithms provided. - */ - -#include <stdio.h> -#include <stdlib.h> /* for malloc, free */ -#include <string.h> /* for memmove */ -#include <ctype.h> /* for isupper, tolower */ - -#include "libstemmer.h" - -const char * progname; -static int pretty = 1; - -static void -stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out) -{ -#define INC 10 - int lim = INC; - sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol)); - - while(1) { - int ch = getc(f_in); - if (ch == EOF) { - free(b); return; - } - { - int i = 0; - int inlen = 0; - while(1) { - if (ch == '\n' || ch == EOF) break; - if (i == lim) { - sb_symbol * newb; - newb = (sb_symbol *) - realloc(b, (lim + INC) * sizeof(sb_symbol)); - if (newb == 0) goto error; - b = newb; - lim = lim + INC; - } - /* Update count of utf-8 characters. */ - if (ch < 0x80 || ch > 0xBF) inlen += 1; - /* force lower case: */ - if (isupper(ch)) ch = tolower(ch); - - b[i] = ch; - i++; - ch = getc(f_in); - } - - { - const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i); - if (stemmed == NULL) - { - fprintf(stderr, "Out of memory"); - exit(1); - } - else - { - if (pretty == 1) { - fwrite(b, i, 1, f_out); - fputs(" -> ", f_out); - } else if (pretty == 2) { - fwrite(b, i, 1, f_out); - if (sb_stemmer_length(stemmer) > 0) { - int j; - if (inlen < 30) { - for (j = 30 - inlen; j > 0; j--) - fputs(" ", f_out); - } else { - fputs("\n", f_out); - for (j = 30; j > 0; j--) - fputs(" ", f_out); - } - } - } - - fputs((const char *)stemmed, f_out); - putc('\n', f_out); - } - } - } - } -error: - if (b != 0) free(b); - return; -} - -/** Display the command line syntax, and then exit. - * @param n The value to exit with. - */ -static void -usage(int n) -{ - printf("usage: %s [-l <language>] [-i <input file>] [-o <output file>] [-c <character encoding>] [-p[2]] [-h]\n" - "\n" - "The input file consists of a list of words to be stemmed, one per\n" - "line. Words should be in lower case, but (for English) A-Z letters\n" - "are mapped to their a-z equivalents anyway. If omitted, stdin is\n" - "used.\n" - "\n" - "If -c is given, the argument is the character encoding of the input\n" - "and output files. If it is omitted, the UTF-8 encoding is used.\n" - "\n" - "If -p is given the output file consists of each word of the input\n" - "file followed by \"->\" followed by its stemmed equivalent.\n" - "If -p2 is given the output file is a two column layout containing\n" - "the input words in the first column and the stemmed equivalents in\n" - "the second column.\n" - "Otherwise, the output file consists of the stemmed words, one per\n" - "line.\n" - "\n" - "-h displays this help\n", - progname); - exit(n); -} - -int -main(int argc, char * argv[]) -{ - char * in = 0; - char * out = 0; - FILE * f_in; - FILE * f_out; - struct sb_stemmer * stemmer; - - char * language = "english"; - char * charenc = NULL; - - char * s; - int i = 1; - pretty = 0; - - progname = argv[0]; - - while(i < argc) { - s = argv[i++]; - if (s[0] == '-') { - if (strcmp(s, "-o") == 0) { - if (i >= argc) { - fprintf(stderr, "%s requires an argument\n", s); - exit(1); - } - out = argv[i++]; - } else if (strcmp(s, "-i") == 0) { - if (i >= argc) { - fprintf(stderr, "%s requires an argument\n", s); - exit(1); - } - in = argv[i++]; - } else if (strcmp(s, "-l") == 0) { - if (i >= argc) { - fprintf(stderr, "%s requires an argument\n", s); - exit(1); - } - language = argv[i++]; - } else if (strcmp(s, "-c") == 0) { - if (i >= argc) { - fprintf(stderr, "%s requires an argument\n", s); - exit(1); - } - charenc = argv[i++]; - } else if (strcmp(s, "-p2") == 0) { - pretty = 2; - } else if (strcmp(s, "-p") == 0) { - pretty = 1; - } else if (strcmp(s, "-h") == 0) { - usage(0); - } else { - fprintf(stderr, "option %s unknown\n", s); - usage(1); - } - } else { - fprintf(stderr, "unexpected parameter %s\n", s); - usage(1); - } - } - - /* prepare the files */ - f_in = (in == 0) ? stdin : fopen(in, "r"); - if (f_in == 0) { - fprintf(stderr, "file %s not found\n", in); - exit(1); - } - f_out = (out == 0) ? stdout : fopen(out, "w"); - if (f_out == 0) { - fprintf(stderr, "file %s cannot be opened\n", out); - exit(1); - } - - /* do the stemming process: */ - stemmer = sb_stemmer_new(language, charenc); - if (stemmer == 0) { - if (charenc == NULL) { - fprintf(stderr, "language `%s' not available for stemming\n", language); - exit(1); - } else { - fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc); - exit(1); - } - } - stem_file(stemmer, f_in, f_out); - sb_stemmer_delete(stemmer); - - if (in != 0) (void) fclose(f_in); - if (out != 0) (void) fclose(f_out); - - return 0; -} - diff --git a/contrib/snowball/include/libstemmer.h b/contrib/snowball/include/libstemmer.h index 9d86b8581..98051e108 100644 --- a/contrib/snowball/include/libstemmer.h +++ b/contrib/snowball/include/libstemmer.h @@ -32,9 +32,9 @@ const char ** sb_stemmer_list(void); * * @param charenc The character encoding. NULL may be passed as * this value, in which case UTF-8 encoding will be assumed. Otherwise, - * the argument may be one of "UTF_8", "ISO_8859_1" (ie, Latin 1), - * "CP850" (ie, MS-DOS Latin 1) or "KOI8_R" (Russian). Note that - * case is significant in this parameter. + * the argument may be one of "UTF_8", "ISO_8859_1" (i.e. Latin 1), + * "ISO_8859_2" (i.e. Latin 2) or "KOI8_R" (Russian). Note that case is + * significant in this parameter. * * @return NULL if the specified algorithm is not recognised, or the * algorithm is not available for the requested encoding. Otherwise, @@ -66,7 +66,7 @@ void sb_stemmer_delete(struct sb_stemmer * stemmer); * If an out-of-memory error occurs, this will return NULL. */ const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer, - const sb_symbol * word, int size); + const sb_symbol * word, int size); /** Get the length of the result of the last stemmed word. * This should not be called before sb_stemmer_stem() has been called. @@ -76,4 +76,3 @@ int sb_stemmer_length(struct sb_stemmer * stemmer); #ifdef __cplusplus } #endif - diff --git a/contrib/snowball/java/org/tartarus/snowball/Among.java b/contrib/snowball/java/org/tartarus/snowball/Among.java deleted file mode 100644 index 5ed37b503..000000000 --- a/contrib/snowball/java/org/tartarus/snowball/Among.java +++ /dev/null @@ -1,31 +0,0 @@ -package org.tartarus.snowball; - -import java.lang.reflect.Method; - -public class Among { - public Among (String s, int substring_i, int result, - String methodname, SnowballProgram methodobject) { - this.s_size = s.length(); - this.s = s.toCharArray(); - this.substring_i = substring_i; - this.result = result; - this.methodobject = methodobject; - if (methodname.length() == 0) { - this.method = null; - } else { - try { - this.method = methodobject.getClass(). - getDeclaredMethod(methodname, new Class[0]); - } catch (NoSuchMethodException e) { - throw new RuntimeException(e); - } - } - } - - public final int s_size; /* search string */ - public final char[] s; /* search string */ - public final int substring_i; /* index to longest matching substring */ - public final int result; /* result of the lookup */ - public final Method method; /* method to use if substring matches */ - public final SnowballProgram methodobject; /* object to invoke method on */ -}; diff --git a/contrib/snowball/java/org/tartarus/snowball/SnowballProgram.java b/contrib/snowball/java/org/tartarus/snowball/SnowballProgram.java deleted file mode 100644 index 52d6baa78..000000000 --- a/contrib/snowball/java/org/tartarus/snowball/SnowballProgram.java +++ /dev/null @@ -1,432 +0,0 @@ - -package org.tartarus.snowball; -import java.lang.reflect.InvocationTargetException; - -public class SnowballProgram { - protected SnowballProgram() - { - current = new StringBuffer(); - setCurrent(""); - } - - /** - * Set the current string. - */ - public void setCurrent(String value) - { - current.replace(0, current.length(), value); - cursor = 0; - limit = current.length(); - limit_backward = 0; - bra = cursor; - ket = limit; - } - - /** - * Get the current string. - */ - public String getCurrent() - { - String result = current.toString(); - // Make a new StringBuffer. If we reuse the old one, and a user of - // the library keeps a reference to the buffer returned (for example, - // by converting it to a String in a way which doesn't force a copy), - // the buffer size will not decrease, and we will risk wasting a large - // amount of memory. - // Thanks to Wolfram Esser for spotting this problem. - current = new StringBuffer(); - return result; - } - - // current string - protected StringBuffer current; - - protected int cursor; - protected int limit; - protected int limit_backward; - protected int bra; - protected int ket; - - protected void copy_from(SnowballProgram other) - { - current = other.current; - cursor = other.cursor; - limit = other.limit; - limit_backward = other.limit_backward; - bra = other.bra; - ket = other.ket; - } - - protected boolean in_grouping(char [] s, int min, int max) - { - if (cursor >= limit) return false; - char ch = current.charAt(cursor); - if (ch > max || ch < min) return false; - ch -= min; - if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; - cursor++; - return true; - } - - protected boolean in_grouping_b(char [] s, int min, int max) - { - if (cursor <= limit_backward) return false; - char ch = current.charAt(cursor - 1); - if (ch > max || ch < min) return false; - ch -= min; - if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; - cursor--; - return true; - } - - protected boolean out_grouping(char [] s, int min, int max) - { - if (cursor >= limit) return false; - char ch = current.charAt(cursor); - if (ch > max || ch < min) { - cursor++; - return true; - } - ch -= min; - if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { - cursor ++; - return true; - } - return false; - } - - protected boolean out_grouping_b(char [] s, int min, int max) - { - if (cursor <= limit_backward) return false; - char ch = current.charAt(cursor - 1); - if (ch > max || ch < min) { - cursor--; - return true; - } - ch -= min; - if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { - cursor--; - return true; - } - return false; - } - - protected boolean in_range(int min, int max) - { - if (cursor >= limit) return false; - char ch = current.charAt(cursor); - if (ch > max || ch < min) return false; - cursor++; - return true; - } - - protected boolean in_range_b(int min, int max) - { - if (cursor <= limit_backward) return false; - char ch = current.charAt(cursor - 1); - if (ch > max || ch < min) return false; - cursor--; - return true; - } - - protected boolean out_range(int min, int max) - { - if (cursor >= limit) return false; - char ch = current.charAt(cursor); - if (!(ch > max || ch < min)) return false; - cursor++; - return true; - } - - protected boolean out_range_b(int min, int max) - { - if (cursor <= limit_backward) return false; - char ch = current.charAt(cursor - 1); - if(!(ch > max || ch < min)) return false; - cursor--; - return true; - } - - protected boolean eq_s(int s_size, String s) - { - if (limit - cursor < s_size) return false; - int i; - for (i = 0; i != s_size; i++) { - if (current.charAt(cursor + i) != s.charAt(i)) return false; - } - cursor += s_size; - return true; - } - - protected boolean eq_s_b(int s_size, String s) - { - if (cursor - limit_backward < s_size) return false; - int i; - for (i = 0; i != s_size; i++) { - if (current.charAt(cursor - s_size + i) != s.charAt(i)) return false; - } - cursor -= s_size; - return true; - } - - protected boolean eq_v(CharSequence s) - { - return eq_s(s.length(), s.toString()); - } - - protected boolean eq_v_b(CharSequence s) - { return eq_s_b(s.length(), s.toString()); - } - - protected int find_among(Among v[], int v_size) - { - int i = 0; - int j = v_size; - - int c = cursor; - int l = limit; - - int common_i = 0; - int common_j = 0; - - boolean first_key_inspected = false; - - while(true) { - int k = i + ((j - i) >> 1); - int diff = 0; - int common = common_i < common_j ? common_i : common_j; // smaller - Among w = v[k]; - int i2; - for (i2 = common; i2 < w.s_size; i2++) { - if (c + common == l) { - diff = -1; - break; - } - diff = current.charAt(c + common) - w.s[i2]; - if (diff != 0) break; - common++; - } - if (diff < 0) { - j = k; - common_j = common; - } else { - i = k; - common_i = common; - } - if (j - i <= 1) { - if (i > 0) break; // v->s has been inspected - if (j == i) break; // only one item in v - - // - but now we need to go round once more to get - // v->s inspected. This looks messy, but is actually - // the optimal approach. - - if (first_key_inspected) break; - first_key_inspected = true; - } - } - while(true) { - Among w = v[i]; - if (common_i >= w.s_size) { - cursor = c + w.s_size; - if (w.method == null) return w.result; - boolean res; - try { - Object resobj = w.method.invoke(w.methodobject, - new Object[0]); - res = resobj.toString().equals("true"); - } catch (InvocationTargetException e) { - res = false; - // FIXME - debug message - } catch (IllegalAccessException e) { - res = false; - // FIXME - debug message - } - cursor = c + w.s_size; - if (res) return w.result; - } - i = w.substring_i; - if (i < 0) return 0; - } - } - - // find_among_b is for backwards processing. Same comments apply - protected int find_among_b(Among v[], int v_size) - { - int i = 0; - int j = v_size; - - int c = cursor; - int lb = limit_backward; - - int common_i = 0; - int common_j = 0; - - boolean first_key_inspected = false; - - while(true) { - int k = i + ((j - i) >> 1); - int diff = 0; - int common = common_i < common_j ? common_i : common_j; - Among w = v[k]; - int i2; - for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) { - if (c - common == lb) { - diff = -1; - break; - } - diff = current.charAt(c - 1 - common) - w.s[i2]; - if (diff != 0) break; - common++; - } - if (diff < 0) { - j = k; - common_j = common; - } else { - i = k; - common_i = common; - } - if (j - i <= 1) { - if (i > 0) break; - if (j == i) break; - if (first_key_inspected) break; - first_key_inspected = true; - } - } - while(true) { - Among w = v[i]; - if (common_i >= w.s_size) { - cursor = c - w.s_size; - if (w.method == null) return w.result; - - boolean res; - try { - Object resobj = w.method.invoke(w.methodobject, - new Object[0]); - res = resobj.toString().equals("true"); - } catch (InvocationTargetException e) { - res = false; - // FIXME - debug message - } catch (IllegalAccessException e) { - res = false; - // FIXME - debug message - } - cursor = c - w.s_size; - if (res) return w.result; - } - i = w.substring_i; - if (i < 0) return 0; - } - } - - /* to replace chars between c_bra and c_ket in current by the - * chars in s. - */ - protected int replace_s(int c_bra, int c_ket, String s) - { - int adjustment = s.length() - (c_ket - c_bra); - current.replace(c_bra, c_ket, s); - limit += adjustment; - if (cursor >= c_ket) cursor += adjustment; - else if (cursor > c_bra) cursor = c_bra; - return adjustment; - } - - protected void slice_check() - { - if (bra < 0 || - bra > ket || - ket > limit || - limit > current.length()) // this line could be removed - { - System.err.println("faulty slice operation"); - // FIXME: report error somehow. - /* - fprintf(stderr, "faulty slice operation:\n"); - debug(z, -1, 0); - exit(1); - */ - } - } - - protected void slice_from(String s) - { - slice_check(); - replace_s(bra, ket, s); - } - - protected void slice_from(CharSequence s) - { - slice_from(s.toString()); - } - - protected void slice_del() - { - slice_from(""); - } - - protected void insert(int c_bra, int c_ket, String s) - { - int adjustment = replace_s(c_bra, c_ket, s); - if (c_bra <= bra) bra += adjustment; - if (c_bra <= ket) ket += adjustment; - } - - protected void insert(int c_bra, int c_ket, CharSequence s) - { - insert(c_bra, c_ket, s.toString()); - } - - /* Copy the slice into the supplied StringBuffer */ - protected StringBuffer slice_to(StringBuffer s) - { - slice_check(); - int len = ket - bra; - s.replace(0, s.length(), current.substring(bra, ket)); - return s; - } - - /* Copy the slice into the supplied StringBuilder */ - protected StringBuilder slice_to(StringBuilder s) - { - slice_check(); - int len = ket - bra; - s.replace(0, s.length(), current.substring(bra, ket)); - return s; - } - - protected StringBuffer assign_to(StringBuffer s) - { - s.replace(0, s.length(), current.substring(0, limit)); - return s; - } - - protected StringBuilder assign_to(StringBuilder s) - { - s.replace(0, s.length(), current.substring(0, limit)); - return s; - } - -/* -extern void debug(struct SN_env * z, int number, int line_count) -{ int i; - int limit = SIZE(z->p); - //if (number >= 0) printf("%3d (line %4d): '", number, line_count); - if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit); - for (i = 0; i <= limit; i++) - { if (z->lb == i) printf("{"); - if (z->bra == i) printf("["); - if (z->c == i) printf("|"); - if (z->ket == i) printf("]"); - if (z->l == i) printf("}"); - if (i < limit) - { int ch = z->p[i]; - if (ch == 0) ch = '#'; - printf("%c", ch); - } - } - printf("'\n"); -} -*/ - -}; diff --git a/contrib/snowball/java/org/tartarus/snowball/SnowballStemmer.java b/contrib/snowball/java/org/tartarus/snowball/SnowballStemmer.java deleted file mode 100644 index 960bd55f6..000000000 --- a/contrib/snowball/java/org/tartarus/snowball/SnowballStemmer.java +++ /dev/null @@ -1,7 +0,0 @@ - -package org.tartarus.snowball; -import java.lang.reflect.InvocationTargetException; - -public abstract class SnowballStemmer extends SnowballProgram { - public abstract boolean stem(); -}; diff --git a/contrib/snowball/java/org/tartarus/snowball/TestApp.java b/contrib/snowball/java/org/tartarus/snowball/TestApp.java deleted file mode 100644 index 38803f673..000000000 --- a/contrib/snowball/java/org/tartarus/snowball/TestApp.java +++ /dev/null @@ -1,77 +0,0 @@ - -package org.tartarus.snowball; - -import java.lang.reflect.Method; -import java.io.Reader; -import java.io.Writer; -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.FileInputStream; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.io.OutputStream; -import java.io.FileOutputStream; - -public class TestApp { - private static void usage() - { - System.err.println("Usage: TestApp <algorithm> <input file> [-o <output file>]"); - } - - public static void main(String [] args) throws Throwable { - if (args.length < 2) { - usage(); - return; - } - - Class stemClass = Class.forName("org.tartarus.snowball.ext." + - args[0] + "Stemmer"); - SnowballStemmer stemmer = (SnowballStemmer) stemClass.newInstance(); - - Reader reader; - reader = new InputStreamReader(new FileInputStream(args[1])); - reader = new BufferedReader(reader); - - StringBuffer input = new StringBuffer(); - - OutputStream outstream; - - if (args.length > 2) { - if (args.length >= 4 && args[2].equals("-o")) { - outstream = new FileOutputStream(args[3]); - } else { - usage(); - return; - } - } else { - outstream = System.out; - } - Writer output = new OutputStreamWriter(outstream); - output = new BufferedWriter(output); - - int repeat = 1; - if (args.length > 4) { - repeat = Integer.parseInt(args[4]); - } - - Object [] emptyArgs = new Object[0]; - int character; - while ((character = reader.read()) != -1) { - char ch = (char) character; - if (Character.isWhitespace((char) ch)) { - if (input.length() > 0) { - stemmer.setCurrent(input.toString()); - for (int i = repeat; i != 0; i--) { - stemmer.stem(); - } - output.write(stemmer.getCurrent()); - output.write('\n'); - input.delete(0, input.length()); - } - } else { - input.append(Character.toLowerCase(ch)); - } - } - output.flush(); - } -} diff --git a/contrib/snowball/libstemmer/libstemmer_c.in b/contrib/snowball/libstemmer/libstemmer_c.in index 4de579874..2aa918d61 100644 --- a/contrib/snowball/libstemmer/libstemmer_c.in +++ b/contrib/snowball/libstemmer/libstemmer_c.in @@ -22,10 +22,10 @@ sb_stemmer_list(void) static stemmer_encoding_t sb_getenc(const char * charenc) { - struct stemmer_encoding * encoding; + const struct stemmer_encoding * encoding; if (charenc == NULL) return ENC_UTF_8; for (encoding = encodings; encoding->name != 0; encoding++) { - if (strcmp(encoding->name, charenc) == 0) break; + if (strcmp(encoding->name, charenc) == 0) break; } if (encoding->name == NULL) return ENC_UNKNOWN; return encoding->enc; @@ -35,14 +35,14 @@ extern struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc) { stemmer_encoding_t enc; - struct stemmer_modules * module; + const struct stemmer_modules * module; struct sb_stemmer * stemmer; enc = sb_getenc(charenc); if (enc == ENC_UNKNOWN) return NULL; for (module = modules; module->name != 0; module++) { - if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break; + if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break; } if (module->name == NULL) return NULL; @@ -67,9 +67,10 @@ void sb_stemmer_delete(struct sb_stemmer * stemmer) { if (stemmer == 0) return; - if (stemmer->close == 0) return; - stemmer->close(stemmer->env); - stemmer->close = 0; + if (stemmer->close) { + stemmer->close(stemmer->env); + stemmer->close = 0; + } free(stemmer); } diff --git a/contrib/snowball/libstemmer/mkmodules.pl b/contrib/snowball/libstemmer/mkmodules.pl index ff8c19e7c..dd6678759 100755 --- a/contrib/snowball/libstemmer/mkmodules.pl +++ b/contrib/snowball/libstemmer/mkmodules.pl @@ -1,10 +1,12 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl use strict; +use 5.006; +use warnings; my $progname = $0; if (scalar @ARGV < 4 || scalar @ARGV > 5) { - print "Usage: $progname <outfile> <C source directory> <modules description file> <source list file> [<extn>]\n"; + print "Usage: $progname <outfile> <C source directory> <modules description file> <source list file> [<enc>]\n"; exit 1; } @@ -12,9 +14,11 @@ my $outname = shift(@ARGV); my $c_src_dir = shift(@ARGV); my $descfile = shift(@ARGV); my $srclistfile = shift(@ARGV); +my $enc_only; my $extn = ''; if (@ARGV) { - $extn = '_'.shift(@ARGV); + $enc_only = shift(@ARGV); + $extn = '_'.$enc_only; } my %aliases = (); @@ -27,6 +31,14 @@ sub addalgenc($$) { my $alg = shift(); my $enc = shift(); + if (defined $enc_only) { + my $norm_enc = lc $enc; + $norm_enc =~ s/_//g; + if ($norm_enc ne $enc_only) { + return; + } + } + if (defined $algorithm_encs{$alg}) { my $hashref = $algorithm_encs{$alg}; $$hashref{$enc}=1; @@ -42,7 +54,7 @@ sub readinput() { open DESCFILE, $descfile; my $line; - while($line = <DESCFILE>) + while ($line = <DESCFILE>) { next if $line =~ m/^\s*#/; next if $line =~ m/^\s*$/; @@ -123,7 +135,7 @@ struct stemmer_encoding { const char * name; stemmer_encoding_t enc; }; -static struct stemmer_encoding encodings[] = { +static const struct stemmer_encoding encodings[] = { EOS for $enc (sort keys %encs) { print OUT " {\"${enc}\", ENC_${enc}},\n"; @@ -139,7 +151,7 @@ struct stemmer_modules { void (*close)(struct SN_env *); int (*stem)(struct SN_env *); }; -static struct stemmer_modules modules[] = { +static const struct stemmer_modules modules[] = { EOS for $lang (sort keys %aliases) { @@ -162,7 +174,6 @@ static const char * algorithm_names[] = { EOS for $lang (@algorithms) { - my $l = $aliases{$lang}; print OUT " \"$lang\", \n"; } diff --git a/contrib/snowball/libstemmer/modules.txt b/contrib/snowball/libstemmer/modules.txt index d237c1d8d..f6dcc7e92 100644 --- a/contrib/snowball/libstemmer/modules.txt +++ b/contrib/snowball/libstemmer/modules.txt @@ -9,27 +9,35 @@ # List all the main algorithms for each language, in UTF-8, and also with # the most commonly used encoding. -danish UTF_8,ISO_8859_1 danish,da,dan -dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld -english UTF_8,ISO_8859_1 english,en,eng -finnish UTF_8,ISO_8859_1 finnish,fi,fin -french UTF_8,ISO_8859_1 french,fr,fre,fra -german UTF_8,ISO_8859_1 german,de,ger,deu -hungarian UTF_8,ISO_8859_2 hungarian,hu,hun -italian UTF_8,ISO_8859_1 italian,it,ita -norwegian UTF_8,ISO_8859_1 norwegian,no,nor -portuguese UTF_8,ISO_8859_1 portuguese,pt,por -romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron -russian UTF_8,KOI8_R russian,ru,rus -spanish UTF_8,ISO_8859_1 spanish,es,esl,spa -swedish UTF_8,ISO_8859_1 swedish,sv,swe +arabic UTF_8 arabic,ar,ara +danish UTF_8 danish,da,dan +dutch UTF_8 dutch,nl,dut,nld +english UTF_8 english,en,eng +finnish UTF_8 finnish,fi,fin +french UTF_8 french,fr,fre,fra +german UTF_8 german,de,ger,deu +greek UTF_8 greek,el,gre,ell +hindi UTF_8 hindi,hi,hin +hungarian UTF_8 hungarian,hu,hun +indonesian UTF_8 indonesian,id,ind +italian UTF_8 italian,it,ita +lithuanian UTF_8 lithuanian,lt,lit +nepali UTF_8 nepali,ne,nep +norwegian UTF_8 norwegian,no,nor +portuguese UTF_8 portuguese,pt,por +romanian UTF_8 romanian,ro,rum,ron +russian UTF_8 russian,ru,rus +serbian UTF_8 serbian,sr,srp +spanish UTF_8 spanish,es,esl,spa +swedish UTF_8 swedish,sv,swe +tamil UTF_8 tamil,ta,tam turkish UTF_8 turkish,tr,tur # Also include the traditional porter algorithm for english. # The porter algorithm is included in the libstemmer distribution to assist # with backwards compatibility, but for new systems the english algorithm # should be used in preference. -porter UTF_8,ISO_8859_1 porter +porter UTF_8 porter english # Some other stemmers in the snowball project are not included in the standard # distribution. To compile a libstemmer with them in, add them to this list, @@ -39,12 +47,12 @@ porter UTF_8,ISO_8859_1 porter # algorithms are: # # german2 - This is a slight modification of the german stemmer. -#german2 UTF_8,ISO_8859_1 german2 +#german2 UTF_8,ISO_8859_1 german2 german # # kraaij_pohlmann - This is a different dutch stemmer. -#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann +#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch # # lovins - This is an english stemmer, but fairly outdated, and # only really applicable to a restricted type of input text # (keywords in academic publications). -#lovins UTF_8,ISO_8859_1 lovins +#lovins UTF_8,ISO_8859_1 lovins english diff --git a/contrib/snowball/runtime/api.c b/contrib/snowball/runtime/api.c index 40039ef4a..21ea5f246 100644 --- a/contrib/snowball/runtime/api.c +++ b/contrib/snowball/runtime/api.c @@ -2,7 +2,7 @@ #include <stdlib.h> /* for calloc, free */ #include "header.h" -extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size) +extern struct SN_env * SN_create_env(int S_size, int I_size) { struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env)); if (z == NULL) return NULL; @@ -27,12 +27,6 @@ extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size) if (z->I == NULL) goto error; } - if (B_size) - { - z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char)); - if (z->B == NULL) goto error; - } - return z; error: SN_close_env(z, S_size); @@ -52,7 +46,6 @@ extern void SN_close_env(struct SN_env * z, int S_size) free(z->S); } free(z->I); - free(z->B); if (z->p) lose_s(z->p); free(z); } @@ -63,4 +56,3 @@ extern int SN_set_current(struct SN_env * z, int size, const symbol * s) z->c = 0; return err; } - diff --git a/contrib/snowball/runtime/api.h b/contrib/snowball/runtime/api.h index 8b997f0c2..ba9d1c14b 100644 --- a/contrib/snowball/runtime/api.h +++ b/contrib/snowball/runtime/api.h @@ -16,11 +16,17 @@ struct SN_env { int c; int l; int lb; int bra; int ket; symbol * * S; int * I; - unsigned char * B; }; -extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size); +#ifdef __cplusplus +extern "C" { +#endif + +extern struct SN_env * SN_create_env(int S_size, int I_size); extern void SN_close_env(struct SN_env * z, int S_size); extern int SN_set_current(struct SN_env * z, int size, const symbol * s); +#ifdef __cplusplus +} +#endif diff --git a/contrib/snowball/runtime/header.h b/contrib/snowball/runtime/header.h index 4d3078f50..85a42fdb8 100644 --- a/contrib/snowball/runtime/header.h +++ b/contrib/snowball/runtime/header.h @@ -54,5 +54,6 @@ extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p); extern symbol * slice_to(struct SN_env * z, symbol * p); extern symbol * assign_to(struct SN_env * z, symbol * p); -extern void debug(struct SN_env * z, int number, int line_count); +extern int len_utf8(const symbol * p); +extern void debug(struct SN_env * z, int number, int line_count); diff --git a/contrib/snowball/runtime/utilities.c b/contrib/snowball/runtime/utilities.c index 1840f0280..1cfd1a17d 100644 --- a/contrib/snowball/runtime/utilities.c +++ b/contrib/snowball/runtime/utilities.c @@ -5,8 +5,6 @@ #include "header.h" -#define unless(C) if(!(C)) - #define CREATE_SIZE 1 extern symbol * create_s(void) { @@ -15,7 +13,7 @@ extern symbol * create_s(void) { if (mem == NULL) return NULL; p = (symbol *) (HEAD + (char *) mem); CAPACITY(p) = CREATE_SIZE; - SET_SIZE(p, CREATE_SIZE); + SET_SIZE(p, 0); return p; } @@ -27,7 +25,7 @@ extern void lose_s(symbol * p) { /* new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new - position, or 0 on failure. + position, or -1 on failure. -- used to implement hop and next in the utf8 case. */ @@ -66,77 +64,95 @@ extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) { /* Code for character groupings: utf8 cases */ static int get_utf8(const symbol * p, int c, int l, int * slot) { - int b0, b1; + int b0, b1, b2; if (c >= l) return 0; b0 = p[c++]; if (b0 < 0xC0 || c == l) { /* 1100 0000 */ - * slot = b0; return 1; + *slot = b0; + return 1; } - b1 = p[c++]; + b1 = p[c++] & 0x3F; if (b0 < 0xE0 || c == l) { /* 1110 0000 */ - * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2; + *slot = (b0 & 0x1F) << 6 | b1; + return 2; + } + b2 = p[c++] & 0x3F; + if (b0 < 0xF0 || c == l) { /* 1111 0000 */ + *slot = (b0 & 0xF) << 12 | b1 << 6 | b2; + return 3; } - * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[c] & 0x3F); return 3; + *slot = (b0 & 0xE) << 18 | b1 << 12 | b2 << 6 | (p[c] & 0x3F); + return 4; } static int get_b_utf8(const symbol * p, int c, int lb, int * slot) { - int b0, b1; + int a, b; if (c <= lb) return 0; - b0 = p[--c]; - if (b0 < 0x80 || c == lb) { /* 1000 0000 */ - * slot = b0; return 1; + b = p[--c]; + if (b < 0x80 || c == lb) { /* 1000 0000 */ + *slot = b; + return 1; } - b1 = p[--c]; - if (b1 >= 0xC0 || c == lb) { /* 1100 0000 */ - * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2; + a = b & 0x3F; + b = p[--c]; + if (b >= 0xC0 || c == lb) { /* 1100 0000 */ + *slot = (b & 0x1F) << 6 | a; + return 2; } - * slot = (p[c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3; + a |= (b & 0x3F) << 6; + b = p[--c]; + if (b >= 0xE0 || c == lb) { /* 1110 0000 */ + *slot = (b & 0xF) << 12 | a; + return 3; + } + *slot = (p[--c] & 0xE) << 18 | (b & 0x3F) << 12 | a; + return 4; } extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { - int ch; - int w = get_utf8(z->p, z->c, z->l, & ch); - unless (w) return -1; - if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) - return w; - z->c += w; + int ch; + int w = get_utf8(z->p, z->c, z->l, & ch); + if (!w) return -1; + if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return w; + z->c += w; } while (repeat); return 0; } extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { - int ch; - int w = get_b_utf8(z->p, z->c, z->lb, & ch); - unless (w) return -1; - if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) - return w; - z->c -= w; + int ch; + int w = get_b_utf8(z->p, z->c, z->lb, & ch); + if (!w) return -1; + if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return w; + z->c -= w; } while (repeat); return 0; } extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { - int ch; - int w = get_utf8(z->p, z->c, z->l, & ch); - unless (w) return -1; - unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) - return w; - z->c += w; + int ch; + int w = get_utf8(z->p, z->c, z->l, & ch); + if (!w) return -1; + if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) + return w; + z->c += w; } while (repeat); return 0; } extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { - int ch; - int w = get_b_utf8(z->p, z->c, z->lb, & ch); - unless (w) return -1; - unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) - return w; - z->c -= w; + int ch; + int w = get_b_utf8(z->p, z->c, z->lb, & ch); + if (!w) return -1; + if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) + return w; + z->c -= w; } while (repeat); return 0; } @@ -145,48 +161,48 @@ extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { - int ch; - if (z->c >= z->l) return -1; - ch = z->p[z->c]; - if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) - return 1; - z->c++; + int ch; + if (z->c >= z->l) return -1; + ch = z->p[z->c]; + if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return 1; + z->c++; } while (repeat); return 0; } extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { - int ch; - if (z->c <= z->lb) return -1; - ch = z->p[z->c - 1]; - if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) - return 1; - z->c--; + int ch; + if (z->c <= z->lb) return -1; + ch = z->p[z->c - 1]; + if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return 1; + z->c--; } while (repeat); return 0; } extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { - int ch; - if (z->c >= z->l) return -1; - ch = z->p[z->c]; - unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) - return 1; - z->c++; + int ch; + if (z->c >= z->l) return -1; + ch = z->p[z->c]; + if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) + return 1; + z->c++; } while (repeat); return 0; } extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { - int ch; - if (z->c <= z->lb) return -1; - ch = z->p[z->c - 1]; - unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) - return 1; - z->c--; + int ch; + if (z->c <= z->lb) return -1; + ch = z->p[z->c - 1]; + if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) + return 1; + z->c--; } while (repeat); return 0; } @@ -215,7 +231,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) { int j = v_size; int c = z->c; int l = z->l; - symbol * q = z->p + c; + const symbol * q = z->p + c; const struct among * w; @@ -224,7 +240,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) { int first_key_inspected = 0; - while(1) { + while (1) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; /* smaller */ @@ -237,8 +253,13 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) { common++; } } - if (diff < 0) { j = k; common_j = common; } - else { i = k; common_i = common; } + if (diff < 0) { + j = k; + common_j = common; + } else { + i = k; + common_i = common; + } if (j - i <= 1) { if (i > 0) break; /* v->s has been inspected */ if (j == i) break; /* only one item in v */ @@ -251,7 +272,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) { first_key_inspected = 1; } } - while(1) { + while (1) { w = v + i; if (common_i >= w->s_size) { z->c = c + w->s_size; @@ -275,7 +296,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) { int j = v_size; int c = z->c; int lb = z->lb; - symbol * q = z->p + c - 1; + const symbol * q = z->p + c - 1; const struct among * w; @@ -284,7 +305,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) { int first_key_inspected = 0; - while(1) { + while (1) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; @@ -306,7 +327,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) { first_key_inspected = 1; } } - while(1) { + while (1) { w = v + i; if (common_i >= w->s_size) { z->c = c - w->s_size; @@ -367,11 +388,10 @@ extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const z->l += adjustment; if (z->c >= c_ket) z->c += adjustment; - else - if (z->c > c_bra) - z->c = c_bra; + else if (z->c > c_bra) + z->c = c_bra; } - unless (s_size == 0) memmove(z->p + c_bra, s, s_size * sizeof(symbol)); + if (s_size) memmove(z->p + c_bra, s, s_size * sizeof(symbol)); if (adjptr != NULL) *adjptr = adjustment; return 0; @@ -417,12 +437,7 @@ extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbo } extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) { - int adjustment; - if (replace_s(z, bra, ket, SIZE(p), p, &adjustment)) - return -1; - if (bra <= z->bra) z->bra += adjustment; - if (bra <= z->ket) z->ket += adjustment; - return 0; + return insert_s(z, bra, ket, SIZE(p), p); } extern symbol * slice_to(struct SN_env * z, symbol * p) { @@ -455,6 +470,16 @@ extern symbol * assign_to(struct SN_env * z, symbol * p) { return p; } +extern int len_utf8(const symbol * p) { + int size = SIZE(p); + int len = 0; + while (size--) { + symbol b = *p++; + if (b >= 0xC0 || b < 0x80) ++len; + } + return len; +} + #if 0 extern void debug(struct SN_env * z, int number, int line_count) { int i; |