diff options
Diffstat (limited to 'contrib/snowball')
60 files changed, 13246 insertions, 0 deletions
diff --git a/contrib/snowball b/contrib/snowball deleted file mode 160000 -Subproject c381f4fa958b59d41b0c596f0cbfe3ed48831e9 diff --git a/contrib/snowball/.gitignore b/contrib/snowball/.gitignore new file mode 100644 index 000000000..2147da841 --- /dev/null +++ b/contrib/snowball/.gitignore @@ -0,0 +1,5 @@ +*.o +/libstemmer +/snowball +/src_c +/stemwords diff --git a/contrib/snowball/.travis.yml b/contrib/snowball/.travis.yml new file mode 100644 index 000000000..e576233dc --- /dev/null +++ b/contrib/snowball/.travis.yml @@ -0,0 +1,4 @@ +language: c +compiler: gcc +before_script: git clone https://github.com/snowballstem/snowball-data ../data +script: make check diff --git a/contrib/snowball/AUTHORS b/contrib/snowball/AUTHORS new file mode 100644 index 000000000..60eae6f04 --- /dev/null +++ b/contrib/snowball/AUTHORS @@ -0,0 +1,27 @@ +Authors +======= + +Martin Porter +------------- + + - Designed the snowball language. + - Implemented the snowball to C compiler. + - Implemented the stemming algorithms in C. + - Wrote the documentation. + +Richard Boulton +--------------- + + - Implemented Java backend of the snowball compiler. + - Developed build system. + - Assisted with website maintenance. + + +Assistance from +--------------- + +Olivier Bornet - fixes to java packaging and build system. +Andreas Jung - useful bug reports on the libstemmer library. +Olly Betts - several patches, bug reports, and performance improvements. +Sebastiano Vigna and Oerd Cukalla - patches for the Java stemming algorithms. +Ralf Junker - fix a potential memory leak in sb_stemmer_new(). diff --git a/contrib/snowball/CMakeLists.txt b/contrib/snowball/CMakeLists.txt new file mode 100644 index 000000000..1dee79490 --- /dev/null +++ b/contrib/snowball/CMakeLists.txt @@ -0,0 +1,85 @@ +PROJECT(snowball C) + +cmake_minimum_required(VERSION 2.8) + +INCLUDE(CheckCCompilerFlag) +INCLUDE(FindPerl) + +# End of configuration +SET(LIBSTEM_ALGORITHMS danish dutch english finnish french german hungarian + italian norwegian porter portuguese romanian + russian spanish swedish turkish) +SET(KOI8_ALGORITHMS russian) +SET(ISO_8859_1_ALGORITHMS danish dutch english finnish french german italian + norwegian porter portuguese spanish swedish) +SET(ISO_8859_2_ALGORITHMS hungarian romanian) +SET(OTHER_ALGORITHMS german2 kraaij_pohlmann lovins) +SET(ALL_ALGORITHMS ${LIBSTEM_ALGORITHMS} ${OTHER_ALGORITHMS}) + +SET(COMPILER_SOURCES compiler/space.c + compiler/tokeniser.c + compiler/analyser.c + compiler/generator.c + compiler/driver.c + compiler/generator_java.c) + +SET(SNOWBALL_RUNTIME runtime/api.c + runtime/utilities.c) +SET(LIBSTEMMER_SOURCES libstemmer/libstemmer.c) +SET(LIBSTEMMER_UTF8_SOURCES libstemmer/libstemmer_utf8.c) +#LIBSTEMMER_UTF8_SOURCES = libstemmer/libstemmer_utf8.c +#LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h +#LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in + +SET(STEMWORDS_SOURCES examples/stemwords.c) +SET(MODULES_H "modules.h") +CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/libstemmer_c.in ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/libstemmer.c @ONLY) + +MACRO(gen_stem IN ENCODING) + FOREACH(_it ${IN}) + SET(_base "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/stem_${ENCODING}_${_it}") + SET(_header "${_base}.h") + SET(_source "${_base}.c") + STRING(REPLACE "UTF_8" "Unicode" _in_enc "${ENCODING}") + SET(_input "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}/stem_${_in_enc}.sbl") + IF(${_in_enc} STREQUAL "Unicode" AND NOT EXISTS ${_input}) + ADD_CUSTOM_COMMAND(OUTPUT ${_source} + COMMAND ${CMAKE_CURRENT_BINARY_DIR}/snowball "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}/stem_ISO_8859_1.sbl" -o ${_base} -eprefix ${_it}_${ENCODING}_ -r ${CMAKE_CURRENT_SOURCE_DIR}/runtime -u + DEPENDS snowball) + LIST(APPEND STEMMER_SOURCES ${_source}) + + ELSE() + IF(EXISTS "${_input}") + ADD_CUSTOM_COMMAND(OUTPUT ${_source} + COMMAND ${CMAKE_CURRENT_BINARY_DIR}/snowball ${_input} -o ${_base} -eprefix ${_it}_${ENCODING}_ -r ${CMAKE_CURRENT_SOURCE_DIR}/runtime -u + DEPENDS snowball) + LIST(APPEND STEMMER_SOURCES ${_source}) + ENDIF() + ENDIF() + ENDFOREACH() +ENDMACRO() + +INCLUDE_DIRECTORIES("include") +INCLUDE_DIRECTORIES("${CMAKE_CURRENT_BINARY_DIR}/libstemmer") + +ADD_EXECUTABLE(snowball ${COMPILER_SOURCES}) + +ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h + COMMAND ${PERL_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/mkmodules.pl -f ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h ${CMAKE_CURRENT_BINARY_DIR}/libstemmer ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/modules.txt ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/mkinc.mak) +ADD_CUSTOM_TARGET(modules DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h") + +SET(STEMMER_SOURCES "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/libstemmer.c") +ADD_CUSTOM_TARGET(stemmer_deps ALL) +ADD_DEPENDENCIES(stemmer_deps modules) + +gen_stem("${LIBSTEM_ALGORITHMS}" "UTF_8") +gen_stem("${KOI8_ALGORITHMS}" "KOI8_R") +gen_stem("${ISO_8859_1_ALGORITHMS}" "ISO_8859_1") +gen_stem("${ISO_8859_2_ALGORITHMS}" "ISO_8859_2") + + +ADD_LIBRARY(stemmer ${LINK_TYPE} ${SNOWBALL_RUNTIME} ${STEMMER_SOURCES}) +ADD_DEPENDENCIES(stemmer stemmer_deps) + +ADD_EXECUTABLE(stemwords ${STEMWORDS_SOURCES}) +TARGET_LINK_LIBRARIES(stemwords stemmer) diff --git a/contrib/snowball/GNUmakefile b/contrib/snowball/GNUmakefile new file mode 100644 index 000000000..a30cafd89 --- /dev/null +++ b/contrib/snowball/GNUmakefile @@ -0,0 +1,300 @@ +# -*- makefile -*- + +c_src_dir = src_c +java_src_main_dir = java/org/tartarus/snowball +java_src_dir = $(java_src_main_dir)/ext + +libstemmer_algorithms = danish dutch english finnish french german hungarian \ + italian \ + norwegian porter portuguese romanian \ + russian spanish swedish turkish + +KOI8_R_algorithms = russian +ISO_8859_1_algorithms = danish dutch english finnish french german italian \ + norwegian porter portuguese spanish swedish +ISO_8859_2_algorithms = hungarian romanian + +other_algorithms = german2 kraaij_pohlmann lovins + +all_algorithms = $(libstemmer_algorithms) $(other_algorithms) + +COMPILER_SOURCES = compiler/space.c \ + compiler/tokeniser.c \ + compiler/analyser.c \ + compiler/generator.c \ + compiler/driver.c \ + compiler/generator_java.c +COMPILER_HEADERS = compiler/header.h \ + compiler/syswords.h \ + compiler/syswords2.h + +RUNTIME_SOURCES = runtime/api.c \ + runtime/utilities.c +RUNTIME_HEADERS = runtime/api.h \ + runtime/header.h + +JAVARUNTIME_SOURCES = java/org/tartarus/snowball/Among.java \ + java/org/tartarus/snowball/SnowballProgram.java \ + java/org/tartarus/snowball/SnowballStemmer.java \ + java/org/tartarus/snowball/TestApp.java + +LIBSTEMMER_SOURCES = libstemmer/libstemmer.c +LIBSTEMMER_UTF8_SOURCES = libstemmer/libstemmer_utf8.c +LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h +LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in + +STEMWORDS_SOURCES = examples/stemwords.c + +ALL_ALGORITHM_FILES = $(all_algorithms:%=algorithms/%/stem*.sbl) +C_LIB_SOURCES = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) \ + $(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.c) \ + $(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.c) \ + $(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.c) +C_LIB_HEADERS = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) \ + $(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.h) \ + $(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.h) \ + $(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.h) +C_OTHER_SOURCES = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) +C_OTHER_HEADERS = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) +JAVA_SOURCES = $(libstemmer_algorithms:%=$(java_src_dir)/%Stemmer.java) + +COMPILER_OBJECTS=$(COMPILER_SOURCES:.c=.o) +RUNTIME_OBJECTS=$(RUNTIME_SOURCES:.c=.o) +LIBSTEMMER_OBJECTS=$(LIBSTEMMER_SOURCES:.c=.o) +LIBSTEMMER_UTF8_OBJECTS=$(LIBSTEMMER_UTF8_SOURCES:.c=.o) +STEMWORDS_OBJECTS=$(STEMWORDS_SOURCES:.c=.o) +C_LIB_OBJECTS = $(C_LIB_SOURCES:.c=.o) +C_OTHER_OBJECTS = $(C_OTHER_SOURCES:.c=.o) +JAVA_CLASSES = $(JAVA_SOURCES:.java=.class) +JAVA_RUNTIME_CLASSES=$(JAVARUNTIME_SOURCES:.java=.class) + +CFLAGS=-Iinclude -O2 +CPPFLAGS=-W -Wall -Wmissing-prototypes -Wmissing-declarations + +all: snowball libstemmer.o stemwords $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) + +clean: + rm -f $(COMPILER_OBJECTS) $(RUNTIME_OBJECTS) \ + $(LIBSTEMMER_OBJECTS) $(LIBSTEMMER_UTF8_OBJECTS) $(STEMWORDS_OBJECTS) snowball \ + libstemmer.o stemwords \ + libstemmer/modules.h \ + libstemmer/modules_utf8.h \ + snowball.splint \ + $(C_LIB_SOURCES) $(C_LIB_HEADERS) $(C_LIB_OBJECTS) \ + $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) \ + $(JAVA_SOURCES) $(JAVA_CLASSES) $(JAVA_RUNTIME_CLASSES) \ + libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak \ + libstemmer/libstemmer.c libstemmer/libstemmer_utf8.c + rm -rf dist + rmdir $(c_src_dir) || true + +snowball: $(COMPILER_OBJECTS) + $(CC) -o $@ $^ + +$(COMPILER_OBJECTS): $(COMPILER_HEADERS) + +libstemmer/libstemmer.c: libstemmer/libstemmer_c.in + sed 's/@MODULES_H@/modules.h/' $^ >$@ + +libstemmer/libstemmer_utf8.c: libstemmer/libstemmer_c.in + sed 's/@MODULES_H@/modules_utf8.h/' $^ >$@ + +libstemmer/modules.h libstemmer/mkinc.mak: libstemmer/mkmodules.pl libstemmer/modules.txt + libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules.txt libstemmer/mkinc.mak + +libstemmer/modules_utf8.h libstemmer/mkinc_utf8.mak: libstemmer/mkmodules.pl libstemmer/modules_utf8.txt + libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules_utf8.txt libstemmer/mkinc_utf8.mak utf8 + +libstemmer/libstemmer.o: libstemmer/modules.h $(C_LIB_HEADERS) + +libstemmer.o: libstemmer/libstemmer.o $(RUNTIME_OBJECTS) $(C_LIB_OBJECTS) + $(AR) -cru $@ $^ + +stemwords: $(STEMWORDS_OBJECTS) libstemmer.o + $(CC) -o $@ $^ + +algorithms/%/stem_Unicode.sbl: algorithms/%/stem_ISO_8859_1.sbl + cp $^ $@ + +$(c_src_dir)/stem_UTF_8_%.c $(c_src_dir)/stem_UTF_8_%.h: algorithms/%/stem_Unicode.sbl snowball + @mkdir -p $(c_src_dir) + @l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \ + o="$(c_src_dir)/stem_UTF_8_$${l}"; \ + echo "./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u"; \ + ./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u + +$(c_src_dir)/stem_KOI8_R_%.c $(c_src_dir)/stem_KOI8_R_%.h: algorithms/%/stem_KOI8_R.sbl snowball + @mkdir -p $(c_src_dir) + @l=`echo "$<" | sed 's!\(.*\)/stem_KOI8_R.sbl$$!\1!;s!^.*/!!'`; \ + o="$(c_src_dir)/stem_KOI8_R_$${l}"; \ + echo "./snowball $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime"; \ + ./snowball $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime + +$(c_src_dir)/stem_ISO_8859_1_%.c $(c_src_dir)/stem_ISO_8859_1_%.h: algorithms/%/stem_ISO_8859_1.sbl snowball + @mkdir -p $(c_src_dir) + @l=`echo "$<" | sed 's!\(.*\)/stem_ISO_8859_1.sbl$$!\1!;s!^.*/!!'`; \ + o="$(c_src_dir)/stem_ISO_8859_1_$${l}"; \ + echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime"; \ + ./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime + +$(c_src_dir)/stem_ISO_8859_2_%.c $(c_src_dir)/stem_ISO_8859_2_%.h: algorithms/%/stem_ISO_8859_2.sbl snowball + @mkdir -p $(c_src_dir) + @l=`echo "$<" | sed 's!\(.*\)/stem_ISO_8859_2.sbl$$!\1!;s!^.*/!!'`; \ + o="$(c_src_dir)/stem_ISO_8859_2_$${l}"; \ + echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime"; \ + ./snowball $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime + +$(c_src_dir)/stem_%.o: $(c_src_dir)/stem_%.c $(c_src_dir)/stem_%.h + $(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ $< + +$(java_src_dir)/%Stemmer.java: algorithms/%/stem_Unicode.sbl snowball + @mkdir -p $(java_src_dir) + @l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \ + o="$(java_src_dir)/$${l}Stemmer"; \ + echo "./snowball $< -j -o $${o} -p \"org.tartarus.snowball.SnowballStemmer\" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer"; \ + ./snowball $< -j -o $${o} -p "org.tartarus.snowball.SnowballStemmer" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer + +splint: snowball.splint +snowball.splint: $(COMPILER_SOURCES) + splint $^ >$@ -weak + +# Make a full source distribution +dist: dist_snowball dist_libstemmer_c dist_libstemmer_java + +# Make a distribution of all the sources involved in snowball +dist_snowball: $(COMPILER_SOURCES) $(COMPILER_HEADERS) \ + $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \ + $(LIBSTEMMER_SOURCES) \ + $(LIBSTEMMER_UTF8_SOURCES) \ + $(LIBSTEMMER_HEADERS) \ + $(LIBSTEMMER_EXTRA) \ + $(ALL_ALGORITHM_FILES) $(STEMWORDS_SOURCES) \ + GNUmakefile README doc/TODO libstemmer/mkmodules.pl + destname=snowball_code; \ + dest=dist/$${destname}; \ + rm -rf $${dest} && \ + rm -f $${dest}.tgz && \ + for file in $^; do \ + dir=`dirname $$file` && \ + mkdir -p $${dest}/$${dir} && \ + cp -a $${file} $${dest}/$${dir} || exit 1 ; \ + done && \ + (cd dist && tar zcf $${destname}.tgz $${destname}) && \ + rm -rf $${dest} + +# Make a distribution of all the sources required to compile the C library. +dist_libstemmer_c: \ + $(RUNTIME_SOURCES) \ + $(RUNTIME_HEADERS) \ + $(LIBSTEMMER_SOURCES) \ + $(LIBSTEMMER_UTF8_SOURCES) \ + $(LIBSTEMMER_HEADERS) \ + $(LIBSTEMMER_EXTRA) \ + $(C_LIB_SOURCES) \ + $(C_LIB_HEADERS) \ + libstemmer/mkinc.mak \ + libstemmer/mkinc_utf8.mak + destname=libstemmer_c; \ + dest=dist/$${destname}; \ + rm -rf $${dest} && \ + rm -f $${dest}.tgz && \ + mkdir -p $${dest} && \ + cp -a doc/libstemmer_c_README $${dest}/README && \ + mkdir -p $${dest}/examples && \ + cp -a examples/stemwords.c $${dest}/examples && \ + mkdir -p $${dest}/$(c_src_dir) && \ + cp -a $(C_LIB_SOURCES) $(C_LIB_HEADERS) $${dest}/$(c_src_dir) && \ + mkdir -p $${dest}/runtime && \ + cp -a $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) $${dest}/runtime && \ + mkdir -p $${dest}/libstemmer && \ + cp -a $(LIBSTEMMER_SOURCES) $(LIBSTEMMER_UTF8_SOURCES) $(LIBSTEMMER_HEADERS) $(LIBSTEMMER_EXTRA) $${dest}/libstemmer && \ + mkdir -p $${dest}/include && \ + mv $${dest}/libstemmer/libstemmer.h $${dest}/include && \ + (cd $${dest} && \ + echo "README" >> MANIFEST && \ + ls $(c_src_dir)/*.c $(c_src_dir)/*.h >> MANIFEST && \ + ls runtime/*.c runtime/*.h >> MANIFEST && \ + ls libstemmer/*.c libstemmer/*.h >> MANIFEST && \ + ls include/*.h >> MANIFEST) && \ + cp -a libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak $${dest}/ && \ + echo 'include mkinc.mak' >> $${dest}/Makefile && \ + echo 'CFLAGS=-Iinclude' >> $${dest}/Makefile && \ + echo 'all: libstemmer.o stemwords' >> $${dest}/Makefile && \ + echo 'libstemmer.o: $$(snowball_sources:.c=.o)' >> $${dest}/Makefile && \ + echo ' $$(AR) -cru $$@ $$^' >> $${dest}/Makefile && \ + echo 'stemwords: examples/stemwords.o libstemmer.o' >> $${dest}/Makefile && \ + echo ' $$(CC) -o $$@ $$^' >> $${dest}/Makefile && \ + echo 'clean:' >> $${dest}/Makefile && \ + echo ' rm -f stemwords *.o $(c_src_dir)/*.o runtime/*.o libstemmer/*.o' >> $${dest}/Makefile && \ + (cd dist && tar zcf $${destname}.tgz $${destname}) && \ + rm -rf $${dest} + +# Make a distribution of all the sources required to compile the Java library. +dist_libstemmer_java: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \ + $(LIBSTEMMER_EXTRA) \ + $(JAVA_SOURCES) + destname=libstemmer_java; \ + dest=dist/$${destname}; \ + rm -rf $${dest} && \ + rm -f $${dest}.tgz && \ + mkdir -p $${dest} && \ + cp -a doc/libstemmer_java_README $${dest}/README && \ + mkdir -p $${dest}/$(java_src_dir) && \ + cp -a $(JAVA_SOURCES) $${dest}/$(java_src_dir) && \ + mkdir -p $${dest}/$(java_src_main_dir) && \ + cp -a $(JAVARUNTIME_SOURCES) $${dest}/$(java_src_main_dir) && \ + (cd $${dest} && \ + echo "README" >> MANIFEST && \ + ls $(java_src_dir)/*.java >> MANIFEST && \ + ls $(java_src_main_dir)/*.java >> MANIFEST) && \ + (cd dist && tar zcf $${destname}.tgz $${destname}) && \ + rm -rf $${dest} + +check: check_utf8 check_iso_8859_1 check_iso_8859_2 check_koi8r + +check_utf8: $(libstemmer_algorithms:%=check_utf8_%) + +check_iso_8859_1: $(ISO_8859_1_algorithms:%=check_iso_8859_1_%) + +check_iso_8859_2: $(ISO_8859_2_algorithms:%=check_iso_8859_2_%) + +check_koi8r: $(KOI8_R_algorithms:%=check_koi8r_%) + +# Where the data files are located - assumed their repo is checked out as +# a sibling to this one. +STEMMING_DATA = ../snowball-data + +check_utf8_%: $(STEMMING_DATA)/% stemwords + @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with UTF-8" + @./stemwords -c UTF_8 -l `echo $<|sed 's!.*/!!'` -i $</voc.txt -o tmp.txt + @diff -u $</output.txt tmp.txt + @if [ -e $</diffs.txt ] ; \ + then \ + ./stemwords -c UTF_8 -l `echo $<|sed 's!.*/!!'` -i $</voc.txt -o tmp.txt -p2 && \ + diff -u $</diffs.txt tmp.txt; \ + fi + @rm tmp.txt + +check_iso_8859_1_%: $(STEMMING_DATA)/% stemwords + @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_1" + @python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-1"))' | \ + ./stemwords -c ISO_8859_1 -l `echo $<|sed 's!.*/!!'` -o tmp.txt + @python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-1"))' | \ + diff -u - tmp.txt + @rm tmp.txt + +check_iso_8859_2_%: $(STEMMING_DATA)/% stemwords + @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_2" + @python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-2"))' | \ + ./stemwords -c ISO_8859_2 -l `echo $<|sed 's!.*/!!'` -o tmp.txt + @python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-2"))' | \ + diff -u - tmp.txt + @rm tmp.txt + +check_koi8r_%: $(STEMMING_DATA)/% stemwords + @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with KOI8R" + @python -c 'print(open("$</voc.txt").read().decode("utf8").encode("koi8_r"))' | \ + ./stemwords -c KOI8_R -l `echo $<|sed 's!.*/!!'` -o tmp.txt + @python -c 'print(open("$</output.txt").read().decode("utf8").encode("koi8_r"))' | \ + diff -u - tmp.txt + @rm tmp.txt diff --git a/contrib/snowball/README b/contrib/snowball/README new file mode 100644 index 000000000..afb51b340 --- /dev/null +++ b/contrib/snowball/README @@ -0,0 +1,5 @@ +This contains the source code for the snowball compiler and the stemming +algorithms on the website. + +See http://snowball.tartarus.org/ for more details. + diff --git a/contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..0a8190a08 --- /dev/null +++ b/contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl @@ -0,0 +1,91 @@ +routines ( + mark_regions + main_suffix + consonant_pair + other_suffix + undouble +) + +externals ( stem ) + +strings ( ch ) + +integers ( p1 x ) + +groupings ( v s_ending ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef ae hex 'E6' +stringdef ao hex 'E5' +stringdef o/ hex 'F8' + +define v 'aeiouy{ae}{ao}{o/}' + +define s_ending 'abcdfghjklmnoprtvyz{ao}' + +define mark_regions as ( + + $p1 = limit + + test ( hop 3 setmark x ) + goto v gopast non-v setmark p1 + try ( $p1 < x $p1 = x ) +) + +backwardmode ( + + define main_suffix as ( + setlimit tomark p1 for ([substring]) + among( + + 'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere' + 'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes' + 'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets' + 'erets' 'et' 'eret' + (delete) + 's' + (s_ending delete) + ) + ) + + define consonant_pair as ( + test ( + setlimit tomark p1 for ([substring]) + among( + 'gd' // significant in the call from other_suffix + 'dt' 'gt' 'kt' + ) + ) + next] delete + ) + + define other_suffix as ( + do ( ['st'] 'ig' delete ) + setlimit tomark p1 for ([substring]) + among( + 'ig' 'lig' 'elig' 'els' + (delete do consonant_pair) + 'l{o/}st' + (<-'l{o/}s') + ) + ) + define undouble as ( + setlimit tomark p1 for ([non-v] ->ch) + ch + delete + ) +) + +define stem as ( + + do mark_regions + backwards ( + do main_suffix + do consonant_pair + do other_suffix + do undouble + ) +) diff --git a/contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl new file mode 100644 index 000000000..1131a1cb7 --- /dev/null +++ b/contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl @@ -0,0 +1,91 @@ +routines ( + mark_regions + main_suffix + consonant_pair + other_suffix + undouble +) + +externals ( stem ) + +strings ( ch ) + +integers ( p1 x ) + +groupings ( v s_ending ) + +stringescapes {} + +/* special characters (in MS-DOS Latin I) */ + +stringdef ae hex '91' +stringdef ao hex '86' +stringdef o/ hex '9B' + +define v 'aeiouy{ae}{ao}{o/}' + +define s_ending 'abcdfghjklmnoprtvyz{ao}' + +define mark_regions as ( + + $p1 = limit + + test ( hop 3 setmark x ) + goto v gopast non-v setmark p1 + try ( $p1 < x $p1 = x ) +) + +backwardmode ( + + define main_suffix as ( + setlimit tomark p1 for ([substring]) + among( + + 'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere' + 'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes' + 'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets' + 'erets' 'et' 'eret' + (delete) + 's' + (s_ending delete) + ) + ) + + define consonant_pair as ( + test ( + setlimit tomark p1 for ([substring]) + among( + 'gd' // significant in the call from other_suffix + 'dt' 'gt' 'kt' + ) + ) + next] delete + ) + + define other_suffix as ( + do ( ['st'] 'ig' delete ) + setlimit tomark p1 for ([substring]) + among( + 'ig' 'lig' 'elig' 'els' + (delete do consonant_pair) + 'l{o/}st' + (<-'l{o/}s') + ) + ) + define undouble as ( + setlimit tomark p1 for ([non-v] ->ch) + ch + delete + ) +) + +define stem as ( + + do mark_regions + backwards ( + do main_suffix + do consonant_pair + do other_suffix + do undouble + ) +) diff --git a/contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..f7609f766 --- /dev/null +++ b/contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl @@ -0,0 +1,164 @@ +routines ( + prelude postlude + e_ending + en_ending + mark_regions + R1 R2 + undouble + standard_suffix +) + +externals ( stem ) + +booleans ( e_found ) + +integers ( p1 p2 ) + +groupings ( v v_I v_j ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef a" hex 'E4' +stringdef e" hex 'EB' +stringdef i" hex 'EF' +stringdef o" hex 'F6' +stringdef u" hex 'FC' + +stringdef a' hex 'E1' +stringdef e' hex 'E9' +stringdef i' hex 'ED' +stringdef o' hex 'F3' +stringdef u' hex 'FA' + +stringdef e` hex 'E8' + +define v 'aeiouy{e`}' +define v_I v + 'I' +define v_j v + 'j' + +define prelude as ( + test repeat ( + [substring] among( + '{a"}' '{a'}' + (<- 'a') + '{e"}' '{e'}' + (<- 'e') + '{i"}' '{i'}' + (<- 'i') + '{o"}' '{o'}' + (<- 'o') + '{u"}' '{u'}' + (<- 'u') + '' (next) + ) //or next + ) + try(['y'] <- 'Y') + repeat goto ( + v [('i'] v <- 'I') or + ('y'] <- 'Y') + ) +) + +define mark_regions as ( + + $p1 = limit + $p2 = limit + + gopast v gopast non-v setmark p1 + try($p1 < 3 $p1 = 3) // at least 3 + gopast v gopast non-v setmark p2 + +) + +define postlude as repeat ( + + [substring] among( + 'Y' (<- 'y') + 'I' (<- 'i') + '' (next) + ) //or next + +) + +backwardmode ( + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define undouble as ( + test among('kk' 'dd' 'tt') [next] delete + ) + + define e_ending as ( + unset e_found + ['e'] R1 test non-v delete + set e_found + undouble + ) + + define en_ending as ( + R1 non-v and not 'gem' delete + undouble + ) + + define standard_suffix as ( + do ( + [substring] among( + 'heden' + ( R1 <- 'heid' + ) + 'en' 'ene' + ( en_ending + ) + 's' 'se' + ( R1 non-v_j delete + ) + ) + ) + do e_ending + + do ( ['heid'] R2 not 'c' delete + ['en'] en_ending + ) + + do ( + [substring] among( + 'end' 'ing' + ( R2 delete + (['ig'] R2 not 'e' delete) or undouble + ) + 'ig' + ( R2 not 'e' delete + ) + 'lijk' + ( R2 delete e_ending + ) + 'baar' + ( R2 delete + ) + 'bar' + ( R2 e_found delete + ) + ) + ) + do ( + non-v_I + test ( + among ('aa' 'ee' 'oo' 'uu') + non-v + ) + [next] delete + ) + ) +) + +define stem as ( + + do prelude + do mark_regions + backwards + do standard_suffix + do postlude +) diff --git a/contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl new file mode 100644 index 000000000..15b8718d1 --- /dev/null +++ b/contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl @@ -0,0 +1,164 @@ +routines ( + prelude postlude + e_ending + en_ending + mark_regions + R1 R2 + undouble + standard_suffix +) + +externals ( stem ) + +booleans ( e_found ) + +integers ( p1 p2 ) + +groupings ( v v_I v_j ) + +stringescapes {} + +/* special characters (in MS-DOS Latin I) */ + +stringdef a" hex '84' +stringdef e" hex '89' +stringdef i" hex '8B' +stringdef o" hex '94' +stringdef u" hex '81' + +stringdef a' hex 'A0' +stringdef e' hex '82' +stringdef i' hex 'A1' +stringdef o' hex 'A2' +stringdef u' hex 'A3' + +stringdef e` hex '8A' + +define v 'aeiouy{e`}' +define v_I v + 'I' +define v_j v + 'j' + +define prelude as ( + test repeat ( + [substring] among( + '{a"}' '{a'}' + (<- 'a') + '{e"}' '{e'}' + (<- 'e') + '{i"}' '{i'}' + (<- 'i') + '{o"}' '{o'}' + (<- 'o') + '{u"}' '{u'}' + (<- 'u') + '' (next) + ) //or next + ) + try(['y'] <- 'Y') + repeat goto ( + v [('i'] v <- 'I') or + ('y'] <- 'Y') + ) +) + +define mark_regions as ( + + $p1 = limit + $p2 = limit + + gopast v gopast non-v setmark p1 + try($p1 < 3 $p1 = 3) // at least 3 + gopast v gopast non-v setmark p2 + +) + +define postlude as repeat ( + + [substring] among( + 'Y' (<- 'y') + 'I' (<- 'i') + '' (next) + ) //or next + +) + +backwardmode ( + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define undouble as ( + test among('kk' 'dd' 'tt') [next] delete + ) + + define e_ending as ( + unset e_found + ['e'] R1 test non-v delete + set e_found + undouble + ) + + define en_ending as ( + R1 non-v and not 'gem' delete + undouble + ) + + define standard_suffix as ( + do ( + [substring] among( + 'heden' + ( R1 <- 'heid' + ) + 'en' 'ene' + ( en_ending + ) + 's' 'se' + ( R1 non-v_j delete + ) + ) + ) + do e_ending + + do ( ['heid'] R2 not 'c' delete + ['en'] en_ending + ) + + do ( + [substring] among( + 'end' 'ing' + ( R2 delete + (['ig'] R2 not 'e' delete) or undouble + ) + 'ig' + ( R2 not 'e' delete + ) + 'lijk' + ( R2 delete e_ending + ) + 'baar' + ( R2 delete + ) + 'bar' + ( R2 e_found delete + ) + ) + ) + do ( + non-v_I + test ( + among ('aa' 'ee' 'oo' 'uu') + non-v + ) + [next] delete + ) + ) +) + +define stem as ( + + do prelude + do mark_regions + backwards + do standard_suffix + do postlude +) diff --git a/contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..fe18d7a91 --- /dev/null +++ b/contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl @@ -0,0 +1,229 @@ +integers ( p1 p2 ) +booleans ( Y_found ) + +routines ( + prelude postlude + mark_regions + shortv + R1 R2 + Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5 + exception1 + exception2 +) + +externals ( stem ) + +groupings ( v v_WXY valid_LI ) + +stringescapes {} + +define v 'aeiouy' +define v_WXY v + 'wxY' + +define valid_LI 'cdeghkmnrt' + +define prelude as ( + unset Y_found + do ( ['{'}'] delete) + do ( ['y'] <-'Y' set Y_found) + do repeat(goto (v ['y']) <-'Y' set Y_found) +) + +define mark_regions as ( + $p1 = limit + $p2 = limit + do( + among ( + 'gener' + 'commun' // added May 2005 + 'arsen' // added Nov 2006 (arsenic/arsenal) + // ... extensions possible here ... + ) or (gopast v gopast non-v) + setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +backwardmode ( + + define shortv as ( + ( non-v_WXY v non-v ) + or + ( non-v v atlimit ) + ) + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define Step_1a as ( + try ( + [substring] among ( + '{'}' '{'}s' '{'}s{'}' + (delete) + ) + ) + [substring] among ( + 'sses' (<-'ss') + 'ied' 'ies' + ((hop 2 <-'i') or <-'ie') + 's' (next gopast v delete) + 'us' 'ss' + ) + ) + + define Step_1b as ( + [substring] among ( + 'eed' 'eedly' + (R1 <-'ee') + 'ed' 'edly' 'ing' 'ingly' + ( + test gopast v delete + test substring among( + 'at' 'bl' 'iz' + (<+ 'e') + 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt' + // ignoring double c, h, j, k, q, v, w, and x + ([next] delete) + '' (atmark p1 test shortv <+ 'e') + ) + ) + ) + ) + + define Step_1c as ( + ['y' or 'Y'] + non-v not atlimit + <-'i' + ) + + define Step_2 as ( + [substring] R1 among ( + 'tional' (<-'tion') + 'enci' (<-'ence') + 'anci' (<-'ance') + 'abli' (<-'able') + 'entli' (<-'ent') + 'izer' 'ization' + (<-'ize') + 'ational' 'ation' 'ator' + (<-'ate') + 'alism' 'aliti' 'alli' + (<-'al') + 'fulness' (<-'ful') + 'ousli' 'ousness' + (<-'ous') + 'iveness' 'iviti' + (<-'ive') + 'biliti' 'bli' + (<-'ble') + 'ogi' ('l' <-'og') + 'fulli' (<-'ful') + 'lessli' (<-'less') + 'li' (valid_LI delete) + ) + ) + + define Step_3 as ( + [substring] R1 among ( + 'tional' (<- 'tion') + 'ational' (<- 'ate') + 'alize' (<-'al') + 'icate' 'iciti' 'ical' + (<-'ic') + 'ful' 'ness' + (delete) + 'ative' + (R2 delete) // 'R2' added Dec 2001 + ) + ) + + define Step_4 as ( + [substring] R2 among ( + 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement' + 'ment' 'ent' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize' + (delete) + 'ion' ('s' or 't' delete) + ) + ) + + define Step_5 as ( + [substring] among ( + 'e' (R2 or (R1 not shortv) delete) + 'l' (R2 'l' delete) + ) + ) + + define exception2 as ( + + [substring] atlimit among( + 'inning' 'outing' 'canning' 'herring' 'earring' + 'proceed' 'exceed' 'succeed' + + // ... extensions possible here ... + + ) + ) +) + +define exception1 as ( + + [substring] atlimit among( + + /* special changes: */ + + 'skis' (<-'ski') + 'skies' (<-'sky') + 'dying' (<-'die') + 'lying' (<-'lie') + 'tying' (<-'tie') + + /* special -LY cases */ + + 'idly' (<-'idl') + 'gently' (<-'gentl') + 'ugly' (<-'ugli') + 'early' (<-'earli') + 'only' (<-'onli') + 'singly' (<-'singl') + + // ... extensions possible here ... + + /* invariant forms: */ + + 'sky' + 'news' + 'howe' + + 'atlas' 'cosmos' 'bias' 'andes' // not plural forms + + // ... extensions possible here ... + ) +) + +define postlude as (Y_found repeat(goto (['Y']) <-'y')) + +define stem as ( + + exception1 or + not hop 3 or ( + do prelude + do mark_regions + backwards ( + + do Step_1a + + exception2 or ( + + do Step_1b + do Step_1c + + do Step_2 + do Step_3 + do Step_4 + + do Step_5 + ) + ) + do postlude + ) +) diff --git a/contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..9ac74f292 --- /dev/null +++ b/contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl @@ -0,0 +1,196 @@ + +/* Finnish stemmer. + + Numbers in square brackets refer to the sections in + Fred Karlsson, Finnish: An Essential Grammar. Routledge, 1999 + ISBN 0-415-20705-3 + +*/ + +routines ( + mark_regions + R2 + particle_etc possessive + LONG VI + case_ending + i_plural + t_plural + other_endings + tidy +) + +externals ( stem ) + +integers ( p1 p2 ) +strings ( x ) +booleans ( ending_removed ) +groupings ( AEI V1 V2 particle_end ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef a" hex 'E4' +stringdef o" hex 'F6' + +define AEI 'a{a"}ei' +define V1 'aeiouy{a"}{o"}' +define V2 'aeiou{a"}{o"}' +define particle_end V1 + 'nt' + +define mark_regions as ( + + $p1 = limit + $p2 = limit + + goto V1 gopast non-V1 setmark p1 + goto V1 gopast non-V1 setmark p2 +) + +backwardmode ( + + define R2 as $p2 <= cursor + + define particle_etc as ( + setlimit tomark p1 for ([substring]) + among( + 'kin' + 'kaan' 'k{a"}{a"}n' + 'ko' 'k{o"}' + 'han' 'h{a"}n' + 'pa' 'p{a"}' // Particles [91] + (particle_end) + 'sti' // Adverb [87] + (R2) + ) + delete + ) + define possessive as ( // [36] + setlimit tomark p1 for ([substring]) + among( + 'si' + (not 'k' delete) // take 'ksi' as the Comitative case + 'ni' + (delete ['kse'] <- 'ksi') // kseni = ksi + ni + 'nsa' 'ns{a"}' + 'mme' + 'nne' + (delete) + /* Now for Vn possessives after case endings: [36] */ + 'an' + (among('ta' 'ssa' 'sta' 'lla' 'lta' 'na') delete) + '{a"}n' + (among('t{a"}' 'ss{a"}' 'st{a"}' + 'll{a"}' 'lt{a"}' 'n{a"}') delete) + 'en' + (among('lle' 'ine') delete) + ) + ) + + define LONG as + among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}') + + define VI as ('i' V2) + + define case_ending as ( + setlimit tomark p1 for ([substring]) + among( + 'han' ('a') //-. + 'hen' ('e') // | + 'hin' ('i') // | + 'hon' ('o') // | + 'h{a"}n' ('{a"}') // Illative [43] + 'h{o"}n' ('{o"}') // | + 'siin' VI // | + 'seen' LONG //-' + + 'den' VI + 'tten' VI // Genitive plurals [34] + () + 'n' // Genitive or Illative + ( try ( LONG // Illative + or 'ie' // Genitive + and next ] + ) + /* otherwise Genitive */ + ) + + 'a' '{a"}' //-. + (V1 non-V1) // | + 'tta' 'tt{a"}' // Partitive [32] + ('e') // | + 'ta' 't{a"}' //-' + + 'ssa' 'ss{a"}' // Inessive [41] + 'sta' 'st{a"}' // Elative [42] + + 'lla' 'll{a"}' // Adessive [44] + 'lta' 'lt{a"}' // Ablative [51] + 'lle' // Allative [46] + 'na' 'n{a"}' // Essive [49] + 'ksi' // Translative[50] + 'ine' // Comitative [51] + + /* Abessive and Instructive are too rare for + inclusion [51] */ + + ) + delete + set ending_removed + ) + define other_endings as ( + setlimit tomark p2 for ([substring]) + among( + 'mpi' 'mpa' 'mp{a"}' + 'mmi' 'mma' 'mm{a"}' // Comparative forms [85] + (not 'po') //-improves things + 'impi' 'impa' 'imp{a"}' + 'immi' 'imma' 'imm{a"}' // Superlative forms [86] + 'eja' 'ej{a"}' // indicates agent [93.1B] + ) + delete + ) + define i_plural as ( // [26] + setlimit tomark p1 for ([substring]) + among( + 'i' 'j' + ) + delete + ) + define t_plural as ( // [26] + setlimit tomark p1 for ( + ['t'] test V1 + delete + ) + setlimit tomark p2 for ([substring]) + among( + 'mma' (not 'po') //-mmat endings + 'imma' //-immat endings + ) + delete + ) + define tidy as ( + setlimit tomark p1 for ( + do ( LONG and ([next] delete ) ) // undouble vowel + do ( [AEI] non-V1 delete ) // remove trailing a, a", e, i + do ( ['j'] 'o' or 'u' delete ) + do ( ['o'] 'j' delete ) + ) + goto non-V1 [next] -> x x delete // undouble consonant + ) +) + +define stem as ( + + do mark_regions + unset ending_removed + backwards ( + do particle_etc + do possessive + do case_ending + do other_endings + (ending_removed do i_plural) or do t_plural + do tidy + ) +) + diff --git a/contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..e972f227f --- /dev/null +++ b/contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl @@ -0,0 +1,248 @@ +routines ( + prelude postlude mark_regions + RV R1 R2 + standard_suffix + i_verb_suffix + verb_suffix + residual_suffix + un_double + un_accent +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v keep_with_s ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef a^ hex 'E2' // a-circumflex +stringdef a` hex 'E0' // a-grave +stringdef c, hex 'E7' // c-cedilla + +stringdef e" hex 'EB' // e-diaeresis (rare) +stringdef e' hex 'E9' // e-acute +stringdef e^ hex 'EA' // e-circumflex +stringdef e` hex 'E8' // e-grave +stringdef i" hex 'EF' // i-diaeresis +stringdef i^ hex 'EE' // i-circumflex +stringdef o^ hex 'F4' // o-circumflex +stringdef u^ hex 'FB' // u-circumflex +stringdef u` hex 'F9' // u-grave + +define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}' + +define prelude as repeat goto ( + + ( v [ ('u' ] v <- 'U') or + ('i' ] v <- 'I') or + ('y' ] <- 'Y') + ) + or + ( ['y'] v <- 'Y' ) + or + ( 'q' ['u'] <- 'U' ) +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v v next ) + or + among ( // this exception list begun Nov 2006 + 'par' // paris, parie, pari + 'col' // colis + 'tap' // tapis + // extensions possible here + ) + or + ( next gopast v ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + + [substring] among( + 'I' (<- 'i') + 'U' (<- 'u') + 'Y' (<- 'y') + '' (next) + ) +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define standard_suffix as ( + [substring] among( + + 'ance' 'iqUe' 'isme' 'able' 'iste' 'eux' + 'ances' 'iqUes' 'ismes' 'ables' 'istes' + ( R2 delete ) + 'atrice' 'ateur' 'ation' + 'atrices' 'ateurs' 'ations' + ( R2 delete + try ( ['ic'] (R2 delete) or <-'iqU' ) + ) + 'logie' + 'logies' + ( R2 <- 'log' ) + 'usion' 'ution' + 'usions' 'utions' + ( R2 <- 'u' ) + 'ence' + 'ences' + ( R2 <- 'ent' ) + 'ement' + 'ements' + ( + RV delete + try ( + [substring] among( + 'iv' (R2 delete ['at'] R2 delete) + 'eus' ((R2 delete) or (R1<-'eux')) + 'abl' 'iqU' + (R2 delete) + 'i{e`}r' 'I{e`}r' //) + (RV <-'i') //)--new 2 Sept 02 + ) + ) + ) + 'it{e'}' + 'it{e'}s' + ( + R2 delete + try ( + [substring] among( + 'abil' ((R2 delete) or <-'abl') + 'ic' ((R2 delete) or <-'iqU') + 'iv' (R2 delete) + ) + ) + ) + 'if' 'ive' + 'ifs' 'ives' + ( + R2 delete + try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' ) + ) + 'eaux' (<- 'eau') + 'aux' (R1 <- 'al') + 'euse' + 'euses'((R2 delete) or (R1<-'eux')) + + 'issement' + 'issements'(R1 non-v delete) // verbal + + // fail(...) below forces entry to verb_suffix. -ment typically + // follows the p.p., e.g 'confus{e'}ment'. + + 'amment' (RV fail(<- 'ant')) + 'emment' (RV fail(<- 'ent')) + 'ment' + 'ments' (test(v RV) fail(delete)) + // v is e,i,u,{e'},I or U + ) + ) + + define i_verb_suffix as setlimit tomark pV for ( + [substring] among ( + '{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai' + 'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez' + 'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait' + 'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses' + 'issez' 'issiez' 'issions' 'issons' 'it' + (non-v delete) + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among ( + 'ions' + (R2 delete) + + '{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai' + 'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions' + 'erons' 'eront' 'ez' 'iez' + + // 'ons' //-best omitted + + (delete) + + '{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant' + 'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez' + 'assions' + (delete + try(['e'] delete) + ) + ) + ) + + define keep_with_s 'aiou{e`}s' + + define residual_suffix as ( + try(['s'] test non-keep_with_s delete) + setlimit tomark pV for ( + [substring] among( + 'ion' (R2 's' or 't' delete) + 'ier' 'i{e`}re' + 'Ier' 'I{e`}re' (<-'i') + 'e' (delete) + '{e"}' ('gu' delete) + ) + ) + ) + + define un_double as ( + test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete + ) + + define un_accent as ( + atleast 1 non-v + [ '{e'}' or '{e`}' ] <-'e' + ) +) + +define stem as ( + + do prelude + do mark_regions + backwards ( + + do ( + ( + ( standard_suffix or + i_verb_suffix or + verb_suffix + ) + and + try( [ ('Y' ] <- 'i' ) or + ('{c,}'] <- 'c' ) + ) + ) or + residual_suffix + ) + + // try(['ent'] RV delete) // is best omitted + + do un_double + do un_accent + ) + do postlude +) + diff --git a/contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl new file mode 100644 index 000000000..996eba1ab --- /dev/null +++ b/contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl @@ -0,0 +1,239 @@ +routines ( + prelude postlude mark_regions + RV R1 R2 + standard_suffix + i_verb_suffix + verb_suffix + residual_suffix + un_double + un_accent +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v keep_with_s ) + +stringescapes {} + +/* special characters (in MS-DOS Latin I) */ + +stringdef a^ hex '83' // a-circumflex +stringdef a` hex '85' // a-grave +stringdef c, hex '87' // c-cedilla + +stringdef e" hex '89' // e-diaeresis (rare) +stringdef e' hex '82' // e-acute +stringdef e^ hex '88' // e-circumflex +stringdef e` hex '8A' // e-grave +stringdef i" hex '8B' // i-diaeresis +stringdef i^ hex '8C' // i-circumflex +stringdef o^ hex '93' // o-circumflex +stringdef u^ hex '96' // u-circumflex +stringdef u` hex '97' // u-grave + +define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}' + +define prelude as repeat goto ( + + ( v [ ('u' ] v <- 'U') or + ('i' ] v <- 'I') or + ('y' ] <- 'Y') + ) + or + ( ['y'] v <- 'Y' ) + or + ( 'q' ['u'] <- 'U' ) +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v v next ) or ( next gopast v ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + + [substring] among( + 'I' (<- 'i') + 'U' (<- 'u') + 'Y' (<- 'y') + '' (next) + ) +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define standard_suffix as ( + [substring] among( + + 'ance' 'iqUe' 'isme' 'able' 'iste' 'eux' + 'ances' 'iqUes' 'ismes' 'ables' 'istes' + ( R2 delete ) + 'atrice' 'ateur' 'ation' + 'atrices' 'ateurs' 'ations' + ( R2 delete + try ( ['ic'] (R2 delete) or <-'iqU' ) + ) + 'logie' + 'logies' + ( R2 <- 'log' ) + 'usion' 'ution' + 'usions' 'utions' + ( R2 <- 'u' ) + 'ence' + 'ences' + ( R2 <- 'ent' ) + 'ement' + 'ements' + ( + RV delete + try ( + [substring] among( + 'iv' (R2 delete ['at'] R2 delete) + 'eus' ((R2 delete) or (R1<-'eux')) + 'abl' 'iqU' + (R2 delete) + 'i{e`}r' 'I{e`}r' //) + (RV <-'i') //)--new 2 Sept 02 + ) + ) + ) + 'it{e'}' + 'it{e'}s' + ( + R2 delete + try ( + [substring] among( + 'abil' ((R2 delete) or <-'abl') + 'ic' ((R2 delete) or <-'iqU') + 'iv' (R2 delete) + ) + ) + ) + 'if' 'ive' + 'ifs' 'ives' + ( + R2 delete + try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' ) + ) + 'eaux' (<- 'eau') + 'aux' (R1 <- 'al') + 'euse' + 'euses'((R2 delete) or (R1<-'eux')) + + 'issement' + 'issements'(R1 non-v delete) // verbal + + // fail(...) below forces entry to verb_suffix. -ment typically + // follows the p.p., e.g 'confus{e'}ment'. + + 'amment' (RV fail(<- 'ant')) + 'emment' (RV fail(<- 'ent')) + 'ment' + 'ments' (test(v RV) fail(delete)) + // v is e,i,u,{e'},I or U + ) + ) + + define i_verb_suffix as setlimit tomark pV for ( + [substring] among ( + '{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai' + 'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez' + 'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait' + 'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses' + 'issez' 'issiez' 'issions' 'issons' 'it' + (non-v delete) + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among ( + 'ions' + (R2 delete) + + '{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai' + 'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions' + 'erons' 'eront' 'ez' 'iez' + + // 'ons' //-best omitted + + (delete) + + '{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant' + 'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez' + 'assions' + (delete + try(['e'] delete) + ) + ) + ) + + define keep_with_s 'aiou{e`}s' + + define residual_suffix as ( + try(['s'] test non-keep_with_s delete) + setlimit tomark pV for ( + [substring] among( + 'ion' (R2 's' or 't' delete) + 'ier' 'i{e`}re' + 'Ier' 'I{e`}re' (<-'i') + 'e' (delete) + '{e"}' ('gu' delete) + ) + ) + ) + + define un_double as ( + test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete + ) + + define un_accent as ( + atleast 1 non-v + [ '{e'}' or '{e`}' ] <-'e' + ) +) + +define stem as ( + + do prelude + do mark_regions + backwards ( + + do ( + ( + ( standard_suffix or + i_verb_suffix or + verb_suffix + ) + and + try( [ ('Y' ] <- 'i' ) or + ('{c,}'] <- 'c' ) + ) + ) or + residual_suffix + ) + + // try(['ent'] RV delete) // is best omitted + + do un_double + do un_accent + ) + do postlude +) + diff --git a/contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..7069daf0d --- /dev/null +++ b/contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl @@ -0,0 +1,139 @@ + +/* + Extra rule for -nisse ending added 11 Dec 2009 +*/ + +routines ( + prelude postlude + mark_regions + R1 R2 + standard_suffix +) + +externals ( stem ) + +integers ( p1 p2 x ) + +groupings ( v s_ending st_ending ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef a" hex 'E4' +stringdef o" hex 'F6' +stringdef u" hex 'FC' +stringdef ss hex 'DF' + +define v 'aeiouy{a"}{o"}{u"}' + +define s_ending 'bdfghklmnrt' +define st_ending s_ending - 'r' + +define prelude as ( + + test repeat ( + ( + ['{ss}'] <- 'ss' + ) or next + ) + + repeat goto ( + v [('u'] v <- 'U') or + ('y'] v <- 'Y') + ) +) + +define mark_regions as ( + + $p1 = limit + $p2 = limit + + test(hop 3 setmark x) + + gopast v gopast non-v setmark p1 + try($p1 < x $p1 = x) // at least 3 + gopast v gopast non-v setmark p2 + +) + +define postlude as repeat ( + + [substring] among( + 'Y' (<- 'y') + 'U' (<- 'u') + '{a"}' (<- 'a') + '{o"}' (<- 'o') + '{u"}' (<- 'u') + '' (next) + ) + +) + +backwardmode ( + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define standard_suffix as ( + do ( + [substring] R1 among( + 'em' 'ern' 'er' + ( delete + ) + 'e' 'en' 'es' + ( delete + try (['s'] 'nis' delete) + ) + 's' + ( s_ending delete + ) + ) + ) + do ( + [substring] R1 among( + 'en' 'er' 'est' + ( delete + ) + 'st' + ( st_ending hop 3 delete + ) + ) + ) + do ( + [substring] R2 among( + 'end' 'ung' + ( delete + try (['ig'] not 'e' R2 delete) + ) + 'ig' 'ik' 'isch' + ( not 'e' delete + ) + 'lich' 'heit' + ( delete + try ( + ['er' or 'en'] R1 delete + ) + ) + 'keit' + ( delete + try ( + [substring] R2 among( + 'lich' 'ig' + ( delete + ) + ) + ) + ) + ) + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards + do standard_suffix + do postlude +) diff --git a/contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl new file mode 100644 index 000000000..3effb3257 --- /dev/null +++ b/contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl @@ -0,0 +1,139 @@ + +/* + Extra rule for -nisse ending added 11 Dec 2009 +*/ + +routines ( + prelude postlude + mark_regions + R1 R2 + standard_suffix +) + +externals ( stem ) + +integers ( p1 p2 x ) + +groupings ( v s_ending st_ending ) + +stringescapes {} + +/* special characters (in MS-DOS Latin I) */ + +stringdef a" hex '84' +stringdef o" hex '94' +stringdef u" hex '81' +stringdef ss hex 'E1' + +define v 'aeiouy{a"}{o"}{u"}' + +define s_ending 'bdfghklmnrt' +define st_ending s_ending - 'r' + +define prelude as ( + + test repeat ( + ( + ['{ss}'] <- 'ss' + ) or next + ) + + repeat goto ( + v [('u'] v <- 'U') or + ('y'] v <- 'Y') + ) +) + +define mark_regions as ( + + $p1 = limit + $p2 = limit + + test(hop 3 setmark x) + + gopast v gopast non-v setmark p1 + try($p1 < x $p1 = x) // at least 3 + gopast v gopast non-v setmark p2 + +) + +define postlude as repeat ( + + [substring] among( + 'Y' (<- 'y') + 'U' (<- 'u') + '{a"}' (<- 'a') + '{o"}' (<- 'o') + '{u"}' (<- 'u') + '' (next) + ) + +) + +backwardmode ( + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define standard_suffix as ( + do ( + [substring] R1 among( + 'em' 'ern' 'er' + ( delete + ) + 'e' 'en' 'es' + ( delete + try (['s'] 'nis' delete) + ) + 's' + ( s_ending delete + ) + ) + ) + do ( + [substring] R1 among( + 'en' 'er' 'est' + ( delete + ) + 'st' + ( st_ending hop 3 delete + ) + ) + ) + do ( + [substring] R2 among( + 'end' 'ung' + ( delete + try (['ig'] not 'e' R2 delete) + ) + 'ig' 'ik' 'isch' + ( not 'e' delete + ) + 'lich' 'heit' + ( delete + try ( + ['er' or 'en'] R1 delete + ) + ) + 'keit' + ( delete + try ( + [substring] R2 among( + 'lich' 'ig' + ( delete + ) + ) + ) + ) + ) + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards + do standard_suffix + do postlude +) diff --git a/contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..ce6026a86 --- /dev/null +++ b/contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl @@ -0,0 +1,145 @@ + +/* + Extra rule for -nisse ending added 11 Dec 2009 +*/ + +routines ( + prelude postlude + mark_regions + R1 R2 + standard_suffix +) + +externals ( stem ) + +integers ( p1 p2 x ) + +groupings ( v s_ending st_ending ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef a" hex 'E4' +stringdef o" hex 'F6' +stringdef u" hex 'FC' +stringdef ss hex 'DF' + +define v 'aeiouy{a"}{o"}{u"}' + +define s_ending 'bdfghklmnrt' +define st_ending s_ending - 'r' + +define prelude as ( + + test repeat goto ( + v [('u'] v <- 'U') or + ('y'] v <- 'Y') + ) + + repeat ( + [substring] among( + '{ss}' (<- 'ss') + 'ae' (<- '{a"}') + 'oe' (<- '{o"}') + 'ue' (<- '{u"}') + 'qu' (hop 2) + '' (next) + ) + ) + +) + +define mark_regions as ( + + $p1 = limit + $p2 = limit + + test(hop 3 setmark x) + + gopast v gopast non-v setmark p1 + try($p1 < x $p1 = x) // at least 3 + gopast v gopast non-v setmark p2 + +) + +define postlude as repeat ( + + [substring] among( + 'Y' (<- 'y') + 'U' (<- 'u') + '{a"}' (<- 'a') + '{o"}' (<- 'o') + '{u"}' (<- 'u') + '' (next) + ) + +) + +backwardmode ( + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define standard_suffix as ( + do ( + [substring] R1 among( + 'em' 'ern' 'er' + ( delete + ) + 'e' 'en' 'es' + ( delete + try (['s'] 'nis' delete) + ) + 's' + ( s_ending delete + ) + ) + ) + do ( + [substring] R1 among( + 'en' 'er' 'est' + ( delete + ) + 'st' + ( st_ending hop 3 delete + ) + ) + ) + do ( + [substring] R2 among( + 'end' 'ung' + ( delete + try (['ig'] not 'e' R2 delete) + ) + 'ig' 'ik' 'isch' + ( not 'e' delete + ) + 'lich' 'heit' + ( delete + try ( + ['er' or 'en'] R1 delete + ) + ) + 'keit' + ( delete + try ( + [substring] R2 among( + 'lich' 'ig' + ( delete + ) + ) + ) + ) + ) + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards + do standard_suffix + do postlude +) diff --git a/contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl b/contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl new file mode 100644 index 000000000..d1a644931 --- /dev/null +++ b/contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl @@ -0,0 +1,241 @@ +/* +Hungarian Stemmer +Removes noun inflections +*/ + +routines ( + mark_regions + R1 + v_ending + case + case_special + case_other + plural + owned + sing_owner + plur_owner + instrum + factive + undouble + double +) + +externals ( stem ) + +integers ( p1 ) +groupings ( v ) + +stringescapes {} + +/* special characters (in ISO Latin 2) */ + +stringdef a' hex 'E1' //a-acute +stringdef e' hex 'E9' //e-acute +stringdef i' hex 'ED' //i-acute +stringdef o' hex 'F3' //o-acute +stringdef o" hex 'F6' //o-umlaut +stringdef oq hex 'F5' //o-double acute +stringdef u' hex 'FA' //u-acute +stringdef u" hex 'FC' //u-umlaut +stringdef uq hex 'FB' //u-double acute + +define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}' + +define mark_regions as ( + + $p1 = limit + + (v goto non-v + among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next + setmark p1) + or + + (non-v gopast v setmark p1) +) + +backwardmode ( + + define R1 as $p1 <= cursor + + define v_ending as ( + [substring] R1 among( + '{a'}' (<- 'a') + '{e'}' (<- 'e') + ) + ) + + define double as ( + test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm' + 'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs') + ) + + define undouble as ( + next [hop 1] delete + ) + + define instrum as( + [substring] R1 among( + 'al' (double) + 'el' (double) + ) + delete + undouble + ) + + + define case as ( + [substring] R1 among( + 'ban' 'ben' + 'ba' 'be' + 'ra' 're' + 'nak' 'nek' + 'val' 'vel' + 't{o'}l' 't{oq}l' + 'r{o'}l' 'r{oq}l' + 'b{o'}l' 'b{oq}l' + 'hoz' 'hez' 'h{o"}z' + 'n{a'}l' 'n{e'}l' + 'ig' + 'at' 'et' 'ot' '{o"}t' + '{e'}rt' + 'k{e'}pp' 'k{e'}ppen' + 'kor' + 'ul' '{u"}l' + 'v{a'}' 'v{e'}' + 'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt' + 'k{e'}nt' + 'en' 'on' 'an' '{o"}n' + 'n' + 't' + ) + delete + v_ending + ) + + define case_special as( + [substring] R1 among( + '{e'}n' (<- 'e') + '{a'}n' (<- 'a') + '{a'}nk{e'}nt' (<- 'a') + ) + ) + + define case_other as( + [substring] R1 among( + 'astul' 'est{u"}l' (delete) + 'stul' 'st{u"}l' (delete) + '{a'}stul' (<- 'a') + '{e'}st{u"}l' (<- 'e') + ) + ) + + define factive as( + [substring] R1 among( + '{a'}' (double) + '{e'}' (double) + ) + delete + undouble + ) + + define plural as ( + [substring] R1 among( + '{a'}k' (<- 'a') + '{e'}k' (<- 'e') + '{o"}k' (delete) + 'ak' (delete) + 'ok' (delete) + 'ek' (delete) + 'k' (delete) + ) + ) + + define owned as ( + [substring] R1 among ( + 'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete) + '{e'}k{e'}' (<- 'e') + '{a'}k{e'}' (<- 'a') + 'k{e'}' (delete) + '{e'}{e'}i' (<- 'e') + '{a'}{e'}i' (<- 'a') + '{e'}i' (delete) + '{e'}{e'}' (<- 'e') + '{e'}' (delete) + ) + ) + + define sing_owner as ( + [substring] R1 among( + '{u"}nk' 'unk' (delete) + '{a'}nk' (<- 'a') + '{e'}nk' (<- 'e') + 'nk' (delete) + '{a'}juk' (<- 'a') + '{e'}j{u"}k' (<- 'e') + 'juk' 'j{u"}k' (delete) + 'uk' '{u"}k' (delete) + 'em' 'om' 'am' (delete) + '{a'}m' (<- 'a') + '{e'}m' (<- 'e') + 'm' (delete) + 'od' 'ed' 'ad' '{o"}d' (delete) + '{a'}d' (<- 'a') + '{e'}d' (<- 'e') + 'd' (delete) + 'ja' 'je' (delete) + 'a' 'e' 'o' (delete) + '{a'}' (<- 'a') + '{e'}' (<- 'e') + ) + ) + + define plur_owner as ( + [substring] R1 among( + 'jaim' 'jeim' (delete) + '{a'}im' (<- 'a') + '{e'}im' (<- 'e') + 'aim' 'eim' (delete) + 'im' (delete) + 'jaid' 'jeid' (delete) + '{a'}id' (<- 'a') + '{e'}id' (<- 'e') + 'aid' 'eid' (delete) + 'id' (delete) + 'jai' 'jei' (delete) + '{a'}i' (<- 'a') + '{e'}i' (<- 'e') + 'ai' 'ei' (delete) + 'i' (delete) + 'jaink' 'jeink' (delete) + 'eink' 'aink' (delete) + '{a'}ink' (<- 'a') + '{e'}ink' (<- 'e') + 'ink' + 'jaitok' 'jeitek' (delete) + 'aitok' 'eitek' (delete) + '{a'}itok' (<- 'a') + '{e'}itek' (<- 'e') + 'itek' (delete) + 'jeik' 'jaik' (delete) + 'aik' 'eik' (delete) + '{a'}ik' (<- 'a') + '{e'}ik' (<- 'e') + 'ik' (delete) + ) + ) +) + +define stem as ( + do mark_regions + backwards ( + do instrum + do case + do case_special + do case_other + do factive + do owned + do sing_owner + do plur_owner + do plural + ) +) diff --git a/contrib/snowball/algorithms/hungarian/stem_Unicode.sbl b/contrib/snowball/algorithms/hungarian/stem_Unicode.sbl new file mode 100644 index 000000000..c812e055c --- /dev/null +++ b/contrib/snowball/algorithms/hungarian/stem_Unicode.sbl @@ -0,0 +1,241 @@ +/* +Hungarian Stemmer +Removes noun inflections +*/ + +routines ( + mark_regions + R1 + v_ending + case + case_special + case_other + plural + owned + sing_owner + plur_owner + instrum + factive + undouble + double +) + +externals ( stem ) + +integers ( p1 ) +groupings ( v ) + +stringescapes {} + +/* special characters (in Unicode) */ + +stringdef a' hex 'E1' //a-acute +stringdef e' hex 'E9' //e-acute +stringdef i' hex 'ED' //i-acute +stringdef o' hex 'F3' //o-acute +stringdef o" hex 'F6' //o-umlaut +stringdef oq hex '151' //o-double acute +stringdef u' hex 'FA' //u-acute +stringdef u" hex 'FC' //u-umlaut +stringdef uq hex '171' //u-double acute + +define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}' + +define mark_regions as ( + + $p1 = limit + + (v goto non-v + among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next + setmark p1) + or + + (non-v gopast v setmark p1) +) + +backwardmode ( + + define R1 as $p1 <= cursor + + define v_ending as ( + [substring] R1 among( + '{a'}' (<- 'a') + '{e'}' (<- 'e') + ) + ) + + define double as ( + test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm' + 'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs') + ) + + define undouble as ( + next [hop 1] delete + ) + + define instrum as( + [substring] R1 among( + 'al' (double) + 'el' (double) + ) + delete + undouble + ) + + + define case as ( + [substring] R1 among( + 'ban' 'ben' + 'ba' 'be' + 'ra' 're' + 'nak' 'nek' + 'val' 'vel' + 't{o'}l' 't{oq}l' + 'r{o'}l' 'r{oq}l' + 'b{o'}l' 'b{oq}l' + 'hoz' 'hez' 'h{o"}z' + 'n{a'}l' 'n{e'}l' + 'ig' + 'at' 'et' 'ot' '{o"}t' + '{e'}rt' + 'k{e'}pp' 'k{e'}ppen' + 'kor' + 'ul' '{u"}l' + 'v{a'}' 'v{e'}' + 'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt' + 'k{e'}nt' + 'en' 'on' 'an' '{o"}n' + 'n' + 't' + ) + delete + v_ending + ) + + define case_special as( + [substring] R1 among( + '{e'}n' (<- 'e') + '{a'}n' (<- 'a') + '{a'}nk{e'}nt' (<- 'a') + ) + ) + + define case_other as( + [substring] R1 among( + 'astul' 'est{u"}l' (delete) + 'stul' 'st{u"}l' (delete) + '{a'}stul' (<- 'a') + '{e'}st{u"}l' (<- 'e') + ) + ) + + define factive as( + [substring] R1 among( + '{a'}' (double) + '{e'}' (double) + ) + delete + undouble + ) + + define plural as ( + [substring] R1 among( + '{a'}k' (<- 'a') + '{e'}k' (<- 'e') + '{o"}k' (delete) + 'ak' (delete) + 'ok' (delete) + 'ek' (delete) + 'k' (delete) + ) + ) + + define owned as ( + [substring] R1 among ( + 'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete) + '{e'}k{e'}' (<- 'e') + '{a'}k{e'}' (<- 'a') + 'k{e'}' (delete) + '{e'}{e'}i' (<- 'e') + '{a'}{e'}i' (<- 'a') + '{e'}i' (delete) + '{e'}{e'}' (<- 'e') + '{e'}' (delete) + ) + ) + + define sing_owner as ( + [substring] R1 among( + '{u"}nk' 'unk' (delete) + '{a'}nk' (<- 'a') + '{e'}nk' (<- 'e') + 'nk' (delete) + '{a'}juk' (<- 'a') + '{e'}j{u"}k' (<- 'e') + 'juk' 'j{u"}k' (delete) + 'uk' '{u"}k' (delete) + 'em' 'om' 'am' (delete) + '{a'}m' (<- 'a') + '{e'}m' (<- 'e') + 'm' (delete) + 'od' 'ed' 'ad' '{o"}d' (delete) + '{a'}d' (<- 'a') + '{e'}d' (<- 'e') + 'd' (delete) + 'ja' 'je' (delete) + 'a' 'e' 'o' (delete) + '{a'}' (<- 'a') + '{e'}' (<- 'e') + ) + ) + + define plur_owner as ( + [substring] R1 among( + 'jaim' 'jeim' (delete) + '{a'}im' (<- 'a') + '{e'}im' (<- 'e') + 'aim' 'eim' (delete) + 'im' (delete) + 'jaid' 'jeid' (delete) + '{a'}id' (<- 'a') + '{e'}id' (<- 'e') + 'aid' 'eid' (delete) + 'id' (delete) + 'jai' 'jei' (delete) + '{a'}i' (<- 'a') + '{e'}i' (<- 'e') + 'ai' 'ei' (delete) + 'i' (delete) + 'jaink' 'jeink' (delete) + 'eink' 'aink' (delete) + '{a'}ink' (<- 'a') + '{e'}ink' (<- 'e') + 'ink' + 'jaitok' 'jeitek' (delete) + 'aitok' 'eitek' (delete) + '{a'}itok' (<- 'a') + '{e'}itek' (<- 'e') + 'itek' (delete) + 'jeik' 'jaik' (delete) + 'aik' 'eik' (delete) + '{a'}ik' (<- 'a') + '{e'}ik' (<- 'e') + 'ik' (delete) + ) + ) +) + +define stem as ( + do mark_regions + backwards ( + do instrum + do case + do case_special + do case_other + do factive + do owned + do sing_owner + do plur_owner + do plural + ) +) diff --git a/contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..8d25cf64f --- /dev/null +++ b/contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl @@ -0,0 +1,195 @@ + +routines ( + prelude postlude mark_regions + RV R1 R2 + attached_pronoun + standard_suffix + verb_suffix + vowel_suffix +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v AEIO CG ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef a' hex 'E1' +stringdef a` hex 'E0' +stringdef e' hex 'E9' +stringdef e` hex 'E8' +stringdef i' hex 'ED' +stringdef i` hex 'EC' +stringdef o' hex 'F3' +stringdef o` hex 'F2' +stringdef u' hex 'FA' +stringdef u` hex 'F9' + +define v 'aeiou{a`}{e`}{i`}{o`}{u`}' + +define prelude as ( + test repeat ( + [substring] among( + '{a'}' (<- '{a`}') + '{e'}' (<- '{e`}') + '{i'}' (<- '{i`}') + '{o'}' (<- '{o`}') + '{u'}' (<- '{u`}') + 'qu' (<- 'qU') + '' (next) + ) + ) + repeat goto ( + v [ ('u' ] v <- 'U') or + ('i' ] v <- 'I') + ) +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + + [substring] among( + 'I' (<- 'i') + 'U' (<- 'u') + '' (next) + ) + +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define attached_pronoun as ( + [substring] among( + 'ci' 'gli' 'la' 'le' 'li' 'lo' + 'mi' 'ne' 'si' 'ti' 'vi' + // the compound forms are: + 'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene' + 'mela' 'mele' 'meli' 'melo' 'mene' + 'tela' 'tele' 'teli' 'telo' 'tene' + 'cela' 'cele' 'celi' 'celo' 'cene' + 'vela' 'vele' 'veli' 'velo' 'vene' + ) + among( (RV) + 'ando' 'endo' (delete) + 'ar' 'er' 'ir' (<- 'e') + ) + ) + + define standard_suffix as ( + [substring] among( + + 'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo' + 'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti' + 'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente' + 'atrice' 'atrici' + 'ante' 'anti' // Note 1 + ( R2 delete ) + 'azione' 'azioni' 'atore' 'atori' + ( R2 delete + try ( ['ic'] R2 delete ) + ) + 'logia' 'logie' + ( R2 <- 'log' ) + 'uzione' 'uzioni' 'usione' 'usioni' + ( R2 <- 'u' ) + 'enza' 'enze' + ( R2 <- 'ente' ) + 'amento' 'amenti' 'imento' 'imenti' + ( RV delete ) + 'amente' ( + R1 delete + try ( + [substring] R2 delete among( + 'iv' ( ['at'] R2 delete ) + 'os' 'ic' 'abil' + ) + ) + ) + 'it{a`}' ( + R2 delete + try ( + [substring] among( + 'abil' 'ic' 'iv' (R2 delete) + ) + ) + ) + 'ivo' 'ivi' 'iva' 'ive' ( + R2 delete + try ( ['at'] R2 delete ['ic'] R2 delete ) + ) + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among( + 'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi' + 'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate' + 'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai' + 'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo' + 'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete' + 'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo' + 'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei' + 'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono' + 'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita' + 'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo' + 'ono' 'uta' 'ute' 'uti' 'uto' + + 'ar' 'ir' // but 'er' is problematical + (delete) + ) + ) + + define AEIO 'aeio{a`}{e`}{i`}{o`}' + define CG 'cg' + + define vowel_suffix as ( + try ( + [AEIO] RV delete + ['i'] RV delete + ) + try ( + ['h'] CG RV delete + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards ( + do attached_pronoun + do (standard_suffix or verb_suffix) + do vowel_suffix + ) + do postlude +) + +/* + Note 1: additions of 15 Jun 2005 +*/ + diff --git a/contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl new file mode 100644 index 000000000..b43295c09 --- /dev/null +++ b/contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl @@ -0,0 +1,195 @@ + +routines ( + prelude postlude mark_regions + RV R1 R2 + attached_pronoun + standard_suffix + verb_suffix + vowel_suffix +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v AEIO CG ) + +stringescapes {} + +/* special characters (in MS-DOS Latin I) */ + +stringdef a' hex 'A0' +stringdef a` hex '85' +stringdef e' hex '82' +stringdef e` hex '8A' +stringdef i' hex 'A1' +stringdef i` hex '8D' +stringdef o' hex 'A2' +stringdef o` hex '95' +stringdef u' hex 'A3' +stringdef u` hex '97' + +define v 'aeiou{a`}{e`}{i`}{o`}{u`}' + +define prelude as ( + test repeat ( + [substring] among( + '{a'}' (<- '{a`}') + '{e'}' (<- '{e`}') + '{i'}' (<- '{i`}') + '{o'}' (<- '{o`}') + '{u'}' (<- '{u`}') + 'qu' (<- 'qU') + '' (next) + ) + ) + repeat goto ( + v [ ('u' ] v <- 'U') or + ('i' ] v <- 'I') + ) +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + + [substring] among( + 'I' (<- 'i') + 'U' (<- 'u') + '' (next) + ) + +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define attached_pronoun as ( + [substring] among( + 'ci' 'gli' 'la' 'le' 'li' 'lo' + 'mi' 'ne' 'si' 'ti' 'vi' + // the compound forms are: + 'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene' + 'mela' 'mele' 'meli' 'melo' 'mene' + 'tela' 'tele' 'teli' 'telo' 'tene' + 'cela' 'cele' 'celi' 'celo' 'cene' + 'vela' 'vele' 'veli' 'velo' 'vene' + ) + among( (RV) + 'ando' 'endo' (delete) + 'ar' 'er' 'ir' (<- 'e') + ) + ) + + define standard_suffix as ( + [substring] among( + + 'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo' + 'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti' + 'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente' + 'atrice' 'atrici' + 'ante' 'anti' // Note 1 + ( R2 delete ) + 'azione' 'azioni' 'atore' 'atori' + ( R2 delete + try ( ['ic'] R2 delete ) + ) + 'logia' 'logie' + ( R2 <- 'log' ) + 'uzione' 'uzioni' 'usione' 'usioni' + ( R2 <- 'u' ) + 'enza' 'enze' + ( R2 <- 'ente' ) + 'amento' 'amenti' 'imento' 'imenti' + ( RV delete ) + 'amente' ( + R1 delete + try ( + [substring] R2 delete among( + 'iv' ( ['at'] R2 delete ) + 'os' 'ic' 'abil' + ) + ) + ) + 'it{a`}' ( + R2 delete + try ( + [substring] among( + 'abil' 'ic' 'iv' (R2 delete) + ) + ) + ) + 'ivo' 'ivi' 'iva' 'ive' ( + R2 delete + try ( ['at'] R2 delete ['ic'] R2 delete ) + ) + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among( + 'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi' + 'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate' + 'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai' + 'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo' + 'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete' + 'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo' + 'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei' + 'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono' + 'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita' + 'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo' + 'ono' 'uta' 'ute' 'uti' 'uto' + + 'ar' 'ir' // but 'er' is problematical + (delete) + ) + ) + + define AEIO 'aeio{a`}{e`}{i`}{o`}' + define CG 'cg' + + define vowel_suffix as ( + try ( + [AEIO] RV delete + ['i'] RV delete + ) + try ( + ['h'] CG RV delete + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards ( + do attached_pronoun + do (standard_suffix or verb_suffix) + do vowel_suffix + ) + do postlude +) + +/* + Note 1: additions of 15 Jun 2005 +*/ + diff --git a/contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..cd79d1280 --- /dev/null +++ b/contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl @@ -0,0 +1,245 @@ +strings ( ch ) +integers ( x p1 p2 ) +booleans ( Y_found stemmed GE_removed ) + +routines ( + + R1 R2 + C V VX + lengthen_V + Step_1 Step_2 Step_3 Step_4 Step_7 + Step_6 Step_1c + Lose_prefix + Lose_infix + measure +) + +externals ( stem ) + +groupings ( v v_WX AOU AIOU ) + +stringescapes {} + +stringdef ' hex '27' // yuk + +define v 'aeiouy' +define v_WX v + 'wx' +define AOU 'aou' +define AIOU 'aiou' + +backwardmode ( + + define R1 as (setmark x $x >= p1) + define R2 as (setmark x $x >= p2) + + define V as test (v or 'ij') + define VX as test (next v or 'ij') + define C as test (not 'ij' non-v) + + define lengthen_V as do ( + non-v_WX [ (AOU] test (non-v or atlimit)) or + ('e'] test (non-v or atlimit + not AIOU + not (next AIOU non-v))) + ->ch insert ch + ) + + define Step_1 as + ( + [among ( (]) + + '{'}s' (delete) + 's' (R1 not ('t' R1) C delete) + 'ies' (R1 <-'ie') + 'es' + (('ar' R1 C ] delete lengthen_V) or + ('er' R1 C ] delete) or + (R1 C <-'e')) + + 'aus' (R1 V <-'au') + 'en' (('hed' R1 ] <-'heid') or + ('nd' delete) or + ('d' R1 C ] delete) or + ('i' or 'j' V delete) or + (R1 C delete lengthen_V)) + 'nde' (<-'nd') + ) + ) + + define Step_2 as + ( + [among ( (]) + 'je' (('{'}t' ] delete) or + ('et' ] R1 C delete) or + ('rnt' ] <-'rn') or + ('t' ] R1 VX delete) or + ('ink' ] <-'ing') or + ('mp' ] <-'m') or + ('{'}' ] R1 delete) or + (] R1 C delete)) + 'ge' (R1 <-'g') + 'lijke'(R1 <-'lijk') + 'ische'(R1 <-'isch') + 'de' (R1 C delete) + 'te' (R1 <-'t') + 'se' (R1 <-'s') + 're' (R1 <-'r') + 'le' (R1 delete attach 'l' lengthen_V) + 'ene' (R1 C delete attach 'en' lengthen_V) + 'ieve' (R1 C <-'ief') + ) + ) + + define Step_3 as + ( + [among ( (]) + 'atie' (R1 <-'eer') + 'iteit' (R1 delete lengthen_V) + 'heid' + 'sel' + 'ster' (R1 delete) + 'rder' (<-'r') + 'ing' + 'isme' + 'erij' (R1 delete lengthen_V) + 'arij' (R1 C <-'aar') + 'fie' (R2 delete attach 'f' lengthen_V) + 'gie' (R2 delete attach 'g' lengthen_V) + 'tst' (R1 C <-'t') + 'dst' (R1 C <-'d') + ) + ) + + define Step_4 as + ( + ( [among ( (]) + 'ioneel' (R1 <-'ie') + 'atief' (R1 <-'eer') + 'baar' (R1 delete) + 'naar' (R1 V <-'n') + 'laar' (R1 V <-'l') + 'raar' (R1 V <-'r') + 'tant' (R1 <-'teer') + 'lijker' + 'lijkst' (R1 <-'lijk') + 'achtig' + 'achtiger' + 'achtigst'(R1 delete) + 'eriger' + 'erigst' + 'erig' + 'end' (R1 C delete lengthen_V) + ) + ) + or + ( [among ( (]) + 'iger' + 'igst' + 'ig' (R1 C delete lengthen_V) + ) + ) + ) + + define Step_7 as + ( + [among ( (]) + 'kt' (<-'k') + 'ft' (<-'f') + 'pt' (<-'p') + ) + ) + + define Step_6 as + ( + [among ( (]) + 'bb' (<-'b') + 'cc' (<-'c') + 'dd' (<-'d') + 'ff' (<-'f') + 'gg' (<-'g') + 'hh' (<-'h') + 'jj' (<-'j') + 'kk' (<-'k') + 'll' (<-'l') + 'mm' (<-'m') + 'nn' (<-'n') + 'pp' (<-'p') + 'qq' (<-'q') + 'rr' (<-'r') + 'ss' (<-'s') + 'tt' (<-'t') + 'vv' (<-'v') + 'ww' (<-'w') + 'xx' (<-'x') + 'zz' (<-'z') + 'v' (<-'f') + 'z' (<-'s') + ) + ) + + define Step_1c as + ( + [among ( (] R1 C) + 'd' (not ('n' R1) delete) + 't' (not ('h' R1) delete) + ) + ) +) + +define Lose_prefix as ( + ['ge'] test hop 3 (goto v goto non-v) + set GE_removed + delete +) + +define Lose_infix as ( + next + gopast (['ge']) test hop 3 (goto v goto non-v) + set GE_removed + delete +) + +define measure as ( + do ( + tolimit + setmark p1 + setmark p2 + ) + do( + repeat non-v atleast 1 ('ij' or v) non-v setmark p1 + repeat non-v atleast 1 ('ij' or v) non-v setmark p2 + ) + +) +define stem as ( + + unset Y_found + unset stemmed + do ( ['y'] <-'Y' set Y_found ) + do repeat(goto (v ['y'])<-'Y' set Y_found ) + + measure + + backwards ( + do (Step_1 set stemmed ) + do (Step_2 set stemmed ) + do (Step_3 set stemmed ) + do (Step_4 set stemmed ) + ) + unset GE_removed + do (Lose_prefix and measure) + backwards ( + do (GE_removed Step_1c) + ) + unset GE_removed + do (Lose_infix and measure) + backwards ( + do (GE_removed Step_1c) + ) + backwards ( + do (Step_7 set stemmed ) + do (stemmed or GE_removed Step_6) + ) + do(Y_found repeat(goto (['Y']) <-'y')) +) + diff --git a/contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..3f69f1572 --- /dev/null +++ b/contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl @@ -0,0 +1,208 @@ + +stringescapes {} + +routines ( + A B C D E F G H I J K L M N O P Q R S T U V W X Y Z AA BB CC + + endings + + undouble respell +) + +externals ( stem ) + +backwardmode ( + + /* Lovins' conditions A, B ... CC, as given in her Appendix B, where + a test for a two letter prefix ('test hop 2') is implicitly + assumed. Note that 'e' next 'u' corresponds to her u*e because + Snowball is scanning backwards. */ + + define A as ( hop 2 ) + define B as ( hop 3 ) + define C as ( hop 4 ) + define D as ( hop 5 ) + define E as ( test hop 2 not 'e' ) + define F as ( test hop 3 not 'e' ) + define G as ( test hop 3 'f' ) + define H as ( test hop 2 't' or 'll' ) + define I as ( test hop 2 not 'o' not 'e' ) + define J as ( test hop 2 not 'a' not 'e' ) + define K as ( test hop 3 'l' or 'i' or ('e' next 'u') ) + define L as ( test hop 2 not 'u' not 'x' not ('s' not 'o') ) + define M as ( test hop 2 not 'a' not 'c' not 'e' not 'm' ) + define N as ( test hop 3 ( hop 2 not 's' or hop 2 ) ) + define O as ( test hop 2 'l' or 'i' ) + define P as ( test hop 2 not 'c' ) + define Q as ( test hop 2 test hop 3 not 'l' not 'n' ) + define R as ( test hop 2 'n' or 'r' ) + define S as ( test hop 2 'dr' or ('t' not 't') ) + define T as ( test hop 2 's' or ('t' not 'o') ) + define U as ( test hop 2 'l' or 'm' or 'n' or 'r' ) + define V as ( test hop 2 'c' ) + define W as ( test hop 2 not 's' not 'u' ) + define X as ( test hop 2 'l' or 'i' or ('e' next 'u') ) + define Y as ( test hop 2 'in' ) + define Z as ( test hop 2 not 'f' ) + define AA as ( test hop 2 among ( 'd' 'f' 'ph' 'th' 'l' 'er' 'or' + 'es' 't' ) ) + define BB as ( test hop 3 not 'met' not 'ryst' ) + define CC as ( test hop 2 'l' ) + + + /* The system of endings, as given in Appendix A. */ + + define endings as ( + [substring] among( + 'alistically' B 'arizability' A 'izationally' B + + 'antialness' A 'arisations' A 'arizations' A 'entialness' A + + 'allically' C 'antaneous' A 'antiality' A 'arisation' A + 'arization' A 'ationally' B 'ativeness' A 'eableness' E + 'entations' A 'entiality' A 'entialize' A 'entiation' A + 'ionalness' A 'istically' A 'itousness' A 'izability' A + 'izational' A + + 'ableness' A 'arizable' A 'entation' A 'entially' A + 'eousness' A 'ibleness' A 'icalness' A 'ionalism' A + 'ionality' A 'ionalize' A 'iousness' A 'izations' A + 'lessness' A + + 'ability' A 'aically' A 'alistic' B 'alities' A + 'ariness' E 'aristic' A 'arizing' A 'ateness' A + 'atingly' A 'ational' B 'atively' A 'ativism' A + 'elihood' E 'encible' A 'entally' A 'entials' A + 'entiate' A 'entness' A 'fulness' A 'ibility' A + 'icalism' A 'icalist' A 'icality' A 'icalize' A + 'ication' G 'icianry' A 'ination' A 'ingness' A + 'ionally' A 'isation' A 'ishness' A 'istical' A + 'iteness' A 'iveness' A 'ivistic' A 'ivities' A + 'ization' F 'izement' A 'oidally' A 'ousness' A + + 'aceous' A 'acious' B 'action' G 'alness' A + 'ancial' A 'ancies' A 'ancing' B 'ariser' A + 'arized' A 'arizer' A 'atable' A 'ations' B + 'atives' A 'eature' Z 'efully' A 'encies' A + 'encing' A 'ential' A 'enting' C 'entist' A + 'eously' A 'ialist' A 'iality' A 'ialize' A + 'ically' A 'icance' A 'icians' A 'icists' A + 'ifully' A 'ionals' A 'ionate' D 'ioning' A + 'ionist' A 'iously' A 'istics' A 'izable' E + 'lessly' A 'nesses' A 'oidism' A + + 'acies' A 'acity' A 'aging' B 'aical' A + 'alist' A 'alism' B 'ality' A 'alize' A + 'allic'BB 'anced' B 'ances' B 'antic' C + 'arial' A 'aries' A 'arily' A 'arity' B + 'arize' A 'aroid' A 'ately' A 'ating' I + 'ation' B 'ative' A 'ators' A 'atory' A + 'ature' E 'early' Y 'ehood' A 'eless' A + 'elity' A 'ement' A 'enced' A 'ences' A + 'eness' E 'ening' E 'ental' A 'ented' C + 'ently' A 'fully' A 'ially' A 'icant' A + 'ician' A 'icide' A 'icism' A 'icist' A + 'icity' A 'idine' I 'iedly' A 'ihood' A + 'inate' A 'iness' A 'ingly' B 'inism' J + 'inity'CC 'ional' A 'ioned' A 'ished' A + 'istic' A 'ities' A 'itous' A 'ively' A + 'ivity' A 'izers' F 'izing' F 'oidal' A + 'oides' A 'otide' A 'ously' A + + 'able' A 'ably' A 'ages' B 'ally' B + 'ance' B 'ancy' B 'ants' B 'aric' A + 'arly' K 'ated' I 'ates' A 'atic' B + 'ator' A 'ealy' Y 'edly' E 'eful' A + 'eity' A 'ence' A 'ency' A 'ened' E + 'enly' E 'eous' A 'hood' A 'ials' A + 'ians' A 'ible' A 'ibly' A 'ical' A + 'ides' L 'iers' A 'iful' A 'ines' M + 'ings' N 'ions' B 'ious' A 'isms' B + 'ists' A 'itic' H 'ized' F 'izer' F + 'less' A 'lily' A 'ness' A 'ogen' A + 'ward' A 'wise' A 'ying' B 'yish' A + + 'acy' A 'age' B 'aic' A 'als'BB + 'ant' B 'ars' O 'ary' F 'ata' A + 'ate' A 'eal' Y 'ear' Y 'ely' E + 'ene' E 'ent' C 'ery' E 'ese' A + 'ful' A 'ial' A 'ian' A 'ics' A + 'ide' L 'ied' A 'ier' A 'ies' P + 'ily' A 'ine' M 'ing' N 'ion' Q + 'ish' C 'ism' B 'ist' A 'ite'AA + 'ity' A 'ium' A 'ive' A 'ize' F + 'oid' A 'one' R 'ous' A + + 'ae' A 'al'BB 'ar' X 'as' B + 'ed' E 'en' F 'es' E 'ia' A + 'ic' A 'is' A 'ly' B 'on' S + 'or' T 'um' U 'us' V 'yl' R + '{'}s' A 's{'}' A + + 'a' A 'e' A 'i' A 'o' A + 's' W 'y' B + + (delete) + ) + ) + + /* Undoubling is rule 1 of appendix C. */ + + define undouble as ( + test substring among ('bb' 'dd' 'gg' 'll' 'mm' 'nn' 'pp' 'rr' 'ss' + 'tt') + [next] delete + ) + + /* The other appendix C rules can be done together. */ + + define respell as ( + [substring] among ( + 'iev' (<-'ief') + 'uct' (<-'uc') + 'umpt' (<-'um') + 'rpt' (<-'rb') + 'urs' (<-'ur') + 'istr' (<-'ister') + 'metr' (<-'meter') + 'olv' (<-'olut') + 'ul' (not 'a' not 'i' not 'o' <-'l') + 'bex' (<-'bic') + 'dex' (<-'dic') + 'pex' (<-'pic') + 'tex' (<-'tic') + 'ax' (<-'ac') + 'ex' (<-'ec') + 'ix' (<-'ic') + 'lux' (<-'luc') + 'uad' (<-'uas') + 'vad' (<-'vas') + 'cid' (<-'cis') + 'lid' (<-'lis') + 'erid' (<-'eris') + 'pand' (<-'pans') + 'end' (not 's' <-'ens') + 'ond' (<-'ons') + 'lud' (<-'lus') + 'rud' (<-'rus') + 'her' (not 'p' not 't' <-'hes') + 'mit' (<-'mis') + 'ent' (not 'm' <-'ens') + /* 'ent' was 'end' in the 1968 paper - a typo. */ + 'ert' (<-'ers') + 'et' (not 'n' <-'es') + 'yt' (<-'ys') + 'yz' (<-'ys') + ) + ) +) + +define stem as ( + + backwards ( + do endings + do undouble + do respell + ) +) + diff --git a/contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..94a071653 --- /dev/null +++ b/contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl @@ -0,0 +1,80 @@ +routines ( + mark_regions + main_suffix + consonant_pair + other_suffix +) + +externals ( stem ) + +integers ( p1 x ) + +groupings ( v s_ending ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef ae hex 'E6' +stringdef ao hex 'E5' +stringdef o/ hex 'F8' + +define v 'aeiouy{ae}{ao}{o/}' + +define s_ending 'bcdfghjlmnoprtvyz' + +define mark_regions as ( + + $p1 = limit + + test ( hop 3 setmark x ) + goto v gopast non-v setmark p1 + try ( $p1 < x $p1 = x ) +) + +backwardmode ( + + define main_suffix as ( + setlimit tomark p1 for ([substring]) + among( + + 'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar' + 'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens' + 'hetens' 'ers' 'ets' 'et' 'het' 'ast' + (delete) + 's' + (s_ending or ('k' non-v) delete) + 'erte' 'ert' + (<-'er') + ) + ) + + define consonant_pair as ( + test ( + setlimit tomark p1 for ([substring]) + among( + 'dt' 'vt' + ) + ) + next] delete + ) + + define other_suffix as ( + setlimit tomark p1 for ([substring]) + among( + 'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov' + 'hetslov' + (delete) + ) + ) +) + +define stem as ( + + do mark_regions + backwards ( + do main_suffix + do consonant_pair + do other_suffix + ) +) diff --git a/contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl new file mode 100644 index 000000000..f57483354 --- /dev/null +++ b/contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl @@ -0,0 +1,80 @@ +routines ( + mark_regions + main_suffix + consonant_pair + other_suffix +) + +externals ( stem ) + +integers ( p1 x ) + +groupings ( v s_ending ) + +stringescapes {} + +/* special characters (in MS-DOS Latin I) */ + +stringdef ae hex '91' +stringdef ao hex '86' +stringdef o/ hex '9B' + +define v 'aeiouy{ae}{ao}{o/}' + +define s_ending 'bcdfghjlmnoprtvyz' + +define mark_regions as ( + + $p1 = limit + + test ( hop 3 setmark x ) + goto v gopast non-v setmark p1 + try ( $p1 < x $p1 = x ) +) + +backwardmode ( + + define main_suffix as ( + setlimit tomark p1 for ([substring]) + among( + + 'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar' + 'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens' + 'hetens' 'ers' 'ets' 'et' 'het' 'ast' + (delete) + 's' + (s_ending or ('k' non-v) delete) + 'erte' 'ert' + (<-'er') + ) + ) + + define consonant_pair as ( + test ( + setlimit tomark p1 for ([substring]) + among( + 'dt' 'vt' + ) + ) + next] delete + ) + + define other_suffix as ( + setlimit tomark p1 for ([substring]) + among( + 'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov' + 'hetslov' + (delete) + ) + ) +) + +define stem as ( + + do mark_regions + backwards ( + do main_suffix + do consonant_pair + do other_suffix + ) +) diff --git a/contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..9533b7932 --- /dev/null +++ b/contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl @@ -0,0 +1,139 @@ +integers ( p1 p2 ) +booleans ( Y_found ) + +routines ( + shortv + R1 R2 + Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5a Step_5b +) + +externals ( stem ) + +groupings ( v v_WXY ) + +define v 'aeiouy' +define v_WXY v + 'wxY' + +backwardmode ( + + define shortv as ( non-v_WXY v non-v ) + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define Step_1a as ( + [substring] among ( + 'sses' (<-'ss') + 'ies' (<-'i') + 'ss' () + 's' (delete) + ) + ) + + define Step_1b as ( + [substring] among ( + 'eed' (R1 <-'ee') + 'ed' + 'ing' ( + test gopast v delete + test substring among( + 'at' 'bl' 'iz' + (<+ 'e') + 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt' + // ignoring double c, h, j, k, q, v, w, and x + ([next] delete) + '' (atmark p1 test shortv <+ 'e') + ) + ) + ) + ) + + define Step_1c as ( + ['y' or 'Y'] + gopast v + <-'i' + ) + + define Step_2 as ( + [substring] R1 among ( + 'tional' (<-'tion') + 'enci' (<-'ence') + 'anci' (<-'ance') + 'abli' (<-'able') + 'entli' (<-'ent') + 'eli' (<-'e') + 'izer' 'ization' + (<-'ize') + 'ational' 'ation' 'ator' + (<-'ate') + 'alli' (<-'al') + 'alism' 'aliti' + (<-'al') + 'fulness' (<-'ful') + 'ousli' 'ousness' + (<-'ous') + 'iveness' 'iviti' + (<-'ive') + 'biliti' (<-'ble') + ) + ) + + define Step_3 as ( + [substring] R1 among ( + 'alize' (<-'al') + 'icate' 'iciti' 'ical' + (<-'ic') + 'ative' 'ful' 'ness' + (delete) + ) + ) + + define Step_4 as ( + [substring] R2 among ( + 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement' + 'ment' 'ent' 'ou' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize' + (delete) + 'ion' ('s' or 't' delete) + ) + ) + + define Step_5a as ( + ['e'] + R2 or (R1 not shortv) + delete + ) + + define Step_5b as ( + ['l'] + R2 'l' + delete + ) +) + +define stem as ( + + unset Y_found + do ( ['y'] <-'Y' set Y_found) + do repeat(goto (v ['y']) <-'Y' set Y_found) + + $p1 = limit + $p2 = limit + do( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) + + backwards ( + do Step_1a + do Step_1b + do Step_1c + do Step_2 + do Step_3 + do Step_4 + do Step_5a + do Step_5b + ) + + do(Y_found repeat(goto (['Y']) <-'y')) + +) diff --git a/contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..3e7da08d4 --- /dev/null +++ b/contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl @@ -0,0 +1,218 @@ +routines ( + prelude postlude mark_regions + RV R1 R2 + standard_suffix + verb_suffix + residual_suffix + residual_form +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef a' hex 'E1' // a-acute +stringdef a^ hex 'E2' // a-circumflex e.g. 'bota^nico +stringdef e' hex 'E9' // e-acute +stringdef e^ hex 'EA' // e-circumflex +stringdef i' hex 'ED' // i-acute +stringdef o^ hex 'F4' // o-circumflex +stringdef o' hex 'F3' // o-acute +stringdef u' hex 'FA' // u-acute +stringdef c, hex 'E7' // c-cedilla + +stringdef a~ hex 'E3' // a-tilde +stringdef o~ hex 'F5' // o-tilde + + +define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}' + +define prelude as repeat ( + [substring] among( + '{a~}' (<- 'a~') + '{o~}' (<- 'o~') + '' (next) + ) //or next +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + [substring] among( + 'a~' (<- '{a~}') + 'o~' (<- '{o~}') + '' (next) + ) //or next +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define standard_suffix as ( + [substring] among( + + 'eza' 'ezas' + 'ico' 'ica' 'icos' 'icas' + 'ismo' 'ismos' + '{a'}vel' + '{i'}vel' + 'ista' 'istas' + 'oso' 'osa' 'osos' 'osas' + 'amento' 'amentos' + 'imento' 'imentos' + + 'adora' 'ador' 'a{c,}a~o' + 'adoras' 'adores' 'a{c,}o~es' // no -ic test + 'ante' 'antes' '{a^}ncia' // Note 1 + ( + R2 delete + ) + 'log{i'}a' + 'log{i'}as' + ( + R2 <- 'log' + ) + 'uci{o'}n' 'uciones' + ( + R2 <- 'u' + ) + '{e^}ncia' '{e^}ncias' + ( + R2 <- 'ente' + ) + 'amente' + ( + R1 delete + try ( + [substring] R2 delete among( + 'iv' (['at'] R2 delete) + 'os' + 'ic' + 'ad' + ) + ) + ) + 'mente' + ( + R2 delete + try ( + [substring] among( + 'ante' // Note 1 + 'avel' + '{i'}vel' (R2 delete) + ) + ) + ) + 'idade' + 'idades' + ( + R2 delete + try ( + [substring] among( + 'abil' + 'ic' + 'iv' (R2 delete) + ) + ) + ) + 'iva' 'ivo' + 'ivas' 'ivos' + ( + R2 delete + try ( + ['at'] R2 delete // but not a further ['ic'] R2 delete + ) + ) + 'ira' 'iras' + ( + RV 'e' // -eira -eiras usually non-verbal + <- 'ir' + ) + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among( + 'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}' + 'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste' + 'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam' + 'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem' + 'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o' + 'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias' + 'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras' + 'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres' + 'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is' + 'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis' + '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis' + '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos' + '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos' + 'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos' + 'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos' + '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou' + + 'ira' 'iras' + (delete) + ) + ) + + define residual_suffix as ( + [substring] among( + 'os' + 'a' 'i' 'o' '{a'}' '{i'}' '{o'}' + ( RV delete ) + ) + ) + + define residual_form as ( + [substring] among( + 'e' '{e'}' '{e^}' + ( RV delete [('u'] test 'g') or + ('i'] test 'c') RV delete ) + '{c,}' (<-'c') + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards ( + do ( + ( ( standard_suffix or verb_suffix ) + and do ( ['i'] test 'c' RV delete ) + ) + or residual_suffix + ) + do residual_form + ) + do postlude +) + +/* + Note 1: additions of 15 Jun 2005 +*/ diff --git a/contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl new file mode 100644 index 000000000..4d6c85214 --- /dev/null +++ b/contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl @@ -0,0 +1,218 @@ +routines ( + prelude postlude mark_regions + RV R1 R2 + standard_suffix + verb_suffix + residual_suffix + residual_form +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v ) + +stringescapes {} + +/* special characters (in MS-DOS Latin I) */ + +stringdef a' hex 'A0' // a-acute +stringdef a^ hex '83' // a-circumflex e.g. 'bota^nico +stringdef e' hex '82' // e-acute +stringdef e^ hex '88' // e-circumflex +stringdef i' hex 'A1' // i-acute +stringdef o^ hex '93' // o-circumflex +stringdef o' hex 'A2' // o-acute +stringdef u' hex 'A3' // u-acute +stringdef c, hex '87' // c-cedilla + +stringdef a~ hex 'C6' // a-tilde +stringdef o~ hex 'E4' // o-tilde + + +define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}' + +define prelude as repeat ( + [substring] among( + '{a~}' (<- 'a~') + '{o~}' (<- 'o~') + '' (next) + ) //or next +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + [substring] among( + 'a~' (<- '{a~}') + 'o~' (<- '{o~}') + '' (next) + ) //or next +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define standard_suffix as ( + [substring] among( + + 'eza' 'ezas' + 'ico' 'ica' 'icos' 'icas' + 'ismo' 'ismos' + '{a'}vel' + '{i'}vel' + 'ista' 'istas' + 'oso' 'osa' 'osos' 'osas' + 'amento' 'amentos' + 'imento' 'imentos' + + 'adora' 'ador' 'a{c,}a~o' + 'adoras' 'adores' 'a{c,}o~es' // no -ic test + 'ante' 'antes' '{a^}ncia' // Note 1 + ( + R2 delete + ) + 'log{i'}a' + 'log{i'}as' + ( + R2 <- 'log' + ) + 'uci{o'}n' 'uciones' + ( + R2 <- 'u' + ) + '{e^}ncia' '{e^}ncias' + ( + R2 <- 'ente' + ) + 'amente' + ( + R1 delete + try ( + [substring] R2 delete among( + 'iv' (['at'] R2 delete) + 'os' + 'ic' + 'ad' + ) + ) + ) + 'mente' + ( + R2 delete + try ( + [substring] among( + 'ante' // Note 1 + 'avel' + '{i'}vel' (R2 delete) + ) + ) + ) + 'idade' + 'idades' + ( + R2 delete + try ( + [substring] among( + 'abil' + 'ic' + 'iv' (R2 delete) + ) + ) + ) + 'iva' 'ivo' + 'ivas' 'ivos' + ( + R2 delete + try ( + ['at'] R2 delete // but not a further ['ic'] R2 delete + ) + ) + 'ira' 'iras' + ( + RV 'e' // -eira -eiras usually non-verbal + <- 'ir' + ) + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among( + 'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}' + 'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste' + 'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam' + 'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem' + 'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o' + 'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias' + 'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras' + 'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres' + 'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is' + 'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis' + '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis' + '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos' + '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos' + 'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos' + 'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos' + '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou' + + 'ira' 'iras' + (delete) + ) + ) + + define residual_suffix as ( + [substring] among( + 'os' + 'a' 'i' 'o' '{a'}' '{i'}' '{o'}' + ( RV delete ) + ) + ) + + define residual_form as ( + [substring] among( + 'e' '{e'}' '{e^}' + ( RV delete [('u'] test 'g') or + ('i'] test 'c') RV delete ) + '{c,}' (<-'c') + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards ( + do ( + ( ( standard_suffix or verb_suffix ) + and do ( ['i'] test 'c' RV delete ) + ) + or residual_suffix + ) + do residual_form + ) + do postlude +) + +/* + Note 1: additions of 15 Jun 2005 +*/ diff --git a/contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl b/contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl new file mode 100644 index 000000000..48a148321 --- /dev/null +++ b/contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl @@ -0,0 +1,236 @@ + +routines ( + prelude postlude mark_regions + RV R1 R2 + step_0 + standard_suffix combo_suffix + verb_suffix + vowel_suffix +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v ) + +booleans ( standard_suffix_removed ) + +stringescapes {} + +/* special characters */ + +stringdef a^ hex 'E2' // a circumflex +stringdef i^ hex 'EE' // i circumflex +stringdef a+ hex 'E3' // a breve +stringdef s, hex 'BA' // s cedilla +stringdef t, hex 'FE' // t cedilla + +define v 'aeiou{a^}{i^}{a+}' + +define prelude as ( + repeat goto ( + v [ ('u' ] v <- 'U') or + ('i' ] v <- 'I') + ) +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + + [substring] among( + 'I' (<- 'i') + 'U' (<- 'u') + '' (next) + ) + +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define step_0 as ( + [substring] R1 among( + 'ul' 'ului' + ( delete ) + 'aua' + ( <-'a' ) + 'ea' 'ele' 'elor' + ( <-'e' ) + 'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor' + ( <-'i') + 'ile' + ( not 'ab' <- 'i' ) + 'atei' + ( <- 'at' ) + 'a{t,}ie' 'a{t,}ia' + ( <- 'a{t,}i' ) + ) + ) + + define combo_suffix as test ( + [substring] R1 ( + among( + /* 'IST'. alternative: include the following + 'alism' 'alisme' + 'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' ( + <- 'al' + ) + */ + 'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' ( + <- 'abil' + ) + 'ibilitate' ( + <- 'ibil' + ) + 'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' ( + <- 'iv' + ) + 'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i' + 'icator' 'icatori' + 'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}' + 'ical' 'icala' 'icale' 'icali' 'ical{a+}' ( + <- 'ic' + ) + 'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune' + 'atoare' 'ator' 'atori' + '{a+}toare' '{a+}tor' '{a+}tori' ( + <- 'at' + ) + 'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune' + 'itoare' 'itor' 'itori' ( + <- 'it' + ) + ) + set standard_suffix_removed + ) + ) + + define standard_suffix as ( + unset standard_suffix_removed + repeat combo_suffix + [substring] R2 ( + among( + + // past participle is treated here, rather than + // as a verb ending: + 'at' 'ata' 'at{a+}' 'ati' 'ate' + 'ut' 'uta' 'ut{a+}' 'uti' 'ute' + 'it' 'ita' 'it{a+}' 'iti' 'ite' + + 'ic' 'ica' 'ice' 'ici' 'ic{a+}' + 'abil' 'abila' 'abile' 'abili' 'abil{a+}' + 'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}' + 'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i' + 'ant' 'anta' 'ante' 'anti' 'ant{a+}' + 'ator' 'atori' + 'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i' + 'iv' 'iva' 'ive' 'ivi' 'iv{a+}' ( + delete + ) + 'iune' 'iuni' ( + '{t,}'] <- 't' + ) + 'ism' 'isme' + 'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' ( + <- 'ist' + /* 'IST'. alternative: remove with <- '' */ + ) + ) + set standard_suffix_removed + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among( + // 'long' infinitive: + 'are' 'ere' 'ire' '{a^}re' + + // gerund: + 'ind' '{a^}nd' + 'indu' '{a^}ndu' + + 'eze' + 'easc{a+}' + // present: + 'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti' + 'e{s,}te' + '{a+}sc' '{a+}{s,}ti' + '{a+}{s,}te' + + // imperfect: + 'am' 'ai' 'au' + 'eam' 'eai' 'ea' 'ea{t,}i' 'eau' + 'iam' 'iai' 'ia' 'ia{t,}i' 'iau' + + // past: // (not 'ii') + 'ui' + 'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}' + 'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}' + 'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}' + '{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}' + + // pluferfect: + 'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}' + 'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}' + '{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i' + '{a^}ser{a+}' + 'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}' + + ( non-v or 'u' delete ) + + // present: + '{a+}m' 'a{t,}i' + 'em' 'e{t,}i' + 'im' 'i{t,}i' + '{a^}m' '{a^}{t,}i' + + // past: + 'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}' + 'sei' 'se' + + // pluperfect: + 'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}' + (delete) + ) + ) + + define vowel_suffix as ( + [substring] RV among ( + 'a' 'e' 'i' 'ie' '{a+}' ( delete ) + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards ( + do step_0 + do standard_suffix + do ( standard_suffix_removed or verb_suffix ) + do vowel_suffix + ) + do postlude +) + diff --git a/contrib/snowball/algorithms/romanian/stem_Unicode.sbl b/contrib/snowball/algorithms/romanian/stem_Unicode.sbl new file mode 100644 index 000000000..09aec6429 --- /dev/null +++ b/contrib/snowball/algorithms/romanian/stem_Unicode.sbl @@ -0,0 +1,236 @@ + +routines ( + prelude postlude mark_regions + RV R1 R2 + step_0 + standard_suffix combo_suffix + verb_suffix + vowel_suffix +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v ) + +booleans ( standard_suffix_removed ) + +stringescapes {} + +/* special characters */ + +stringdef a^ hex '0E2' // a circumflex +stringdef i^ hex '0EE' // i circumflex +stringdef a+ hex '103' // a breve +stringdef s, hex '15F' // s cedilla +stringdef t, hex '163' // t cedilla + +define v 'aeiou{a^}{i^}{a+}' + +define prelude as ( + repeat goto ( + v [ ('u' ] v <- 'U') or + ('i' ] v <- 'I') + ) +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + + [substring] among( + 'I' (<- 'i') + 'U' (<- 'u') + '' (next) + ) + +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define step_0 as ( + [substring] R1 among( + 'ul' 'ului' + ( delete ) + 'aua' + ( <-'a' ) + 'ea' 'ele' 'elor' + ( <-'e' ) + 'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor' + ( <-'i') + 'ile' + ( not 'ab' <- 'i' ) + 'atei' + ( <- 'at' ) + 'a{t,}ie' 'a{t,}ia' + ( <- 'a{t,}i' ) + ) + ) + + define combo_suffix as test ( + [substring] R1 ( + among( + /* 'IST'. alternative: include the following + 'alism' 'alisme' + 'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' ( + <- 'al' + ) + */ + 'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' ( + <- 'abil' + ) + 'ibilitate' ( + <- 'ibil' + ) + 'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' ( + <- 'iv' + ) + 'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i' + 'icator' 'icatori' + 'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}' + 'ical' 'icala' 'icale' 'icali' 'ical{a+}' ( + <- 'ic' + ) + 'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune' + 'atoare' 'ator' 'atori' + '{a+}toare' '{a+}tor' '{a+}tori' ( + <- 'at' + ) + 'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune' + 'itoare' 'itor' 'itori' ( + <- 'it' + ) + ) + set standard_suffix_removed + ) + ) + + define standard_suffix as ( + unset standard_suffix_removed + repeat combo_suffix + [substring] R2 ( + among( + + // past participle is treated here, rather than + // as a verb ending: + 'at' 'ata' 'at{a+}' 'ati' 'ate' + 'ut' 'uta' 'ut{a+}' 'uti' 'ute' + 'it' 'ita' 'it{a+}' 'iti' 'ite' + + 'ic' 'ica' 'ice' 'ici' 'ic{a+}' + 'abil' 'abila' 'abile' 'abili' 'abil{a+}' + 'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}' + 'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i' + 'ant' 'anta' 'ante' 'anti' 'ant{a+}' + 'ator' 'atori' + 'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i' + 'iv' 'iva' 'ive' 'ivi' 'iv{a+}' ( + delete + ) + 'iune' 'iuni' ( + '{t,}'] <- 't' + ) + 'ism' 'isme' + 'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' ( + <- 'ist' + /* 'IST'. alternative: remove with <- '' */ + ) + ) + set standard_suffix_removed + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among( + // 'long' infinitive: + 'are' 'ere' 'ire' '{a^}re' + + // gerund: + 'ind' '{a^}nd' + 'indu' '{a^}ndu' + + 'eze' + 'easc{a+}' + // present: + 'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti' + 'e{s,}te' + '{a+}sc' '{a+}{s,}ti' + '{a+}{s,}te' + + // imperfect: + 'am' 'ai' 'au' + 'eam' 'eai' 'ea' 'ea{t,}i' 'eau' + 'iam' 'iai' 'ia' 'ia{t,}i' 'iau' + + // past: // (not 'ii') + 'ui' + 'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}' + 'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}' + 'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}' + '{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}' + + // pluferfect: + 'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}' + 'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}' + '{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i' + '{a^}ser{a+}' + 'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}' + + ( non-v or 'u' delete ) + + // present: + '{a+}m' 'a{t,}i' + 'em' 'e{t,}i' + 'im' 'i{t,}i' + '{a^}m' '{a^}{t,}i' + + // past: + 'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}' + 'sei' 'se' + + // pluperfect: + 'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}' + (delete) + ) + ) + + define vowel_suffix as ( + [substring] RV among ( + 'a' 'e' 'i' 'ie' '{a+}' ( delete ) + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards ( + do step_0 + do standard_suffix + do ( standard_suffix_removed or verb_suffix ) + do vowel_suffix + ) + do postlude +) + diff --git a/contrib/snowball/algorithms/russian/stem_KOI8_R.sbl b/contrib/snowball/algorithms/russian/stem_KOI8_R.sbl new file mode 100644 index 000000000..cdacb19c4 --- /dev/null +++ b/contrib/snowball/algorithms/russian/stem_KOI8_R.sbl @@ -0,0 +1,217 @@ +stringescapes {} + +/* the 32 Cyrillic letters in the KOI8-R coding scheme, and represented + in Latin characters following the conventions of the standard Library + of Congress transliteration: */ + +stringdef a hex 'C1' +stringdef b hex 'C2' +stringdef v hex 'D7' +stringdef g hex 'C7' +stringdef d hex 'C4' +stringdef e hex 'C5' +stringdef zh hex 'D6' +stringdef z hex 'DA' +stringdef i hex 'C9' +stringdef i` hex 'CA' +stringdef k hex 'CB' +stringdef l hex 'CC' +stringdef m hex 'CD' +stringdef n hex 'CE' +stringdef o hex 'CF' +stringdef p hex 'D0' +stringdef r hex 'D2' +stringdef s hex 'D3' +stringdef t hex 'D4' +stringdef u hex 'D5' +stringdef f hex 'C6' +stringdef kh hex 'C8' +stringdef ts hex 'C3' +stringdef ch hex 'DE' +stringdef sh hex 'DB' +stringdef shch hex 'DD' +stringdef " hex 'DF' +stringdef y hex 'D9' +stringdef ' hex 'D8' +stringdef e` hex 'DC' +stringdef iu hex 'C0' +stringdef ia hex 'D1' + +routines ( mark_regions R2 + perfective_gerund + adjective + adjectival + reflexive + verb + noun + derivational + tidy_up +) + +externals ( stem ) + +integers ( pV p2 ) + +groupings ( v ) + +define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}' + +define mark_regions as ( + + $pV = limit + $p2 = limit + do ( + gopast v setmark pV gopast non-v + gopast v gopast non-v setmark p2 + ) +) + +backwardmode ( + + define R2 as $p2 <= cursor + + define perfective_gerund as ( + [substring] among ( + '{v}' + '{v}{sh}{i}' + '{v}{sh}{i}{s}{'}' + ('{a}' or '{ia}' delete) + '{i}{v}' + '{i}{v}{sh}{i}' + '{i}{v}{sh}{i}{s}{'}' + '{y}{v}' + '{y}{v}{sh}{i}' + '{y}{v}{sh}{i}{s}{'}' + (delete) + ) + ) + + define adjective as ( + [substring] among ( + '{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}' + '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}' + '{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}' + '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}' + '{ia}{ia}' + // and - + '{o}{iu}' // - which is somewhat archaic + '{e}{iu}' // - soft form of {o}{iu} + (delete) + ) + ) + + define adjectival as ( + adjective + + /* of the participle forms, em, vsh, ivsh, yvsh are readily removable. + nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of + errors. Removing im, uem, enn creates too many errors. + */ + + try ( + [substring] among ( + '{e}{m}' // present passive participle + '{n}{n}' // adjective from past passive participle + '{v}{sh}' // past active participle + '{iu}{shch}' '{shch}' // present active participle + ('{a}' or '{ia}' delete) + + //but not '{i}{m}' '{u}{e}{m}' // present passive participle + //or '{e}{n}{n}' // adjective from past passive participle + + '{i}{v}{sh}' '{y}{v}{sh}'// past active participle + '{u}{iu}{shch}' // present active participle + (delete) + ) + ) + + ) + + define reflexive as ( + [substring] among ( + '{s}{ia}' + '{s}{'}' + (delete) + ) + ) + + define verb as ( + [substring] among ( + '{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}' + '{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}' + '{n}{y}' '{t}{'}' '{e}{sh}{'}' + + '{n}{n}{o}' + ('{a}' or '{ia}' delete) + + '{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}' + '{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}' + '{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}' + '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}' + '{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}' + '{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}' + (delete) + /* note the short passive participle tests: + '{n}{a}' '{n}' '{n}{o}' '{n}{y}' + '{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}' + */ + ) + ) + + define noun as ( + [substring] among ( + '{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}' + '{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}' + '{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}' + '{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}' + '{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}' + '{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}' + (delete) + /* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}' + '{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}' + omitted - they only occur on 12 words. + */ + ) + ) + + define derivational as ( + [substring] R2 among ( + '{o}{s}{t}' + '{o}{s}{t}{'}' + (delete) + ) + ) + + define tidy_up as ( + [substring] among ( + + '{e}{i`}{sh}' + '{e}{i`}{sh}{e}' // superlative forms + (delete + ['{n}'] '{n}' delete + ) + '{n}' + ('{n}' delete) // e.g. -nno endings + '{'}' + (delete) // with some slight false conflations + ) + ) +) + +define stem as ( + + do mark_regions + backwards setlimit tomark pV for ( + do ( + perfective_gerund or + ( try reflexive + adjectival or verb or noun + ) + ) + try([ '{i}' ] delete) + // because noun ending -i{iu} is being treated as verb ending -{iu} + + do derivational + do tidy_up + ) +) diff --git a/contrib/snowball/algorithms/russian/stem_Unicode.sbl b/contrib/snowball/algorithms/russian/stem_Unicode.sbl new file mode 100644 index 000000000..9e1a93f93 --- /dev/null +++ b/contrib/snowball/algorithms/russian/stem_Unicode.sbl @@ -0,0 +1,215 @@ +stringescapes {} + +/* the 32 Cyrillic letters in Unicode */ + +stringdef a hex '430' +stringdef b hex '431' +stringdef v hex '432' +stringdef g hex '433' +stringdef d hex '434' +stringdef e hex '435' +stringdef zh hex '436' +stringdef z hex '437' +stringdef i hex '438' +stringdef i` hex '439' +stringdef k hex '43A' +stringdef l hex '43B' +stringdef m hex '43C' +stringdef n hex '43D' +stringdef o hex '43E' +stringdef p hex '43F' +stringdef r hex '440' +stringdef s hex '441' +stringdef t hex '442' +stringdef u hex '443' +stringdef f hex '444' +stringdef kh hex '445' +stringdef ts hex '446' +stringdef ch hex '447' +stringdef sh hex '448' +stringdef shch hex '449' +stringdef " hex '44A' +stringdef y hex '44B' +stringdef ' hex '44C' +stringdef e` hex '44D' +stringdef iu hex '44E' +stringdef ia hex '44F' + +routines ( mark_regions R2 + perfective_gerund + adjective + adjectival + reflexive + verb + noun + derivational + tidy_up +) + +externals ( stem ) + +integers ( pV p2 ) + +groupings ( v ) + +define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}' + +define mark_regions as ( + + $pV = limit + $p2 = limit + do ( + gopast v setmark pV gopast non-v + gopast v gopast non-v setmark p2 + ) +) + +backwardmode ( + + define R2 as $p2 <= cursor + + define perfective_gerund as ( + [substring] among ( + '{v}' + '{v}{sh}{i}' + '{v}{sh}{i}{s}{'}' + ('{a}' or '{ia}' delete) + '{i}{v}' + '{i}{v}{sh}{i}' + '{i}{v}{sh}{i}{s}{'}' + '{y}{v}' + '{y}{v}{sh}{i}' + '{y}{v}{sh}{i}{s}{'}' + (delete) + ) + ) + + define adjective as ( + [substring] among ( + '{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}' + '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}' + '{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}' + '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}' + '{ia}{ia}' + // and - + '{o}{iu}' // - which is somewhat archaic + '{e}{iu}' // - soft form of {o}{iu} + (delete) + ) + ) + + define adjectival as ( + adjective + + /* of the participle forms, em, vsh, ivsh, yvsh are readily removable. + nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of + errors. Removing im, uem, enn creates too many errors. + */ + + try ( + [substring] among ( + '{e}{m}' // present passive participle + '{n}{n}' // adjective from past passive participle + '{v}{sh}' // past active participle + '{iu}{shch}' '{shch}' // present active participle + ('{a}' or '{ia}' delete) + + //but not '{i}{m}' '{u}{e}{m}' // present passive participle + //or '{e}{n}{n}' // adjective from past passive participle + + '{i}{v}{sh}' '{y}{v}{sh}'// past active participle + '{u}{iu}{shch}' // present active participle + (delete) + ) + ) + + ) + + define reflexive as ( + [substring] among ( + '{s}{ia}' + '{s}{'}' + (delete) + ) + ) + + define verb as ( + [substring] among ( + '{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}' + '{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}' + '{n}{y}' '{t}{'}' '{e}{sh}{'}' + + '{n}{n}{o}' + ('{a}' or '{ia}' delete) + + '{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}' + '{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}' + '{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}' + '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}' + '{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}' + '{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}' + (delete) + /* note the short passive participle tests: + '{n}{a}' '{n}' '{n}{o}' '{n}{y}' + '{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}' + */ + ) + ) + + define noun as ( + [substring] among ( + '{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}' + '{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}' + '{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}' + '{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}' + '{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}' + '{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}' + (delete) + /* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}' + '{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}' + omitted - they only occur on 12 words. + */ + ) + ) + + define derivational as ( + [substring] R2 among ( + '{o}{s}{t}' + '{o}{s}{t}{'}' + (delete) + ) + ) + + define tidy_up as ( + [substring] among ( + + '{e}{i`}{sh}' + '{e}{i`}{sh}{e}' // superlative forms + (delete + ['{n}'] '{n}' delete + ) + '{n}' + ('{n}' delete) // e.g. -nno endings + '{'}' + (delete) // with some slight false conflations + ) + ) +) + +define stem as ( + + do mark_regions + backwards setlimit tomark pV for ( + do ( + perfective_gerund or + ( try reflexive + adjectival or verb or noun + ) + ) + try([ '{i}' ] delete) + // because noun ending -i{iu} is being treated as verb ending -{iu} + + do derivational + do tidy_up + ) +) diff --git a/contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..9dee289cc --- /dev/null +++ b/contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl @@ -0,0 +1,230 @@ +routines ( + postlude mark_regions + RV R1 R2 + attached_pronoun + standard_suffix + y_verb_suffix + verb_suffix + residual_suffix +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef a' hex 'E1' // a-acute +stringdef e' hex 'E9' // e-acute +stringdef i' hex 'ED' // i-acute +stringdef o' hex 'F3' // o-acute +stringdef u' hex 'FA' // u-acute +stringdef u" hex 'FC' // u-diaeresis +stringdef n~ hex 'F1' // n-tilde + +define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}' + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + [substring] among( + '{a'}' (<- 'a') + '{e'}' (<- 'e') + '{i'}' (<- 'i') + '{o'}' (<- 'o') + '{u'}' (<- 'u') + // and possibly {u"}->u here, or in prelude + '' (next) + ) //or next +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define attached_pronoun as ( + [substring] among( + 'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo' + 'las' 'les' 'los' 'nos' + ) + substring RV among( + 'i{e'}ndo' (] <- 'iendo') + '{a'}ndo' (] <- 'ando') + '{a'}r' (] <- 'ar') + '{e'}r' (] <- 'er') + '{i'}r' (] <- 'ir') + 'ando' + 'iendo' + 'ar' 'er' 'ir' + (delete) + 'yendo' ('u' delete) + ) + ) + + define standard_suffix as ( + [substring] among( + + 'anza' 'anzas' + 'ico' 'ica' 'icos' 'icas' + 'ismo' 'ismos' + 'able' 'ables' + 'ible' 'ibles' + 'ista' 'istas' + 'oso' 'osa' 'osos' 'osas' + 'amiento' 'amientos' + 'imiento' 'imientos' + ( + R2 delete + ) + 'adora' 'ador' 'aci{o'}n' + 'adoras' 'adores' 'aciones' + 'ante' 'antes' 'ancia' 'ancias'// Note 1 + ( + R2 delete + try ( ['ic'] R2 delete ) + ) + 'log{i'}a' + 'log{i'}as' + ( + R2 <- 'log' + ) + 'uci{o'}n' 'uciones' + ( + R2 <- 'u' + ) + 'encia' 'encias' + ( + R2 <- 'ente' + ) + 'amente' + ( + R1 delete + try ( + [substring] R2 delete among( + 'iv' (['at'] R2 delete) + 'os' + 'ic' + 'ad' + ) + ) + ) + 'mente' + ( + R2 delete + try ( + [substring] among( + 'ante' // Note 1 + 'able' + 'ible' (R2 delete) + ) + ) + ) + 'idad' + 'idades' + ( + R2 delete + try ( + [substring] among( + 'abil' + 'ic' + 'iv' (R2 delete) + ) + ) + ) + 'iva' 'ivo' + 'ivas' 'ivos' + ( + R2 delete + try ( + ['at'] R2 delete // but not a further ['ic'] R2 delete + ) + ) + ) + ) + + define y_verb_suffix as ( + setlimit tomark pV for ([substring]) among( + 'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}' + 'yas' 'yes' 'yais' 'yamos' + ('u' delete) + ) + ) + + define verb_suffix as ( + setlimit tomark pV for ([substring]) among( + + 'en' 'es' '{e'}is' 'emos' + (try ('u' test 'g') ] delete) + + 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais' + 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}' + 'ar{e'}' + 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais' + 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}' + 'er{e'}' + 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais' + 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}' + 'ir{e'}' + + 'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed' + 'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an' + 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado' + 'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as' + 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases' + 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais' + 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados' + 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' + '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos' + (delete) + ) + ) + + define residual_suffix as ( + [substring] among( + 'os' + 'a' 'o' '{a'}' '{i'}' '{o'}' + ( RV delete ) + 'e' '{e'}' + ( RV delete try( ['u'] test 'g' RV delete ) ) + ) + ) +) + +define stem as ( + do mark_regions + backwards ( + do attached_pronoun + do ( standard_suffix or + y_verb_suffix or + verb_suffix + ) + do residual_suffix + ) + do postlude +) + +/* + Note 1: additions of 15 Jun 2005 +*/ diff --git a/contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl new file mode 100644 index 000000000..db7a46201 --- /dev/null +++ b/contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl @@ -0,0 +1,230 @@ +routines ( + postlude mark_regions + RV R1 R2 + attached_pronoun + standard_suffix + y_verb_suffix + verb_suffix + residual_suffix +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v ) + +stringescapes {} + +/* special characters (in MS-DOS Latin I) */ + +stringdef a' hex 'A0' // a-acute +stringdef e' hex '82' // e-acute +stringdef i' hex 'A1' // i-acute +stringdef o' hex 'A2' // o-acute +stringdef u' hex 'A3' // u-acute +stringdef u" hex '81' // u-diaeresis +stringdef n~ hex 'A4' // n-tilde + +define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}' + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + [substring] among( + '{a'}' (<- 'a') + '{e'}' (<- 'e') + '{i'}' (<- 'i') + '{o'}' (<- 'o') + '{u'}' (<- 'u') + // and possibly {u"}->u here, or in prelude + '' (next) + ) //or next +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define attached_pronoun as ( + [substring] among( + 'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo' + 'las' 'les' 'los' 'nos' + ) + substring RV among( + 'i{e'}ndo' (] <- 'iendo') + '{a'}ndo' (] <- 'ando') + '{a'}r' (] <- 'ar') + '{e'}r' (] <- 'er') + '{i'}r' (] <- 'ir') + 'ando' + 'iendo' + 'ar' 'er' 'ir' + (delete) + 'yendo' ('u' delete) + ) + ) + + define standard_suffix as ( + [substring] among( + + 'anza' 'anzas' + 'ico' 'ica' 'icos' 'icas' + 'ismo' 'ismos' + 'able' 'ables' + 'ible' 'ibles' + 'ista' 'istas' + 'oso' 'osa' 'osos' 'osas' + 'amiento' 'amientos' + 'imiento' 'imientos' + ( + R2 delete + ) + 'adora' 'ador' 'aci{o'}n' + 'adoras' 'adores' 'aciones' + 'ante' 'antes' 'ancia' 'ancias'// Note 1 + ( + R2 delete + try ( ['ic'] R2 delete ) + ) + 'log{i'}a' + 'log{i'}as' + ( + R2 <- 'log' + ) + 'uci{o'}n' 'uciones' + ( + R2 <- 'u' + ) + 'encia' 'encias' + ( + R2 <- 'ente' + ) + 'amente' + ( + R1 delete + try ( + [substring] R2 delete among( + 'iv' (['at'] R2 delete) + 'os' + 'ic' + 'ad' + ) + ) + ) + 'mente' + ( + R2 delete + try ( + [substring] among( + 'ante' // Note 1 + 'able' + 'ible' (R2 delete) + ) + ) + ) + 'idad' + 'idades' + ( + R2 delete + try ( + [substring] among( + 'abil' + 'ic' + 'iv' (R2 delete) + ) + ) + ) + 'iva' 'ivo' + 'ivas' 'ivos' + ( + R2 delete + try ( + ['at'] R2 delete // but not a further ['ic'] R2 delete + ) + ) + ) + ) + + define y_verb_suffix as ( + setlimit tomark pV for ([substring]) among( + 'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}' + 'yas' 'yes' 'yais' 'yamos' + ('u' delete) + ) + ) + + define verb_suffix as ( + setlimit tomark pV for ([substring]) among( + + 'en' 'es' '{e'}is' 'emos' + (try ('u' test 'g') ] delete) + + 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais' + 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}' + 'ar{e'}' + 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais' + 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}' + 'er{e'}' + 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais' + 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}' + 'ir{e'}' + + 'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed' + 'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an' + 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado' + 'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as' + 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases' + 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais' + 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados' + 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' + '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos' + (delete) + ) + ) + + define residual_suffix as ( + [substring] among( + 'os' + 'a' 'o' '{a'}' '{i'}' '{o'}' + ( RV delete ) + 'e' '{e'}' + ( RV delete try( ['u'] test 'g' RV delete ) ) + ) + ) +) + +define stem as ( + do mark_regions + backwards ( + do attached_pronoun + do ( standard_suffix or + y_verb_suffix or + verb_suffix + ) + do residual_suffix + ) + do postlude +) + +/* + Note 1: additions of 15 Jun 2005 +*/ diff --git a/contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..03ce1e22f --- /dev/null +++ b/contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl @@ -0,0 +1,72 @@ +routines ( + mark_regions + main_suffix + consonant_pair + other_suffix +) + +externals ( stem ) + +integers ( p1 x ) + +groupings ( v s_ending ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef a" hex 'E4' +stringdef ao hex 'E5' +stringdef o" hex 'F6' + +define v 'aeiouy{a"}{ao}{o"}' + +define s_ending 'bcdfghjklmnoprtvy' + +define mark_regions as ( + + $p1 = limit + test ( hop 3 setmark x ) + goto v gopast non-v setmark p1 + try ( $p1 < x $p1 = x ) +) + +backwardmode ( + + define main_suffix as ( + setlimit tomark p1 for ([substring]) + among( + + 'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne' + 'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter' + 'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens' + 'hetens' 'erns' 'at' 'andet' 'het' 'ast' + (delete) + 's' + (s_ending delete) + ) + ) + + define consonant_pair as setlimit tomark p1 for ( + among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt') + and ([next] delete) + ) + + define other_suffix as setlimit tomark p1 for ( + [substring] among( + 'lig' 'ig' 'els' (delete) + 'l{o"}st' (<-'l{o"}s') + 'fullt' (<-'full') + ) + ) +) + +define stem as ( + + do mark_regions + backwards ( + do main_suffix + do consonant_pair + do other_suffix + ) +) diff --git a/contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl new file mode 100644 index 000000000..1631f401a --- /dev/null +++ b/contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl @@ -0,0 +1,72 @@ +routines ( + mark_regions + main_suffix + consonant_pair + other_suffix +) + +externals ( stem ) + +integers ( p1 x ) + +groupings ( v s_ending ) + +stringescapes {} + +/* special characters (in MS-DOS Latin I) */ + +stringdef a" hex '84' +stringdef ao hex '86' +stringdef o" hex '94' + +define v 'aeiouy{a"}{ao}{o"}' + +define s_ending 'bcdfghjklmnoprtvy' + +define mark_regions as ( + + $p1 = limit + test ( hop 3 setmark x ) + goto v gopast non-v setmark p1 + try ( $p1 < x $p1 = x ) +) + +backwardmode ( + + define main_suffix as ( + setlimit tomark p1 for ([substring]) + among( + + 'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne' + 'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter' + 'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens' + 'hetens' 'erns' 'at' 'andet' 'het' 'ast' + (delete) + 's' + (s_ending delete) + ) + ) + + define consonant_pair as setlimit tomark p1 for ( + among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt') + and ([next] delete) + ) + + define other_suffix as setlimit tomark p1 for ( + [substring] among( + 'lig' 'ig' 'els' (delete) + 'l{o"}st' (<-'l{o"}s') + 'fullt' (<-'full') + ) + ) +) + +define stem as ( + + do mark_regions + backwards ( + do main_suffix + do consonant_pair + do other_suffix + ) +) diff --git a/contrib/snowball/algorithms/turkish/stem_Unicode.sbl b/contrib/snowball/algorithms/turkish/stem_Unicode.sbl new file mode 100644 index 000000000..16c02a51f --- /dev/null +++ b/contrib/snowball/algorithms/turkish/stem_Unicode.sbl @@ -0,0 +1,477 @@ +/* Stemmer for Turkish + * author: Evren (Kapusuz) Çilden + * email: evren.kapusuz at gmail.com + * version: 1.0 (15.01.2007) + + + * stems nominal verb suffixes + * stems nominal inflections + * more than one syllable word check + * (y,n,s,U) context check + * vowel harmony check + * last consonant check and conversion (b, c, d, ğ to p, ç, t, k) + + * The stemming algorithm is based on the paper "An Affix Stripping + * Morphological Analyzer for Turkish" by Gülşen Eryiğit and + * Eşref Adalı (Proceedings of the IAESTED International Conference + * ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004, + * Innsbruck, Austria + + * Turkish is an agglutinative language and has a very rich morphological + * structure. In Turkish, you can form many different words from a single stem + * by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means + * "You had been the doctor of him". The stem of the word is "doktor" and it + * takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about + * the append order of suffixes can be clearly described as FSMs. + * The paper referenced above defines some FSMs for right to left + * morphological analysis. I generated a method for constructing snowball + * expressions from right to left FSMs for stemming suffixes. +*/ + +routines ( + append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings + check_vowel_harmony // tests vowel harmony for suffixes + is_reserved_word // tests whether current string is a reserved word ('ad','soyad') + mark_cAsInA // nominal verb suffix + mark_DA // noun suffix + mark_DAn // noun suffix + mark_DUr // nominal verb suffix + mark_ki // noun suffix + mark_lAr // noun suffix, nominal verb suffix + mark_lArI // noun suffix + mark_nA // noun suffix + mark_ncA // noun suffix + mark_ndA // noun suffix + mark_ndAn // noun suffix + mark_nU // noun suffix + mark_nUn // noun suffix + mark_nUz // nominal verb suffix + mark_sU // noun suffix + mark_sUn // nominal verb suffix + mark_sUnUz // nominal verb suffix + mark_possessives // -(U)m,-(U)n,-(U)mUz,-(U)nUz, + mark_yA // noun suffix + mark_ylA // noun suffix + mark_yU // noun suffix + mark_yUm // nominal verb suffix + mark_yUz // nominal verb suffix + mark_yDU // nominal verb suffix + mark_yken // nominal verb suffix + mark_ymUs_ // nominal verb suffix + mark_ysA // nominal verb suffix + + mark_suffix_with_optional_y_consonant + mark_suffix_with_optional_U_vowel + mark_suffix_with_optional_n_consonant + mark_suffix_with_optional_s_consonant + + more_than_one_syllable_word + + post_process_last_consonants + postlude + + stem_nominal_verb_suffixes + stem_noun_suffixes + stem_suffix_chain_before_ki +) + +/* Special characters in Unicode Latin-1 and Latin Extended-A */ +stringdef c. hex 'E7' // LATIN SMALL LETTER C WITH CEDILLA +stringdef g~ hex '011F' // LATIN SMALL LETTER G WITH BREVE +stringdef i' hex '0131' // LATIN SMALL LETTER I WITHOUT DOT +stringdef o" hex 'F6' // LATIN SMALL LETTER O WITH DIAERESIS +stringdef s. hex '015F' // LATIN SMALL LETTER S WITH CEDILLA +stringdef u" hex 'FC' // LATIN SMALL LETTER U WITH DIAERESIS + +stringescapes { } + +integers ( strlen ) // length of a string + +booleans ( continue_stemming_noun_suffixes ) + +groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6) + +define vowel 'ae{i'}io{o"}u{u"}' +define U '{i'}iu{u"}' + +// the vowel grouping definitions below are used for checking vowel harmony +define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a' +define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e' +define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i'' +define vowel4 'ei' // vowels that can end with suffixes containing 'i' +define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u' +define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"' + +externals ( stem ) + +backwardmode ( + // checks vowel harmony for possible suffixes, + // helps to detect whether the candidate for suffix applies to vowel harmony + // this rule is added to prevent over stemming + define check_vowel_harmony as ( + test + ( + (goto vowel) // if there is a vowel + ( + ('a' goto vowel1) or + ('e' goto vowel2) or + ('{i'}' goto vowel3) or + ('i' goto vowel4) or + ('o' goto vowel5) or + ('{o"}' goto vowel6) or + ('u' goto vowel5) or + ('{u"}' goto vowel6) + ) + ) + ) + + // if the last consonant before suffix is vowel and n then advance and delete + // if the last consonant before suffix is non vowel and n do nothing + // if the last consonant before suffix is not n then only delete the suffix + // assumption: slice beginning is set correctly + define mark_suffix_with_optional_n_consonant as ( + ((test 'n') next (test vowel)) + or + ((not(test 'n')) test(next (test vowel))) + + ) + + // if the last consonant before suffix is vowel and s then advance and delete + // if the last consonant before suffix is non vowel and s do nothing + // if the last consonant before suffix is not s then only delete the suffix + // assumption: slice beginning is set correctly + define mark_suffix_with_optional_s_consonant as ( + ((test 's') next (test vowel)) + or + ((not(test 's')) test(next (test vowel))) + ) + + // if the last consonant before suffix is vowel and y then advance and delete + // if the last consonant before suffix is non vowel and y do nothing + // if the last consonant before suffix is not y then only delete the suffix + // assumption: slice beginning is set correctly + define mark_suffix_with_optional_y_consonant as ( + ((test 'y') next (test vowel)) + or + ((not(test 'y')) test(next (test vowel))) + ) + + define mark_suffix_with_optional_U_vowel as ( + ((test U) next (test non-vowel)) + or + ((not(test U)) test(next (test non-vowel))) + + ) + + define mark_possessives as ( + among ('m{i'}z' 'miz' 'muz' 'm{u"}z' + 'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n') + (mark_suffix_with_optional_U_vowel) + ) + + define mark_sU as ( + check_vowel_harmony + U + (mark_suffix_with_optional_s_consonant) + ) + + define mark_lArI as ( + among ('leri' 'lar{i'}') + ) + + define mark_yU as ( + check_vowel_harmony + U + (mark_suffix_with_optional_y_consonant) + ) + + define mark_nU as ( + check_vowel_harmony + among ('n{i'}' 'ni' 'nu' 'n{u"}') + ) + + define mark_nUn as ( + check_vowel_harmony + among ('{i'}n' 'in' 'un' '{u"}n') + (mark_suffix_with_optional_n_consonant) + ) + + define mark_yA as ( + check_vowel_harmony + among('a' 'e') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_nA as ( + check_vowel_harmony + among('na' 'ne') + ) + + define mark_DA as ( + check_vowel_harmony + among('da' 'de' 'ta' 'te') + ) + + define mark_ndA as ( + check_vowel_harmony + among('nda' 'nde') + ) + + define mark_DAn as ( + check_vowel_harmony + among('dan' 'den' 'tan' 'ten') + ) + + define mark_ndAn as ( + check_vowel_harmony + among('ndan' 'nden') + ) + + define mark_ylA as ( + check_vowel_harmony + among('la' 'le') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_ki as ( + 'ki' + ) + + define mark_ncA as ( + check_vowel_harmony + among('ca' 'ce') + (mark_suffix_with_optional_n_consonant) + ) + + define mark_yUm as ( + check_vowel_harmony + among ('{i'}m' 'im' 'um' '{u"}m') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_sUn as ( + check_vowel_harmony + among ('s{i'}n' 'sin' 'sun' 's{u"}n' ) + ) + + define mark_yUz as ( + check_vowel_harmony + among ('{i'}z' 'iz' 'uz' '{u"}z') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_sUnUz as ( + among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z') + ) + + define mark_lAr as ( + check_vowel_harmony + among ('ler' 'lar') + ) + + define mark_nUz as ( + check_vowel_harmony + among ('n{i'}z' 'niz' 'nuz' 'n{u"}z') + ) + + define mark_DUr as ( + check_vowel_harmony + among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r') + ) + + define mark_cAsInA as ( + among ('cas{i'}na' 'cesine') + ) + + define mark_yDU as ( + check_vowel_harmony + among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m' + 't{i'}n' 'tin' 'tun' 't{u"}n' 'd{i'}n' 'din' 'dun' 'd{u"}n' + 't{i'}k' 'tik' 'tuk' 't{u"}k' 'd{i'}k' 'dik' 'duk' 'd{u"}k' + 't{i'}' 'ti' 'tu' 't{u"}' 'd{i'}' 'di' 'du' 'd{u"}') + (mark_suffix_with_optional_y_consonant) + ) + + // does not fully obey vowel harmony + define mark_ysA as ( + among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_ymUs_ as ( + check_vowel_harmony + among ('m{i'}{s.}' 'mi{s.}' 'mu{s.}' 'm{u"}{s.}') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_yken as ( + 'ken' (mark_suffix_with_optional_y_consonant) + ) + + define stem_nominal_verb_suffixes as ( + [ + set continue_stemming_noun_suffixes + (mark_ymUs_ or mark_yDU or mark_ysA or mark_yken) + or + (mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_) + or + ( + mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_)) + unset continue_stemming_noun_suffixes + ) + or + (mark_nUz (mark_yDU or mark_ysA)) + or + ((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_)) + or + (mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)) + ]delete + ) + + // stems noun suffix chains ending with -ki + define stem_suffix_chain_before_ki as ( + [ + mark_ki + ( + (mark_DA] delete try([ + (mark_lAr] delete try(stem_suffix_chain_before_ki)) + or + (mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + + )) + or + (mark_nUn] delete try([ + (mark_lArI] delete) + or + ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + (stem_suffix_chain_before_ki) + )) + or + (mark_ndA ( + (mark_lArI] delete) + or + ((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki))) + or + (stem_suffix_chain_before_ki) + )) + ) + ) + + define stem_noun_suffixes as ( + ([mark_lAr] delete try(stem_suffix_chain_before_ki)) + or + ([mark_ncA] delete + try( + ([mark_lArI] delete) + or + ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + ([mark_lAr] delete stem_suffix_chain_before_ki) + ) + ) + or + ([(mark_ndA or mark_nA) + ( + (mark_lArI] delete) + or + (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + (stem_suffix_chain_before_ki) + ) + ) + or + ([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI))) + or + ( [mark_DAn] delete try ([ + ( + (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + (mark_lAr] delete try(stem_suffix_chain_before_ki)) + or + (stem_suffix_chain_before_ki) + )) + ) + or + ([mark_nUn or mark_ylA] delete + try( + ([mark_lAr] delete stem_suffix_chain_before_ki) + or + ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + stem_suffix_chain_before_ki + ) + ) + or + ([mark_lArI] delete) + or + (stem_suffix_chain_before_ki) + or + ([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki)) + or + ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + ) + + define post_process_last_consonants as ( + [substring] among ( + 'b' (<- 'p') + 'c' (<- '{c.}') + 'd' (<- 't') + '{g~}' (<- 'k') + ) + ) + + // after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed + // like in 'kedim' -> 'ked' + // Turkish words don't usually end with 'd' or 'g' + // some very well known words are ignored (like 'ad' 'soyad' + // appends U to stems ending with d or g, decides which vowel to add + // based on the last vowel in the stem + define append_U_to_stems_ending_with_d_or_g as ( + test('d' or 'g') + (test((goto vowel) 'a' or '{i'}') <+ '{i'}') + or + (test((goto vowel) 'e' or 'i') <+ 'i') + or + (test((goto vowel) 'o' or 'u') <+ 'u') + or + (test((goto vowel) '{o"}' or '{u"}') <+ '{u"}') + ) + +) + +// Tests if there are more than one syllables +// In Turkish each vowel indicates a distinct syllable +define more_than_one_syllable_word as ( + test (atleast 2 (gopast vowel)) +) + +define is_reserved_word as ( + test(gopast 'ad' ($strlen = 2) ($strlen == limit)) + or + test(gopast 'soyad' ($strlen = 5) ($strlen == limit)) +) + +define postlude as ( + not(is_reserved_word) + backwards ( + do append_U_to_stems_ending_with_d_or_g + do post_process_last_consonants + + ) +) + +define stem as ( + (more_than_one_syllable_word) + ( + backwards ( + do stem_nominal_verb_suffixes + continue_stemming_noun_suffixes + do stem_noun_suffixes + ) + + postlude + ) +) + + diff --git a/contrib/snowball/compiler/analyser.c b/contrib/snowball/compiler/analyser.c new file mode 100644 index 000000000..68c0d2d90 --- /dev/null +++ b/contrib/snowball/compiler/analyser.c @@ -0,0 +1,959 @@ + +#include <stdio.h> /* printf etc */ +#include <stdlib.h> /* exit */ +#include <string.h> /* memmove */ +#include "header.h" + +/* recursive usage: */ + +static void read_program_(struct analyser * a, int terminator); +static struct node * read_C(struct analyser * a); +static struct node * C_style(struct analyser * a, char * s, int token); + + +static void fault(int n) { fprintf(stderr, "fault %d\n", n); exit(1); } + +static void print_node_(struct node * p, int n, const char * s) { + + int i; + for (i = 0; i < n; i++) fputs(i == n - 1 ? s : " ", stdout); + printf("%s ", name_of_token(p->type)); + unless (p->name == 0) report_b(stdout, p->name->b); + unless (p->literalstring == 0) { + printf("'"); + report_b(stdout, p->literalstring); + printf("'"); + } + printf("\n"); + unless (p->AE == 0) print_node_(p->AE, n+1, "# "); + unless (p->left == 0) print_node_(p->left, n+1, " "); + unless (p->right == 0) print_node_(p->right, n, " "); + if (p->aux != 0) print_node_(p->aux, n+1, "@ "); +} + +extern void print_program(struct analyser * a) { + print_node_(a->program, 0, " "); +} + +static struct node * new_node(struct analyser * a, int type) { + NEW(node, p); + p->next = a->nodes; a->nodes = p; + p->left = 0; + p->right = 0; + p->aux = 0; + p->AE = 0; + p->name = 0; + p->literalstring = 0; + p->mode = a->mode; + p->line_number = a->tokeniser->line_number; + p->type = type; + return p; +} + +static const char * name_of_mode(int n) { + switch (n) { + default: fault(0); + case m_backward: return "string backward"; + case m_forward: return "string forward"; + /* case m_integer: return "integer"; */ + } +} + +static const char * name_of_type(int n) { + switch (n) { + default: fault(1); + case 's': return "string"; + case 'i': return "integer"; + case 'r': return "routine"; + case 'R': return "routine or grouping"; + case 'g': return "grouping"; + } +} + +static void count_error(struct analyser * a) { + struct tokeniser * t = a->tokeniser; + if (t->error_count >= 20) { fprintf(stderr, "... etc\n"); exit(1); } + t->error_count++; +} + +static void error2(struct analyser * a, int n, int x) { + struct tokeniser * t = a->tokeniser; + count_error(a); + fprintf(stderr, "%s:%d: ", t->file, t->line_number); + if (n >= 30) report_b(stderr, t->b); + switch (n) { + case 0: + fprintf(stderr, "%s omitted", name_of_token(t->omission)); break; + case 3: + fprintf(stderr, "in among(...), "); + case 1: + fprintf(stderr, "unexpected %s", name_of_token(t->token)); + if (t->token == c_number) fprintf(stderr, " %d", t->number); + if (t->token == c_name) { + fprintf(stderr, " "); + report_b(stderr, t->b); + } break; + case 2: + fprintf(stderr, "string omitted"); break; + + case 14: + fprintf(stderr, "unresolved substring on line %d", x); break; + case 15: + fprintf(stderr, "%s not allowed inside reverse(...)", name_of_token(t->token)); break; + case 16: + fprintf(stderr, "empty grouping"); break; + case 17: + fprintf(stderr, "backwards used when already in this mode"); break; + case 18: + fprintf(stderr, "empty among(...)"); break; + case 19: + fprintf(stderr, "two adjacent bracketed expressions in among(...)"); break; + case 20: + fprintf(stderr, "substring preceded by another substring on line %d", x); break; + + case 30: + fprintf(stderr, " re-declared"); break; + case 31: + fprintf(stderr, " undeclared"); break; + case 32: + fprintf(stderr, " declared as %s mode; used as %s mode", + name_of_mode(a->mode), name_of_mode(x)); break; + case 33: + fprintf(stderr, " not of type %s", name_of_type(x)); break; + case 34: + fprintf(stderr, " not of type string or integer"); break; + case 35: + fprintf(stderr, " misplaced"); break; + case 36: + fprintf(stderr, " redefined"); break; + case 37: + fprintf(stderr, " mis-used as %s mode", + name_of_mode(x)); break; + default: + fprintf(stderr, " error %d", n); break; + + } + if (n <= 13 && t->previous_token > 0) + fprintf(stderr, " after %s", name_of_token(t->previous_token)); + fprintf(stderr, "\n"); +} + +static void error(struct analyser * a, int n) { error2(a, n, 0); } + +static void error3(struct analyser * a, struct node * p, symbol * b) { + count_error(a); + fprintf(stderr, "%s:%d: among(...) has repeated string '", a->tokeniser->file, p->line_number); + report_b(stderr, b); + fprintf(stderr, "'\n"); +} + +static void error4(struct analyser * a, struct name * q) { + count_error(a); + report_b(stderr, q->b); + fprintf(stderr, " undefined\n"); +} + +static void omission_error(struct analyser * a, int n) { + a->tokeniser->omission = n; + error(a, 0); +} + +static int check_token(struct analyser * a, int code) { + struct tokeniser * t = a->tokeniser; + if (t->token != code) { omission_error(a, code); return false; } + return true; +} + +static int get_token(struct analyser * a, int code) { + struct tokeniser * t = a->tokeniser; + read_token(t); + { + int x = check_token(a, code); + unless (x) t->token_held = true; + return x; + } +} + +static struct name * look_for_name(struct analyser * a) { + struct name * p = a->names; + symbol * q = a->tokeniser->b; + repeat { + if (p == 0) return 0; + { symbol * b = p->b; + int n = SIZE(b); + if (n == SIZE(q) && memcmp(q, b, n * sizeof(symbol)) == 0) { + p->referenced = true; + return p; + } + } + p = p->next; + } +} + +static struct name * find_name(struct analyser * a) { + struct name * p = look_for_name(a); + if (p == 0) error(a, 31); + return p; +} + +static void check_routine_mode(struct analyser * a, struct name * p, int mode) { + if (p->mode < 0) p->mode = mode; else + unless (p->mode == mode) error2(a, 37, mode); +} + +static void check_name_type(struct analyser * a, struct name * p, int type) { + switch (type) { + case 's': if (p->type == t_string) return; break; + case 'i': if (p->type == t_integer) return; break; + case 'b': if (p->type == t_boolean) return; break; + case 'R': if (p->type == t_grouping) return; + case 'r': if (p->type == t_routine || + p->type == t_external) return; break; + case 'g': if (p->type == t_grouping) return; break; + } + error2(a, 33, type); +} + +static void read_names(struct analyser * a, int type) { + struct tokeniser * t = a->tokeniser; + unless (get_token(a, c_bra)) return; + repeat { + if (read_token(t) != c_name) break; + if (look_for_name(a) != 0) error(a, 30); else { + NEW(name, p); + p->b = copy_b(t->b); + p->type = type; + p->mode = -1; /* routines, externals */ + p->count = a->name_count[type]; + p->referenced = false; + p->used = false; + p->grouping = 0; + p->definition = 0; + a->name_count[type] ++; + p->next = a->names; + a->names = p; + } + } + unless (check_token(a, c_ket)) t->token_held = true; +} + +static symbol * new_literalstring(struct analyser * a) { + NEW(literalstring, p); + p->b = copy_b(a->tokeniser->b); + p->next = a->literalstrings; + a->literalstrings = p; + return p->b; +} + +static int read_AE_test(struct analyser * a) { + + struct tokeniser * t = a->tokeniser; + switch (read_token(t)) { + case c_assign: return c_mathassign; + case c_plusassign: + case c_minusassign: + case c_multiplyassign: + case c_divideassign: + case c_eq: + case c_ne: + case c_gr: + case c_ge: + case c_ls: + case c_le: return t->token; + default: error(a, 1); t->token_held = true; return c_eq; + } +} + +static int binding(int t) { + switch (t) { + case c_plus: case c_minus: return 1; + case c_multiply: case c_divide: return 2; + default: return -2; + } +} + +static void name_to_node(struct analyser * a, struct node * p, int type) { + struct name * q = find_name(a); + unless (q == 0) { + check_name_type(a, q, type); + q->used = true; + } + p->name = q; +} + +static struct node * read_AE(struct analyser * a, int B) { + struct tokeniser * t = a->tokeniser; + struct node * p; + struct node * q; + switch (read_token(t)) { + case c_minus: /* monadic */ + p = new_node(a, c_neg); + p->right = read_AE(a, 100); + break; + case c_bra: + p = read_AE(a, 0); + get_token(a, c_ket); + break; + case c_name: + p = new_node(a, c_name); + name_to_node(a, p, 'i'); + break; + case c_maxint: + case c_minint: + case c_cursor: + case c_limit: + case c_size: + p = new_node(a, t->token); + break; + case c_number: + p = new_node(a, c_number); + p->number = t->number; + break; + case c_sizeof: + p = C_style(a, "s", c_sizeof); + break; + default: + error(a, 1); + t->token_held = true; + return 0; + } + repeat { + int token = read_token(t); + int b = binding(token); + unless (binding(token) > B) { + t->token_held = true; + return p; + } + q = new_node(a, token); + q->left = p; + q->right = read_AE(a, b); + p = q; + } +} + +static struct node * read_C_connection(struct analyser * a, struct node * q, int op) { + struct tokeniser * t = a->tokeniser; + struct node * p = new_node(a, op); + struct node * p_end = q; + p->left = q; + repeat { + q = read_C(a); + p_end->right = q; p_end = q; + if (read_token(t) != op) { + t->token_held = true; + break; + } + } + return p; +} + +static struct node * read_C_list(struct analyser * a) { + struct tokeniser * t = a->tokeniser; + struct node * p = new_node(a, c_bra); + struct node * p_end = 0; + repeat { + int token = read_token(t); + if (token == c_ket) return p; + if (token < 0) { omission_error(a, c_ket); return p; } + t->token_held = true; + { + struct node * q = read_C(a); + repeat { + token = read_token(t); + if (token != c_and && token != c_or) { + t->token_held = true; + break; + } + q = read_C_connection(a, q, token); + } + if (p_end == 0) p->left = q; else p_end->right = q; + p_end = q; + } + } +} + +static struct node * C_style(struct analyser * a, char * s, int token) { + int i; + struct node * p = new_node(a, token); + for (i = 0; s[i] != 0; i++) switch(s[i]) { + case 'C': + p->left = read_C(a); continue; + case 'D': + p->aux = read_C(a); continue; + case 'A': + p->AE = read_AE(a, 0); continue; + case 'f': + get_token(a, c_for); continue; + case 'S': + { + int str_token = read_token(a->tokeniser); + if (str_token == c_name) name_to_node(a, p, 's'); else + if (str_token == c_literalstring) p->literalstring = new_literalstring(a); + else error(a, 2); + } + continue; + case 'b': + case 's': + case 'i': + if (get_token(a, c_name)) name_to_node(a, p, s[i]); + continue; + } + return p; +} + +static struct node * read_literalstring(struct analyser * a) { + struct node * p = new_node(a, c_literalstring); + p->literalstring = new_literalstring(a); + return p; +} + +static void reverse_b(symbol * b) { + int i = 0; int j = SIZE(b) - 1; + until (i >= j) { + int ch1 = b[i]; int ch2 = b[j]; + b[i++] = ch2; b[j--] = ch1; + } +} + +static int compare_amongvec(const void *pv, const void *qv) { + const struct amongvec * p = (const struct amongvec*)pv; + const struct amongvec * q = (const struct amongvec*)qv; + symbol * b_p = p->b; int p_size = p->size; + symbol * b_q = q->b; int q_size = q->size; + int smaller_size = p_size < q_size ? p_size : q_size; + int i; + for (i = 0; i < smaller_size; i++) + if (b_p[i] != b_q[i]) return b_p[i] - b_q[i]; + return p_size - q_size; +} + +static void make_among(struct analyser * a, struct node * p, struct node * substring) { + + NEW(among, x); + NEWVEC(amongvec, v, p->number); + struct node * q = p->left; + struct amongvec * w0 = v; + struct amongvec * w1 = v; + int result = 1; + + int direction = substring != 0 ? substring->mode : p->mode; + int backward = direction == m_backward; + + if (a->amongs == 0) a->amongs = x; else a->amongs_end->next = x; + a->amongs_end = x; + x->next = 0; + x->b = v; + x->number = a->among_count++; + x->starter = 0; + + if (q->type == c_bra) { x->starter = q; q = q->right; } + + until (q == 0) { + if (q->type == c_literalstring) { + symbol * b = q->literalstring; + w1->b = b; /* pointer to case string */ + w1->p = 0; /* pointer to corresponding case expression */ + w1->size = SIZE(b); /* number of characters in string */ + w1->i = -1; /* index of longest substring */ + w1->result = -1; /* number of corresponding case expression */ + w1->function = q->left == 0 ? 0 : q->left->name; + unless (w1->function == 0) + check_routine_mode(a, w1->function, direction); + w1++; + } + else + if (q->left == 0) /* empty command: () */ + w0 = w1; + else { + until (w0 == w1) { + w0->p = q; + w0->result = result; + w0++; + } + result++; + } + q = q->right; + } + unless (w1-v == p->number) { fprintf(stderr, "oh! %d %d\n", (int)(w1-v), p->number); exit(1); } + if (backward) for (w0 = v; w0 < w1; w0++) reverse_b(w0->b); + qsort(v, w1 - v, sizeof(struct amongvec), compare_amongvec); + + /* the following loop is O(n squared) */ + for (w0 = w1 - 1; w0 >= v; w0--) { + symbol * b = w0->b; + int size = w0->size; + struct amongvec * w; + + for (w = w0 - 1; w >= v; w--) { + if (w->size < size && memcmp(w->b, b, w->size * sizeof(symbol)) == 0) { + w0->i = w - v; /* fill in index of longest substring */ + break; + } + } + } + if (backward) for (w0 = v; w0 < w1; w0++) reverse_b(w0->b); + + for (w0 = v; w0 < w1 - 1; w0++) + if (w0->size == (w0 + 1)->size && + memcmp(w0->b, (w0 + 1)->b, w0->size * sizeof(symbol)) == 0) error3(a, p, w0->b); + + x->literalstring_count = p->number; + x->command_count = result - 1; + p->among = x; + + x->substring = substring; + if (substring != 0) substring->among = x; + unless (x->command_count == 0 && x->starter == 0) a->amongvar_needed = true; +} + +static struct node * read_among(struct analyser * a) { + struct tokeniser * t = a->tokeniser; + struct node * p = new_node(a, c_among); + struct node * p_end = 0; + int previous_token = -1; + struct node * substring = a->substring; + + a->substring = 0; + p->number = 0; /* counts the number of literals */ + unless (get_token(a, c_bra)) return p; + repeat { + struct node * q; + int token = read_token(t); + switch (token) { + case c_literalstring: + q = read_literalstring(a); + if (read_token(t) == c_name) { + struct node * r = new_node(a, c_name); + name_to_node(a, r, 'r'); + q->left = r; + } + else t->token_held = true; + p->number++; break; + case c_bra: + if (previous_token == c_bra) error(a, 19); + q = read_C_list(a); break; + default: + error(a, 3); + case c_ket: + if (p->number == 0) error(a, 18); + if (t->error_count == 0) make_among(a, p, substring); + return p; + } + previous_token = token; + if (p_end == 0) p->left = q; else p_end->right = q; + p_end = q; + } +} + +static struct node * read_substring(struct analyser * a) { + + struct node * p = new_node(a, c_substring); + if (a->substring != 0) error2(a, 20, a->substring->line_number); + a->substring = p; + return p; +} + +static void check_modifyable(struct analyser * a) { + unless (a->modifyable) error(a, 15); +} + +static struct node * read_C(struct analyser * a) { + struct tokeniser * t = a->tokeniser; + int token = read_token(t); + switch (token) { + case c_bra: + return read_C_list(a); + case c_backwards: + { + int mode = a->mode; + if (a->mode == m_backward) error(a, 17); else a->mode = m_backward; + { struct node * p = C_style(a, "C", token); + a->mode = mode; + return p; + } + } + case c_reverse: + { + int mode = a->mode; + int modifyable = a->modifyable; + a->modifyable = false; + a->mode = mode == m_forward ? m_backward : m_forward; + { + struct node * p = C_style(a, "C", token); + a->mode = mode; + a->modifyable = modifyable; + return p; + } + } + case c_not: + case c_try: + case c_fail: + case c_test: + case c_do: + case c_goto: + case c_gopast: + case c_repeat: + return C_style(a, "C", token); + case c_loop: + case c_atleast: + return C_style(a, "AC", token); + case c_setmark: + return C_style(a, "i", token); + case c_tomark: + case c_atmark: + case c_hop: + return C_style(a, "A", token); + case c_delete: + check_modifyable(a); + case c_next: + case c_tolimit: + case c_atlimit: + case c_leftslice: + case c_rightslice: + case c_true: + case c_false: + case c_debug: + return C_style(a, "", token); + case c_assignto: + case c_sliceto: + check_modifyable(a); + return C_style(a, "s", token); + case c_assign: + case c_insert: + case c_attach: + case c_slicefrom: + check_modifyable(a); + return C_style(a, "S", token); + case c_setlimit: + return C_style(a, "CfD", token); + case c_set: + case c_unset: + return C_style(a, "b", token); + case c_dollar: + get_token(a, c_name); + { + struct node * p; + struct name * q = find_name(a); + int mode = a->mode; + int modifyable = a->modifyable; + switch (q ? q->type : t_string) + /* above line was: switch (q->type) - bug #1 fix 7/2/2003 */ + { + default: error(a, 34); + case t_string: + a->mode = m_forward; + a->modifyable = true; + p = new_node(a, c_dollar); + p->left = read_C(a); break; + case t_integer: + /* a->mode = m_integer; */ + p = new_node(a, read_AE_test(a)); + p->AE = read_AE(a, 0); break; + } + p->name = q; + a->mode = mode; + a->modifyable = modifyable; + return p; + } + case c_name: + { + struct name * q = find_name(a); + struct node * p = new_node(a, c_name); + unless (q == 0) { + q->used = true; + switch (q->type) { + case t_boolean: + p->type = c_booltest; break; + case t_integer: + error(a, 35); /* integer name misplaced */ + case t_string: + break; + case t_routine: + case t_external: + p->type = c_call; + check_routine_mode(a, q, a->mode); + break; + case t_grouping: + p->type = c_grouping; break; + } + } + p->name = q; + return p; + } + case c_non: + { + struct node * p = new_node(a, token); + read_token(t); + if (t->token == c_minus) read_token(t); + unless (check_token(a, c_name)) { omission_error(a, c_name); return p; } + name_to_node(a, p, 'g'); + return p; + } + case c_literalstring: + return read_literalstring(a); + case c_among: return read_among(a); + case c_substring: return read_substring(a); + default: error(a, 1); return 0; + } +} + +static int next_symbol(symbol * p, symbol * W, int utf8) { + if (utf8) { + int ch; + int j = get_utf8(p, & ch); + W[0] = ch; return j; + } else { + W[0] = p[0]; return 1; + } +} + +static symbol * alter_grouping(symbol * p, symbol * q, int style, int utf8) { + int j = 0; + symbol W[1]; + int width; + if (style == c_plus) { + while (j < SIZE(q)) { + width = next_symbol(q + j, W, utf8); + p = add_to_b(p, 1, W); + j += width; + } + } else { + while (j < SIZE(q)) { + int i; + width = next_symbol(q + j, W, utf8); + for (i = 0; i < SIZE(p); i++) { + if (p[i] == W[0]) { + memmove(p + i, p + i + 1, (SIZE(p) - i - 1) * sizeof(symbol)); + SIZE(p)--; + } + } + j += width; + } + } + return p; +} + +static void read_define_grouping(struct analyser * a, struct name * q) { + struct tokeniser * t = a->tokeniser; + int style = c_plus; + { + NEW(grouping, p); + if (a->groupings == 0) a->groupings = p; else a->groupings_end->next = p; + a->groupings_end = p; + q->grouping = p; + p->next = 0; + p->name = q; + p->number = q->count; + p->b = create_b(0); + repeat { + switch (read_token(t)) { + case c_name: + { + struct name * r = find_name(a); + unless (r == 0) { + check_name_type(a, r, 'g'); + p->b = alter_grouping(p->b, r->grouping->b, style, false); + } + } + break; + case c_literalstring: + p->b = alter_grouping(p->b, t->b, style, a->utf8); + break; + default: error(a, 1); return; + } + switch (read_token(t)) { + case c_plus: + case c_minus: style = t->token; break; + default: goto label0; + } + } + label0: + { + int i; + int max = 0; + int min = 1<<16; + for (i = 0; i < SIZE(p->b); i++) { + if (p->b[i] > max) max = p->b[i]; + if (p->b[i] < min) min = p->b[i]; + } + p->largest_ch = max; + p->smallest_ch = min; + if (min == 1<<16) error(a, 16); + } + t->token_held = true; return; + } +} + +static void read_define_routine(struct analyser * a, struct name * q) { + struct node * p = new_node(a, c_define); + a->amongvar_needed = false; + unless (q == 0) { + check_name_type(a, q, 'R'); + if (q->definition != 0) error(a, 36); + if (q->mode < 0) q->mode = a->mode; else + if (q->mode != a->mode) error2(a, 32, q->mode); + } + p->name = q; + if (a->program == 0) a->program = p; else a->program_end->right = p; + a->program_end = p; + get_token(a, c_as); + p->left = read_C(a); + unless (q == 0) q->definition = p->left; + + if (a->substring != 0) { + error2(a, 14, a->substring->line_number); + a->substring = 0; + } + p->amongvar_needed = a->amongvar_needed; +} + +static void read_define(struct analyser * a) { + unless (get_token(a, c_name)) return; + { + struct name * q = find_name(a); + if (q != 0 && q->type == t_grouping) read_define_grouping(a, q); + else read_define_routine(a, q); + } +} + +static void read_backwardmode(struct analyser * a) { + int mode = a->mode; + a->mode = m_backward; + if (get_token(a, c_bra)) { + read_program_(a, c_ket); + check_token(a, c_ket); + } + a->mode = mode; +} + +static void read_program_(struct analyser * a, int terminator) { + struct tokeniser * t = a->tokeniser; + repeat { + switch (read_token(t)) { + case c_strings: read_names(a, t_string); break; + case c_booleans: read_names(a, t_boolean); break; + case c_integers: read_names(a, t_integer); break; + case c_routines: read_names(a, t_routine); break; + case c_externals: read_names(a, t_external); break; + case c_groupings: read_names(a, t_grouping); break; + case c_define: read_define(a); break; + case c_backwardmode:read_backwardmode(a); break; + case c_ket: + if (terminator == c_ket) return; + default: + error(a, 1); break; + case -1: + unless (terminator < 0) omission_error(a, c_ket); + return; + } + } +} + +extern void read_program(struct analyser * a) { + read_program_(a, -1); + { + struct name * q = a->names; + until (q == 0) { + switch(q->type) { + case t_external: case t_routine: + if (q->used && q->definition == 0) error4(a, q); break; + case t_grouping: + if (q->used && q->grouping == 0) error4(a, q); break; + } + q = q->next; + } + } + + if (a->tokeniser->error_count == 0) { + struct name * q = a->names; + int warned = false; + until (q == 0) { + unless (q->referenced) { + unless (warned) { + fprintf(stderr, "Declared but not used:"); + warned = true; + } + fprintf(stderr, " "); report_b(stderr, q->b); + } + q = q->next; + } + if (warned) fprintf(stderr, "\n"); + + q = a->names; + warned = false; + until (q == 0) { + if (! q->used && (q->type == t_routine || + q->type == t_grouping)) { + unless (warned) { + fprintf(stderr, "Declared and defined but not used:"); + warned = true; + } + fprintf(stderr, " "); report_b(stderr, q->b); + } + q = q->next; + } + if (warned) fprintf(stderr, "\n"); + } +} + +extern struct analyser * create_analyser(struct tokeniser * t) { + NEW(analyser, a); + a->tokeniser = t; + a->nodes = 0; + a->names = 0; + a->literalstrings = 0; + a->program = 0; + a->amongs = 0; + a->among_count = 0; + a->groupings = 0; + a->mode = m_forward; + a->modifyable = true; + { int i; for (i = 0; i < t_size; i++) a->name_count[i] = 0; } + a->substring = 0; + return a; +} + +extern void close_analyser(struct analyser * a) { + { + struct node * q = a->nodes; + until (q == 0) { + struct node * q_next = q->next; + FREE(q); + q = q_next; + } + } + { + struct name * q = a->names; + until (q == 0) { + struct name * q_next = q->next; + lose_b(q->b); FREE(q); + q = q_next; + } + } + { + struct literalstring * q = a->literalstrings; + until (q == 0) { + struct literalstring * q_next = q->next; + lose_b(q->b); FREE(q); + q = q_next; + } + } + { + struct among * q = a->amongs; + until (q == 0) { + struct among * q_next = q->next; + FREE(q->b); FREE(q); + q = q_next; + } + } + { + struct grouping * q = a->groupings; + until (q == 0) { + struct grouping * q_next = q->next; + lose_b(q->b); FREE(q); + q = q_next; + } + } + FREE(a); +} + diff --git a/contrib/snowball/compiler/driver.c b/contrib/snowball/compiler/driver.c new file mode 100644 index 000000000..fbb1e9cae --- /dev/null +++ b/contrib/snowball/compiler/driver.c @@ -0,0 +1,257 @@ +#include <stdio.h> /* for fprintf etc */ +#include <stdlib.h> /* for free etc */ +#include <string.h> /* for strlen */ +#include "header.h" + +#define DEFAULT_PACKAGE "org.tartarus.snowball.ext" +#define DEFAULT_BASE_CLASS "org.tartarus.snowball.SnowballProgram" +#define DEFAULT_AMONG_CLASS "org.tartarus.snowball.Among" +#define DEFAULT_STRING_CLASS "java.lang.StringBuilder" + +static int eq(const char * s1, const char * s2) { + int s1_len = strlen(s1); + int s2_len = strlen(s2); + return s1_len == s2_len && memcmp(s1, s2, s1_len) == 0; +} + +static void print_arglist(void) { + fprintf(stderr, "Usage: snowball <file> [options]\n\n" + "options are: [-o[utput] file]\n" + " [-s[yntax]]\n" +#ifndef DISABLE_JAVA + " [-j[ava]]\n" +#endif + " [-c++]\n" + " [-w[idechars]]\n" + " [-u[tf8]]\n" + " [-n[ame] class name]\n" + " [-ep[refix] string]\n" + " [-vp[refix] string]\n" + " [-i[nclude] directory]\n" + " [-r[untime] path to runtime headers]\n" +#ifndef DISABLE_JAVA + " [-p[arentclassname] fully qualified parent class name]\n" + " [-P[ackage] package name for stemmers]\n" + " [-S[tringclass] StringBuffer-compatible class]\n" + " [-a[mongclass] fully qualified name of the Among class]\n" +#endif + ); + exit(1); +} + +static void check_lim(int i, int argc) { + if (i >= argc) { + fprintf(stderr, "argument list is one short\n"); + print_arglist(); + } +} + +static FILE * get_output(symbol * b) { + char * s = b_to_s(b); + FILE * output = fopen(s, "w"); + if (output == 0) { + fprintf(stderr, "Can't open output %s\n", s); + exit(1); + } + free(s); + return output; +} + +static void read_options(struct options * o, int argc, char * argv[]) { + char * s; + int i = 2; + + /* set defaults: */ + + o->output_file = 0; + o->syntax_tree = false; + o->externals_prefix = ""; + o->variables_prefix = 0; + o->runtime_path = 0; + o->parent_class_name = DEFAULT_BASE_CLASS; + o->string_class = DEFAULT_STRING_CLASS; + o->among_class = DEFAULT_AMONG_CLASS; + o->package = DEFAULT_PACKAGE; + o->name = ""; + o->make_lang = LANG_C; + o->widechars = false; + o->includes = 0; + o->includes_end = 0; + o->utf8 = false; + + /* read options: */ + + repeat { + if (i >= argc) break; + s = argv[i++]; + { if (eq(s, "-o") || eq(s, "-output")) { + check_lim(i, argc); + o->output_file = argv[i++]; + continue; + } + if (eq(s, "-n") || eq(s, "-name")) { + check_lim(i, argc); + o->name = argv[i++]; + continue; + } +#ifndef DISABLE_JAVA + if (eq(s, "-j") || eq(s, "-java")) { + o->make_lang = LANG_JAVA; + o->widechars = true; + continue; + } +#endif + if (eq(s, "-c++")) { + o->make_lang = LANG_CPLUSPLUS; + continue; + } + if (eq(s, "-w") || eq(s, "-widechars")) { + o->widechars = true; + o->utf8 = false; + continue; + } + if (eq(s, "-s") || eq(s, "-syntax")) { + o->syntax_tree = true; + continue; + } + if (eq(s, "-ep") || eq(s, "-eprefix")) { + check_lim(i, argc); + o->externals_prefix = argv[i++]; + continue; + } + if (eq(s, "-vp") || eq(s, "-vprefix")) { + check_lim(i, argc); + o->variables_prefix = argv[i++]; + continue; + } + if (eq(s, "-i") || eq(s, "-include")) { + check_lim(i, argc); + + { + NEW(include, p); + symbol * b = add_s_to_b(0, argv[i++]); + b = add_s_to_b(b, "/"); + p->next = 0; p->b = b; + + if (o->includes == 0) o->includes = p; else + o->includes_end->next = p; + o->includes_end = p; + } + continue; + } + if (eq(s, "-r") || eq(s, "-runtime")) { + check_lim(i, argc); + o->runtime_path = argv[i++]; + continue; + } + if (eq(s, "-u") || eq(s, "-utf8")) { + o->utf8 = true; + o->widechars = false; + continue; + } +#ifndef DISABLE_JAVA + if (eq(s, "-p") || eq(s, "-parentclassname")) { + check_lim(i, argc); + o->parent_class_name = argv[i++]; + continue; + } + if (eq(s, "-P") || eq(s, "-Package")) { + check_lim(i, argc); + o->package = argv[i++]; + continue; + } + if (eq(s, "-S") || eq(s, "-stringclass")) { + check_lim(i, argc); + o->string_class = argv[i++]; + continue; + } + if (eq(s, "-a") || eq(s, "-amongclass")) { + check_lim(i, argc); + o->among_class = argv[i++]; + continue; + } +#endif + fprintf(stderr, "'%s' misplaced\n", s); + print_arglist(); + } + } +} + +extern int main(int argc, char * argv[]) { + + NEW(options, o); + if (argc == 1) print_arglist(); + read_options(o, argc, argv); + { + symbol * filename = add_s_to_b(0, argv[1]); + char * file; + symbol * u = get_input(filename, &file); + if (u == 0) { + fprintf(stderr, "Can't open input %s\n", argv[1]); + exit(1); + } + { + struct tokeniser * t = create_tokeniser(u, file); + struct analyser * a = create_analyser(t); + t->widechars = o->widechars; + t->includes = o->includes; + a->utf8 = t->utf8 = o->utf8; + read_program(a); + if (t->error_count > 0) exit(1); + if (o->syntax_tree) print_program(a); + close_tokeniser(t); + unless (o->syntax_tree) { + struct generator * g; + + char * s = o->output_file; + unless (s) { + fprintf(stderr, "Please include the -o option\n"); + print_arglist(); + exit(1); + } + if (o->make_lang == LANG_C || o->make_lang == LANG_CPLUSPLUS) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".h"); + o->output_h = get_output(b); + b[SIZE(b) - 1] = 'c'; + if (o->make_lang == LANG_CPLUSPLUS) { + b = add_s_to_b(b, "c"); + } + o->output_c = get_output(b); + lose_b(b); + + g = create_generator_c(a, o); + generate_program_c(g); + close_generator_c(g); + fclose(o->output_c); + fclose(o->output_h); + } +#ifndef DISABLE_JAVA + if (o->make_lang == LANG_JAVA) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".java"); + o->output_java = get_output(b); + lose_b(b); + g = create_generator_java(a, o); + generate_program_java(g); + close_generator_java(g); + fclose(o->output_java); + } +#endif + } + close_analyser(a); + } + lose_b(u); + lose_b(filename); + } + { struct include * p = o->includes; + until (p == 0) + { struct include * q = p->next; + lose_b(p->b); FREE(p); p = q; + } + } + FREE(o); + unless (space_count == 0) fprintf(stderr, "%d blocks unfreed\n", space_count); + return 0; +} + diff --git a/contrib/snowball/compiler/generator.c b/contrib/snowball/compiler/generator.c new file mode 100644 index 000000000..7294b0471 --- /dev/null +++ b/contrib/snowball/compiler/generator.c @@ -0,0 +1,1465 @@ + +#include <limits.h> /* for INT_MAX */ +#include <stdio.h> /* for fprintf etc */ +#include <stdlib.h> /* for free etc */ +#include <string.h> /* for strlen */ +#include "header.h" + +/* Define this to get warning messages when optimisations can't be used. */ +/* #define OPTIMISATION_WARNINGS */ + +/* recursive use: */ + +static void generate(struct generator * g, struct node * p); + +enum special_labels { + + x_return = -1 + +}; + +static int new_label(struct generator * g) { + return g->next_label++; +} + +/* Output routines */ +static void output_str(FILE * outfile, struct str * str) { + + char * s = b_to_s(str_data(str)); + fprintf(outfile, "%s", s); + free(s); +} + +static void wch(struct generator * g, int ch) { + str_append_ch(g->outbuf, ch); /* character */ +} + +static void wnl(struct generator * g) { + str_append_ch(g->outbuf, '\n'); /* newline */ + g->line_count++; +} + +static void ws(struct generator * g, const char * s) { + str_append_string(g->outbuf, s); /* string */ +} + +static void wi(struct generator * g, int i) { + str_append_int(g->outbuf, i); /* integer */ +} + +static void wh_ch(struct generator * g, int i) { + str_append_ch(g->outbuf, "0123456789ABCDEF"[i & 0xF]); /* hexchar */ +} + +static void wh(struct generator * g, int i) { + if (i >> 4) wh(g, i >> 4); + wh_ch(g, i); /* hex integer */ +} + +static void wi3(struct generator * g, int i) { + if (i < 100) wch(g, ' '); + if (i < 10) wch(g, ' '); + wi(g, i); /* integer (width 3) */ +} + +static void wvn(struct generator * g, struct name * p) { /* variable name */ + + int ch = "SBIrxg"[p->type]; + switch (p->type) { + case t_string: + case t_boolean: + case t_integer: + wch(g, ch); wch(g, '['); wi(g, p->count); wch(g, ']'); return; + case t_external: + ws(g, g->options->externals_prefix); break; + default: + wch(g, ch); wch(g, '_'); + } + str_append_b(g->outbuf, p->b); +} + +static void wv(struct generator * g, struct name * p) { /* reference to variable */ + if (p->type < t_routine) ws(g, "z->"); + wvn(g, p); +} + +static void wlitarray(struct generator * g, symbol * p) { /* write literal array */ + + ws(g, "{ "); + { + int i; + for (i = 0; i < SIZE(p); i++) { + int ch = p[i]; + if (32 <= ch && ch < 127) { + wch(g, '\''); + switch (ch) { + case '\'': + case '\\': wch(g, '\\'); + default: wch(g, ch); + } + wch(g, '\''); + } else { + wch(g, '0'); wch(g, 'x'); wh(g, ch); + } + if (i < SIZE(p) - 1) ws(g, ", "); + } + } + ws(g, " }"); +} + +static void wlitref(struct generator * g, symbol * p) { /* write ref to literal array */ + + if (SIZE(p) == 0) ws(g, "0"); else { + struct str * s = g->outbuf; + g->outbuf = g->declarations; + ws(g, "static const symbol s_"); wi(g, g->literalstring_count); ws(g, "[] = "); + wlitarray(g, p); + ws(g, ";\n"); + g->outbuf = s; + ws(g, "s_"); wi(g, g->literalstring_count); + g->literalstring_count++; + } +} + + +static void wm(struct generator * g) { /* margin */ + int i; + for (i = 0; i < g->margin; i++) ws(g, " "); +} + +static void wc(struct generator * g, struct node * p) { /* comment */ + + ws(g, " /* "); + switch (p->type) { + case c_mathassign: + case c_plusassign: + case c_minusassign: + case c_multiplyassign: + case c_divideassign: + case c_eq: + case c_ne: + case c_gr: + case c_ge: + case c_ls: + case c_le: + if (p->name) { + wch(g, '$'); + str_append_b(g->outbuf, p->name->b); + wch(g, ' '); + } + ws(g, name_of_token(p->type)); + ws(g, " <integer expression>"); + break; + default: + ws(g, name_of_token(p->type)); + if (p->name) { + wch(g, ' '); + str_append_b(g->outbuf, p->name->b); + } + } + ws(g, ", line "); wi(g, p->line_number); ws(g, " */"); + wnl(g); +} + +static void wms(struct generator * g, const char * s) { + wm(g); ws(g, s); } /* margin + string */ + +static void wbs(struct generator * g) { /* block start */ + wms(g, "{ "); + g->margin++; +} + +static void wbe(struct generator * g) { /* block end */ + + if (g->line_labelled == g->line_count) { wms(g, ";"); wnl(g); } + g->margin--; + wms(g, "}"); wnl(g); +} + +static void wk(struct generator * g, struct node * p) { /* keep c */ + ++g->keep_count; + if (p->mode == m_forward) { + ws(g, "int c"); wi(g, g->keep_count); ws(g, " = z->c;"); + } else { + ws(g, "int m"); wi(g, g->keep_count); ws(g, " = z->l - z->c; (void)m"); + wi(g, g->keep_count); ws(g, ";"); + } +} + +static void wrestore(struct generator * g, struct node * p, int keep_token) { /* restore c */ + if (p->mode == m_forward) { + ws(g, "z->c = c"); + } else { + ws(g, "z->c = z->l - m"); + } + wi(g, keep_token); ws(g, ";"); +} + +static void winc(struct generator * g, struct node * p) { /* increment c */ + ws(g, p->mode == m_forward ? "z->c++;" : + "z->c--;"); +} + +static void wsetl(struct generator * g, int n) { + + g->margin--; + wms(g, "lab"); wi(g, n); wch(g, ':'); wnl(g); + g->line_labelled = g->line_count; + g->margin++; +} + +static void wgotol(struct generator * g, int n) { + wms(g, "goto lab"); wi(g, n); wch(g, ';'); wnl(g); +} + +static void wf(struct generator * g) { /* fail */ + if (g->failure_string != 0) { ws(g, "{ "); ws(g, g->failure_string); wch(g, ' '); } + switch (g->failure_label) + { + case x_return: + ws(g, "return 0;"); + break; + default: + ws(g, "goto lab"); + wi(g, g->failure_label); + wch(g, ';'); + g->label_used = 1; + } + if (g->failure_string != 0) ws(g, " }"); +} + +static void wlim(struct generator * g, struct node * p) { /* if at limit fail */ + + ws(g, p->mode == m_forward ? "if (z->c >= z->l) " : + "if (z->c <= z->lb) "); + wf(g); +} + +static void wp(struct generator * g, const char * s, struct node * p) { /* formatted write */ + int i = 0; + int l = strlen(s); + until (i >= l) { + int ch = s[i++]; + if (ch != '~') wch(g, ch); else + switch(s[i++]) { + default: wch(g, s[i - 1]); continue; + case 'C': wc(g, p); continue; + case 'k': wk(g, p); continue; + case 'K': /* keep for c_test */ + ws(g, p->mode == m_forward ? "int c_test = z->c;" : + "int m_test = z->l - z->c;"); + continue; + case 'R': /* restore for c_test */ + ws(g, p->mode == m_forward ? "z->c = c_test;" : + "z->c = z->l - m_test;"); + continue; + case 'i': winc(g, p); continue; + case 'l': wlim(g, p); continue; + case 'f': wf(g); continue; + case 'M': wm(g); continue; + case 'N': wnl(g); continue; + case '{': wbs(g); continue; + case '}': wbe(g); continue; + case 'S': ws(g, g->S[s[i++] - '0']); continue; + case 'I': wi(g, g->I[s[i++] - '0']); continue; + case 'J': wi3(g, g->I[s[i++] - '0']); continue; + case 'V': wv(g, g->V[s[i++] - '0']); continue; + case 'W': wvn(g, g->V[s[i++] - '0']); continue; + case 'L': wlitref(g, g->L[s[i++] - '0']); continue; + case 'A': wlitarray(g, g->L[s[i++] - '0']); continue; + case '+': g->margin++; continue; + case '-': g->margin--; continue; + case '$': /* insert_s, insert_v etc */ + wch(g, p->literalstring == 0 ? 'v' : 's'); + continue; + case 'p': ws(g, g->options->externals_prefix); continue; + } + } +} + +static void w(struct generator * g, const char * s) { wp(g, s, 0); } + +static void generate_AE(struct generator * g, struct node * p) { + char * s; + switch (p->type) { + case c_name: + wv(g, p->name); break; + case c_number: + wi(g, p->number); break; + case c_maxint: + ws(g, "MAXINT"); break; + case c_minint: + ws(g, "MININT"); break; + case c_neg: + wch(g, '-'); generate_AE(g, p->right); break; + case c_multiply: + s = " * "; goto label0; + case c_plus: + s = " + "; goto label0; + case c_minus: + s = " - "; goto label0; + case c_divide: + s = " / "; + label0: + wch(g, '('); generate_AE(g, p->left); + ws(g, s); generate_AE(g, p->right); wch(g, ')'); break; + case c_sizeof: + g->V[0] = p->name; + w(g, "SIZE(~V0)"); break; + case c_cursor: + w(g, "z->c"); break; + case c_limit: + w(g, p->mode == m_forward ? "z->l" : "z->lb"); break; + case c_size: + w(g, "SIZE(z->p)"); break; + } +} + +/* K_needed() tests to see if we really need to keep c. Not true when the + the command does not touch the cursor. This and repeat_score() could be + elaborated almost indefinitely. +*/ + +static int K_needed(struct generator * g, struct node * p) { + until (p == 0) { + switch (p->type) { + case c_dollar: + case c_leftslice: + case c_rightslice: + case c_mathassign: + case c_plusassign: + case c_minusassign: + case c_multiplyassign: + case c_divideassign: + case c_eq: + case c_ne: + case c_gr: + case c_ge: + case c_ls: + case c_le: + case c_sliceto: + case c_true: + case c_false: + case c_debug: + break; + + case c_call: + if (K_needed(g, p->name->definition)) return true; + break; + + case c_bra: + if (K_needed(g, p->left)) return true; + break; + + default: return true; + } + p = p->right; + } + return false; +} + +static int repeat_score(struct generator * g, struct node * p) { + int score = 0; + until (p == 0) + { + switch (p->type) { + case c_dollar: + case c_leftslice: + case c_rightslice: + case c_mathassign: + case c_plusassign: + case c_minusassign: + case c_multiplyassign: + case c_divideassign: + case c_eq: + case c_ne: + case c_gr: + case c_ge: + case c_ls: + case c_le: + case c_sliceto: /* case c_not: must not be included here! */ + case c_debug: + break; + + case c_call: + score += repeat_score(g, p->name->definition); + break; + + case c_bra: + score += repeat_score(g, p->left); + break; + + case c_name: + case c_literalstring: + case c_next: + case c_grouping: + case c_non: + case c_hop: + score = score + 1; break; + + default: score = 2; break; + } + p = p->right; + } + return score; +} + +/* tests if an expression requires cursor reinstatement in a repeat */ + +static int repeat_restore(struct generator * g, struct node * p) { + return repeat_score(g, p) >= 2; +} + +static void generate_bra(struct generator * g, struct node * p) { + p = p->left; + until (p == 0) { generate(g, p); p = p->right; } +} + +static void generate_and(struct generator * g, struct node * p) { + int keep_c = 0; + if (K_needed(g, p->left)) { + wp(g, "~{~k~C", p); + keep_c = g->keep_count; + } else { + wp(g, "~M~C", p); + } + p = p->left; + until (p == 0) { + generate(g, p); + if (keep_c && p->right != 0) { + w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); + } + p = p->right; + } + if (keep_c) w(g, "~}"); +} + +static void generate_or(struct generator * g, struct node * p) { + int keep_c = 0; + + int used = g->label_used; + int a0 = g->failure_label; + const char * a1 = g->failure_string; + + int out_lab = new_label(g); + + if (K_needed(g, p->left)) { + wp(g, "~{~k~C", p); + keep_c = g->keep_count; + } else { + wp(g, "~M~C", p); + } + p = p->left; + g->failure_string = 0; + until (p->right == 0) { + g->failure_label = new_label(g); + g->label_used = 0; + generate(g, p); + wgotol(g, out_lab); + if (g->label_used) + wsetl(g, g->failure_label); + if (keep_c) { + w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); + } + p = p->right; + } + g->label_used = used; + g->failure_label = a0; + g->failure_string = a1; + + generate(g, p); + if (keep_c) w(g, "~}"); + wsetl(g, out_lab); +} + +static void generate_backwards(struct generator * g, struct node * p) { + + wp(g,"~Mz->lb = z->c; z->c = z->l;~C~N", p); + generate(g, p->left); + w(g, "~Mz->c = z->lb;~N"); +} + + +static void generate_not(struct generator * g, struct node * p) { + int keep_c = 0; + + int used = g->label_used; + int a0 = g->failure_label; + const char * a1 = g->failure_string; + + if (K_needed(g, p->left)) { + wp(g, "~{~k~C", p); + keep_c = g->keep_count; + } else { + wp(g, "~M~C", p); + } + + g->failure_label = new_label(g); + g->label_used = 0; + g->failure_string = 0; + generate(g, p->left); + + { + int l = g->failure_label; + int u = g->label_used; + + g->label_used = used; + g->failure_label = a0; + g->failure_string = a1; + + w(g, "~M~f~N"); + if (u) + wsetl(g, l); + } + if (keep_c) { + w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N~}"); + } +} + + +static void generate_try(struct generator * g, struct node * p) { + int keep_c = K_needed(g, p->left); + + if (keep_c) { + if (p->mode == m_forward) { + wp(g, "~{int c_keep = z->c;~C", p); + g->failure_string = "z->c = c_keep;"; + } else { + wp(g, "~{int m_keep = z->l - z->c;/* (void) m_keep;*/~C", p); + g->failure_string = "z->c = z->l - m_keep;"; + } + } else { + wp(g, "~M~C", p); + g->failure_string = 0; + } + + g->failure_label = new_label(g); + g->label_used = 0; + generate(g, p->left); + + if (g->label_used) + wsetl(g, g->failure_label); + + if (keep_c) w(g, "~}"); +} + +static void generate_set(struct generator * g, struct node * p) { + g->V[0] = p->name; wp(g, "~M~V0 = 1;~C", p); +} + +static void generate_unset(struct generator * g, struct node * p) { + g->V[0] = p->name; wp(g, "~M~V0 = 0;~C", p); +} + +static void generate_fail(struct generator * g, struct node * p) { + generate(g, p->left); + wp(g, "~M~f~C", p); +} + +/* generate_test() also implements 'reverse' */ + +static void generate_test(struct generator * g, struct node * p) { + int keep_c = K_needed(g, p->left); + if (keep_c) wp(g, "~{~K~C", p); + else wp(g, "~M~C", p); + + generate(g, p->left); + + if (keep_c) wp(g, "~M~R~N" + "~}", p); +} + +static void generate_do(struct generator * g, struct node * p) { + int keep_c = 0; + if (K_needed(g, p->left)) { + wp(g, "~{~k~C", p); + keep_c = g->keep_count; + } else { + wp(g, "~M~C", p); + } + + g->failure_label = new_label(g); + g->label_used = 0; + g->failure_string = 0; + generate(g, p->left); + + if (g->label_used) + wsetl(g, g->failure_label); + if (keep_c) { + w(g, "~M"); wrestore(g, p, keep_c); + w(g, "~N~}"); + } +} + +static void generate_next(struct generator * g, struct node * p) { + if (g->options->utf8) { + if (p->mode == m_forward) + w(g, "~{int ret = skip_utf8(z->p, z->c, 0, z->l, 1"); + else + w(g, "~{int ret = skip_utf8(z->p, z->c, z->lb, 0, -1"); + wp(g, ");~N" + "~Mif (ret < 0) ~f~N" + "~Mz->c = ret;~C" + "~}", p); + } else + wp(g, "~M~l~N" + "~M~i~C", p); +} + +static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { + + struct grouping * q = p->name->grouping; + g->S[0] = p->mode == m_forward ? "" : "_b"; + g->S[1] = complement ? "in" : "out"; + g->S[2] = g->options->utf8 ? "_U" : ""; + g->V[0] = p->name; + g->I[0] = q->smallest_ch; + g->I[1] = q->largest_ch; + if (is_goto) { + wp(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1) < 0) ~f /* goto */~C", p); + } else { + wp(g, "~{ /* gopast */~C" + "~Mint ret = ~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1);~N" + "~Mif (ret < 0) ~f~N", p); + if (p->mode == m_forward) + w(g, "~Mz->c += ret;~N"); + else + w(g, "~Mz->c -= ret;~N"); + w(g, "~}"); + } +} + +static void generate_GO(struct generator * g, struct node * p, int style) { + int keep_c = 0; + + int used = g->label_used; + int a0 = g->failure_label; + const char * a1 = g->failure_string; + + if (p->left->type == c_grouping || p->left->type == c_non) { + /* Special case for "goto" or "gopast" when used on a grouping or an + * inverted grouping - the movement of c by the matching action is + * exactly what we want! */ +#ifdef OPTIMISATION_WARNINGS + printf("Optimising %s %s\n", style ? "goto" : "gopast", p->left->type == c_non ? "non" : "grouping"); +#endif + generate_GO_grouping(g, p->left, style, p->left->type == c_non); + return; + } + + w(g, "~Mwhile(1) {"); wp(g, "~C~+", p); + + if (style == 1 || repeat_restore(g, p->left)) { + wp(g, "~M~k~N", p); + keep_c = g->keep_count; + } + + g->failure_label = new_label(g); + g->label_used = 0; + generate(g, p->left); + + if (style == 1) { + /* include for goto; omit for gopast */ + w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); + } + w(g, "~Mbreak;~N"); + if (g->label_used) + wsetl(g, g->failure_label); + if (keep_c) { + w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); + } + + g->label_used = used; + g->failure_label = a0; + g->failure_string = a1; + +/* wp(g, "~M~l~N" + "~M~i~N", p); */ + generate_next(g, p); + w(g, "~}"); +} + +static void generate_loop(struct generator * g, struct node * p) { + w(g, "~{int i; for (i = "); generate_AE(g, p->AE); wp(g, "; i > 0; i--)~C" + "~{", p); + + generate(g, p->left); + + w(g, "~}" + "~}"); +} + +static void generate_repeat(struct generator * g, struct node * p, int atleast_case) { + int keep_c = 0; + wp(g, "~Mwhile(1) {~C~+", p); + + if (repeat_restore(g, p->left)) { + wp(g, "~M~k~N", p); + keep_c = g->keep_count; + } + + g->failure_label = new_label(g); + g->label_used = 0; + g->failure_string = 0; + generate(g, p->left); + + if (atleast_case) w(g, "~Mi--;~N"); + + w(g, "~Mcontinue;~N"); + if (g->label_used) + wsetl(g, g->failure_label); + + if (keep_c) { + w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); + } + + w(g, "~Mbreak;~N" + "~}"); +} + +static void generate_atleast(struct generator * g, struct node * p) { + w(g, "~{int i = "); generate_AE(g, p->AE); w(g, ";~N"); + { + int used = g->label_used; + int a0 = g->failure_label; + const char * a1 = g->failure_string; + + generate_repeat(g, p, true); + + g->label_used = used; + g->failure_label = a0; + g->failure_string = a1; + } + w(g, "~Mif (i > 0) ~f~N" + "~}"); +} + +static void generate_setmark(struct generator * g, struct node * p) { + g->V[0] = p->name; + wp(g, "~M~V0 = z->c;~C", p); +} + +static void generate_tomark(struct generator * g, struct node * p) { + g->S[0] = p->mode == m_forward ? ">" : "<"; + + w(g, "~Mif (z->c ~S0 "); generate_AE(g, p->AE); w(g, ") ~f~N"); + w(g, "~Mz->c = "); generate_AE(g, p->AE); wp(g, ";~C", p); +} + +static void generate_atmark(struct generator * g, struct node * p) { + + w(g, "~Mif (z->c != "); generate_AE(g, p->AE); wp(g, ") ~f~C", p); +} + +static void generate_hop(struct generator * g, struct node * p) { + g->S[0] = p->mode == m_forward ? "+" : "-"; + g->S[1] = p->mode == m_forward ? "0" : "z->lb"; + if (g->options->utf8) { + w(g, "~{int ret = skip_utf8(z->p, z->c, ~S1, z->l, ~S0 "); + generate_AE(g, p->AE); wp(g, ");~C", p); + w(g, "~Mif (ret < 0) ~f~N"); + } else { + w(g, "~{int ret = z->c ~S0 "); + generate_AE(g, p->AE); wp(g, ";~C", p); + w(g, "~Mif (~S1 > ret || ret > z->l) ~f~N"); + } + wp(g, "~Mz->c = ret;~C" + "~}", p); +} + +static void generate_delete(struct generator * g, struct node * p) { + wp(g, "~{int ret = slice_del(z);~C", p); + wp(g, "~Mif (ret < 0) return ret;~N" + "~}", p); +} + +static void generate_tolimit(struct generator * g, struct node * p) { + g->S[0] = p->mode == m_forward ? "" : "b"; + wp(g, "~Mz->c = z->l~S0;~C", p); +} + +static void generate_atlimit(struct generator * g, struct node * p) { + g->S[0] = p->mode == m_forward ? "" : "b"; + g->S[1] = p->mode == m_forward ? "<" : ">"; + wp(g, "~Mif (z->c ~S1 z->l~S0) ~f~C", p); +} + +static void generate_leftslice(struct generator * g, struct node * p) { + g->S[0] = p->mode == m_forward ? "bra" : "ket"; + wp(g, "~Mz->~S0 = z->c;~C", p); +} + +static void generate_rightslice(struct generator * g, struct node * p) { + g->S[0] = p->mode == m_forward ? "ket" : "bra"; + wp(g, "~Mz->~S0 = z->c;~C", p); +} + +static void generate_assignto(struct generator * g, struct node * p) { + g->V[0] = p->name; + wp(g, "~M~V0 = assign_to(z, ~V0);~C" + "~Mif (~V0 == 0) return -1;~C", p); +} + +static void generate_sliceto(struct generator * g, struct node * p) { + g->V[0] = p->name; + wp(g, "~M~V0 = slice_to(z, ~V0);~C" + "~Mif (~V0 == 0) return -1;~C", p); +} + +static void generate_data_address(struct generator * g, struct node * p) { + + symbol * b = p->literalstring; + if (b != 0) { + wi(g, SIZE(b)); w(g, ", "); + wlitref(g, b); + } else + wv(g, p->name); +} + +static void generate_insert(struct generator * g, struct node * p, int style) { + + int keep_c = style == c_attach; + if (p->mode == m_backward) keep_c = !keep_c; + wp(g, "~{", p); + if (keep_c) w(g, "int c_keep = z->c;~N~M"); + wp(g, "int ret = insert_~$(z, z->c, z->c, ", p); + generate_data_address(g, p); + wp(g, ");~C", p); + if (keep_c) w(g, "~Mz->c = c_keep;~N"); + wp(g, "~Mif (ret < 0) return ret;~N" + "~}", p); +} + +static void generate_assignfrom(struct generator * g, struct node * p) { + + int keep_c = p->mode == m_forward; /* like 'attach' */ + wp(g, "~{", p); + if (keep_c) wp(g, "int c_keep = z->c;~N" + "~Mret = insert_~$(z, z->c, z->l, ", p); + else wp(g, "ret = insert_~$(z, z->lb, z->c, ", p); + generate_data_address(g, p); + wp(g, ");~C", p); + if (keep_c) w(g, "~Mz->c = c_keep;~N"); + wp(g, "~Mif (ret < 0) return ret;~N" + "~}", p); +} + +/* bugs marked <======= fixed 22/7/02. Similar fixes required for Java */ + +static void generate_slicefrom(struct generator * g, struct node * p) { + +/* w(g, "~Mslice_from_s(z, "); <============= bug! should be: */ + wp(g, "~{int ret = slice_from_~$(z, ", p); + generate_data_address(g, p); + wp(g, ");~C", p); + wp(g, "~Mif (ret < 0) return ret;~N" + "~}", p); +} + +static void generate_setlimit(struct generator * g, struct node * p) { + int keep_c; + wp(g, "~{int mlimit;~C" + "~M~k~N" + , p); + keep_c = g->keep_count; + generate(g, p->left); + if (p->mode == m_forward) w(g, "~Mmlimit = z->l - z->c; z->l = z->c;~N"); + else w(g, "~Mmlimit = z->lb; z->lb = z->c;~N"); + w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); + g->failure_string = p->mode == m_forward ? "z->l += mlimit;" : + "z->lb = mlimit;"; + generate(g, p->aux); + wms(g, g->failure_string); + w(g, "~N" + "~}"); +} + +static void generate_dollar(struct generator * g, struct node * p) { + + int used = g->label_used; + int a0 = g->failure_label; + const char * a1 = g->failure_string; + g->failure_label = new_label(g); + g->label_used = 0; + g->failure_string = 0; + + g->V[0] = p->name; + wp(g, "~{struct SN_env env = * z;~C" + "~Mint failure = 1; /* assume failure */~N" + "~Mz->p = ~V0;~N" + "~Mz->lb = z->c = 0;~N" + "~Mz->l = SIZE(z->p);~N", p); + generate(g, p->left); + w(g, "~Mfailure = 0; /* mark success */~N"); + if (g->label_used) + wsetl(g, g->failure_label); + g->V[0] = p->name; /* necessary */ + + g->label_used = used; + g->failure_label = a0; + g->failure_string = a1; + + w(g, "~M~V0 = z->p;~N" + "~M* z = env;~N" + "~Mif (failure) ~f~N~}"); +} + +static void generate_integer_assign(struct generator * g, struct node * p, char * s) { + + g->V[0] = p->name; + g->S[0] = s; + w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); wp(g, ";~C", p); +} + +static void generate_integer_test(struct generator * g, struct node * p, char * s) { + + g->V[0] = p->name; + g->S[0] = s; + w(g, "~Mif (!(~V0 ~S0 "); generate_AE(g, p->AE); wp(g, ")) ~f~C", p); +} + +static void generate_call(struct generator * g, struct node * p) { + + g->V[0] = p->name; + wp(g, "~{int ret = ~V0(z);~C" + "~Mif (ret == 0) ~f~N" + "~Mif (ret < 0) return ret;~N~}", p); +} + +static void generate_grouping(struct generator * g, struct node * p, int complement) { + + struct grouping * q = p->name->grouping; + g->S[0] = p->mode == m_forward ? "" : "_b"; + g->S[1] = complement ? "out" : "in"; + g->S[2] = g->options->utf8 ? "_U" : ""; + g->V[0] = p->name; + g->I[0] = q->smallest_ch; + g->I[1] = q->largest_ch; + wp(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 0)) ~f~C", p); +} + +static void generate_namedstring(struct generator * g, struct node * p) { + + g->S[0] = p->mode == m_forward ? "" : "_b"; + g->V[0] = p->name; + wp(g, "~Mif (!(eq_v~S0(z, ~V0))) ~f~C", p); +} + +static void generate_literalstring(struct generator * g, struct node * p) { + symbol * b = p->literalstring; + g->S[0] = p->mode == m_forward ? "" : "_b"; + g->I[0] = SIZE(b); + g->L[0] = b; + + wp(g, "~Mif (!(eq_s~S0(z, ~I0, ~L0))) ~f~C", p); +} + +static void generate_define(struct generator * g, struct node * p) { + struct name * q = p->name; + g->next_label = 0; + + g->S[0] = q->type == t_routine ? "static" : "extern"; + g->V[0] = q; + + w(g, "~N~S0 int ~V0(struct SN_env * z) {~N~+"); + if (p->amongvar_needed) w(g, "~Mint among_var;~N"); + g->failure_string = 0; + g->failure_label = x_return; + g->label_used = 0; + g->keep_count = 0; + generate(g, p->left); + w(g, "~Mreturn 1;~N~}"); +} + +static void generate_substring(struct generator * g, struct node * p) { + + struct among * x = p->among; + int block = -1; + unsigned int bitmap = 0; + struct amongvec * among_cases = x->b; + int c; + int empty_case = -1; + int n_cases = 0; + symbol cases[2]; + int shortest_size = INT_MAX; + + g->S[0] = p->mode == m_forward ? "" : "_b"; + g->I[0] = x->number; + g->I[1] = x->literalstring_count; + + /* In forward mode with non-ASCII UTF-8 characters, the first character + * of the string will often be the same, so instead look at the last + * common character position. + * + * In backward mode, we can't match if there are fewer characters before + * the current position than the minimum length. + */ + for (c = 0; c < x->literalstring_count; ++c) { + int size = among_cases[c].size; + if (size != 0 && size < shortest_size) { + shortest_size = size; + } + } + + for (c = 0; c < x->literalstring_count; ++c) { + symbol ch; + if (among_cases[c].size == 0) { + empty_case = c; + continue; + } + if (p->mode == m_forward) { + ch = among_cases[c].b[shortest_size - 1]; + } else { + ch = among_cases[c].b[among_cases[c].size - 1]; + } + if (n_cases == 0) { + block = ch >> 5; + } else if (ch >> 5 != block) { + block = -1; + if (n_cases > 2) break; + } + if (block == -1) { + if (ch == cases[0]) continue; + if (n_cases < 2) { + cases[n_cases++] = ch; + } else if (ch != cases[1]) { + ++n_cases; + break; + } + } else { + if ((bitmap & (1u << (ch & 0x1f))) == 0) { + bitmap |= 1u << (ch & 0x1f); + if (n_cases < 2) + cases[n_cases] = ch; + ++n_cases; + } + } + } + + if (block != -1 || n_cases <= 2) { + char buf[64]; + g->I[2] = block; + g->I[3] = bitmap; + g->I[4] = shortest_size - 1; + if (p->mode == m_forward) { + sprintf(buf, "z->p[z->c + %d]", shortest_size - 1); + g->S[1] = buf; + if (shortest_size == 1) { + wp(g, "~Mif (z->c >= z->l || ", p); + } else { + wp(g, "~Mif (z->c + ~I4 >= z->l || ", p); + } + } else { + g->S[1] = "z->p[z->c - 1]"; + if (shortest_size == 1) { + wp(g, "~Mif (z->c <= z->lb || ", p); + } else { + wp(g, "~Mif (z->c - ~I4 <= z->lb || ", p); + } + } + if (n_cases == 0) { + /* We get this for the degenerate case: among { '' } + * This doesn't seem to be a useful construct, but it is + * syntactically valid. + */ + wp(g, "0", p); + } else if (n_cases == 1) { + g->I[4] = cases[0]; + wp(g, "~S1 != ~I4", p); + } else if (n_cases == 2) { + g->I[4] = cases[0]; + g->I[5] = cases[1]; + wp(g, "(~S1 != ~I4 && ~S1 != ~I5)", p); + } else { + wp(g, "~S1 >> 5 != ~I2 || !((~I3 >> (~S1 & 0x1f)) & 1)", p); + } + ws(g, ") "); + if (empty_case != -1) { + /* If the among includes the empty string, it can never fail + * so not matching the bitmap means we match the empty string. + */ + g->I[4] = among_cases[empty_case].result; + wp(g, "among_var = ~I4; else~C", p); + } else { + wp(g, "~f~C", p); + } + } else { +#ifdef OPTIMISATION_WARNINGS + printf("Couldn't shortcut among %d\n", x->number); +#endif + } + + if (x->command_count == 0 && x->starter == 0) + wp(g, "~Mif (!(find_among~S0(z, a_~I0, ~I1))) ~f~C", p); + else + wp(g, "~Mamong_var = find_among~S0(z, a_~I0, ~I1);~C" + "~Mif (!(among_var)) ~f~N", p); +} + +static void generate_among(struct generator * g, struct node * p) { + + struct among * x = p->among; + int case_number = 1; + + if (x->substring == 0) generate_substring(g, p); + if (x->command_count == 0 && x->starter == 0) return; + + unless (x->starter == 0) generate(g, x->starter); + + p = p->left; + if (p != 0 && p->type != c_literalstring) p = p->right; + w(g, "~Mswitch(among_var) {~N~+" + "~Mcase 0: ~f~N"); + + until (p == 0) { + if (p->type == c_bra && p->left != 0) { + g->I[0] = case_number++; + w(g, "~Mcase ~I0:~N~+"); generate(g, p); w(g, "~Mbreak;~N~-"); + } + p = p->right; + } + w(g, "~}"); +} + +static void generate_booltest(struct generator * g, struct node * p) { + + g->V[0] = p->name; + wp(g, "~Mif (!(~V0)) ~f~C", p); +} + +static void generate_false(struct generator * g, struct node * p) { + + wp(g, "~M~f~C", p); +} + +static void generate_debug(struct generator * g, struct node * p) { + + g->I[0] = g->debug_count++; + g->I[1] = p->line_number; + wp(g, "~Mdebug(z, ~I0, ~I1);~C", p); + +} + +static void generate(struct generator * g, struct node * p) { + + int used = g->label_used; + int a0 = g->failure_label; + const char * a1 = g->failure_string; + + switch (p->type) + { + case c_define: generate_define(g, p); break; + case c_bra: generate_bra(g, p); break; + case c_and: generate_and(g, p); break; + case c_or: generate_or(g, p); break; + case c_backwards: generate_backwards(g, p); break; + case c_not: generate_not(g, p); break; + case c_set: generate_set(g, p); break; + case c_unset: generate_unset(g, p); break; + case c_try: generate_try(g, p); break; + case c_fail: generate_fail(g, p); break; + case c_reverse: + case c_test: generate_test(g, p); break; + case c_do: generate_do(g, p); break; + case c_goto: generate_GO(g, p, 1); break; + case c_gopast: generate_GO(g, p, 0); break; + case c_repeat: generate_repeat(g, p, false); break; + case c_loop: generate_loop(g, p); break; + case c_atleast: generate_atleast(g, p); break; + case c_setmark: generate_setmark(g, p); break; + case c_tomark: generate_tomark(g, p); break; + case c_atmark: generate_atmark(g, p); break; + case c_hop: generate_hop(g, p); break; + case c_delete: generate_delete(g, p); break; + case c_next: generate_next(g, p); break; + case c_tolimit: generate_tolimit(g, p); break; + case c_atlimit: generate_atlimit(g, p); break; + case c_leftslice: generate_leftslice(g, p); break; + case c_rightslice: generate_rightslice(g, p); break; + case c_assignto: generate_assignto(g, p); break; + case c_sliceto: generate_sliceto(g, p); break; + case c_assign: generate_assignfrom(g, p); break; + case c_insert: + case c_attach: generate_insert(g, p, p->type); break; + case c_slicefrom: generate_slicefrom(g, p); break; + case c_setlimit: generate_setlimit(g, p); break; + case c_dollar: generate_dollar(g, p); break; + case c_mathassign: generate_integer_assign(g, p, "="); break; + case c_plusassign: generate_integer_assign(g, p, "+="); break; + case c_minusassign: generate_integer_assign(g, p, "-="); break; + case c_multiplyassign:generate_integer_assign(g, p, "*="); break; + case c_divideassign: generate_integer_assign(g, p, "/="); break; + case c_eq: generate_integer_test(g, p, "=="); break; + case c_ne: generate_integer_test(g, p, "!="); break; + case c_gr: generate_integer_test(g, p, ">"); break; + case c_ge: generate_integer_test(g, p, ">="); break; + case c_ls: generate_integer_test(g, p, "<"); break; + case c_le: generate_integer_test(g, p, "<="); break; + case c_call: generate_call(g, p); break; + case c_grouping: generate_grouping(g, p, false); break; + case c_non: generate_grouping(g, p, true); break; + case c_name: generate_namedstring(g, p); break; + case c_literalstring: generate_literalstring(g, p); break; + case c_among: generate_among(g, p); break; + case c_substring: generate_substring(g, p); break; + case c_booltest: generate_booltest(g, p); break; + case c_false: generate_false(g, p); break; + case c_true: break; + case c_debug: generate_debug(g, p); break; + default: fprintf(stderr, "%d encountered\n", p->type); + exit(1); + } + + if (g->failure_label != a0) + g->label_used = used; + g->failure_label = a0; + g->failure_string = a1; +} + +static void generate_start_comment(struct generator * g) { + + w(g, "~N/* This file was generated automatically by the Snowball to ANSI C compiler */~N"); +} + +static void generate_head(struct generator * g) { + + if (g->options->runtime_path == 0) { + w(g, "~N#include \"header.h\"~N~N"); + } else { + w(g, "~N#include \""); + ws(g, g->options->runtime_path); + if (g->options->runtime_path[strlen(g->options->runtime_path) - 1] != '/') + wch(g, '/'); + w(g, "header.h\"~N~N"); + } +} + +static void generate_routine_headers(struct generator * g) { + struct name * q = g->analyser->names; + until (q == 0) { + g->V[0] = q; + switch (q->type) { + case t_routine: + w(g, "static int ~W0(struct SN_env * z);~N"); + break; + case t_external: + w(g, + "#ifdef __cplusplus~N" + "extern \"C\" {~N" + "#endif~N" + "extern int ~W0(struct SN_env * z);~N" + "#ifdef __cplusplus~N" + "}~N" + "#endif~N" + ); + break; + } + q = q->next; + } +} + +static void generate_among_table(struct generator * g, struct among * x) { + + struct amongvec * v = x->b; + + g->I[0] = x->number; + { + int i; + for (i = 0; i < x->literalstring_count; i++) + { + g->I[1] = i; + g->I[2] = v->size; + g->L[0] = v->b; + unless (v->size == 0) + w(g, "static const symbol s_~I0_~I1[~I2] = ~A0;~N"); + v++; + } + } + + g->I[1] = x->literalstring_count; + w(g, "~N~Mstatic const struct among a_~I0[~I1] =~N{~N"); + + v = x->b; + { + int i; + for (i = 0; i < x->literalstring_count; i++) { + g->I[1] = i; + g->I[2] = v->size; + g->I[3] = v->i; + g->I[4] = v->result; + g->S[0] = i < x->literalstring_count - 1 ? "," : ""; + + w(g, "/*~J1 */ { ~I2, "); + if (v->size == 0) w(g, "0,"); + else w(g, "s_~I0_~I1,"); + w(g, " ~I3, ~I4, "); + if (v->function == 0) w(g, "0"); else + wvn(g, v->function); + w(g, "}~S0~N"); + v++; + } + } + w(g, "};~N~N"); +} + +static void generate_amongs(struct generator * g) { + struct among * x = g->analyser->amongs; + until (x == 0) { + generate_among_table(g, x); + x = x->next; + } +} + +static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } + +static void generate_grouping_table(struct generator * g, struct grouping * q) { + + int range = q->largest_ch - q->smallest_ch + 1; + int size = (range + 7)/ 8; /* assume 8 bits per symbol */ + symbol * b = q->b; + symbol * map = create_b(size); + int i; + for (i = 0; i < size; i++) map[i] = 0; + + for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); + + { + g->V[0] = q->name; + + w(g, "static const unsigned char ~V0[] = { "); + for (i = 0; i < size; i++) { + wi(g, map[i]); + if (i < size - 1) w(g, ", "); + } + w(g, " };~N~N"); + } + lose_b(map); +} + +static void generate_groupings(struct generator * g) { + struct grouping * q = g->analyser->groupings; + until (q == 0) { + generate_grouping_table(g, q); + q = q->next; + } +} + +static void generate_create(struct generator * g) { + + int * p = g->analyser->name_count; + g->I[0] = p[t_string]; + g->I[1] = p[t_integer]; + g->I[2] = p[t_boolean]; + w(g, "~N" + "extern struct SN_env * ~pcreate_env(void) { return SN_create_env(~I0, ~I1, ~I2); }" + "~N"); +} + +static void generate_close(struct generator * g) { + + int * p = g->analyser->name_count; + g->I[0] = p[t_string]; + w(g, "~Nextern void ~pclose_env(struct SN_env * z) { SN_close_env(z, ~I0); }~N~N"); +} + +static void generate_create_and_close_templates(struct generator * g) { + w(g, "~N" + "extern struct SN_env * ~pcreate_env(void);~N" + "extern void ~pclose_env(struct SN_env * z);~N" + "~N"); +} + +static void generate_header_file(struct generator * g) { + + struct name * q = g->analyser->names; + char * vp = g->options->variables_prefix; + g->S[0] = vp; + + w(g, "~N" + "#ifdef __cplusplus~N" + "extern \"C\" {~N" + "#endif~N"); /* for C++ */ + + generate_create_and_close_templates(g); + until (q == 0) { + g->V[0] = q; + switch (q->type) + { + case t_external: + w(g, "extern int ~W0(struct SN_env * z);~N"); + break; + case t_string: g->S[1] = "S"; goto label0; + case t_integer: g->S[1] = "I"; goto label0; + case t_boolean: g->S[1] = "B"; + label0: + if (vp) { + g->I[0] = q->count; + w(g, "#define ~S0"); + str_append_b(g->outbuf, q->b); + w(g, " (~S1[~I0])~N"); + } + break; + } + q = q->next; + } + + w(g, "~N" + "#ifdef __cplusplus~N" + "}~N" + "#endif~N"); /* for C++ */ + + w(g, "~N"); +} + +extern void generate_program_c(struct generator * g) { + + g->outbuf = str_new(); + generate_start_comment(g); + generate_head(g); + generate_routine_headers(g); + w(g, "#ifdef __cplusplus~N" + "extern \"C\" {~N" + "#endif~N" + "~N"); + generate_create_and_close_templates(g); + w(g, "~N" + "#ifdef __cplusplus~N" + "}~N" + "#endif~N"); + generate_amongs(g); + generate_groupings(g); + g->declarations = g->outbuf; + g->outbuf = str_new(); + g->literalstring_count = 0; + { + struct node * p = g->analyser->program; + until (p == 0) { generate(g, p); p = p->right; } + } + generate_create(g); + generate_close(g); + output_str(g->options->output_c, g->declarations); + str_delete(g->declarations); + output_str(g->options->output_c, g->outbuf); + str_clear(g->outbuf); + + generate_start_comment(g); + generate_header_file(g); + output_str(g->options->output_h, g->outbuf); + str_delete(g->outbuf); +} + +extern struct generator * create_generator_c(struct analyser * a, struct options * o) { + NEW(generator, g); + g->analyser = a; + g->options = o; + g->margin = 0; + g->debug_count = 0; + g->line_count = 0; + return g; +} + +extern void close_generator_c(struct generator * g) { + + FREE(g); +} + diff --git a/contrib/snowball/compiler/generator_java.c b/contrib/snowball/compiler/generator_java.c new file mode 100644 index 000000000..07a4b8e2e --- /dev/null +++ b/contrib/snowball/compiler/generator_java.c @@ -0,0 +1,1452 @@ + +#include <stdlib.h> /* for exit */ +#include <string.h> /* for strlen */ +#include <stdio.h> /* for fprintf etc */ +#include "header.h" + +/* prototypes */ + +static void generate(struct generator * g, struct node * p); +static void w(struct generator * g, const char * s); +static void writef(struct generator * g, const char * s, struct node * p); + + +enum special_labels { + x_return = -1 +}; + +static int new_label(struct generator * g) { + + return g->next_label++; +} + +static struct str * vars_newname(struct generator * g) { + + struct str * output; + g->var_number ++; + output = str_new(); + str_append_string(output, "v_"); + str_append_int(output, g->var_number); + return output; +} + +/* Output routines */ +static void output_str(FILE * outfile, struct str * str) { + + char * s = b_to_s(str_data(str)); + fprintf(outfile, "%s", s); + free(s); +} + +/* Write routines for simple entities */ + +static void write_char(struct generator * g, int ch) { + + str_append_ch(g->outbuf, ch); +} + +static void write_newline(struct generator * g) { + + str_append_string(g->outbuf, "\n"); +} + +static void write_string(struct generator * g, const char * s) { + + str_append_string(g->outbuf, s); +} + +static void write_b(struct generator * g, symbol * b) { + + str_append_b(g->outbuf, b); +} + +static void write_str(struct generator * g, struct str * str) { + + str_append(g->outbuf, str); +} + +static void write_int(struct generator * g, int i) { + + str_append_int(g->outbuf, i); +} + + +/* Write routines for items from the syntax tree */ + +static void write_varname(struct generator * g, struct name * p) { + + int ch = "SBIrxg"[p->type]; + if (p->type != t_external) + { + write_char(g, ch); + write_char(g, '_'); + } + str_append_b(g->outbuf, p->b); +} + +static void write_varref(struct generator * g, struct name * p) { + + /* In java, references look just the same */ + write_varname(g, p); +} + +static void write_hexdigit(struct generator * g, int n) { + + write_char(g, n < 10 ? n + '0' : n - 10 + 'A'); +} + +static void write_hex(struct generator * g, int ch) { + + write_string(g, "\\u"); + { + int i; + for (i = 12; i >= 0; i -= 4) write_hexdigit(g, ch >> i & 0xf); + } +} + +static void write_literal_string(struct generator * g, symbol * p) { + + int i; + write_string(g, "\""); + for (i = 0; i < SIZE(p); i++) { + int ch = p[i]; + if (32 <= ch && ch <= 127) { + if (ch == '\"' || ch == '\\') write_string(g, "\\"); + write_char(g, ch); + } else { + write_hex(g, ch); + } + } + write_string(g, "\""); +} + +static void write_margin(struct generator * g) { + + int i; + for (i = 0; i < g->margin; i++) write_string(g, " "); +} + +/* Write a variable declaration. */ +static void write_declare(struct generator * g, + char * declaration, + struct node * p) { + + struct str * temp = g->outbuf; + g->outbuf = g->declarations; + write_string(g, " "); + writef(g, declaration, p); + write_string(g, ";"); + write_newline(g); + g->outbuf = temp; +} + +static void write_comment(struct generator * g, struct node * p) { + + write_margin(g); + write_string(g, "// "); + write_string(g, name_of_token(p->type)); + if (p->name != 0) { + write_string(g, " "); + str_append_b(g->outbuf, p->name->b); + } + write_string(g, ", line "); + write_int(g, p->line_number); + write_newline(g); +} + +static void write_block_start(struct generator * g) { + + w(g, "~M{~+~N"); +} + +static void write_block_end(struct generator * g) /* block end */ { + + w(g, "~-~M}~N"); +} + +static void write_savecursor(struct generator * g, struct node * p, + struct str * savevar) { + + g->B[0] = str_data(savevar); + g->S[1] = ""; + if (p->mode != m_forward) g->S[1] = "limit - "; + write_declare(g, "int ~B0", p); + writef(g, "~M~B0 = ~S1cursor;~N" , p); +} + +static void restore_string(struct node * p, struct str * out, struct str * savevar) { + + str_clear(out); + str_append_string(out, "cursor = "); + if (p->mode != m_forward) str_append_string(out, "limit - "); + str_append(out, savevar); + str_append_string(out, ";"); +} + +static void write_restorecursor(struct generator * g, struct node * p, + struct str * savevar) { + + struct str * temp = str_new(); + write_margin(g); + restore_string(p, temp, savevar); + write_str(g, temp); + write_newline(g); + str_delete(temp); +} + +static void write_inc_cursor(struct generator * g, struct node * p) { + + write_margin(g); + write_string(g, p->mode == m_forward ? "cursor++;" : "cursor--;"); + write_newline(g); +} + +static void wsetlab_begin(struct generator * g, int n) { + + w(g, "~Mlab"); + write_int(g, n); + w(g, ": do {~+~N"); +} + +static void wsetlab_end(struct generator * g) { + + w(g, "~-~M} while (false);~N"); +} + +static void wgotol(struct generator * g, int n) { + + write_margin(g); + write_string(g, "break lab"); + write_int(g, n); + write_string(g, ";"); + write_newline(g); +} + +static void write_failure(struct generator * g) { + + if (str_len(g->failure_str) != 0) { + write_margin(g); + write_str(g, g->failure_str); + write_newline(g); + } + write_margin(g); + switch (g->failure_label) + { + case x_return: + write_string(g, "return false;"); + break; + default: + write_string(g, "break lab"); + write_int(g, g->failure_label); + write_string(g, ";"); + g->unreachable = true; + } + write_newline(g); +} + +static void write_failure_if(struct generator * g, char * s, struct node * p) { + + writef(g, "~Mif (", p); + writef(g, s, p); + writef(g, ")~N", p); + write_block_start(g); + write_failure(g); + write_block_end(g); + g->unreachable = false; +} + +/* if at limit fail */ +static void write_check_limit(struct generator * g, struct node * p) { + + if (p->mode == m_forward) { + write_failure_if(g, "cursor >= limit", p); + } else { + write_failure_if(g, "cursor <= limit_backward", p); + } +} + +/* Formatted write. */ +static void writef(struct generator * g, const char * input, struct node * p) { + + int i = 0; + int l = strlen(input); + + while (i < l) { + int ch = input[i++]; + if (ch == '~') { + switch(input[i++]) { + default: write_char(g, input[i - 1]); continue; + case 'C': write_comment(g, p); continue; + case 'f': write_block_start(g); + write_failure(g); + g->unreachable = false; + write_block_end(g); + continue; + case 'M': write_margin(g); continue; + case 'N': write_newline(g); continue; + case '{': write_block_start(g); continue; + case '}': write_block_end(g); continue; + case 'S': write_string(g, g->S[input[i++] - '0']); continue; + case 'B': write_b(g, g->B[input[i++] - '0']); continue; + case 'I': write_int(g, g->I[input[i++] - '0']); continue; + case 'V': write_varref(g, g->V[input[i++] - '0']); continue; + case 'W': write_varname(g, g->V[input[i++] - '0']); continue; + case 'L': write_literal_string(g, g->L[input[i++] - '0']); continue; + case '+': g->margin++; continue; + case '-': g->margin--; continue; + case 'n': write_string(g, g->options->name); continue; + } + } else { + write_char(g, ch); + } + } +} + +static void w(struct generator * g, const char * s) { + writef(g, s, 0); +} + +static void generate_AE(struct generator * g, struct node * p) { + char * s; + switch (p->type) { + case c_name: + write_varref(g, p->name); break; + case c_number: + write_int(g, p->number); break; + case c_maxint: + write_string(g, "MAXINT"); break; + case c_minint: + write_string(g, "MININT"); break; + case c_neg: + write_string(g, "-"); generate_AE(g, p->right); break; + case c_multiply: + s = " * "; goto label0; + case c_plus: + s = " + "; goto label0; + case c_minus: + s = " - "; goto label0; + case c_divide: + s = " / "; + label0: + write_string(g, "("); generate_AE(g, p->left); + write_string(g, s); generate_AE(g, p->right); write_string(g, ")"); break; + case c_sizeof: + g->V[0] = p->name; + w(g, "(~V0.length())"); break; + case c_cursor: + w(g, "cursor"); break; + case c_limit: + w(g, p->mode == m_forward ? "limit" : "limit_backward"); break; + case c_size: + w(g, "(current.length())"); break; + } +} + +/* K_needed() tests to see if we really need to keep c. Not true when the + the command does not touch the cursor. This and repeat_score() could be + elaborated almost indefinitely. +*/ + +static int K_needed(struct generator * g, struct node * p) { + + while (p != 0) { + switch (p->type) { + case c_dollar: + case c_leftslice: + case c_rightslice: + case c_mathassign: + case c_plusassign: + case c_minusassign: + case c_multiplyassign: + case c_divideassign: + case c_eq: + case c_ne: + case c_gr: + case c_ge: + case c_ls: + case c_le: + case c_sliceto: + case c_booltest: + case c_true: + case c_false: + case c_debug: + break; + + case c_call: + if (K_needed(g, p->name->definition)) return true; + break; + + case c_bra: + if (K_needed(g, p->left)) return true; + break; + + default: return true; + } + p = p->right; + } + return false; +} + +static int repeat_score(struct generator * g, struct node * p) { + + int score = 0; + while (p != 0) { + switch (p->type) { + case c_dollar: + case c_leftslice: + case c_rightslice: + case c_mathassign: + case c_plusassign: + case c_minusassign: + case c_multiplyassign: + case c_divideassign: + case c_eq: + case c_ne: + case c_gr: + case c_ge: + case c_ls: + case c_le: + case c_sliceto: /* case c_not: must not be included here! */ + case c_debug: + break; + + case c_call: + score += repeat_score(g, p->name->definition); + break; + + case c_bra: + score += repeat_score(g, p->left); + break; + + case c_name: + case c_literalstring: + case c_next: + case c_grouping: + case c_non: + case c_hop: + score = score + 1; + break; + + default: + score = 2; + break; + } + p = p->right; + } + return score; +} + +/* tests if an expression requires cursor reinstatement in a repeat */ + +static int repeat_restore(struct generator * g, struct node * p) { + + return repeat_score(g, p) >= 2; +} + +static void generate_bra(struct generator * g, struct node * p) { + + write_comment(g, p); + p = p->left; + while (p != 0) { + generate(g, p); + p = p->right; + } +} + +static void generate_and(struct generator * g, struct node * p) { + + struct str * savevar = vars_newname(g); + int keep_c = K_needed(g, p->left); + + write_comment(g, p); + + if (keep_c) write_savecursor(g, p, savevar); + + p = p->left; + while (p != 0) { + generate(g, p); + if (g->unreachable) break; + if (keep_c && p->right != 0) write_restorecursor(g, p, savevar); + p = p->right; + } + str_delete(savevar); +} + +static void generate_or(struct generator * g, struct node * p) { + + struct str * savevar = vars_newname(g); + int keep_c = K_needed(g, p->left); + + int a0 = g->failure_label; + struct str * a1 = str_copy(g->failure_str); + + int out_lab = new_label(g); + write_comment(g, p); + wsetlab_begin(g, out_lab); + + if (keep_c) write_savecursor(g, p, savevar); + + p = p->left; + str_clear(g->failure_str); + + if (p == 0) { + /* p should never be 0 after an or: there should be at least two + * sub nodes. */ + fprintf(stderr, "Error: \"or\" node without children nodes."); + exit (1); + } + while (p->right != 0) { + g->failure_label = new_label(g); + wsetlab_begin(g, g->failure_label); + generate(g, p); + if (!g->unreachable) wgotol(g, out_lab); + wsetlab_end(g); + g->unreachable = false; + if (keep_c) write_restorecursor(g, p, savevar); + p = p->right; + } + + g->failure_label = a0; + str_delete(g->failure_str); + g->failure_str = a1; + + generate(g, p); + wsetlab_end(g); + str_delete(savevar); +} + +static void generate_backwards(struct generator * g, struct node * p) { + + write_comment(g, p); + writef(g,"~Mlimit_backward = cursor; cursor = limit;~N", p); + generate(g, p->left); + w(g, "~Mcursor = limit_backward;"); +} + + +static void generate_not(struct generator * g, struct node * p) { + + struct str * savevar = vars_newname(g); + int keep_c = K_needed(g, p->left); + + int a0 = g->failure_label; + struct str * a1 = str_copy(g->failure_str); + + write_comment(g, p); + if (keep_c) { + write_block_start(g); + write_savecursor(g, p, savevar); + } + + g->failure_label = new_label(g); + str_clear(g->failure_str); + + wsetlab_begin(g, g->failure_label); + + generate(g, p->left); + + g->failure_label = a0; + str_delete(g->failure_str); + g->failure_str = a1; + + if (!g->unreachable) write_failure(g); + + wsetlab_end(g); + g->unreachable = false; + + if (keep_c) write_restorecursor(g, p, savevar); + if (keep_c) write_block_end(g); + str_delete(savevar); +} + + +static void generate_try(struct generator * g, struct node * p) { + + struct str * savevar = vars_newname(g); + int keep_c = K_needed(g, p->left); + + write_comment(g, p); + if (keep_c) write_savecursor(g, p, savevar); + + g->failure_label = new_label(g); + if (keep_c) restore_string(p, g->failure_str, savevar); + + wsetlab_begin(g, g->failure_label); + generate(g, p->left); + wsetlab_end(g); + g->unreachable = false; + + str_delete(savevar); +} + +static void generate_set(struct generator * g, struct node * p) { + + write_comment(g, p); + g->V[0] = p->name; + writef(g, "~M~V0 = true;~N", p); +} + +static void generate_unset(struct generator * g, struct node * p) { + + write_comment(g, p); + g->V[0] = p->name; + writef(g, "~M~V0 = false;~N", p); +} + +static void generate_fail(struct generator * g, struct node * p) { + + write_comment(g, p); + generate(g, p->left); + if (!g->unreachable) write_failure(g); +} + +/* generate_test() also implements 'reverse' */ + +static void generate_test(struct generator * g, struct node * p) { + + struct str * savevar = vars_newname(g); + int keep_c = K_needed(g, p->left); + + write_comment(g, p); + + if (keep_c) { + write_savecursor(g, p, savevar); + } + + generate(g, p->left); + + if (!g->unreachable) { + if (keep_c) { + write_restorecursor(g, p, savevar); + } + } + str_delete(savevar); +} + +static void generate_do(struct generator * g, struct node * p) { + + struct str * savevar = vars_newname(g); + int keep_c = K_needed(g, p->left); + write_comment(g, p); + if (keep_c) write_savecursor(g, p, savevar); + + g->failure_label = new_label(g); + str_clear(g->failure_str); + + wsetlab_begin(g, g->failure_label); + generate(g, p->left); + wsetlab_end(g); + g->unreachable = false; + + if (keep_c) write_restorecursor(g, p, savevar); + str_delete(savevar); +} + +static void generate_GO(struct generator * g, struct node * p, int style) { + + int end_unreachable = false; + struct str * savevar = vars_newname(g); + int keep_c = style == 1 || repeat_restore(g, p->left); + + int a0 = g->failure_label; + struct str * a1 = str_copy(g->failure_str); + + int golab = new_label(g); + g->I[0] = golab; + write_comment(g, p); + w(g, "~Mgolab~I0: while(true)~N"); + w(g, "~{"); + + if (keep_c) write_savecursor(g, p, savevar); + + g->failure_label = new_label(g); + wsetlab_begin(g, g->failure_label); + generate(g, p->left); + + if (g->unreachable) { + /* Cannot break out of this loop: therefore the code after the + * end of the loop is unreachable.*/ + end_unreachable = true; + } else { + /* include for goto; omit for gopast */ + if (style == 1) write_restorecursor(g, p, savevar); + g->I[0] = golab; + w(g, "~Mbreak golab~I0;~N"); + } + g->unreachable = false; + wsetlab_end(g); + if (keep_c) write_restorecursor(g, p, savevar); + + g->failure_label = a0; + str_delete(g->failure_str); + g->failure_str = a1; + + write_check_limit(g, p); + write_inc_cursor(g, p); + write_block_end(g); + str_delete(savevar); + g->unreachable = end_unreachable; +} + +static void generate_loop(struct generator * g, struct node * p) { + + struct str * loopvar = vars_newname(g); + write_comment(g, p); + g->B[0] = str_data(loopvar); + write_declare(g, "int ~B0", p); + w(g, "~Mfor (~B0 = "); + generate_AE(g, p->AE); + g->B[0] = str_data(loopvar); + writef(g, "; ~B0 > 0; ~B0--)~N", p); + writef(g, "~{", p); + + generate(g, p->left); + + w(g, "~}"); + str_delete(loopvar); + g->unreachable = false; +} + +static void generate_repeat(struct generator * g, struct node * p, struct str * loopvar) { + + struct str * savevar = vars_newname(g); + int keep_c = repeat_restore(g, p->left); + int replab = new_label(g); + g->I[0] = replab; + write_comment(g, p); + writef(g, "~Mreplab~I0: while(true)~N~{", p); + + if (keep_c) write_savecursor(g, p, savevar); + + g->failure_label = new_label(g); + str_clear(g->failure_str); + wsetlab_begin(g, g->failure_label); + generate(g, p->left); + + if (!g->unreachable) { + if (loopvar != 0) { + g->B[0] = str_data(loopvar); + w(g, "~M~B0--;~N"); + } + + g->I[0] = replab; + w(g, "~Mcontinue replab~I0;~N"); + } + + wsetlab_end(g); + g->unreachable = false; + + if (keep_c) write_restorecursor(g, p, savevar); + + g->I[0] = replab; + w(g, "~Mbreak replab~I0;~N~}"); + str_delete(savevar); +} + +static void generate_atleast(struct generator * g, struct node * p) { + + struct str * loopvar = vars_newname(g); + write_comment(g, p); + w(g, "~{"); + g->B[0] = str_data(loopvar); + w(g, "~Mint ~B0 = "); + generate_AE(g, p->AE); + w(g, ";~N"); + { + int a0 = g->failure_label; + struct str * a1 = str_copy(g->failure_str); + + generate_repeat(g, p, loopvar); + + g->failure_label = a0; + str_delete(g->failure_str); + g->failure_str = a1; + } + g->B[0] = str_data(loopvar); + write_failure_if(g, "~B0 > 0", p); + w(g, "~}"); + str_delete(loopvar); +} + +static void generate_setmark(struct generator * g, struct node * p) { + + write_comment(g, p); + g->V[0] = p->name; + writef(g, "~M~V0 = cursor;~N", p); +} + +static void generate_tomark(struct generator * g, struct node * p) { + + write_comment(g, p); + g->S[0] = p->mode == m_forward ? ">" : "<"; + + w(g, "~Mif (cursor ~S0 "); generate_AE(g, p->AE); w(g, ")~N"); + write_block_start(g); + write_failure(g); + write_block_end(g); + g->unreachable = false; + w(g, "~Mcursor = "); generate_AE(g, p->AE); writef(g, ";~N", p); +} + +static void generate_atmark(struct generator * g, struct node * p) { + + write_comment(g, p); + w(g, "~Mif (cursor != "); generate_AE(g, p->AE); writef(g, ")~N", p); + write_block_start(g); + write_failure(g); + write_block_end(g); + g->unreachable = false; +} + + +static void generate_hop(struct generator * g, struct node * p) { + + write_comment(g, p); + g->S[0] = p->mode == m_forward ? "+" : "-"; + + w(g, "~{~Mint c = cursor ~S0 "); + generate_AE(g, p->AE); + w(g, ";~N"); + + g->S[0] = p->mode == m_forward ? "0" : "limit_backward"; + + write_failure_if(g, "~S0 > c || c > limit", p); + writef(g, "~Mcursor = c;~N", p); + writef(g, "~}", p); +} + +static void generate_delete(struct generator * g, struct node * p) { + + write_comment(g, p); + writef(g, "~Mslice_del();~N", p); +} + + +static void generate_next(struct generator * g, struct node * p) { + + write_comment(g, p); + write_check_limit(g, p); + write_inc_cursor(g, p); +} + +static void generate_tolimit(struct generator * g, struct node * p) { + + write_comment(g, p); + g->S[0] = p->mode == m_forward ? "limit" : "limit_backward"; + writef(g, "~Mcursor = ~S0;~N", p); +} + +static void generate_atlimit(struct generator * g, struct node * p) { + + write_comment(g, p); + g->S[0] = p->mode == m_forward ? "limit" : "limit_backward"; + g->S[1] = p->mode == m_forward ? "<" : ">"; + write_failure_if(g, "cursor ~S1 ~S0", p); +} + +static void generate_leftslice(struct generator * g, struct node * p) { + + write_comment(g, p); + g->S[0] = p->mode == m_forward ? "bra" : "ket"; + writef(g, "~M~S0 = cursor;~N", p); +} + +static void generate_rightslice(struct generator * g, struct node * p) { + + write_comment(g, p); + g->S[0] = p->mode == m_forward ? "ket" : "bra"; + writef(g, "~M~S0 = cursor;~N", p); +} + +static void generate_assignto(struct generator * g, struct node * p) { + + write_comment(g, p); + g->V[0] = p->name; + writef(g, "~M~V0 = assign_to(~V0);~N", p); +} + +static void generate_sliceto(struct generator * g, struct node * p) { + + write_comment(g, p); + g->V[0] = p->name; + writef(g, "~M~V0 = slice_to(~V0);~N", p); +} + +static void generate_address(struct generator * g, struct node * p) { + + symbol * b = p->literalstring; + if (b != 0) { + write_literal_string(g, b); + } else { + write_varref(g, p->name); + } +} + +static void generate_insert(struct generator * g, struct node * p, int style) { + + int keep_c = style == c_attach; + write_comment(g, p); + if (p->mode == m_backward) keep_c = !keep_c; + if (keep_c) w(g, "~{~Mint c = cursor;~N"); + writef(g, "~Minsert(cursor, cursor, ", p); + generate_address(g, p); + writef(g, ");~N", p); + if (keep_c) w(g, "~Mcursor = c;~N~}"); +} + +static void generate_assignfrom(struct generator * g, struct node * p) { + + int keep_c = p->mode == m_forward; /* like 'attach' */ + + write_comment(g, p); + if (keep_c) writef(g, "~{~Mint c = cursor;~N", p); + if (p->mode == m_forward) { + writef(g, "~Minsert(cursor, limit, ", p); + } else { + writef(g, "~Minsert(limit_backward, cursor, ", p); + } + generate_address(g, p); + writef(g, ");~N", p); + if (keep_c) w(g, "~Mcursor = c;~N~}"); +} + + +static void generate_slicefrom(struct generator * g, struct node * p) { + + write_comment(g, p); + w(g, "~Mslice_from("); + generate_address(g, p); + writef(g, ");~N", p); +} + +static void generate_setlimit(struct generator * g, struct node * p) { + + struct str * savevar = vars_newname(g); + struct str * varname = vars_newname(g); + write_comment(g, p); + write_savecursor(g, p, savevar); + generate(g, p->left); + + if (!g->unreachable) { + g->B[0] = str_data(varname); + write_declare(g, "int ~B0", p); + if (p->mode == m_forward) { + w(g, "~M~B0 = limit - cursor;~N"); + w(g, "~Mlimit = cursor;~N"); + } else { + w(g, "~M~B0 = limit_backward;~N"); + w(g, "~Mlimit_backward = cursor;~N"); + } + write_restorecursor(g, p, savevar); + + if (p->mode == m_forward) { + str_assign(g->failure_str, "limit += "); + str_append(g->failure_str, varname); + str_append_ch(g->failure_str, ';'); + } else { + str_assign(g->failure_str, "limit_backward = "); + str_append(g->failure_str, varname); + str_append_ch(g->failure_str, ';'); + } + generate(g, p->aux); + + if (!g->unreachable) { + write_margin(g); + write_str(g, g->failure_str); + write_newline(g); + } + } + str_delete(varname); + str_delete(savevar); +} + +/* dollar sets snowball up to operate on a string variable as if it were the + * current string */ +static void generate_dollar(struct generator * g, struct node * p) { + + struct str * savevar = vars_newname(g); + write_comment(g, p); + g->V[0] = p->name; + + str_assign(g->failure_str, "copy_from("); + str_append(g->failure_str, savevar); + str_append_string(g->failure_str, ");"); + g->B[0] = str_data(savevar); + writef(g, "~{~M~n ~B0 = this;~N" + "~Mcurrent = new StringBuffer(~V0.toString());~N" + "~Mcursor = 0;~N" + "~Mlimit = (current.length());~N", p); + generate(g, p->left); + if (!g->unreachable) { + write_margin(g); + write_str(g, g->failure_str); + write_newline(g); + } + w(g, "~}"); + str_delete(savevar); +} + +static void generate_integer_assign(struct generator * g, struct node * p, char * s) { + + g->V[0] = p->name; + g->S[0] = s; + w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); +} + +static void generate_integer_test(struct generator * g, struct node * p, char * s) { + + g->V[0] = p->name; + g->S[0] = s; + w(g, "~Mif (!(~V0 ~S0 "); generate_AE(g, p->AE); w(g, "))~N"); + write_block_start(g); + write_failure(g); + write_block_end(g); + g->unreachable = false; +} + +static void generate_call(struct generator * g, struct node * p) { + + write_comment(g, p); + g->V[0] = p->name; + write_failure_if(g, "!~V0()", p); +} + +static void generate_grouping(struct generator * g, struct node * p, int complement) { + + struct grouping * q = p->name->grouping; + g->S[0] = p->mode == m_forward ? "" : "_b"; + g->S[1] = complement ? "out" : "in"; + g->V[0] = p->name; + g->I[0] = q->smallest_ch; + g->I[1] = q->largest_ch; + if (q->no_gaps) + write_failure_if(g, "!(~S1_range~S0(~I0, ~I1))", p); + else + write_failure_if(g, "!(~S1_grouping~S0(~V0, ~I0, ~I1))", p); +} + +static void generate_namedstring(struct generator * g, struct node * p) { + + write_comment(g, p); + g->S[0] = p->mode == m_forward ? "" : "_b"; + g->V[0] = p->name; + write_failure_if(g, "!(eq_v~S0(~V0))", p); +} + +static void generate_literalstring(struct generator * g, struct node * p) { + + symbol * b = p->literalstring; + write_comment(g, p); + g->S[0] = p->mode == m_forward ? "" : "_b"; + g->I[0] = SIZE(b); + g->L[0] = b; + write_failure_if(g, "!(eq_s~S0(~I0, ~L0))", p); +} + +static void generate_define(struct generator * g, struct node * p) { + + struct name * q = p->name; + + struct str * saved_output = g->outbuf; + struct str * saved_declarations = g->declarations; + + g->S[0] = q->type == t_routine ? "private" : "public"; + g->V[0] = q; + w(g, "~+~+~N~M~S0 boolean ~V0() {~+~N"); + + g->outbuf = str_new(); + g->declarations = str_new(); + + g->next_label = 0; + g->var_number = 0; + + if (p->amongvar_needed) write_declare(g, "int among_var", p); + str_clear(g->failure_str); + g->failure_label = x_return; + g->unreachable = false; + generate(g, p->left); + if (!g->unreachable) w(g, "~Mreturn true;~N"); + w(g, "~}~-~-"); + + str_append(saved_output, g->declarations); + str_append(saved_output, g->outbuf); + str_delete(g->declarations); + str_delete(g->outbuf); + g->declarations = saved_declarations; + g->outbuf = saved_output; +} + +static void generate_substring(struct generator * g, struct node * p) { + + struct among * x = p->among; + + write_comment(g, p); + + g->S[0] = p->mode == m_forward ? "" : "_b"; + g->I[0] = x->number; + g->I[1] = x->literalstring_count; + + if (x->command_count == 0 && x->starter == 0) { + write_failure_if(g, "find_among~S0(a_~I0, ~I1) == 0", p); + } else { + writef(g, "~Mamong_var = find_among~S0(a_~I0, ~I1);~N", p); + write_failure_if(g, "among_var == 0", p); + } +} + +static void generate_among(struct generator * g, struct node * p) { + + struct among * x = p->among; + int case_number = 1; + + if (x->substring == 0) generate_substring(g, p); + if (x->command_count == 0 && x->starter == 0) return; + + if (x->starter != 0) generate(g, x->starter); + + p = p->left; + if (p != 0 && p->type != c_literalstring) p = p->right; + w(g, "~Mswitch(among_var) {~N~+"); + w(g, "~Mcase 0:~N~+"); + write_failure(g); + g->unreachable = false; + w(g, "~-"); + + while (p != 0) { + if (p->type == c_bra && p->left != 0) { + g->I[0] = case_number++; + w(g, "~Mcase ~I0:~N~+"); + generate(g, p); + if (!g->unreachable) w(g, "~Mbreak;~N"); + w(g, "~-"); + g->unreachable = false; + } + p = p->right; + } + write_block_end(g); +} + +static void generate_booltest(struct generator * g, struct node * p) { + + write_comment(g, p); + g->V[0] = p->name; + write_failure_if(g, "!(~V0)", p); +} + +static void generate_false(struct generator * g, struct node * p) { + + write_comment(g, p); + write_failure(g); +} + +static void generate_debug(struct generator * g, struct node * p) { + + write_comment(g, p); + g->I[0] = g->debug_count++; + g->I[1] = p->line_number; + writef(g, "~Mdebug(~I0, ~I1);~N", p); +} + +static void generate(struct generator * g, struct node * p) { + + int a0; + struct str * a1; + + if (g->unreachable) return; + + a0 = g->failure_label; + a1 = str_copy(g->failure_str); + + switch (p->type) + { + case c_define: generate_define(g, p); break; + case c_bra: generate_bra(g, p); break; + case c_and: generate_and(g, p); break; + case c_or: generate_or(g, p); break; + case c_backwards: generate_backwards(g, p); break; + case c_not: generate_not(g, p); break; + case c_set: generate_set(g, p); break; + case c_unset: generate_unset(g, p); break; + case c_try: generate_try(g, p); break; + case c_fail: generate_fail(g, p); break; + case c_reverse: + case c_test: generate_test(g, p); break; + case c_do: generate_do(g, p); break; + case c_goto: generate_GO(g, p, 1); break; + case c_gopast: generate_GO(g, p, 0); break; + case c_repeat: generate_repeat(g, p, 0); break; + case c_loop: generate_loop(g, p); break; + case c_atleast: generate_atleast(g, p); break; + case c_setmark: generate_setmark(g, p); break; + case c_tomark: generate_tomark(g, p); break; + case c_atmark: generate_atmark(g, p); break; + case c_hop: generate_hop(g, p); break; + case c_delete: generate_delete(g, p); break; + case c_next: generate_next(g, p); break; + case c_tolimit: generate_tolimit(g, p); break; + case c_atlimit: generate_atlimit(g, p); break; + case c_leftslice: generate_leftslice(g, p); break; + case c_rightslice: generate_rightslice(g, p); break; + case c_assignto: generate_assignto(g, p); break; + case c_sliceto: generate_sliceto(g, p); break; + case c_assign: generate_assignfrom(g, p); break; + case c_insert: + case c_attach: generate_insert(g, p, p->type); break; + case c_slicefrom: generate_slicefrom(g, p); break; + case c_setlimit: generate_setlimit(g, p); break; + case c_dollar: generate_dollar(g, p); break; + case c_mathassign: generate_integer_assign(g, p, "="); break; + case c_plusassign: generate_integer_assign(g, p, "+="); break; + case c_minusassign: generate_integer_assign(g, p, "-="); break; + case c_multiplyassign:generate_integer_assign(g, p, "*="); break; + case c_divideassign: generate_integer_assign(g, p, "/="); break; + case c_eq: generate_integer_test(g, p, "=="); break; + case c_ne: generate_integer_test(g, p, "!="); break; + case c_gr: generate_integer_test(g, p, ">"); break; + case c_ge: generate_integer_test(g, p, ">="); break; + case c_ls: generate_integer_test(g, p, "<"); break; + case c_le: generate_integer_test(g, p, "<="); break; + case c_call: generate_call(g, p); break; + case c_grouping: generate_grouping(g, p, false); break; + case c_non: generate_grouping(g, p, true); break; + case c_name: generate_namedstring(g, p); break; + case c_literalstring: generate_literalstring(g, p); break; + case c_among: generate_among(g, p); break; + case c_substring: generate_substring(g, p); break; + case c_booltest: generate_booltest(g, p); break; + case c_false: generate_false(g, p); break; + case c_true: break; + case c_debug: generate_debug(g, p); break; + default: fprintf(stderr, "%d encountered\n", p->type); + exit(1); + } + + g->failure_label = a0; + str_delete(g->failure_str); + g->failure_str = a1; +} + +static void generate_start_comment(struct generator * g) { + + w(g, "// This file was generated automatically by the Snowball to Java compiler~N"); + w(g, "~N"); +} + +static void generate_class_begin(struct generator * g) { + + w(g, "package " ); + w(g, g->options->package); + w(g, ";~N~N" ); + + w(g, "import "); + w(g, g->options->among_class ); + w(g, ";~N" + "~N" + " /**~N" + " * This class was automatically generated by a Snowball to Java compiler ~N" + " * It implements the stemming algorithm defined by a snowball script.~N" + " */~N" + "~N" + "public class ~n extends "); + + w(g, g->options->parent_class_name); + w(g, " {~N" + "~N" + "private static final long serialVersionUID = 1L;~N" + "~N" + "~+~+~Mprivate final static ~n methodObject = new ~n ();~N" + "~N"); +} + +static void generate_class_end(struct generator * g) { + + w(g, "~N}"); + w(g, "~N~N"); +} + +static void generate_equals(struct generator * g) { + + w(g, "~N" + "~Mpublic boolean equals( Object o ) {~N" + "~+~Mreturn o instanceof "); + w(g, g->options->name); + w(g, ";~N~-~M}~N" + "~N" + "~Mpublic int hashCode() {~N" + "~+~Mreturn "); + w(g, g->options->name); + w(g, ".class.getName().hashCode();~N" + "~-~M}~N"); + w(g, "~N~N"); +} + +static void generate_among_table(struct generator * g, struct among * x) { + + struct amongvec * v = x->b; + + g->I[0] = x->number; + g->I[1] = x->literalstring_count; + + w(g, "~+~+~Mprivate final static Among a_~I0[] = {~N~+"); + { + int i; + for (i = 0; i < x->literalstring_count; i++) { + g->I[0] = i; + g->I[1] = v->i; + g->I[2] = v->result; + g->L[0] = v->b; + g->S[0] = i < x->literalstring_count - 1 ? "," : ""; + + w(g, "~Mnew Among ( ~L0, ~I1, ~I2, \""); + if (v->function != 0) { + write_varname(g, v->function); + } + w(g, "\", methodObject )~S0~N"); + v++; + } + } + w(g, "~-~M};~-~-~N~N"); +} + +static void generate_amongs(struct generator * g) { + + struct among * x = g->analyser->amongs; + while (x != 0) { + generate_among_table(g, x); + x = x->next; + } +} + +static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } + +static int bit_is_set(symbol * b, int i) { return b[i/8] & 1 << i%8; } + +static void generate_grouping_table(struct generator * g, struct grouping * q) { + + int range = q->largest_ch - q->smallest_ch + 1; + int size = (range + 7)/ 8; /* assume 8 bits per symbol */ + symbol * b = q->b; + symbol * map = create_b(size); + int i; + for (i = 0; i < size; i++) map[i] = 0; + + /* Using unicode would require revision here */ + + for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); + + q->no_gaps = true; + for (i = 0; i < range; i++) unless (bit_is_set(map, i)) q->no_gaps = false; + + unless (q->no_gaps) { + g->V[0] = q->name; + + w(g, "~+~+~Mprivate static final char ~V0[] = {"); + for (i = 0; i < size; i++) { + write_int(g, map[i]); + if (i < size - 1) w(g, ", "); + } + w(g, " };~N~-~-~N"); + } + lose_b(map); +} + +static void generate_groupings(struct generator * g) { + struct grouping * q = g->analyser->groupings; + until (q == 0) { + generate_grouping_table(g, q); + q = q->next; + } +} + +static void generate_members(struct generator * g) { + + struct name * q = g->analyser->names; + until (q == 0) { + g->V[0] = q; + switch (q->type) { + case t_string: + w(g, " private "); + w(g, g->options->string_class ); + w(g, " ~W0 = new "); + w(g, g->options->string_class); + w(g, "();~N"); + break; + case t_integer: + w(g, " private int ~W0;~N"); + break; + case t_boolean: + w(g, " private boolean ~W0;~N"); + break; + } + q = q->next; + } + w(g, "~N"); +} + +static void generate_copyfrom(struct generator * g) { + + struct name * q; + w(g, "~+~+~Mprivate void copy_from(~n other) {~+~N"); + for (q = g->analyser->names; q != 0; q = q->next) { + g->V[0] = q; + switch (q->type) { + case t_string: + case t_integer: + case t_boolean: + w(g, "~M~W0 = other.~W0;~N"); + break; + } + } + w(g, "~Msuper.copy_from(other);~N"); + w(g, "~-~M}~-~-~N"); +} + +static void generate_methods(struct generator * g) { + + struct node * p = g->analyser->program; + while (p != 0) { + generate(g, p); + g->unreachable = false; + p = p->right; + } +} + +extern void generate_program_java(struct generator * g) { + + g->outbuf = str_new(); + g->failure_str = str_new(); + + generate_start_comment(g); + generate_class_begin(g); + + generate_amongs(g); + generate_groupings(g); + + generate_members(g); + generate_copyfrom(g); + generate_methods(g); + generate_equals(g); + + generate_class_end(g); + + output_str(g->options->output_java, g->outbuf); + str_delete(g->failure_str); + str_delete(g->outbuf); +} + +extern struct generator * create_generator_java(struct analyser * a, struct options * o) { + + NEW(generator, g); + g->analyser = a; + g->options = o; + g->margin = 0; + g->debug_count = 0; + g->unreachable = false; + return g; +} + +extern void close_generator_java(struct generator * g) { + + FREE(g); +} + diff --git a/contrib/snowball/compiler/header.h b/contrib/snowball/compiler/header.h new file mode 100644 index 000000000..9baf1d917 --- /dev/null +++ b/contrib/snowball/compiler/header.h @@ -0,0 +1,324 @@ + +typedef unsigned char byte; +typedef unsigned short symbol; + +#define true 1 +#define false 0 +#define repeat while(true) +#define unless(C) if(!(C)) +#define until(C) while(!(C)) + +#define MALLOC check_malloc +#define FREE check_free + +#define NEW(type, p) struct type * p = (struct type *) MALLOC(sizeof(struct type)) +#define NEWVEC(type, p, n) struct type * p = (struct type *) MALLOC(sizeof(struct type) * n) + +#define STARTSIZE 10 +#define SIZE(p) ((int *)(p))[-1] +#define CAPACITY(p) ((int *)(p))[-2] + +extern symbol * create_b(int n); +extern void report_b(FILE * out, symbol * p); +extern void lose_b(symbol * p); +extern symbol * increase_capacity(symbol * p, int n); +extern symbol * move_to_b(symbol * p, int n, symbol * q); +extern symbol * add_to_b(symbol * p, int n, symbol * q); +extern symbol * copy_b(symbol * p); +extern char * b_to_s(symbol * p); +extern symbol * add_s_to_b(symbol * p, const char * s); + +struct str; /* defined in space.c */ + +extern struct str * str_new(void); +extern void str_delete(struct str * str); +extern void str_append(struct str * str, struct str * add); +extern void str_append_ch(struct str * str, char add); +extern void str_append_b(struct str * str, symbol * q); +extern void str_append_string(struct str * str, const char * s); +extern void str_append_int(struct str * str, int i); +extern void str_clear(struct str * str); +extern void str_assign(struct str * str, char * s); +extern struct str * str_copy(struct str * old); +extern symbol * str_data(struct str * str); +extern int str_len(struct str * str); +extern int get_utf8(const symbol * p, int * slot); +extern int put_utf8(int ch, symbol * p); + +struct m_pair { + + struct m_pair * next; + symbol * name; + symbol * value; + +}; + +/* struct input must be a prefix of struct tokeniser. */ +struct input { + + struct input * next; + symbol * p; + int c; + char * file; + int line_number; + +}; + +struct include { + + struct include * next; + symbol * b; + +}; + +/* struct input must be a prefix of struct tokeniser. */ +struct tokeniser { + + struct input * next; + symbol * p; + int c; + char * file; + int line_number; + symbol * b; + symbol * b2; + int number; + int m_start; + int m_end; + struct m_pair * m_pairs; + int get_depth; + int error_count; + int token; + int previous_token; + byte token_held; + byte widechars; + byte utf8; + + int omission; + struct include * includes; + +}; + +extern symbol * get_input(symbol * p, char ** p_file); +extern struct tokeniser * create_tokeniser(symbol * b, char * file); +extern int read_token(struct tokeniser * t); +extern const char * name_of_token(int code); +extern void close_tokeniser(struct tokeniser * t); + +enum token_codes { + +#include "syswords2.h" + + c_mathassign, + c_name, + c_number, + c_literalstring, + c_neg, + c_call, + c_grouping, + c_booltest +}; + +extern int space_count; +extern void * check_malloc(int n); +extern void check_free(void * p); + +struct node; + +struct name { + + struct name * next; + symbol * b; + int type; /* t_string etc */ + int mode; /* )_ for routines, externals */ + struct node * definition; /* ) */ + int count; /* 0, 1, 2 for each type */ + struct grouping * grouping; /* for grouping names */ + byte referenced; + byte used; + +}; + +struct literalstring { + + struct literalstring * next; + symbol * b; + +}; + +struct amongvec { + + symbol * b; /* the string giving the case */ + int size; /* - and its size */ + struct node * p; /* the corresponding command */ + int i; /* the amongvec index of the longest substring of b */ + int result; /* the numeric result for the case */ + struct name * function; + +}; + +struct among { + + struct among * next; + struct amongvec * b; /* pointer to the amongvec */ + int number; /* amongs are numbered 0, 1, 2 ... */ + int literalstring_count; /* in this among */ + int command_count; /* in this among */ + struct node * starter; /* i.e. among( (starter) 'string' ... ) */ + struct node * substring; /* i.e. substring ... among ( ... ) */ +}; + +struct grouping { + + struct grouping * next; + int number; /* groupings are numbered 0, 1, 2 ... */ + symbol * b; /* the characters of this group */ + int largest_ch; /* character with max code */ + int smallest_ch; /* character with min code */ + byte no_gaps; /* not used in generator.c after 11/5/05 */ + struct name * name; /* so g->name->grouping == g */ +}; + +struct node { + + struct node * next; + struct node * left; + struct node * aux; /* used in setlimit */ + struct among * among; /* used in among */ + struct node * right; + int type; + int mode; + struct node * AE; + struct name * name; + symbol * literalstring; + int number; + int line_number; + int amongvar_needed; /* used in routine definitions */ +}; + +enum name_types { + + t_size = 6, + + t_string = 0, t_boolean = 1, t_integer = 2, t_routine = 3, t_external = 4, + t_grouping = 5 + +/* If this list is extended, adjust wvn in generator.c */ +}; + +/* In name_count[i] below, remember that + type is + ----+---- + 0 | string + 1 | boolean + 2 | integer + 3 | routine + 4 | external + 5 | grouping +*/ + +struct analyser { + + struct tokeniser * tokeniser; + struct node * nodes; + struct name * names; + struct literalstring * literalstrings; + int mode; + byte modifyable; /* false inside reverse(...) */ + struct node * program; + struct node * program_end; + int name_count[t_size]; /* name_count[i] counts the number of names of type i */ + struct among * amongs; + struct among * amongs_end; + int among_count; + int amongvar_needed; /* used in reading routine definitions */ + struct grouping * groupings; + struct grouping * groupings_end; + struct node * substring; /* pending 'substring' in current routine definition */ + byte utf8; +}; + +enum analyser_modes { + + m_forward = 0, m_backward /*, m_integer */ + +}; + +extern void print_program(struct analyser * a); +extern struct analyser * create_analyser(struct tokeniser * t); +extern void close_analyser(struct analyser * a); + +extern void read_program(struct analyser * a); + +struct generator { + + struct analyser * analyser; + struct options * options; + int unreachable; /* 0 if code can be reached, 1 if current code + * is unreachable. */ + int var_number; /* Number of next variable to use. */ + struct str * outbuf; /* temporary str to store output */ + struct str * declarations; /* str storing variable declarations */ + int next_label; + int margin; + + const char * failure_string; /* String to output in case of a failure. */ +#ifndef DISABLE_JAVA + struct str * failure_str; /* This is used by the java generator instead of failure_string */ +#endif + + int label_used; /* Keep track of whether the failure label is used. */ + int failure_label; + int debug_count; + + const char * S[10]; /* strings */ + symbol * B[10]; /* blocks */ + int I[10]; /* integers */ + struct name * V[5]; /* variables */ + symbol * L[5]; /* literals, used in formatted write */ + + int line_count; /* counts number of lines output */ + int line_labelled; /* in ANSI C, will need extra ';' if it is a block end */ + int literalstring_count; + int keep_count; /* used to number keep/restore pairs to avoid compiler warnings + about shadowed variables */ +}; + +struct options { + + /* for the command line: */ + + char * output_file; + char * name; + FILE * output_c; + FILE * output_h; +#ifndef DISABLE_JAVA + FILE * output_java; +#endif + byte syntax_tree; + byte widechars; + enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS } make_lang; + char * externals_prefix; + char * variables_prefix; + char * runtime_path; + char * parent_class_name; + char * package; + char * string_class; + char * among_class; + struct include * includes; + struct include * includes_end; + byte utf8; +}; + +/* Generator for C code. */ +extern struct generator * create_generator_c(struct analyser * a, struct options * o); +extern void close_generator_c(struct generator * g); + +extern void generate_program_c(struct generator * g); + +#ifndef DISABLE_JAVA +/* Generator for Java code. */ +extern struct generator * create_generator_java(struct analyser * a, struct options * o); +extern void close_generator_java(struct generator * g); + +extern void generate_program_java(struct generator * g); +#endif diff --git a/contrib/snowball/compiler/space.c b/contrib/snowball/compiler/space.c new file mode 100644 index 000000000..cd5fd863d --- /dev/null +++ b/contrib/snowball/compiler/space.c @@ -0,0 +1,263 @@ + +#include <stdio.h> /* for printf */ +#include <stdlib.h> /* malloc, free */ +#include <string.h> /* memmove */ + +#include "header.h" + +#define HEAD 2*sizeof(int) +#define EXTENDER 40 + + +/* This modules provides a simple mechanism for arbitrary length writable + strings, called 'blocks'. They are 'symbol *' items rather than 'char *' + items however. + + The calls are: + + symbol * b = create_b(n); + - create an empty block b with room for n symbols + b = increase_capacity(b, n); + - increase the capacity of block b by n symbols (b may change) + b2 = copy_b(b) + - copy block b into b2 + lose_b(b); + - lose block b + b = move_to_b(b, n, p); + - set the data in b to be the n symbols at address p + b = add_to_b(b, n, p); + - add the n symbols at address p to the end of the data in b + SIZE(b) + - is the number of symbols in b + For example: + + symbol * b = create_b(0); + { int i; + char p[10]; + for (i = 0; i < 100; i++) { + sprintf(p, " %d", i); + add_s_to_b(b, p); + } + } + + and b contains " 0 1 2 ... 99" spaced out as symbols. +*/ + +/* For a block b, SIZE(b) is the number of symbols so far written into it, + CAPACITY(b) the total number it can contain, so SIZE(b) <= CAPACITY(b). + In fact blocks have 1 extra character over the promised capacity so + they can be zero terminated by 'b[SIZE(b)] = 0;' without fear of + overwriting. +*/ + +extern symbol * create_b(int n) { + symbol * p = (symbol *) (HEAD + (char *) MALLOC(HEAD + (n + 1) * sizeof(symbol))); + CAPACITY(p) = n; + SIZE(p) = 0; + return p; +} + +extern void report_b(FILE * out, symbol * p) { + int i; + for (i = 0; i < SIZE(p); i++) fprintf(out, "%c", p[i]); +} + +extern void lose_b(symbol * p) { + if (p == 0) return; + FREE((char *) p - HEAD); +} + +extern symbol * increase_capacity(symbol * p, int n) { + symbol * q = create_b(CAPACITY(p) + n + EXTENDER); + memmove(q, p, CAPACITY(p) * sizeof(symbol)); + SIZE(q) = SIZE(p); + lose_b(p); return q; +} + +extern symbol * move_to_b(symbol * p, int n, symbol * q) { + int x = n - CAPACITY(p); + if (x > 0) p = increase_capacity(p, x); + memmove(p, q, n * sizeof(symbol)); SIZE(p) = n; return p; +} + +extern symbol * add_to_b(symbol * p, int n, symbol * q) { + int x = SIZE(p) + n - CAPACITY(p); + if (x > 0) p = increase_capacity(p, x); + memmove(p + SIZE(p), q, n * sizeof(symbol)); SIZE(p) += n; return p; +} + +extern symbol * copy_b(symbol * p) { + int n = SIZE(p); + symbol * q = create_b(n); + move_to_b(q, n, p); + return q; +} + +int space_count = 0; + +extern void * check_malloc(int n) { + space_count++; + return malloc(n); +} + +extern void check_free(void * p) { + space_count--; + free(p); +} + +/* To convert a block to a zero terminated string: */ + +extern char * b_to_s(symbol * p) { + int n = SIZE(p); + char * s = (char *)malloc(n + 1); + { + int i; + for (i = 0; i < n; i++) { + if (p[i] > 255) { + printf("In b_to_s, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]); + exit(1); + } + s[i] = (char)p[i]; + } + } + s[n] = 0; + return s; +} + +/* To add a zero terminated string to a block. If p = 0 the + block is created. */ + +extern symbol * add_s_to_b(symbol * p, const char * s) { + int n = strlen(s); + int k; + if (p == 0) p = create_b(n); + k = SIZE(p); + { + int x = k + n - CAPACITY(p); + if (x > 0) p = increase_capacity(p, x); + } + { + int i; + for (i = 0; i < n; i++) p[i + k] = s[i]; + } + SIZE(p) += n; + return p; +} + +/* The next section defines string handling capabilities in terms + of the lower level block handling capabilities of space.c */ +/* -------------------------------------------------------------*/ + +struct str { + symbol * data; +}; + +/* Create a new string. */ +extern struct str * str_new() { + + struct str * output = (struct str *) malloc(sizeof(struct str)); + output->data = create_b(0); + return output; +} + +/* Delete a string. */ +extern void str_delete(struct str * str) { + + lose_b(str->data); + free(str); +} + +/* Append a str to this str. */ +extern void str_append(struct str * str, struct str * add) { + + symbol * q = add->data; + str->data = add_to_b(str->data, SIZE(q), q); +} + +/* Append a character to this str. */ +extern void str_append_ch(struct str * str, char add) { + + symbol q[1]; + q[0] = add; + str->data = add_to_b(str->data, 1, q); +} + +/* Append a low level block to a str. */ +extern void str_append_b(struct str * str, symbol * q) { + + str->data = add_to_b(str->data, SIZE(q), q); +} + +/* Append a (char *, null teminated) string to a str. */ +extern void str_append_string(struct str * str, const char * s) { + + str->data = add_s_to_b(str->data, s); +} + +/* Append an integer to a str. */ +extern void str_append_int(struct str * str, int i) { + + char s[30]; + sprintf(s, "%d", i); + str_append_string(str, s); +} + +/* Clear a string */ +extern void str_clear(struct str * str) { + + SIZE(str->data) = 0; +} + +/* Set a string */ +extern void str_assign(struct str * str, char * s) { + + str_clear(str); + str_append_string(str, s); +} + +/* Copy a string. */ +extern struct str * str_copy(struct str * old) { + + struct str * newstr = str_new(); + str_append(newstr, old); + return newstr; +} + +/* Get the data stored in this str. */ +extern symbol * str_data(struct str * str) { + + return str->data; +} + +/* Get the length of the str. */ +extern int str_len(struct str * str) { + + return SIZE(str->data); +} + +extern int get_utf8(const symbol * p, int * slot) { + int b0, b1; + b0 = *p++; + if (b0 < 0xC0) { /* 1100 0000 */ + * slot = b0; return 1; + } + b1 = *p++; + if (b0 < 0xE0) { /* 1110 0000 */ + * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2; + } + * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (*p & 0x3F); return 3; +} + +extern int put_utf8(int ch, symbol * p) { + if (ch < 0x80) { + p[0] = ch; return 1; + } + if (ch < 0x800) { + p[0] = (ch >> 6) | 0xC0; + p[1] = (ch & 0x3F) | 0x80; return 2; + } + p[0] = (ch >> 12) | 0xE0; + p[1] = ((ch >> 6) & 0x3F) | 0x80; + p[2] = (ch & 0x3F) | 0x80; return 3; +} + diff --git a/contrib/snowball/compiler/syswords.h b/contrib/snowball/compiler/syswords.h new file mode 100644 index 000000000..917dd5751 --- /dev/null +++ b/contrib/snowball/compiler/syswords.h @@ -0,0 +1,84 @@ +static const struct system_word vocab[80+1] = { + { 0, (const byte *)"", 80+1}, + + { 1, (const byte *)"$", c_dollar }, + { 1, (const byte *)"(", c_bra }, + { 1, (const byte *)")", c_ket }, + { 1, (const byte *)"*", c_multiply }, + { 1, (const byte *)"+", c_plus }, + { 1, (const byte *)"-", c_minus }, + { 1, (const byte *)"/", c_divide }, + { 1, (const byte *)"<", c_ls }, + { 1, (const byte *)"=", c_assign }, + { 1, (const byte *)">", c_gr }, + { 1, (const byte *)"?", c_debug }, + { 1, (const byte *)"[", c_leftslice }, + { 1, (const byte *)"]", c_rightslice }, + { 2, (const byte *)"!=", c_ne }, + { 2, (const byte *)"*=", c_multiplyassign }, + { 2, (const byte *)"+=", c_plusassign }, + { 2, (const byte *)"-=", c_minusassign }, + { 2, (const byte *)"->", c_sliceto }, + { 2, (const byte *)"/*", c_comment2 }, + { 2, (const byte *)"//", c_comment1 }, + { 2, (const byte *)"/=", c_divideassign }, + { 2, (const byte *)"<+", c_insert }, + { 2, (const byte *)"<-", c_slicefrom }, + { 2, (const byte *)"<=", c_le }, + { 2, (const byte *)"==", c_eq }, + { 2, (const byte *)"=>", c_assignto }, + { 2, (const byte *)">=", c_ge }, + { 2, (const byte *)"as", c_as }, + { 2, (const byte *)"do", c_do }, + { 2, (const byte *)"or", c_or }, + { 3, (const byte *)"and", c_and }, + { 3, (const byte *)"for", c_for }, + { 3, (const byte *)"get", c_get }, + { 3, (const byte *)"hex", c_hex }, + { 3, (const byte *)"hop", c_hop }, + { 3, (const byte *)"non", c_non }, + { 3, (const byte *)"not", c_not }, + { 3, (const byte *)"set", c_set }, + { 3, (const byte *)"try", c_try }, + { 4, (const byte *)"fail", c_fail }, + { 4, (const byte *)"goto", c_goto }, + { 4, (const byte *)"loop", c_loop }, + { 4, (const byte *)"next", c_next }, + { 4, (const byte *)"size", c_size }, + { 4, (const byte *)"test", c_test }, + { 4, (const byte *)"true", c_true }, + { 5, (const byte *)"among", c_among }, + { 5, (const byte *)"false", c_false }, + { 5, (const byte *)"limit", c_limit }, + { 5, (const byte *)"unset", c_unset }, + { 6, (const byte *)"atmark", c_atmark }, + { 6, (const byte *)"attach", c_attach }, + { 6, (const byte *)"cursor", c_cursor }, + { 6, (const byte *)"define", c_define }, + { 6, (const byte *)"delete", c_delete }, + { 6, (const byte *)"gopast", c_gopast }, + { 6, (const byte *)"insert", c_insert }, + { 6, (const byte *)"maxint", c_maxint }, + { 6, (const byte *)"minint", c_minint }, + { 6, (const byte *)"repeat", c_repeat }, + { 6, (const byte *)"sizeof", c_sizeof }, + { 6, (const byte *)"tomark", c_tomark }, + { 7, (const byte *)"atleast", c_atleast }, + { 7, (const byte *)"atlimit", c_atlimit }, + { 7, (const byte *)"decimal", c_decimal }, + { 7, (const byte *)"reverse", c_reverse }, + { 7, (const byte *)"setmark", c_setmark }, + { 7, (const byte *)"strings", c_strings }, + { 7, (const byte *)"tolimit", c_tolimit }, + { 8, (const byte *)"booleans", c_booleans }, + { 8, (const byte *)"integers", c_integers }, + { 8, (const byte *)"routines", c_routines }, + { 8, (const byte *)"setlimit", c_setlimit }, + { 9, (const byte *)"backwards", c_backwards }, + { 9, (const byte *)"externals", c_externals }, + { 9, (const byte *)"groupings", c_groupings }, + { 9, (const byte *)"stringdef", c_stringdef }, + { 9, (const byte *)"substring", c_substring }, + { 12, (const byte *)"backwardmode", c_backwardmode }, + { 13, (const byte *)"stringescapes", c_stringescapes } +}; diff --git a/contrib/snowball/compiler/syswords2.h b/contrib/snowball/compiler/syswords2.h new file mode 100644 index 000000000..eb8a91241 --- /dev/null +++ b/contrib/snowball/compiler/syswords2.h @@ -0,0 +1,13 @@ + c_among = 4, c_and, c_as, c_assign, c_assignto, c_atleast, + c_atlimit, c_atmark, c_attach, c_backwardmode, c_backwards, + c_booleans, c_bra, c_comment1, c_comment2, c_cursor, c_debug, + c_decimal, c_define, c_delete, c_divide, c_divideassign, c_do, + c_dollar, c_eq, c_externals, c_fail, c_false, c_for, c_ge, c_get, + c_gopast, c_goto, c_gr, c_groupings, c_hex, c_hop, c_insert, + c_integers, c_ket, c_le, c_leftslice, c_limit, c_loop, c_ls, + c_maxint, c_minint, c_minus, c_minusassign, c_multiply, + c_multiplyassign, c_ne, c_next, c_non, c_not, c_or, c_plus, + c_plusassign, c_repeat, c_reverse, c_rightslice, c_routines, + c_set, c_setlimit, c_setmark, c_size, c_sizeof, c_slicefrom, + c_sliceto, c_stringdef, c_stringescapes, c_strings, c_substring, + c_test, c_tolimit, c_tomark, c_true, c_try, c_unset, diff --git a/contrib/snowball/compiler/tokeniser.c b/contrib/snowball/compiler/tokeniser.c new file mode 100644 index 000000000..3dae5f744 --- /dev/null +++ b/contrib/snowball/compiler/tokeniser.c @@ -0,0 +1,470 @@ + +#include <stdio.h> /* stderr etc */ +#include <stdlib.h> /* malloc free */ +#include <string.h> /* strlen */ +#include <ctype.h> /* isalpha etc */ +#include "header.h" + +struct system_word { + int s_size; /* size of system word */ + const byte * s; /* pointer to the system word */ + int code; /* its internal code */ +}; + + +/* ASCII collating assumed in syswords.c */ + +#include "syswords.h" + +static int smaller(int a, int b) { return a < b ? a : b; } + +extern symbol * get_input(symbol * p, char ** p_file) { + + char * s = b_to_s(p); + { + FILE * input = fopen(s, "r"); + if (input == 0) { free(s); return 0; } + *p_file = s; + { + symbol * u = create_b(STARTSIZE); + int size = 0; + repeat + { int ch = getc(input); + if (ch == EOF) break; + if (size >= CAPACITY(u)) u = increase_capacity(u, size/2); + u[size++] = ch; + } + fclose(input); + SIZE(u) = size; return u; + } + } +} + +static void error(struct tokeniser * t, char * s1, int n, symbol * p, char * s2) { + if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); } + fprintf(stderr, "%s:%d: ", t->file, t->line_number); + unless (s1 == 0) fprintf(stderr, "%s", s1); + unless (p == 0) { + int i; + for (i = 0; i < n; i++) fprintf(stderr, "%c", p[i]); + } + unless (s2 == 0) fprintf(stderr, "%s", s2); + fprintf(stderr, "\n"); + t->error_count++; +} + +static void error1(struct tokeniser * t, char * s) { + error(t, s, 0,0, 0); +} + +static void error2(struct tokeniser * t, char * s) { + error(t, "unexpected end of text after ", 0,0, s); +} + +static int compare_words(int m, symbol * p, int n, const byte * q) { + unless (m == n) return m - n; + { + int i; for (i = 0; i < n; i++) { + int diff = p[i] - q[i]; + unless (diff == 0) return diff; + } + } + return 0; +} + +static int find_word(int n, symbol * p) { + int i = 0; int j = vocab->code; + repeat { + int k = i + (j - i)/2; + const struct system_word * w = vocab + k; + int diff = compare_words(n, p, w->s_size, w->s); + if (diff == 0) return w->code; + if (diff < 0) j = k; else i = k; + if (j - i == 1) break; + } + return -1; +} + +static int get_number(int n, symbol * p) { + int x = 0; + int i; for (i = 0; i < n; i++) x = 10*x + p[i] - '0'; + return x; +} + +static int eq_s(struct tokeniser * t, char * s) { + int l = strlen(s); + if (SIZE(t->p) - t->c < l) return false; + { + int i; + for (i = 0; i < l; i++) if (t->p[t->c + i] != s[i]) return false; + } + t->c += l; return true; +} + +static int white_space(struct tokeniser * t, int ch) { + switch (ch) { + case '\n': t->line_number++; + case '\r': + case '\t': + case ' ': return true; + } + return false; +} + +static symbol * find_in_m(struct tokeniser * t, int n, symbol * p) { + struct m_pair * q = t->m_pairs; + repeat { + if (q == 0) return 0; + { + symbol * name = q->name; + if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value; + } + q = q->next; + } +} + +static int read_literal_string(struct tokeniser * t, int c) { + symbol * p = t->p; + int ch; + SIZE(t->b) = 0; + repeat { + if (c >= SIZE(p)) { error2(t, "'"); return c; } + ch = p[c]; + if (ch == '\n') { error1(t, "string not terminated"); return c; } + c++; + if (ch == t->m_start) { + int c0 = c; + int newlines = false; /* no newlines as yet */ + int black_found = false; /* no printing chars as yet */ + repeat { + if (c >= SIZE(p)) { error2(t, "'"); return c; } + ch = p[c]; c++; + if (ch == t->m_end) break; + unless (white_space(t, ch)) black_found = true; + if (ch == '\n') newlines = true; + if (newlines && black_found) { + error1(t, "string not terminated"); + return c; + } + } + unless (newlines) { + int n = c - c0 - 1; /* macro size */ + int firstch = p[c0]; + symbol * q = find_in_m(t, n, p + c0); + if (q == 0) { + if (n == 1 && (firstch == '\'' || firstch == t->m_start)) + t->b = add_to_b(t->b, 1, p + c0); + else + error(t, "string macro '", n, p + c0, "' undeclared"); + } else + t->b = add_to_b(t->b, SIZE(q), q); + } + } else { + if (ch == '\'') return c; + t->b = add_to_b(t->b, 1, p + c - 1); + } + } +} + +static int next_token(struct tokeniser * t) { + symbol * p = t->p; + int c = t->c; + int ch; + int code = -1; + repeat { + if (c >= SIZE(p)) { t->c = c; return -1; } + ch = p[c]; + if (white_space(t, ch)) { c++; continue; } + if (isalpha(ch)) { + int c0 = c; + while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++; + code = find_word(c - c0, p + c0); + if (code < 0) { + t->b = move_to_b(t->b, c - c0, p + c0); + code = c_name; + } + } else + if (isdigit(ch)) { + int c0 = c; + while (c < SIZE(p) && isdigit(p[c])) c++; + t->number = get_number(c - c0, p + c0); + code = c_number; + } else + if (ch == '\'') { + c = read_literal_string(t, c + 1); + code = c_literalstring; + } else + { + int lim = smaller(2, SIZE(p) - c); + int i; + for (i = lim; i > 0; i--) { + code = find_word(i, p + c); + if (code >= 0) { c += i; break; } + } + } + if (code >= 0) { + t->c = c; + return code; + } + error(t, "'", 1, p + c, "' unknown"); + c++; + continue; + } +} + +static int next_char(struct tokeniser * t) { + if (t->c >= SIZE(t->p)) return -1; + return t->p[t->c++]; +} + +static int next_real_char(struct tokeniser * t) { + repeat { + int ch = next_char(t); + if (white_space(t, ch)) continue; + return ch; + } +} + +static void read_chars(struct tokeniser * t) { + int ch = next_real_char(t); + if (ch < 0) { error2(t, "stringdef"); return; } + { + int c0 = t->c-1; + repeat { + ch = next_char(t); + if (white_space(t, ch) || ch < 0) break; + } + t->b2 = move_to_b(t->b2, t->c - c0 - 1, t->p + c0); + } +} + +static int decimal_to_num(int ch) { + if ('0' <= ch && ch <= '9') return ch - '0'; + return -1; +} + +static int hex_to_num(int ch) { + if ('0' <= ch && ch <= '9') return ch - '0'; + if ('a' <= ch && ch <= 'f') return ch - 'a' + 10; + return -1; +} + +static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) { + int c = 0; int d = 0; + repeat { + while (c < SIZE(p) && p[c] == ' ') c++; + if (c == SIZE(p)) break; + { + int number = 0; + repeat { + int ch = p[c]; + if (c == SIZE(p) || ch == ' ') break; + if (base == 10) { + ch = decimal_to_num(ch); + if (ch < 0) { + error1(t, "decimal string contains non-digits"); + return; + } + } else { + ch = hex_to_num(tolower(ch)); + if (ch < 0) { + error1(t, "hex string contains non-hex characters"); + return; + } + } + number = base * number + ch; + c++; + } + if (t->widechars || t->utf8) { + unless (0 <= number && number <= 0xffff) { + error1(t, "character values exceed 64K"); + return; + } + } else { + unless (0 <= number && number <= 0xff) { + error1(t, "character values exceed 256"); + return; + } + } + if (t->utf8) + d += put_utf8(number, p + d); + else + p[d++] = number; + } + } + SIZE(p) = d; +} + +extern int read_token(struct tokeniser * t) { + symbol * p = t->p; + int held = t->token_held; + t->token_held = false; + if (held) return t->token; + repeat { + int code = next_token(t); + switch (code) { + case c_comment1: /* slash-slash comment */ + while (t->c < SIZE(p) && p[t->c] != '\n') t->c++; + continue; + case c_comment2: /* slash-star comment */ + repeat { + if (t->c >= SIZE(p)) { + error1(t, "/* comment not terminated"); + t->token = -1; + return -1; + } + if (p[t->c] == '\n') t->line_number++; + if (eq_s(t, "*/")) break; + t->c++; + } + continue; + case c_stringescapes: + { + int ch1 = next_real_char(t); + int ch2 = next_real_char(t); + if (ch2 < 0) + { error2(t, "stringescapes"); continue; } + if (ch1 == '\'') + { error1(t, "first stringescape cannot be '"); continue; } + t->m_start = ch1; + t->m_end = ch2; + } + continue; + case c_stringdef: + { + int base = 0; + read_chars(t); + code = read_token(t); + if (code == c_hex) { base = 16; code = read_token(t); } else + if (code == c_decimal) { base = 10; code = read_token(t); } + unless (code == c_literalstring) + { error1(t, "string omitted after stringdef"); continue; } + if (base > 0) convert_numeric_string(t, t->b, base); + { NEW(m_pair, q); + q->next = t->m_pairs; + q->name = copy_b(t->b2); + q->value = copy_b(t->b); + t->m_pairs = q; + } + } + continue; + case c_get: + code = read_token(t); + unless (code == c_literalstring) { + error1(t, "string omitted after get"); continue; + } + t->get_depth++; + if (t->get_depth > 10) { + fprintf(stderr, "get directives go 10 deep. Looping?\n"); + exit(1); + } + { + char * file; + NEW(input, q); + symbol * u = get_input(t->b, &file); + if (u == 0) { + struct include * r = t->includes; + until (r == 0) { + symbol * b = copy_b(r->b); + b = add_to_b(b, SIZE(t->b), t->b); + u = get_input(b, &file); + lose_b(b); + unless (u == 0) break; + r = r->next; + } + } + if (u == 0) { + error(t, "Can't get '", SIZE(t->b), t->b, "'"); + exit(1); + } + memmove(q, t, sizeof(struct input)); + t->next = q; + t->p = u; + t->c = 0; + t->file = file; + t->line_number = 1; + } + p = t->p; + continue; + case -1: + unless (t->next == 0) { + lose_b(p); + { + struct input * q = t->next; + memmove(t, q, sizeof(struct input)); p = t->p; + FREE(q); + } + t->get_depth--; + continue; + } + /* drop through */ + default: + t->previous_token = t->token; + t->token = code; + return code; + } + } +} + +extern const char * name_of_token(int code) { + int i; + for (i = 1; i < vocab->code; i++) + if ((vocab + i)->code == code) return (const char *)(vocab + i)->s; + switch (code) { + case c_mathassign: return "="; + case c_name: return "name"; + case c_number: return "number"; + case c_literalstring:return "literal"; + case c_neg: return "neg"; + case c_grouping: return "grouping"; + case c_call: return "call"; + case c_booltest: return "Boolean test"; + case -2: return "start of text"; + case -1: return "end of text"; + default: return "?"; + } +} + +extern struct tokeniser * create_tokeniser(symbol * p, char * file) { + NEW(tokeniser, t); + t->next = 0; + t->p = p; + t->c = 0; + t->file = file; + t->line_number = 1; + t->b = create_b(0); + t->b2 = create_b(0); + t->m_start = -1; + t->m_pairs = 0; + t->get_depth = 0; + t->error_count = 0; + t->token_held = false; + t->token = -2; + t->previous_token = -2; + return t; +} + +extern void close_tokeniser(struct tokeniser * t) { + lose_b(t->b); + lose_b(t->b2); + { + struct m_pair * q = t->m_pairs; + until (q == 0) { + struct m_pair * q_next = q->next; + lose_b(q->name); + lose_b(q->value); + FREE(q); + q = q_next; + } + } + { + struct input * q = t->next; + until (q == 0) { + struct input * q_next = q->next; + FREE(q); + q = q_next; + } + } + free(t->file); + FREE(t); +} diff --git a/contrib/snowball/doc/TODO b/contrib/snowball/doc/TODO new file mode 100644 index 000000000..0cfa1b1a8 --- /dev/null +++ b/contrib/snowball/doc/TODO @@ -0,0 +1,15 @@ +Things to do: + + - Write documentation for how to use libstemmer (as opposed to how stemming + algorithms themselves work). + Currently, the documentation in the include/libstemmer.h header file is + pretty clear and comprehensive, but an overview document wouldn't go amiss. + +Things that would be nice to include at some point. + + - Add version numbers to each stemming algorithm, and allow the interface to + request a specific version of the stemming algorithms. Default to providing + the latest version of the algorithm. + - Make mkmodules.pl generate the build system, instead of being called from it. + This would allow it to generate the list of modules to be built, so that it's + not necessary to change things in more than one place to add a new algorithm. diff --git a/contrib/snowball/doc/libstemmer_c_README b/contrib/snowball/doc/libstemmer_c_README new file mode 100644 index 000000000..9d3af8bec --- /dev/null +++ b/contrib/snowball/doc/libstemmer_c_README @@ -0,0 +1,125 @@ +libstemmer_c +============ + +This document pertains to the C version of the libstemmer distribution, +available for download from: + +http://snowball.tartarus.org/dist/libstemmer_c.tgz + + +Compiling the library +===================== + +A simple makefile is provided for Unix style systems. On such systems, it +should be possible simply to run "make", and the file "libstemmer.o" +and the example program "stemwords" will be generated. + +If this doesn't work on your system, you need to write your own build +system (or call the compiler directly). The files to compile are +all contained in the "libstemmer", "runtime" and "src_c" directories, +and the public header file is contained in the "include" directory. + +The library comes in two flavours; UTF-8 only, and UTF-8 plus other character +sets. To use the utf-8 only flavour, compile "libstemmer_utf8.c" instead of +"libstemmer.c". + +For convenience "mkinc.mak" is a makefile fragment listing the source files and +header files used to compile the standard version of the library. +"mkinc_utf8.mak" is a comparable makefile fragment listing just the source +files for the UTF-8 only version of the library. + + +Using the library +================= + +The library provides a simple C API. Essentially, a new stemmer can +be obtained by using "sb_stemmer_new". "sb_stemmer_stem" is then +used to stem a word, "sb_stemmer_length" returns the stemmed +length of the last word processed, and "sb_stemmer_delete" is +used to delete a stemmer. + +Creating a stemmer is a relatively expensive operation - the expected +usage pattern is that a new stemmer is created when needed, used +to stem many words, and deleted after some time. + +Stemmers are re-entrant, but not threadsafe. In other words, if +you wish to access the same stemmer object from multiple threads, +you must ensure that all access is protected by a mutex or similar +device. + +libstemmer does not currently incorporate any mechanism for caching the results +of stemming operations. Such caching can greatly increase the performance of a +stemmer under certain situations, so suitable patches will be considered for +inclusion. + +The standard libstemmer sources contain an algorithm for each of the supported +languages. The algorithm may be selected using the english name of the +language, or using the 2 or 3 letter ISO 639 language codes. In addition, +the traditional "Porter" stemming algorithm for english is included for +backwards compatibility purposes, but we recommend use of the "English" +stemmer in preference for new projects. + +(Some minor algorithms which are included only as curiosities in the snowball +website, such as the Lovins stemmer and the Kraaij Pohlmann stemmer, are not +included in the standard libstemmer sources. These are not really supported by +the snowball project, but it would be possible to compile a modified libstemmer +library containing these if desired.) + + +The stemwords example +===================== + +The stemwords example program allows you to run any of the stemmers +compiled into the libstemmer library on a sample vocabulary. For +details on how to use it, run it with the "-h" command line option. + + +Using the library in a larger system +==================================== + +If you are incorporating the library into the build system of a larger +program, I recommend copying the unpacked tarball without modification into +a subdirectory of the sources of your program. Future versions of the +library are intended to keep the same structure, so this will keep the +work required to move to a new version of the library to a minimum. + +As an additional convenience, the list of source and header files used +in the library is detailed in mkinc.mak - a file which is in a suitable +format for inclusion by a Makefile. By including this file in your build +system, you can link the snowball system into your program with a few +extra rules. + +Using the library in a system using GNU autotools +================================================= + +The libstemmer_c library can be integrated into a larger system which uses the +GNU autotool framework (and in particular, automake and autoconf) as follows: + +1) Unpack libstemmer_c.tgz in the top level project directory so that there is + a libstemmer_c subdirectory of the top level directory of the project. + +2) Add a file "Makefile.am" to the unpacked libstemmer_c folder, containing: + +noinst_LTLIBRARIES = libstemmer.la +include $(srcdir)/mkinc.mak +noinst_HEADERS = $(snowball_headers) +libstemmer_la_SOURCES = $(snowball_sources) + +(You may also need to add other lines to this, for example, if you are using +compiler options which are not compatible with compiling the libstemmer +library.) + +3) Add libstemmer_c to the AC_CONFIG_FILES declaration in the project's + configure.ac file. + +4) Add to the top level makefile the following lines (or modify existing + assignments to these variables appropriately): + +AUTOMAKE_OPTIONS = subdir-objects +AM_CPPFLAGS = -I$(top_srcdir)/libstemmer_c/include +SUBDIRS=libstemmer_c +<name>_LIBADD = libstemmer_c/libstemmer.la + +(Where <name> is the name of the library or executable which links against +libstemmer.) + diff --git a/contrib/snowball/doc/libstemmer_java_README b/contrib/snowball/doc/libstemmer_java_README new file mode 100644 index 000000000..38b1af693 --- /dev/null +++ b/contrib/snowball/doc/libstemmer_java_README @@ -0,0 +1,40 @@ +libstemmer_java +=============== + +This document pertains to the Java version of the libstemmer distribution, +available for download from: + +http://snowball.tartarus.org/dist/libstemmer_java.tgz + + +Compiling the library +===================== + +Simply run the java compiler on all the java source files under the java +directory. For example, this can be done under unix by changing directory into +the java directory, and running: + + javac org/tartarus/snowball/*.java org/tartarus/snowball/ext/*.java + +This will compile the library and also an example program "TestApp" which +provides a command line interface to the library. + + +Using the library +================= + +There is currently no formal documentation on the use of the Java version +of the library. Additionally, its interface is not guaranteed to be +stable. + +The best documentation of the library is the source of the TestApp example +program. + + +The TestApp example +=================== + +The TestApp example program allows you to run any of the stemmers +compiled into the libstemmer library on a sample vocabulary. For +details on how to use it, run it with no command line parameters. + diff --git a/contrib/snowball/examples/stemwords.c b/contrib/snowball/examples/stemwords.c new file mode 100644 index 000000000..995bf40dc --- /dev/null +++ b/contrib/snowball/examples/stemwords.c @@ -0,0 +1,209 @@ +/* This is a simple program which uses libstemmer to provide a command + * line interface for stemming using any of the algorithms provided. + */ + +#include <stdio.h> +#include <stdlib.h> /* for malloc, free */ +#include <string.h> /* for memmove */ +#include <ctype.h> /* for isupper, tolower */ + +#include "libstemmer.h" + +const char * progname; +static int pretty = 1; + +static void +stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out) +{ +#define INC 10 + int lim = INC; + sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol)); + + while(1) { + int ch = getc(f_in); + if (ch == EOF) { + free(b); return; + } + { + int i = 0; + int inlen = 0; + while(1) { + if (ch == '\n' || ch == EOF) break; + if (i == lim) { + sb_symbol * newb; + newb = (sb_symbol *) + realloc(b, (lim + INC) * sizeof(sb_symbol)); + if (newb == 0) goto error; + b = newb; + lim = lim + INC; + } + /* Update count of utf-8 characters. */ + if (ch < 0x80 || ch > 0xBF) inlen += 1; + /* force lower case: */ + if (isupper(ch)) ch = tolower(ch); + + b[i] = ch; + i++; + ch = getc(f_in); + } + + { + const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i); + if (stemmed == NULL) + { + fprintf(stderr, "Out of memory"); + exit(1); + } + else + { + if (pretty == 1) { + fwrite(b, i, 1, f_out); + fputs(" -> ", f_out); + } else if (pretty == 2) { + fwrite(b, i, 1, f_out); + if (sb_stemmer_length(stemmer) > 0) { + int j; + if (inlen < 30) { + for (j = 30 - inlen; j > 0; j--) + fputs(" ", f_out); + } else { + fputs("\n", f_out); + for (j = 30; j > 0; j--) + fputs(" ", f_out); + } + } + } + + fputs((const char *)stemmed, f_out); + putc('\n', f_out); + } + } + } + } +error: + if (b != 0) free(b); + return; +} + +/** Display the command line syntax, and then exit. + * @param n The value to exit with. + */ +static void +usage(int n) +{ + printf("usage: %s [-l <language>] [-i <input file>] [-o <output file>] [-c <character encoding>] [-p[2]] [-h]\n" + "\n" + "The input file consists of a list of words to be stemmed, one per\n" + "line. Words should be in lower case, but (for English) A-Z letters\n" + "are mapped to their a-z equivalents anyway. If omitted, stdin is\n" + "used.\n" + "\n" + "If -c is given, the argument is the character encoding of the input\n" + "and output files. If it is omitted, the UTF-8 encoding is used.\n" + "\n" + "If -p is given the output file consists of each word of the input\n" + "file followed by \"->\" followed by its stemmed equivalent.\n" + "If -p2 is given the output file is a two column layout containing\n" + "the input words in the first column and the stemmed equivalents in\n" + "the second column.\n" + "Otherwise, the output file consists of the stemmed words, one per\n" + "line.\n" + "\n" + "-h displays this help\n", + progname); + exit(n); +} + +int +main(int argc, char * argv[]) +{ + char * in = 0; + char * out = 0; + FILE * f_in; + FILE * f_out; + struct sb_stemmer * stemmer; + + char * language = "english"; + char * charenc = NULL; + + char * s; + int i = 1; + pretty = 0; + + progname = argv[0]; + + while(i < argc) { + s = argv[i++]; + if (s[0] == '-') { + if (strcmp(s, "-o") == 0) { + if (i >= argc) { + fprintf(stderr, "%s requires an argument\n", s); + exit(1); + } + out = argv[i++]; + } else if (strcmp(s, "-i") == 0) { + if (i >= argc) { + fprintf(stderr, "%s requires an argument\n", s); + exit(1); + } + in = argv[i++]; + } else if (strcmp(s, "-l") == 0) { + if (i >= argc) { + fprintf(stderr, "%s requires an argument\n", s); + exit(1); + } + language = argv[i++]; + } else if (strcmp(s, "-c") == 0) { + if (i >= argc) { + fprintf(stderr, "%s requires an argument\n", s); + exit(1); + } + charenc = argv[i++]; + } else if (strcmp(s, "-p2") == 0) { + pretty = 2; + } else if (strcmp(s, "-p") == 0) { + pretty = 1; + } else if (strcmp(s, "-h") == 0) { + usage(0); + } else { + fprintf(stderr, "option %s unknown\n", s); + usage(1); + } + } else { + fprintf(stderr, "unexpected parameter %s\n", s); + usage(1); + } + } + + /* prepare the files */ + f_in = (in == 0) ? stdin : fopen(in, "r"); + if (f_in == 0) { + fprintf(stderr, "file %s not found\n", in); + exit(1); + } + f_out = (out == 0) ? stdout : fopen(out, "w"); + if (f_out == 0) { + fprintf(stderr, "file %s cannot be opened\n", out); + exit(1); + } + + /* do the stemming process: */ + stemmer = sb_stemmer_new(language, charenc); + if (stemmer == 0) { + if (charenc == NULL) { + fprintf(stderr, "language `%s' not available for stemming\n", language); + exit(1); + } else { + fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc); + exit(1); + } + } + stem_file(stemmer, f_in, f_out); + sb_stemmer_delete(stemmer); + + if (in != 0) (void) fclose(f_in); + if (out != 0) (void) fclose(f_out); + + return 0; +} + diff --git a/contrib/snowball/include/libstemmer.h b/contrib/snowball/include/libstemmer.h new file mode 100644 index 000000000..9d86b8581 --- /dev/null +++ b/contrib/snowball/include/libstemmer.h @@ -0,0 +1,79 @@ + +/* Make header file work when included from C++ */ +#ifdef __cplusplus +extern "C" { +#endif + +struct sb_stemmer; +typedef unsigned char sb_symbol; + +/* FIXME - should be able to get a version number for each stemming + * algorithm (which will be incremented each time the output changes). */ + +/** Returns an array of the names of the available stemming algorithms. + * Note that these are the canonical names - aliases (ie, other names for + * the same algorithm) will not be included in the list. + * The list is terminated with a null pointer. + * + * The list must not be modified in any way. + */ +const char ** sb_stemmer_list(void); + +/** Create a new stemmer object, using the specified algorithm, for the + * specified character encoding. + * + * All algorithms will usually be available in UTF-8, but may also be + * available in other character encodings. + * + * @param algorithm The algorithm name. This is either the english + * name of the algorithm, or the 2 or 3 letter ISO 639 codes for the + * language. Note that case is significant in this parameter - the + * value should be supplied in lower case. + * + * @param charenc The character encoding. NULL may be passed as + * this value, in which case UTF-8 encoding will be assumed. Otherwise, + * the argument may be one of "UTF_8", "ISO_8859_1" (ie, Latin 1), + * "CP850" (ie, MS-DOS Latin 1) or "KOI8_R" (Russian). Note that + * case is significant in this parameter. + * + * @return NULL if the specified algorithm is not recognised, or the + * algorithm is not available for the requested encoding. Otherwise, + * returns a pointer to a newly created stemmer for the requested algorithm. + * The returned pointer must be deleted by calling sb_stemmer_delete(). + * + * @note NULL will also be returned if an out of memory error occurs. + */ +struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc); + +/** Delete a stemmer object. + * + * This frees all resources allocated for the stemmer. After calling + * this function, the supplied stemmer may no longer be used in any way. + * + * It is safe to pass a null pointer to this function - this will have + * no effect. + */ +void sb_stemmer_delete(struct sb_stemmer * stemmer); + +/** Stem a word. + * + * The return value is owned by the stemmer - it must not be freed or + * modified, and it will become invalid when the stemmer is called again, + * or if the stemmer is freed. + * + * The length of the return value can be obtained using sb_stemmer_length(). + * + * If an out-of-memory error occurs, this will return NULL. + */ +const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer, + const sb_symbol * word, int size); + +/** Get the length of the result of the last stemmed word. + * This should not be called before sb_stemmer_stem() has been called. + */ +int sb_stemmer_length(struct sb_stemmer * stemmer); + +#ifdef __cplusplus +} +#endif + diff --git a/contrib/snowball/java/org/tartarus/snowball/Among.java b/contrib/snowball/java/org/tartarus/snowball/Among.java new file mode 100644 index 000000000..5ed37b503 --- /dev/null +++ b/contrib/snowball/java/org/tartarus/snowball/Among.java @@ -0,0 +1,31 @@ +package org.tartarus.snowball; + +import java.lang.reflect.Method; + +public class Among { + public Among (String s, int substring_i, int result, + String methodname, SnowballProgram methodobject) { + this.s_size = s.length(); + this.s = s.toCharArray(); + this.substring_i = substring_i; + this.result = result; + this.methodobject = methodobject; + if (methodname.length() == 0) { + this.method = null; + } else { + try { + this.method = methodobject.getClass(). + getDeclaredMethod(methodname, new Class[0]); + } catch (NoSuchMethodException e) { + throw new RuntimeException(e); + } + } + } + + public final int s_size; /* search string */ + public final char[] s; /* search string */ + public final int substring_i; /* index to longest matching substring */ + public final int result; /* result of the lookup */ + public final Method method; /* method to use if substring matches */ + public final SnowballProgram methodobject; /* object to invoke method on */ +}; diff --git a/contrib/snowball/java/org/tartarus/snowball/SnowballProgram.java b/contrib/snowball/java/org/tartarus/snowball/SnowballProgram.java new file mode 100644 index 000000000..52d6baa78 --- /dev/null +++ b/contrib/snowball/java/org/tartarus/snowball/SnowballProgram.java @@ -0,0 +1,432 @@ + +package org.tartarus.snowball; +import java.lang.reflect.InvocationTargetException; + +public class SnowballProgram { + protected SnowballProgram() + { + current = new StringBuffer(); + setCurrent(""); + } + + /** + * Set the current string. + */ + public void setCurrent(String value) + { + current.replace(0, current.length(), value); + cursor = 0; + limit = current.length(); + limit_backward = 0; + bra = cursor; + ket = limit; + } + + /** + * Get the current string. + */ + public String getCurrent() + { + String result = current.toString(); + // Make a new StringBuffer. If we reuse the old one, and a user of + // the library keeps a reference to the buffer returned (for example, + // by converting it to a String in a way which doesn't force a copy), + // the buffer size will not decrease, and we will risk wasting a large + // amount of memory. + // Thanks to Wolfram Esser for spotting this problem. + current = new StringBuffer(); + return result; + } + + // current string + protected StringBuffer current; + + protected int cursor; + protected int limit; + protected int limit_backward; + protected int bra; + protected int ket; + + protected void copy_from(SnowballProgram other) + { + current = other.current; + cursor = other.cursor; + limit = other.limit; + limit_backward = other.limit_backward; + bra = other.bra; + ket = other.ket; + } + + protected boolean in_grouping(char [] s, int min, int max) + { + if (cursor >= limit) return false; + char ch = current.charAt(cursor); + if (ch > max || ch < min) return false; + ch -= min; + if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; + cursor++; + return true; + } + + protected boolean in_grouping_b(char [] s, int min, int max) + { + if (cursor <= limit_backward) return false; + char ch = current.charAt(cursor - 1); + if (ch > max || ch < min) return false; + ch -= min; + if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; + cursor--; + return true; + } + + protected boolean out_grouping(char [] s, int min, int max) + { + if (cursor >= limit) return false; + char ch = current.charAt(cursor); + if (ch > max || ch < min) { + cursor++; + return true; + } + ch -= min; + if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { + cursor ++; + return true; + } + return false; + } + + protected boolean out_grouping_b(char [] s, int min, int max) + { + if (cursor <= limit_backward) return false; + char ch = current.charAt(cursor - 1); + if (ch > max || ch < min) { + cursor--; + return true; + } + ch -= min; + if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { + cursor--; + return true; + } + return false; + } + + protected boolean in_range(int min, int max) + { + if (cursor >= limit) return false; + char ch = current.charAt(cursor); + if (ch > max || ch < min) return false; + cursor++; + return true; + } + + protected boolean in_range_b(int min, int max) + { + if (cursor <= limit_backward) return false; + char ch = current.charAt(cursor - 1); + if (ch > max || ch < min) return false; + cursor--; + return true; + } + + protected boolean out_range(int min, int max) + { + if (cursor >= limit) return false; + char ch = current.charAt(cursor); + if (!(ch > max || ch < min)) return false; + cursor++; + return true; + } + + protected boolean out_range_b(int min, int max) + { + if (cursor <= limit_backward) return false; + char ch = current.charAt(cursor - 1); + if(!(ch > max || ch < min)) return false; + cursor--; + return true; + } + + protected boolean eq_s(int s_size, String s) + { + if (limit - cursor < s_size) return false; + int i; + for (i = 0; i != s_size; i++) { + if (current.charAt(cursor + i) != s.charAt(i)) return false; + } + cursor += s_size; + return true; + } + + protected boolean eq_s_b(int s_size, String s) + { + if (cursor - limit_backward < s_size) return false; + int i; + for (i = 0; i != s_size; i++) { + if (current.charAt(cursor - s_size + i) != s.charAt(i)) return false; + } + cursor -= s_size; + return true; + } + + protected boolean eq_v(CharSequence s) + { + return eq_s(s.length(), s.toString()); + } + + protected boolean eq_v_b(CharSequence s) + { return eq_s_b(s.length(), s.toString()); + } + + protected int find_among(Among v[], int v_size) + { + int i = 0; + int j = v_size; + + int c = cursor; + int l = limit; + + int common_i = 0; + int common_j = 0; + + boolean first_key_inspected = false; + + while(true) { + int k = i + ((j - i) >> 1); + int diff = 0; + int common = common_i < common_j ? common_i : common_j; // smaller + Among w = v[k]; + int i2; + for (i2 = common; i2 < w.s_size; i2++) { + if (c + common == l) { + diff = -1; + break; + } + diff = current.charAt(c + common) - w.s[i2]; + if (diff != 0) break; + common++; + } + if (diff < 0) { + j = k; + common_j = common; + } else { + i = k; + common_i = common; + } + if (j - i <= 1) { + if (i > 0) break; // v->s has been inspected + if (j == i) break; // only one item in v + + // - but now we need to go round once more to get + // v->s inspected. This looks messy, but is actually + // the optimal approach. + + if (first_key_inspected) break; + first_key_inspected = true; + } + } + while(true) { + Among w = v[i]; + if (common_i >= w.s_size) { + cursor = c + w.s_size; + if (w.method == null) return w.result; + boolean res; + try { + Object resobj = w.method.invoke(w.methodobject, + new Object[0]); + res = resobj.toString().equals("true"); + } catch (InvocationTargetException e) { + res = false; + // FIXME - debug message + } catch (IllegalAccessException e) { + res = false; + // FIXME - debug message + } + cursor = c + w.s_size; + if (res) return w.result; + } + i = w.substring_i; + if (i < 0) return 0; + } + } + + // find_among_b is for backwards processing. Same comments apply + protected int find_among_b(Among v[], int v_size) + { + int i = 0; + int j = v_size; + + int c = cursor; + int lb = limit_backward; + + int common_i = 0; + int common_j = 0; + + boolean first_key_inspected = false; + + while(true) { + int k = i + ((j - i) >> 1); + int diff = 0; + int common = common_i < common_j ? common_i : common_j; + Among w = v[k]; + int i2; + for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) { + if (c - common == lb) { + diff = -1; + break; + } + diff = current.charAt(c - 1 - common) - w.s[i2]; + if (diff != 0) break; + common++; + } + if (diff < 0) { + j = k; + common_j = common; + } else { + i = k; + common_i = common; + } + if (j - i <= 1) { + if (i > 0) break; + if (j == i) break; + if (first_key_inspected) break; + first_key_inspected = true; + } + } + while(true) { + Among w = v[i]; + if (common_i >= w.s_size) { + cursor = c - w.s_size; + if (w.method == null) return w.result; + + boolean res; + try { + Object resobj = w.method.invoke(w.methodobject, + new Object[0]); + res = resobj.toString().equals("true"); + } catch (InvocationTargetException e) { + res = false; + // FIXME - debug message + } catch (IllegalAccessException e) { + res = false; + // FIXME - debug message + } + cursor = c - w.s_size; + if (res) return w.result; + } + i = w.substring_i; + if (i < 0) return 0; + } + } + + /* to replace chars between c_bra and c_ket in current by the + * chars in s. + */ + protected int replace_s(int c_bra, int c_ket, String s) + { + int adjustment = s.length() - (c_ket - c_bra); + current.replace(c_bra, c_ket, s); + limit += adjustment; + if (cursor >= c_ket) cursor += adjustment; + else if (cursor > c_bra) cursor = c_bra; + return adjustment; + } + + protected void slice_check() + { + if (bra < 0 || + bra > ket || + ket > limit || + limit > current.length()) // this line could be removed + { + System.err.println("faulty slice operation"); + // FIXME: report error somehow. + /* + fprintf(stderr, "faulty slice operation:\n"); + debug(z, -1, 0); + exit(1); + */ + } + } + + protected void slice_from(String s) + { + slice_check(); + replace_s(bra, ket, s); + } + + protected void slice_from(CharSequence s) + { + slice_from(s.toString()); + } + + protected void slice_del() + { + slice_from(""); + } + + protected void insert(int c_bra, int c_ket, String s) + { + int adjustment = replace_s(c_bra, c_ket, s); + if (c_bra <= bra) bra += adjustment; + if (c_bra <= ket) ket += adjustment; + } + + protected void insert(int c_bra, int c_ket, CharSequence s) + { + insert(c_bra, c_ket, s.toString()); + } + + /* Copy the slice into the supplied StringBuffer */ + protected StringBuffer slice_to(StringBuffer s) + { + slice_check(); + int len = ket - bra; + s.replace(0, s.length(), current.substring(bra, ket)); + return s; + } + + /* Copy the slice into the supplied StringBuilder */ + protected StringBuilder slice_to(StringBuilder s) + { + slice_check(); + int len = ket - bra; + s.replace(0, s.length(), current.substring(bra, ket)); + return s; + } + + protected StringBuffer assign_to(StringBuffer s) + { + s.replace(0, s.length(), current.substring(0, limit)); + return s; + } + + protected StringBuilder assign_to(StringBuilder s) + { + s.replace(0, s.length(), current.substring(0, limit)); + return s; + } + +/* +extern void debug(struct SN_env * z, int number, int line_count) +{ int i; + int limit = SIZE(z->p); + //if (number >= 0) printf("%3d (line %4d): '", number, line_count); + if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit); + for (i = 0; i <= limit; i++) + { if (z->lb == i) printf("{"); + if (z->bra == i) printf("["); + if (z->c == i) printf("|"); + if (z->ket == i) printf("]"); + if (z->l == i) printf("}"); + if (i < limit) + { int ch = z->p[i]; + if (ch == 0) ch = '#'; + printf("%c", ch); + } + } + printf("'\n"); +} +*/ + +}; diff --git a/contrib/snowball/java/org/tartarus/snowball/SnowballStemmer.java b/contrib/snowball/java/org/tartarus/snowball/SnowballStemmer.java new file mode 100644 index 000000000..960bd55f6 --- /dev/null +++ b/contrib/snowball/java/org/tartarus/snowball/SnowballStemmer.java @@ -0,0 +1,7 @@ + +package org.tartarus.snowball; +import java.lang.reflect.InvocationTargetException; + +public abstract class SnowballStemmer extends SnowballProgram { + public abstract boolean stem(); +}; diff --git a/contrib/snowball/java/org/tartarus/snowball/TestApp.java b/contrib/snowball/java/org/tartarus/snowball/TestApp.java new file mode 100644 index 000000000..38803f673 --- /dev/null +++ b/contrib/snowball/java/org/tartarus/snowball/TestApp.java @@ -0,0 +1,77 @@ + +package org.tartarus.snowball; + +import java.lang.reflect.Method; +import java.io.Reader; +import java.io.Writer; +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileInputStream; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.OutputStream; +import java.io.FileOutputStream; + +public class TestApp { + private static void usage() + { + System.err.println("Usage: TestApp <algorithm> <input file> [-o <output file>]"); + } + + public static void main(String [] args) throws Throwable { + if (args.length < 2) { + usage(); + return; + } + + Class stemClass = Class.forName("org.tartarus.snowball.ext." + + args[0] + "Stemmer"); + SnowballStemmer stemmer = (SnowballStemmer) stemClass.newInstance(); + + Reader reader; + reader = new InputStreamReader(new FileInputStream(args[1])); + reader = new BufferedReader(reader); + + StringBuffer input = new StringBuffer(); + + OutputStream outstream; + + if (args.length > 2) { + if (args.length >= 4 && args[2].equals("-o")) { + outstream = new FileOutputStream(args[3]); + } else { + usage(); + return; + } + } else { + outstream = System.out; + } + Writer output = new OutputStreamWriter(outstream); + output = new BufferedWriter(output); + + int repeat = 1; + if (args.length > 4) { + repeat = Integer.parseInt(args[4]); + } + + Object [] emptyArgs = new Object[0]; + int character; + while ((character = reader.read()) != -1) { + char ch = (char) character; + if (Character.isWhitespace((char) ch)) { + if (input.length() > 0) { + stemmer.setCurrent(input.toString()); + for (int i = repeat; i != 0; i--) { + stemmer.stem(); + } + output.write(stemmer.getCurrent()); + output.write('\n'); + input.delete(0, input.length()); + } + } else { + input.append(Character.toLowerCase(ch)); + } + } + output.flush(); + } +} diff --git a/contrib/snowball/runtime/api.c b/contrib/snowball/runtime/api.c new file mode 100644 index 000000000..40039ef4a --- /dev/null +++ b/contrib/snowball/runtime/api.c @@ -0,0 +1,66 @@ + +#include <stdlib.h> /* for calloc, free */ +#include "header.h" + +extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size) +{ + struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env)); + if (z == NULL) return NULL; + z->p = create_s(); + if (z->p == NULL) goto error; + if (S_size) + { + int i; + z->S = (symbol * *) calloc(S_size, sizeof(symbol *)); + if (z->S == NULL) goto error; + + for (i = 0; i < S_size; i++) + { + z->S[i] = create_s(); + if (z->S[i] == NULL) goto error; + } + } + + if (I_size) + { + z->I = (int *) calloc(I_size, sizeof(int)); + if (z->I == NULL) goto error; + } + + if (B_size) + { + z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char)); + if (z->B == NULL) goto error; + } + + return z; +error: + SN_close_env(z, S_size); + return NULL; +} + +extern void SN_close_env(struct SN_env * z, int S_size) +{ + if (z == NULL) return; + if (S_size) + { + int i; + for (i = 0; i < S_size; i++) + { + lose_s(z->S[i]); + } + free(z->S); + } + free(z->I); + free(z->B); + if (z->p) lose_s(z->p); + free(z); +} + +extern int SN_set_current(struct SN_env * z, int size, const symbol * s) +{ + int err = replace_s(z, 0, z->l, size, s, NULL); + z->c = 0; + return err; +} + diff --git a/contrib/snowball/runtime/api.h b/contrib/snowball/runtime/api.h new file mode 100644 index 000000000..8b997f0c2 --- /dev/null +++ b/contrib/snowball/runtime/api.h @@ -0,0 +1,26 @@ + +typedef unsigned char symbol; + +/* Or replace 'char' above with 'short' for 16 bit characters. + + More precisely, replace 'char' with whatever type guarantees the + character width you need. Note however that sizeof(symbol) should divide + HEAD, defined in header.h as 2*sizeof(int), without remainder, otherwise + there is an alignment problem. In the unlikely event of a problem here, + consult Martin Porter. + +*/ + +struct SN_env { + symbol * p; + int c; int l; int lb; int bra; int ket; + symbol * * S; + int * I; + unsigned char * B; +}; + +extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size); +extern void SN_close_env(struct SN_env * z, int S_size); + +extern int SN_set_current(struct SN_env * z, int size, const symbol * s); + diff --git a/contrib/snowball/runtime/header.h b/contrib/snowball/runtime/header.h new file mode 100644 index 000000000..4d3078f50 --- /dev/null +++ b/contrib/snowball/runtime/header.h @@ -0,0 +1,58 @@ + +#include <limits.h> + +#include "api.h" + +#define MAXINT INT_MAX +#define MININT INT_MIN + +#define HEAD 2*sizeof(int) + +#define SIZE(p) ((int *)(p))[-1] +#define SET_SIZE(p, n) ((int *)(p))[-1] = n +#define CAPACITY(p) ((int *)(p))[-2] + +struct among +{ int s_size; /* number of chars in string */ + const symbol * s; /* search string */ + int substring_i;/* index to longest matching substring */ + int result; /* result of the lookup */ + int (* function)(struct SN_env *); +}; + +extern symbol * create_s(void); +extern void lose_s(symbol * p); + +extern int skip_utf8(const symbol * p, int c, int lb, int l, int n); + +extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); +extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); +extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); +extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); + +extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); +extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); +extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); +extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); + +extern int eq_s(struct SN_env * z, int s_size, const symbol * s); +extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s); +extern int eq_v(struct SN_env * z, const symbol * p); +extern int eq_v_b(struct SN_env * z, const symbol * p); + +extern int find_among(struct SN_env * z, const struct among * v, int v_size); +extern int find_among_b(struct SN_env * z, const struct among * v, int v_size); + +extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment); +extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s); +extern int slice_from_v(struct SN_env * z, const symbol * p); +extern int slice_del(struct SN_env * z); + +extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s); +extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p); + +extern symbol * slice_to(struct SN_env * z, symbol * p); +extern symbol * assign_to(struct SN_env * z, symbol * p); + +extern void debug(struct SN_env * z, int number, int line_count); + diff --git a/contrib/snowball/runtime/utilities.c b/contrib/snowball/runtime/utilities.c new file mode 100644 index 000000000..1840f0280 --- /dev/null +++ b/contrib/snowball/runtime/utilities.c @@ -0,0 +1,478 @@ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "header.h" + +#define unless(C) if(!(C)) + +#define CREATE_SIZE 1 + +extern symbol * create_s(void) { + symbol * p; + void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol)); + if (mem == NULL) return NULL; + p = (symbol *) (HEAD + (char *) mem); + CAPACITY(p) = CREATE_SIZE; + SET_SIZE(p, CREATE_SIZE); + return p; +} + +extern void lose_s(symbol * p) { + if (p == NULL) return; + free((char *) p - HEAD); +} + +/* + new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c + if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new + position, or 0 on failure. + + -- used to implement hop and next in the utf8 case. +*/ + +extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) { + int b; + if (n >= 0) { + for (; n > 0; n--) { + if (c >= l) return -1; + b = p[c++]; + if (b >= 0xC0) { /* 1100 0000 */ + while (c < l) { + b = p[c]; + if (b >= 0xC0 || b < 0x80) break; + /* break unless b is 10------ */ + c++; + } + } + } + } else { + for (; n < 0; n++) { + if (c <= lb) return -1; + b = p[--c]; + if (b >= 0x80) { /* 1000 0000 */ + while (c > lb) { + b = p[c]; + if (b >= 0xC0) break; /* 1100 0000 */ + c--; + } + } + } + } + return c; +} + +/* Code for character groupings: utf8 cases */ + +static int get_utf8(const symbol * p, int c, int l, int * slot) { + int b0, b1; + if (c >= l) return 0; + b0 = p[c++]; + if (b0 < 0xC0 || c == l) { /* 1100 0000 */ + * slot = b0; return 1; + } + b1 = p[c++]; + if (b0 < 0xE0 || c == l) { /* 1110 0000 */ + * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2; + } + * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[c] & 0x3F); return 3; +} + +static int get_b_utf8(const symbol * p, int c, int lb, int * slot) { + int b0, b1; + if (c <= lb) return 0; + b0 = p[--c]; + if (b0 < 0x80 || c == lb) { /* 1000 0000 */ + * slot = b0; return 1; + } + b1 = p[--c]; + if (b1 >= 0xC0 || c == lb) { /* 1100 0000 */ + * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2; + } + * slot = (p[c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3; +} + +extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { + do { + int ch; + int w = get_utf8(z->p, z->c, z->l, & ch); + unless (w) return -1; + if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return w; + z->c += w; + } while (repeat); + return 0; +} + +extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { + do { + int ch; + int w = get_b_utf8(z->p, z->c, z->lb, & ch); + unless (w) return -1; + if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return w; + z->c -= w; + } while (repeat); + return 0; +} + +extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { + do { + int ch; + int w = get_utf8(z->p, z->c, z->l, & ch); + unless (w) return -1; + unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return w; + z->c += w; + } while (repeat); + return 0; +} + +extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { + do { + int ch; + int w = get_b_utf8(z->p, z->c, z->lb, & ch); + unless (w) return -1; + unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return w; + z->c -= w; + } while (repeat); + return 0; +} + +/* Code for character groupings: non-utf8 cases */ + +extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { + do { + int ch; + if (z->c >= z->l) return -1; + ch = z->p[z->c]; + if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return 1; + z->c++; + } while (repeat); + return 0; +} + +extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { + do { + int ch; + if (z->c <= z->lb) return -1; + ch = z->p[z->c - 1]; + if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return 1; + z->c--; + } while (repeat); + return 0; +} + +extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { + do { + int ch; + if (z->c >= z->l) return -1; + ch = z->p[z->c]; + unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return 1; + z->c++; + } while (repeat); + return 0; +} + +extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { + do { + int ch; + if (z->c <= z->lb) return -1; + ch = z->p[z->c - 1]; + unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) + return 1; + z->c--; + } while (repeat); + return 0; +} + +extern int eq_s(struct SN_env * z, int s_size, const symbol * s) { + if (z->l - z->c < s_size || memcmp(z->p + z->c, s, s_size * sizeof(symbol)) != 0) return 0; + z->c += s_size; return 1; +} + +extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s) { + if (z->c - z->lb < s_size || memcmp(z->p + z->c - s_size, s, s_size * sizeof(symbol)) != 0) return 0; + z->c -= s_size; return 1; +} + +extern int eq_v(struct SN_env * z, const symbol * p) { + return eq_s(z, SIZE(p), p); +} + +extern int eq_v_b(struct SN_env * z, const symbol * p) { + return eq_s_b(z, SIZE(p), p); +} + +extern int find_among(struct SN_env * z, const struct among * v, int v_size) { + + int i = 0; + int j = v_size; + + int c = z->c; int l = z->l; + symbol * q = z->p + c; + + const struct among * w; + + int common_i = 0; + int common_j = 0; + + int first_key_inspected = 0; + + while(1) { + int k = i + ((j - i) >> 1); + int diff = 0; + int common = common_i < common_j ? common_i : common_j; /* smaller */ + w = v + k; + { + int i2; for (i2 = common; i2 < w->s_size; i2++) { + if (c + common == l) { diff = -1; break; } + diff = q[common] - w->s[i2]; + if (diff != 0) break; + common++; + } + } + if (diff < 0) { j = k; common_j = common; } + else { i = k; common_i = common; } + if (j - i <= 1) { + if (i > 0) break; /* v->s has been inspected */ + if (j == i) break; /* only one item in v */ + + /* - but now we need to go round once more to get + v->s inspected. This looks messy, but is actually + the optimal approach. */ + + if (first_key_inspected) break; + first_key_inspected = 1; + } + } + while(1) { + w = v + i; + if (common_i >= w->s_size) { + z->c = c + w->s_size; + if (w->function == 0) return w->result; + { + int res = w->function(z); + z->c = c + w->s_size; + if (res) return w->result; + } + } + i = w->substring_i; + if (i < 0) return 0; + } +} + +/* find_among_b is for backwards processing. Same comments apply */ + +extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) { + + int i = 0; + int j = v_size; + + int c = z->c; int lb = z->lb; + symbol * q = z->p + c - 1; + + const struct among * w; + + int common_i = 0; + int common_j = 0; + + int first_key_inspected = 0; + + while(1) { + int k = i + ((j - i) >> 1); + int diff = 0; + int common = common_i < common_j ? common_i : common_j; + w = v + k; + { + int i2; for (i2 = w->s_size - 1 - common; i2 >= 0; i2--) { + if (c - common == lb) { diff = -1; break; } + diff = q[- common] - w->s[i2]; + if (diff != 0) break; + common++; + } + } + if (diff < 0) { j = k; common_j = common; } + else { i = k; common_i = common; } + if (j - i <= 1) { + if (i > 0) break; + if (j == i) break; + if (first_key_inspected) break; + first_key_inspected = 1; + } + } + while(1) { + w = v + i; + if (common_i >= w->s_size) { + z->c = c - w->s_size; + if (w->function == 0) return w->result; + { + int res = w->function(z); + z->c = c - w->s_size; + if (res) return w->result; + } + } + i = w->substring_i; + if (i < 0) return 0; + } +} + + +/* Increase the size of the buffer pointed to by p to at least n symbols. + * If insufficient memory, returns NULL and frees the old buffer. + */ +static symbol * increase_size(symbol * p, int n) { + symbol * q; + int new_size = n + 20; + void * mem = realloc((char *) p - HEAD, + HEAD + (new_size + 1) * sizeof(symbol)); + if (mem == NULL) { + lose_s(p); + return NULL; + } + q = (symbol *) (HEAD + (char *)mem); + CAPACITY(q) = new_size; + return q; +} + +/* to replace symbols between c_bra and c_ket in z->p by the + s_size symbols at s. + Returns 0 on success, -1 on error. + Also, frees z->p (and sets it to NULL) on error. +*/ +extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjptr) +{ + int adjustment; + int len; + if (z->p == NULL) { + z->p = create_s(); + if (z->p == NULL) return -1; + } + adjustment = s_size - (c_ket - c_bra); + len = SIZE(z->p); + if (adjustment != 0) { + if (adjustment + len > CAPACITY(z->p)) { + z->p = increase_size(z->p, adjustment + len); + if (z->p == NULL) return -1; + } + memmove(z->p + c_ket + adjustment, + z->p + c_ket, + (len - c_ket) * sizeof(symbol)); + SET_SIZE(z->p, adjustment + len); + z->l += adjustment; + if (z->c >= c_ket) + z->c += adjustment; + else + if (z->c > c_bra) + z->c = c_bra; + } + unless (s_size == 0) memmove(z->p + c_bra, s, s_size * sizeof(symbol)); + if (adjptr != NULL) + *adjptr = adjustment; + return 0; +} + +static int slice_check(struct SN_env * z) { + + if (z->bra < 0 || + z->bra > z->ket || + z->ket > z->l || + z->p == NULL || + z->l > SIZE(z->p)) /* this line could be removed */ + { +#if 0 + fprintf(stderr, "faulty slice operation:\n"); + debug(z, -1, 0); +#endif + return -1; + } + return 0; +} + +extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s) { + if (slice_check(z)) return -1; + return replace_s(z, z->bra, z->ket, s_size, s, NULL); +} + +extern int slice_from_v(struct SN_env * z, const symbol * p) { + return slice_from_s(z, SIZE(p), p); +} + +extern int slice_del(struct SN_env * z) { + return slice_from_s(z, 0, 0); +} + +extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s) { + int adjustment; + if (replace_s(z, bra, ket, s_size, s, &adjustment)) + return -1; + if (bra <= z->bra) z->bra += adjustment; + if (bra <= z->ket) z->ket += adjustment; + return 0; +} + +extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) { + int adjustment; + if (replace_s(z, bra, ket, SIZE(p), p, &adjustment)) + return -1; + if (bra <= z->bra) z->bra += adjustment; + if (bra <= z->ket) z->ket += adjustment; + return 0; +} + +extern symbol * slice_to(struct SN_env * z, symbol * p) { + if (slice_check(z)) { + lose_s(p); + return NULL; + } + { + int len = z->ket - z->bra; + if (CAPACITY(p) < len) { + p = increase_size(p, len); + if (p == NULL) + return NULL; + } + memmove(p, z->p + z->bra, len * sizeof(symbol)); + SET_SIZE(p, len); + } + return p; +} + +extern symbol * assign_to(struct SN_env * z, symbol * p) { + int len = z->l; + if (CAPACITY(p) < len) { + p = increase_size(p, len); + if (p == NULL) + return NULL; + } + memmove(p, z->p, len * sizeof(symbol)); + SET_SIZE(p, len); + return p; +} + +#if 0 +extern void debug(struct SN_env * z, int number, int line_count) { + int i; + int limit = SIZE(z->p); + /*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/ + if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit); + for (i = 0; i <= limit; i++) { + if (z->lb == i) printf("{"); + if (z->bra == i) printf("["); + if (z->c == i) printf("|"); + if (z->ket == i) printf("]"); + if (z->l == i) printf("}"); + if (i < limit) + { int ch = z->p[i]; + if (ch == 0) ch = '#'; + printf("%c", ch); + } + } + printf("'\n"); +} +#endif |