Browse Source

[Rework] Update snowball stemmer to 2.0 and remove all crap aside of UTF8

tags/2.4
Vsevolod Stakhov 4 years ago
parent
commit
b87995255f
70 changed files with 8903 additions and 6045 deletions
  1. 14
    21
      contrib/snowball/CMakeLists.txt
  2. 0
    300
      contrib/snowball/GNUmakefile
  3. 407
    0
      contrib/snowball/NEWS
  4. 561
    0
      contrib/snowball/algorithms/arabic.sbl
  5. 149
    0
      contrib/snowball/algorithms/basque.sbl
  6. 202
    0
      contrib/snowball/algorithms/catalan.sbl
  7. 8
    6
      contrib/snowball/algorithms/danish.sbl
  8. 0
    91
      contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl
  9. 12
    12
      contrib/snowball/algorithms/dutch.sbl
  10. 0
    164
      contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl
  11. 0
    0
      contrib/snowball/algorithms/english.sbl
  12. 8
    7
      contrib/snowball/algorithms/finnish.sbl
  13. 24
    18
      contrib/snowball/algorithms/french.sbl
  14. 0
    239
      contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl
  15. 5
    5
      contrib/snowball/algorithms/german.sbl
  16. 0
    139
      contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl
  17. 5
    5
      contrib/snowball/algorithms/german2.sbl
  18. 706
    0
      contrib/snowball/algorithms/greek.sbl
  19. 323
    0
      contrib/snowball/algorithms/hindi.sbl
  20. 11
    11
      contrib/snowball/algorithms/hungarian.sbl
  21. 0
    241
      contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl
  22. 192
    0
      contrib/snowball/algorithms/indonesian.sbl
  23. 151
    0
      contrib/snowball/algorithms/irish.sbl
  24. 12
    12
      contrib/snowball/algorithms/italian.sbl
  25. 0
    195
      contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl
  26. 13
    18
      contrib/snowball/algorithms/kraaij_pohlmann.sbl
  27. 373
    0
      contrib/snowball/algorithms/lithuanian.sbl
  28. 0
    0
      contrib/snowball/algorithms/lovins.sbl
  29. 92
    0
      contrib/snowball/algorithms/nepali.sbl
  30. 4
    4
      contrib/snowball/algorithms/norwegian.sbl
  31. 0
    80
      contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl
  32. 0
    0
      contrib/snowball/algorithms/porter.sbl
  33. 15
    15
      contrib/snowball/algorithms/portuguese.sbl
  34. 0
    218
      contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl
  35. 5
    5
      contrib/snowball/algorithms/romanian.sbl
  36. 0
    236
      contrib/snowball/algorithms/romanian/stem_Unicode.sbl
  37. 40
    36
      contrib/snowball/algorithms/russian.sbl
  38. 0
    215
      contrib/snowball/algorithms/russian/stem_Unicode.sbl
  39. 2378
    0
      contrib/snowball/algorithms/serbian.sbl
  40. 9
    9
      contrib/snowball/algorithms/spanish.sbl
  41. 0
    230
      contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl
  42. 4
    4
      contrib/snowball/algorithms/swedish.sbl
  43. 0
    72
      contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl
  44. 405
    0
      contrib/snowball/algorithms/tamil.sbl
  45. 97
    104
      contrib/snowball/algorithms/turkish.sbl
  46. 98
    0
      contrib/snowball/charsets/ISO-8859-2.sbl
  47. 74
    0
      contrib/snowball/charsets/KOI8-R.sbl
  48. 130
    0
      contrib/snowball/charsets/cp850.sbl
  49. 644
    222
      contrib/snowball/compiler/analyser.c
  50. 391
    75
      contrib/snowball/compiler/driver.c
  51. 709
    449
      contrib/snowball/compiler/generator.c
  52. 0
    1452
      contrib/snowball/compiler/generator_java.c
  53. 151
    64
      contrib/snowball/compiler/header.h
  54. 48
    24
      contrib/snowball/compiler/space.c
  55. 4
    2
      contrib/snowball/compiler/syswords.h
  56. 2
    2
      contrib/snowball/compiler/syswords2.h
  57. 252
    155
      contrib/snowball/compiler/tokeniser.c
  58. 0
    209
      contrib/snowball/examples/stemwords.c
  59. 4
    5
      contrib/snowball/include/libstemmer.h
  60. 0
    31
      contrib/snowball/java/org/tartarus/snowball/Among.java
  61. 0
    432
      contrib/snowball/java/org/tartarus/snowball/SnowballProgram.java
  62. 0
    7
      contrib/snowball/java/org/tartarus/snowball/SnowballStemmer.java
  63. 0
    77
      contrib/snowball/java/org/tartarus/snowball/TestApp.java
  64. 8
    7
      contrib/snowball/libstemmer/libstemmer_c.in
  65. 18
    7
      contrib/snowball/libstemmer/mkmodules.pl
  66. 26
    18
      contrib/snowball/libstemmer/modules.txt
  67. 1
    9
      contrib/snowball/runtime/api.c
  68. 8
    2
      contrib/snowball/runtime/api.h
  69. 2
    1
      contrib/snowball/runtime/header.h
  70. 108
    83
      contrib/snowball/runtime/utilities.c

+ 14
- 21
contrib/snowball/CMakeLists.txt View File

@@ -1,20 +1,14 @@
# End of configuration
SET(LIBSTEM_ALGORITHMS danish dutch english finnish french german hungarian
italian norwegian porter portuguese romanian
russian spanish swedish turkish)
SET(KOI8_ALGORITHMS russian)
SET(ISO_8859_1_ALGORITHMS danish dutch english finnish french german italian
norwegian porter portuguese spanish swedish)
SET(ISO_8859_2_ALGORITHMS hungarian romanian)
SET(OTHER_ALGORITHMS german2 kraaij_pohlmann lovins)
SET(ALL_ALGORITHMS ${LIBSTEM_ALGORITHMS} ${OTHER_ALGORITHMS})
SET(LIBSTEM_ALGORITHMS arabic danish dutch english finnish french german greek hindi hungarian
indonesian italian lithuanian nepali norwegian porter portuguese romanian
russian serbian spanish swedish tamil turkish)
SET(ALL_ALGORITHMS ${LIBSTEM_ALGORITHMS})

SET(COMPILER_SOURCES compiler/space.c
compiler/tokeniser.c
compiler/analyser.c
compiler/generator.c
compiler/driver.c
compiler/generator_java.c)
compiler/driver.c)

SET(SNOWBALL_RUNTIME runtime/api.c
runtime/utilities.c)
@@ -24,9 +18,15 @@ SET(LIBSTEMMER_UTF8_SOURCES libstemmer/libstemmer_utf8.c)
#LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h
#LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in

SET(STEMWORDS_SOURCES examples/stemwords.c)
SET(MODULES_H "modules.h")
CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/libstemmer_c.in ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/libstemmer.c @ONLY)
ADD_DEFINITIONS("-DDISABLE_JS")
ADD_DEFINITIONS("-DDISABLE_GO")
ADD_DEFINITIONS("-DDISABLE_JAVA")
ADD_DEFINITIONS("-DDISABLE_PYTHON")
ADD_DEFINITIONS("-DDISABLE_CSHARP")
ADD_DEFINITIONS("-DDISABLE_PASCAL")
ADD_DEFINITIONS("-DDISABLE_RUST")

MACRO(gen_stem IN ENCODING)
FOREACH(_it ${IN})
@@ -34,7 +34,7 @@ MACRO(gen_stem IN ENCODING)
SET(_header "${_base}.h")
SET(_source "${_base}.c")
STRING(REPLACE "UTF_8" "Unicode" _in_enc "${ENCODING}")
SET(_input "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}/stem_${_in_enc}.sbl")
SET(_input "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}.sbl")
IF(${_in_enc} STREQUAL "Unicode" AND NOT EXISTS ${_input})
ADD_CUSTOM_COMMAND(OUTPUT ${_source}
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/snowball "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}/stem_ISO_8859_1.sbl" -o ${_base} -eprefix ${_it}_${ENCODING}_ -r ../runtime -u
@@ -57,7 +57,7 @@ INCLUDE_DIRECTORIES("include")
ADD_EXECUTABLE(snowball ${COMPILER_SOURCES})

ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h
COMMAND ${PERL_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/mkmodules.pl libstemmer/modules.h libstemmer ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/modules.txt libstemmer/mkinc.mak)
COMMAND ${PERL_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/mkmodules.pl ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h libstemmer ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/modules.txt ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/mkinc.mak)
ADD_CUSTOM_TARGET(modules DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h")

SET(STEMMER_SOURCES "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/libstemmer.c")
@@ -65,13 +65,6 @@ ADD_CUSTOM_TARGET(stemmer_deps ALL)
ADD_DEPENDENCIES(stemmer_deps modules)

gen_stem("${LIBSTEM_ALGORITHMS}" "UTF_8")
gen_stem("${KOI8_ALGORITHMS}" "KOI8_R")
gen_stem("${ISO_8859_1_ALGORITHMS}" "ISO_8859_1")
gen_stem("${ISO_8859_2_ALGORITHMS}" "ISO_8859_2")


ADD_LIBRARY(stemmer ${LINK_TYPE} ${SNOWBALL_RUNTIME} ${STEMMER_SOURCES})
ADD_DEPENDENCIES(stemmer stemmer_deps)

ADD_EXECUTABLE(stemwords ${STEMWORDS_SOURCES})
TARGET_LINK_LIBRARIES(stemwords stemmer)

+ 0
- 300
contrib/snowball/GNUmakefile View File

@@ -1,300 +0,0 @@
# -*- makefile -*-

c_src_dir = src_c
java_src_main_dir = java/org/tartarus/snowball
java_src_dir = $(java_src_main_dir)/ext

libstemmer_algorithms = danish dutch english finnish french german hungarian \
italian \
norwegian porter portuguese romanian \
russian spanish swedish turkish

KOI8_R_algorithms = russian
ISO_8859_1_algorithms = danish dutch english finnish french german italian \
norwegian porter portuguese spanish swedish
ISO_8859_2_algorithms = hungarian romanian

other_algorithms = german2 kraaij_pohlmann lovins

all_algorithms = $(libstemmer_algorithms) $(other_algorithms)

COMPILER_SOURCES = compiler/space.c \
compiler/tokeniser.c \
compiler/analyser.c \
compiler/generator.c \
compiler/driver.c \
compiler/generator_java.c
COMPILER_HEADERS = compiler/header.h \
compiler/syswords.h \
compiler/syswords2.h

RUNTIME_SOURCES = runtime/api.c \
runtime/utilities.c
RUNTIME_HEADERS = runtime/api.h \
runtime/header.h

JAVARUNTIME_SOURCES = java/org/tartarus/snowball/Among.java \
java/org/tartarus/snowball/SnowballProgram.java \
java/org/tartarus/snowball/SnowballStemmer.java \
java/org/tartarus/snowball/TestApp.java

LIBSTEMMER_SOURCES = libstemmer/libstemmer.c
LIBSTEMMER_UTF8_SOURCES = libstemmer/libstemmer_utf8.c
LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h
LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in

STEMWORDS_SOURCES = examples/stemwords.c

ALL_ALGORITHM_FILES = $(all_algorithms:%=algorithms/%/stem*.sbl)
C_LIB_SOURCES = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) \
$(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.c) \
$(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.c) \
$(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.c)
C_LIB_HEADERS = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) \
$(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.h) \
$(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.h) \
$(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.h)
C_OTHER_SOURCES = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c)
C_OTHER_HEADERS = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h)
JAVA_SOURCES = $(libstemmer_algorithms:%=$(java_src_dir)/%Stemmer.java)

COMPILER_OBJECTS=$(COMPILER_SOURCES:.c=.o)
RUNTIME_OBJECTS=$(RUNTIME_SOURCES:.c=.o)
LIBSTEMMER_OBJECTS=$(LIBSTEMMER_SOURCES:.c=.o)
LIBSTEMMER_UTF8_OBJECTS=$(LIBSTEMMER_UTF8_SOURCES:.c=.o)
STEMWORDS_OBJECTS=$(STEMWORDS_SOURCES:.c=.o)
C_LIB_OBJECTS = $(C_LIB_SOURCES:.c=.o)
C_OTHER_OBJECTS = $(C_OTHER_SOURCES:.c=.o)
JAVA_CLASSES = $(JAVA_SOURCES:.java=.class)
JAVA_RUNTIME_CLASSES=$(JAVARUNTIME_SOURCES:.java=.class)

CFLAGS=-Iinclude -O2
CPPFLAGS=-W -Wall -Wmissing-prototypes -Wmissing-declarations

all: snowball libstemmer.o stemwords $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS)

clean:
rm -f $(COMPILER_OBJECTS) $(RUNTIME_OBJECTS) \
$(LIBSTEMMER_OBJECTS) $(LIBSTEMMER_UTF8_OBJECTS) $(STEMWORDS_OBJECTS) snowball \
libstemmer.o stemwords \
libstemmer/modules.h \
libstemmer/modules_utf8.h \
snowball.splint \
$(C_LIB_SOURCES) $(C_LIB_HEADERS) $(C_LIB_OBJECTS) \
$(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) \
$(JAVA_SOURCES) $(JAVA_CLASSES) $(JAVA_RUNTIME_CLASSES) \
libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak \
libstemmer/libstemmer.c libstemmer/libstemmer_utf8.c
rm -rf dist
rmdir $(c_src_dir) || true

snowball: $(COMPILER_OBJECTS)
$(CC) -o $@ $^

$(COMPILER_OBJECTS): $(COMPILER_HEADERS)

libstemmer/libstemmer.c: libstemmer/libstemmer_c.in
sed 's/@MODULES_H@/modules.h/' $^ >$@

libstemmer/libstemmer_utf8.c: libstemmer/libstemmer_c.in
sed 's/@MODULES_H@/modules_utf8.h/' $^ >$@

libstemmer/modules.h libstemmer/mkinc.mak: libstemmer/mkmodules.pl libstemmer/modules.txt
libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules.txt libstemmer/mkinc.mak

libstemmer/modules_utf8.h libstemmer/mkinc_utf8.mak: libstemmer/mkmodules.pl libstemmer/modules_utf8.txt
libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules_utf8.txt libstemmer/mkinc_utf8.mak utf8

libstemmer/libstemmer.o: libstemmer/modules.h $(C_LIB_HEADERS)

libstemmer.o: libstemmer/libstemmer.o $(RUNTIME_OBJECTS) $(C_LIB_OBJECTS)
$(AR) -cru $@ $^

stemwords: $(STEMWORDS_OBJECTS) libstemmer.o
$(CC) -o $@ $^

algorithms/%/stem_Unicode.sbl: algorithms/%/stem_ISO_8859_1.sbl
cp $^ $@

$(c_src_dir)/stem_UTF_8_%.c $(c_src_dir)/stem_UTF_8_%.h: algorithms/%/stem_Unicode.sbl snowball
@mkdir -p $(c_src_dir)
@l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \
o="$(c_src_dir)/stem_UTF_8_$${l}"; \
echo "./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u"; \
./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u

$(c_src_dir)/stem_KOI8_R_%.c $(c_src_dir)/stem_KOI8_R_%.h: algorithms/%/stem_KOI8_R.sbl snowball
@mkdir -p $(c_src_dir)
@l=`echo "$<" | sed 's!\(.*\)/stem_KOI8_R.sbl$$!\1!;s!^.*/!!'`; \
o="$(c_src_dir)/stem_KOI8_R_$${l}"; \
echo "./snowball $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime"; \
./snowball $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime

$(c_src_dir)/stem_ISO_8859_1_%.c $(c_src_dir)/stem_ISO_8859_1_%.h: algorithms/%/stem_ISO_8859_1.sbl snowball
@mkdir -p $(c_src_dir)
@l=`echo "$<" | sed 's!\(.*\)/stem_ISO_8859_1.sbl$$!\1!;s!^.*/!!'`; \
o="$(c_src_dir)/stem_ISO_8859_1_$${l}"; \
echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime"; \
./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime

$(c_src_dir)/stem_ISO_8859_2_%.c $(c_src_dir)/stem_ISO_8859_2_%.h: algorithms/%/stem_ISO_8859_2.sbl snowball
@mkdir -p $(c_src_dir)
@l=`echo "$<" | sed 's!\(.*\)/stem_ISO_8859_2.sbl$$!\1!;s!^.*/!!'`; \
o="$(c_src_dir)/stem_ISO_8859_2_$${l}"; \
echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime"; \
./snowball $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime

$(c_src_dir)/stem_%.o: $(c_src_dir)/stem_%.c $(c_src_dir)/stem_%.h
$(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ $<

$(java_src_dir)/%Stemmer.java: algorithms/%/stem_Unicode.sbl snowball
@mkdir -p $(java_src_dir)
@l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \
o="$(java_src_dir)/$${l}Stemmer"; \
echo "./snowball $< -j -o $${o} -p \"org.tartarus.snowball.SnowballStemmer\" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer"; \
./snowball $< -j -o $${o} -p "org.tartarus.snowball.SnowballStemmer" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer

splint: snowball.splint
snowball.splint: $(COMPILER_SOURCES)
splint $^ >$@ -weak

# Make a full source distribution
dist: dist_snowball dist_libstemmer_c dist_libstemmer_java

# Make a distribution of all the sources involved in snowball
dist_snowball: $(COMPILER_SOURCES) $(COMPILER_HEADERS) \
$(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \
$(LIBSTEMMER_SOURCES) \
$(LIBSTEMMER_UTF8_SOURCES) \
$(LIBSTEMMER_HEADERS) \
$(LIBSTEMMER_EXTRA) \
$(ALL_ALGORITHM_FILES) $(STEMWORDS_SOURCES) \
GNUmakefile README doc/TODO libstemmer/mkmodules.pl
destname=snowball_code; \
dest=dist/$${destname}; \
rm -rf $${dest} && \
rm -f $${dest}.tgz && \
for file in $^; do \
dir=`dirname $$file` && \
mkdir -p $${dest}/$${dir} && \
cp -a $${file} $${dest}/$${dir} || exit 1 ; \
done && \
(cd dist && tar zcf $${destname}.tgz $${destname}) && \
rm -rf $${dest}

# Make a distribution of all the sources required to compile the C library.
dist_libstemmer_c: \
$(RUNTIME_SOURCES) \
$(RUNTIME_HEADERS) \
$(LIBSTEMMER_SOURCES) \
$(LIBSTEMMER_UTF8_SOURCES) \
$(LIBSTEMMER_HEADERS) \
$(LIBSTEMMER_EXTRA) \
$(C_LIB_SOURCES) \
$(C_LIB_HEADERS) \
libstemmer/mkinc.mak \
libstemmer/mkinc_utf8.mak
destname=libstemmer_c; \
dest=dist/$${destname}; \
rm -rf $${dest} && \
rm -f $${dest}.tgz && \
mkdir -p $${dest} && \
cp -a doc/libstemmer_c_README $${dest}/README && \
mkdir -p $${dest}/examples && \
cp -a examples/stemwords.c $${dest}/examples && \
mkdir -p $${dest}/$(c_src_dir) && \
cp -a $(C_LIB_SOURCES) $(C_LIB_HEADERS) $${dest}/$(c_src_dir) && \
mkdir -p $${dest}/runtime && \
cp -a $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) $${dest}/runtime && \
mkdir -p $${dest}/libstemmer && \
cp -a $(LIBSTEMMER_SOURCES) $(LIBSTEMMER_UTF8_SOURCES) $(LIBSTEMMER_HEADERS) $(LIBSTEMMER_EXTRA) $${dest}/libstemmer && \
mkdir -p $${dest}/include && \
mv $${dest}/libstemmer/libstemmer.h $${dest}/include && \
(cd $${dest} && \
echo "README" >> MANIFEST && \
ls $(c_src_dir)/*.c $(c_src_dir)/*.h >> MANIFEST && \
ls runtime/*.c runtime/*.h >> MANIFEST && \
ls libstemmer/*.c libstemmer/*.h >> MANIFEST && \
ls include/*.h >> MANIFEST) && \
cp -a libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak $${dest}/ && \
echo 'include mkinc.mak' >> $${dest}/Makefile && \
echo 'CFLAGS=-Iinclude' >> $${dest}/Makefile && \
echo 'all: libstemmer.o stemwords' >> $${dest}/Makefile && \
echo 'libstemmer.o: $$(snowball_sources:.c=.o)' >> $${dest}/Makefile && \
echo ' $$(AR) -cru $$@ $$^' >> $${dest}/Makefile && \
echo 'stemwords: examples/stemwords.o libstemmer.o' >> $${dest}/Makefile && \
echo ' $$(CC) -o $$@ $$^' >> $${dest}/Makefile && \
echo 'clean:' >> $${dest}/Makefile && \
echo ' rm -f stemwords *.o $(c_src_dir)/*.o runtime/*.o libstemmer/*.o' >> $${dest}/Makefile && \
(cd dist && tar zcf $${destname}.tgz $${destname}) && \
rm -rf $${dest}

# Make a distribution of all the sources required to compile the Java library.
dist_libstemmer_java: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \
$(LIBSTEMMER_EXTRA) \
$(JAVA_SOURCES)
destname=libstemmer_java; \
dest=dist/$${destname}; \
rm -rf $${dest} && \
rm -f $${dest}.tgz && \
mkdir -p $${dest} && \
cp -a doc/libstemmer_java_README $${dest}/README && \
mkdir -p $${dest}/$(java_src_dir) && \
cp -a $(JAVA_SOURCES) $${dest}/$(java_src_dir) && \
mkdir -p $${dest}/$(java_src_main_dir) && \
cp -a $(JAVARUNTIME_SOURCES) $${dest}/$(java_src_main_dir) && \
(cd $${dest} && \
echo "README" >> MANIFEST && \
ls $(java_src_dir)/*.java >> MANIFEST && \
ls $(java_src_main_dir)/*.java >> MANIFEST) && \
(cd dist && tar zcf $${destname}.tgz $${destname}) && \
rm -rf $${dest}

check: check_utf8 check_iso_8859_1 check_iso_8859_2 check_koi8r

check_utf8: $(libstemmer_algorithms:%=check_utf8_%)

check_iso_8859_1: $(ISO_8859_1_algorithms:%=check_iso_8859_1_%)

check_iso_8859_2: $(ISO_8859_2_algorithms:%=check_iso_8859_2_%)

check_koi8r: $(KOI8_R_algorithms:%=check_koi8r_%)

# Where the data files are located - assumed their repo is checked out as
# a sibling to this one.
STEMMING_DATA = ../snowball-data

check_utf8_%: $(STEMMING_DATA)/% stemwords
@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with UTF-8"
@./stemwords -c UTF_8 -l `echo $<|sed 's!.*/!!'` -i $</voc.txt -o tmp.txt
@diff -u $</output.txt tmp.txt
@if [ -e $</diffs.txt ] ; \
then \
./stemwords -c UTF_8 -l `echo $<|sed 's!.*/!!'` -i $</voc.txt -o tmp.txt -p2 && \
diff -u $</diffs.txt tmp.txt; \
fi
@rm tmp.txt

check_iso_8859_1_%: $(STEMMING_DATA)/% stemwords
@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_1"
@python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-1"))' | \
./stemwords -c ISO_8859_1 -l `echo $<|sed 's!.*/!!'` -o tmp.txt
@python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-1"))' | \
diff -u - tmp.txt
@rm tmp.txt

check_iso_8859_2_%: $(STEMMING_DATA)/% stemwords
@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_2"
@python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-2"))' | \
./stemwords -c ISO_8859_2 -l `echo $<|sed 's!.*/!!'` -o tmp.txt
@python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-2"))' | \
diff -u - tmp.txt
@rm tmp.txt

check_koi8r_%: $(STEMMING_DATA)/% stemwords
@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with KOI8R"
@python -c 'print(open("$</voc.txt").read().decode("utf8").encode("koi8_r"))' | \
./stemwords -c KOI8_R -l `echo $<|sed 's!.*/!!'` -o tmp.txt
@python -c 'print(open("$</output.txt").read().decode("utf8").encode("koi8_r"))' | \
diff -u - tmp.txt
@rm tmp.txt

+ 407
- 0
contrib/snowball/NEWS View File

@@ -0,0 +1,407 @@
Snowball 2.0.0 (2019-10-02)
===========================

C/C++
-----

* Fully handle 4-byte UTF-8 sequences. Previously `hop` and `next` handled
sequences of any length, but commands which look at the character value only
handled sequences up to length 3. Fixes #89.

* Fix handling of a 3-byte UTF-8 sequence in a grouping in `backwardmode`.

Java
----

* TestApp.java:

- Always use UTF-8 for I/O. Patch from David Corbett (#80).

- Allow reading input from stdin.

- Remove rather pointless "stem n times" feature.

- Only lower case ASCII to match stemwords.c.

- Stem empty lines too to match stemwords.c.

Code Quality Improvements
-------------------------

* Fix various warnings from newer compilers.

* Improve use of `const`.

* Share common functions between compiler backends rather than having multiple
copies of the same code.

* Assorted code clean-up.

* Initialise line_labelled member of struct generator to 0. Previously we were
invoking undefined behaviour, though in practice it'll be zero initialised on
most platforms.

New Code Generators
-------------------

* Add Python generator (#24). Originally written by Yoshiki Shibukawa, with
additional updates by Dmitry Shachnev.

* Add Javascript generator. Based on JSX generator (#26) written by Yoshiki
Shibukawa.

* Add Rust generator from Jakob Demler (#51).

* Add Go generator from Marty Schoch (#57).

* Add C# generator. Based on patch from Cesar Souza (#16, #17).

* Add Pascal generator. Based on Delphi backend from stemming.zip file on old
website (#75).

New Language Features
---------------------

* Add `len` and `lenof` to measure Unicode length. These are similar to `size`
and `sizeof` (respectively), but `size` and `sizeof` return the length in
bytes under `-utf8`, whereas these new commands give the same result whether
using `-utf8`, `-widechars` or neither (but under `-utf8` they are O(n) in
the length of the string). For compatibility with existing code which might
use these as variable or function names, they stop being treated as tokens if
declared to be a variable or function.

* New `{U+1234}` stringdef notation for Unicode codepoints.

* More versatile integer tests. Now you can compare any two arithmetic
expressions with a relational operator in parentheses after the `$`, so for
example `$(len > 3)` can now be used when previously a temporary variable was
required: `$tmp = len $tmp > 3`

Code generation improvements
----------------------------

* General:

+ Avoid unnecessarily saving and restoring of the cursor for more commands -
`atlimit`, `do`, `set` and `unset` all leave the cursor alone or always
restore its value, and for C `booltest` (which other languages already
handled).

+ Special case handling for `setlimit tomark AE`. All uses of setlimit in
the current stemmers we ship follow this pattern, and by special-casing we
can avoid having to save and restore the cursor (#74).

+ Merge duplicate actions in the same `among`. This reduces the size of the
switch/if-chain in the generated code which dispatch the among for many of
the stemmers.

+ Generate simpler code for `among`. We always check for a zero return value
when we call the among, so there's no point also checking for that in the
switch/if-chain. We can also avoid the switch/if-chain entirely when
there's only one possible outcome (besides the zero return).

+ Optimise code generated for `do <function call>`. This speeds up "make
check_python" by about 2%, and should speed up other interpreted languages
too (#110).

+ Generate more and better comments referencing snowball source.

+ Add homepage URL and compiler version as comments in generated files.

* C/C++:

+ Fix `size` and `sizeof` to not report one too high (reported by Assem
Chelli in #32).

+ If signal `f` from a function call would lead to return from the current
function then handle this and bailing out on an error together with a
simple `if (ret <= 0) return ret;`

+ Inline testing for a single character literals.

+ Avoiding generating `|| 0` in corner case - this can result in a compiler
warning when building the generated code.

+ Implement `insert_v()` in terms of `insert_s()`.

+ Add conditional `extern "C"` so `runtime/api.h` can be included from C++
code. Closes #90, reported by vvarma.

* Java:

+ Fix functions in `among` to work in Java. We seem to need to make the
methods called from among `public` instead of `private`, and to call them
on `this` instead of the `methodObject` (which is cleaner anyway). No
revision in version control seems to generate working code for this case,
but Richard says it definitely used to work - possibly older JVMs failed to
correctly enforce the access controls when methods were invoked by
reflection.

+ Code after handling `f` by returning from the current function is
unreachable too.

+ Previously we incorrectly decided that code after an `or` was
unreachable in certain cases. None of the current stemmers in the
distribution triggered this, but Martin Porter's snowball version
of the Schinke Latin stemmer does. Fixes #58, reported by Alexander
Myltsev.

+ The reachability logic was failing to consider reachability from
the final command in an `or`. Fixes #82, reported by David Corbett.

+ Fix `maxint` and `minint`. Patch from David Corbett in #31.

+ Fix `$` on strings. The previous generated code was just wrong. This
doesn't affect any of the included algorithms, but for example breaks
Martin Porter's snowball implementation of Schinke's Latin Stemmer.
Issue noted by Jakob Demler while working on the Rust backend in #51,
and reported in the Schinke's Latin Stemmer by Alexander Myltsev
in #58.

+ Make SnowballProgram objects serializable. Patch from Oleg Smirnov in #43.

+ Eliminate range-check implementation for groupings. This was removed from
the C generator 10 years earlier, isn't used for any of the existing
algorithms, and it doesn't seem likely it would be - the grouping would
have to consist entirely of a contiguous block of Unicode code-points.

+ Simplify code generated for `repeat` and `atleast`.

+ Eliminate unused return values and variables from runtime functions.

+ Only import the `among` and `SnowballProgram` classes if they're actually
used.

+ Only generate `copy_from()` method if it's used.

+ Merge runtime functions `eq_s` and `eq_v` functions.

+ Java arrays know their own length so stop storing it separately.

+ Escape char 127 (DEL) in generated Java code. It's unlikely that this
character would actually be used in a real stemmer, so this was more of a
theoretical bug.

+ Drop unused import of InvocationTargetException from SnowballStemmer.
Reported by GerritDeMeulder in #72.

+ Fix lint check issues in generated Java code. The stemmer classes are only
referenced in the example app via reflection, so add
@SuppressWarnings("unused") for them. The stemmer classes override
equals() and hashCode() methods from the standard java Object class, so
mark these with @Override. Both suggested by GerritDeMeulder in #72.

+ Declare Java variables at point of use in generated code. Putting all
declarations at the top of the function was adding unnecessary complexity
to the Java generator code for no benefit.

+ Improve formatting of generated code.

New stemming algorithms
-----------------------

* Add Tamil stemmer from Damodharan Rajalingam (#2, #3).

* Add Arabic stemmer from Assem Chelli (#32, #50).

* Add Irish stemmer Jim O'Regan (#48).

* Add Nepali stemmer from Arthur Zakirov (#70).

* Add Indonesian stemmer from Olly Betts (#71).

* Add Hindi stemmer from Olly Betts (#73). Thanks to David Corbett for review.

* Add Lithuanian stemmer from Dainius Jocas (#22, #76).

* Add Greek stemmer from Oleg Smirnov (#44).

* Add Catalan and Basque stemmers from Israel Olalla (#104).

Behavioural changes to existing algorithms
------------------------------------------

* Portuguese:

+ Replace incorrect Spanish suffixes by Portuguese suffixes (#1).

* French:

+ The MSDOS CP850 version of the French algorithm was missing changes present
in the ISO8859-1 and Unicode versions. There's now a single version of
each algorithm which was based on the Unicode version.

+ Recognize French suffixes even when they begin with diaereses. Patch from
David Corbett in #78.

* Russian:

+ We now normalise 'ё' to 'е' before stemming. The documentation has long
said "we assume ['ё'] is mapped into ['е']" but it's more convenient for
the stemmer to actually perform this normalisation. This change has no
effect if the caller is already normalising as we recommend. It's a change
in behaviour they aren't, but 'ё' occurs rarely (there are currently no
instances in our test vocabulary) and this improves behaviour when it does
occur. Patch from Eugene Mirotin (#65, #68).

* Finish:

+ Adjust the Finnish algorithm not to mangle numbers. This change also
means it tends to leave foreign words alone. Fixes #66.

* Danish:

+ Adjust Danish algorithm not to mangle alphanumeric codes. In particular
alphanumeric codes ending in a double digit (e.g. 0x0e00, hal9000,
space1999) are no longer mangled. See #81.

Optimisations to existing algorithms
------------------------------------

* Turkish:

+ Simplify uses of `test` in stemmer code.

+ Check for 'ad' or 'soyad' more efficiently, and without needing the
strlen variable. This speeds up "make check_utf8_turkish" by 11%
on x86 Linux.

* Kraaij-Pohlmann:

+ Eliminate variable x `$p1 <= cursor` is simpler and a little more efficient
than `setmark x $x >= p1`.

Code clarity improvements to existing algorithms
------------------------------------------------

* Turkish:

+ Use , for cedilla to match the conventions used in other stemmers.

* Kraaij-Pohlmann:

+ Avoid cryptic `[among ( (])` ... `)` construct - instead use the same
`[substring] among (` ... `)` construct we do in other stemmers.

Compiler
--------

* Support conventional --help and --version options.

* Warn if -r or -ep used with backend other than C/C++.

* Warn if encoding command line options are specified when generating code in a
language with a fixed encoding.

* The default classname is now set based on the output filename, so `-n` is now
often no longer needed. Fixes #64.

* Avoid potential one byte buffer over-read when parsing snowball code.

* Avoid comparing with uninitialised array element during compilation.

* Improve `-syntax` output for `setlimit L for C`.

* Optimise away double negation so generators don't have to worry about
generating `--` (decrement operator in many languages). Fixes #52, reported
by David Corbett.

* Improved compiler error and warning messages:

- We now report FILE:LINE: before each diagnostic message.

- Improve warnings for unused declarations/definitions.

- Warn for variables which are used, but either never initialised
or never read.

- Flag non-ASCII literal strings. This is an error for wide Unicode, but
only a warning for single-byte and UTF-8 which work so long as the source
encoding matches the encoding used in the generated stemmer code.

- Improve error recovery after an undeclared `define`. We now sniff the
token after the identifier and if it is `as` we parse as a routine,
otherwise we parse as a grouping. Previously we always just assumed it was
a routine, which gave a confusing second error if it was a grouping.

- Improve error recovery after an unexpected token in `among`. Previously
we acted as if the unexpected token closed the `among` (this probably
wasn't intended but just a missing `break;` in a switch statement). Now we
issue an error and try the next token.

* Report error instead of silently truncating character values (e.g. `hex 123`
previously silently became byte 0x23 which is `#` rather than a
g-with-cedilla).

* Enlarge the initial input buffer size to 8192 bytes and double each time we
hit the end. Snowball programs are typically a few KB in size (with the
current largest we ship being the Greek stemmer at 27KB) so the previous
approach of starting with a 10 byte input buffer and increasing its size by
50% plus 40 bytes each time it filled was inefficient, needing up to 15
reallocations to load greek.sbl.

* Identify variables only used by one `routine`/`external`. This information
isn't yet used, but such variables which are also always written to before
being read can be emitted as local variables in most target languages.

* We now allow multiple source files on command line, and allow them to be
after (or even interspersed) with options to better match modern Unix
conventions. Support for multiple source files allows specifying a single
byte character set mapping via a source file of `stringdef`.

* Avoid infinite recursion in compiler when optimising a recursive snowball
function. Recursive functions aren't typical in snowball programs, but
the compiler shouldn't crash for any input, especially not a valid one.
We now simply limit on how deep the compiler will recurse and make the
pessimistic assumption in the unlikely event we hit this limit.

Build system:

* `make clean` in C libstemmer_c distribution now removes `examples/*.o`.
(#59)

* Fix all the places which previously had to have a list of stemmers to work
dynamically or be generated, so now only modules.txt needs updating to add
a new stemmer.

* Add check_java make target which runs tests for java.

* Support gzipped test data (the uncompressed arabic test data is too big for
github).

* GNUmakefile: Drop useless `-eprefix` and `-r` options from snowball
invocations for Java - these are only meaningful when generating C code.

* Pass CFLAGS when linking which matches convention (e.g. automake does it) and
facilitates use of tools such as ASan. Fixes #84, reported by Thomas
Pointhuber.

* Add CI builds with -std=c90 to check compiler and generated code are C90
(#54)

libstemmer stuff:

* Split out CPPFLAGS from CFLAGS and use CFLAGS when linking stemwords.

* Add -O2 to CFLAGS.

* Make generated tables of encodings and modules const.

* Fix clang static analyzer memory leak warning (in practice this code path
can never actually be taken). Patch from Patrick O. Perry (#56)

documentation

* Added copyright and licensing details (#10).

* Document that libstemmer supports ISO_8859_2 encoding. Currently hungarian
and romanian are available in ISO_8859_2.

* Remove documentation falsely claiming that libstemmer supports CP850
encoding.

* CONTRIBUTING.rst: Add guidance for contributing new stemming algorithms and
new language backends.

* Overhaul libstemmer_python_README. Most notably, replace the benchmark data
which was very out of date.

+ 561
- 0
contrib/snowball/algorithms/arabic.sbl View File

@@ -0,0 +1,561 @@
/*
* Authors:
* - Assem Chelli, < assem [dot] ch [at] gmail >
* - Abdelkrim Aries <ab [underscore] aries [at] esi [dot] dz>
*
*/

stringescapes { }

/* the Arabic letters in Unicode */
// Hamza
stringdef o '{U+0621}' // Hamza
stringdef ao '{U+0623}' // Hamza above Alef
stringdef ao_ '{U+0625}' // Hamza below Alef
stringdef a~ '{U+0622}' // Alef madda
stringdef wo '{U+0624}' // Hamza above waw
stringdef yo '{U+0626}' // Hamza above yeh

// Letters
stringdef a '{U+0627}' // Alef
stringdef a_ '{U+0649}' // Alef Maksura
stringdef b '{U+0628}' // Beh
stringdef t_ '{U+0629}' // Teh_Marbuta
stringdef t '{U+062A}' // Teh
stringdef th '{U+062B}' // Theh
stringdef j '{U+062C}' // Jeem
stringdef h '{U+062D}' // Hah
stringdef x '{U+062E}' // Khah
stringdef d '{U+062F}' // Dal
stringdef dz '{U+0630}' // Thal
stringdef r '{U+0631}' // Reh
stringdef z '{U+0632}' // Zain
stringdef s '{U+0633}' // Seen
stringdef sh '{U+0634}' // Sheen
stringdef c '{U+0635}' // Sad
stringdef dh '{U+0636}' // Dad
stringdef tt '{U+0637}' // Tah
stringdef zh '{U+0638}' // Zah
stringdef i '{U+0639}' // Ain
stringdef gh '{U+063A}' // Ghain
stringdef f '{U+0641}' // Feh
stringdef q '{U+0642}' // Qaf
stringdef k '{U+0643}' // Kaf
stringdef l '{U+0644}' // Lam
stringdef m '{U+0645}' // Meem
stringdef n '{U+0646}' // Noon
stringdef e '{U+0647}' // Heh
stringdef w '{U+0648}' // Waw
stringdef y '{U+064A}' // Yeh

// Diacritics
stringdef aan '{U+064B}' // FatHatan
stringdef uun '{U+064C}' // Dammatan
stringdef iin '{U+064D}' // Kasratan
stringdef aa '{U+064E}' // FatHa
stringdef uu '{U+064F}' // Damma
stringdef ii '{U+0650}' // Kasra
stringdef oo '{U+0652}' // Sukun
stringdef ~ '{U+0651}' // Shadda

// Hindu–Arabic numerals
stringdef 0 '{U+0660}'
stringdef 1 '{U+0661}'
stringdef 2 '{U+0662}'
stringdef 3 '{U+0663}'
stringdef 4 '{U+0664}'
stringdef 5 '{U+0665}'
stringdef 6 '{U+0666}'
stringdef 7 '{U+0667}'
stringdef 8 '{U+0668}'
stringdef 9 '{U+0669}'


// Kasheeda
stringdef _ '{U+0640}' // Kasheeda, Tatweel

// Shaped forms
stringdef o1 '{U+FE80}' // HAMZA
stringdef ao1 '{U+FE83}' // ALEF_HAMZA_ABOVE
stringdef ao2 '{U+FE84}' // ALEF_HAMZA_ABOVE
stringdef ao_1 '{U+FE87}' // ALEF_HAMZA_BELOW
stringdef ao_2 '{U+FE88}' // ALEF_HAMZA_BELOW
stringdef yo1 '{U+FE8B}' // YEH_HAMZA
stringdef yo2 '{U+FE8C}' // YEH_HAMZA
stringdef yo3 '{U+FE89}' // YEH_HAMZA
stringdef yo4 '{U+FE8A}' // YEH_HAMZA
stringdef a~1 '{U+FE81}' // ALEF_MADDA
stringdef a~2 '{U+FE82}' // ALEF_MADDA
stringdef wo1 '{U+FE85}' // WAW_HAMZA
stringdef wo2 '{U+FE86}' // WAW_HAMZA
stringdef a1 '{U+FE8D}' // ALEF
stringdef a2 '{U+FE8E}' // ALEF
stringdef b1 '{U+FE8F}' // BEH
stringdef b2 '{U+FE90}' // BEH
stringdef b3 '{U+FE91}' // BEH
stringdef b4 '{U+FE92}' // BEH
stringdef t_1 '{U+FE93}' // TEH_MARBUTA
stringdef t_2 '{U+FE94}' // TEH_MARBUTA
stringdef t1 '{U+FE97}' // TEH
stringdef t2 '{U+FE98}' // TEH
stringdef t3 '{U+FE95}' // TEH
stringdef t4 '{U+FE96}' // TEH
stringdef th1 '{U+FE9B}' // THEH
stringdef th2 '{U+FE9C}' // THEH
stringdef th3 '{U+FE9A}' // THEH
stringdef th4 '{U+FE99}' // THEH
stringdef j1 '{U+FE9F}' // JEEM
stringdef j2 '{U+FEA0}' // JEEM
stringdef j3 '{U+FE9D}' // JEEM
stringdef j4 '{U+FE9E}' // JEEM
stringdef h1 '{U+FEA3}' // HAH
stringdef h2 '{U+FEA4}' // HAH
stringdef h3 '{U+FEA1}' // HAH
stringdef h4 '{U+FEA2}' // HAH
stringdef x1 '{U+FEA7}' // KHAH
stringdef x2 '{U+FEA8}' // KHAH
stringdef x3 '{U+FEA5}' // KHAH
stringdef x4 '{U+FEA6}' // KHAH
stringdef d1 '{U+FEA9}' // DAL
stringdef d2 '{U+FEAA}' // DAL
stringdef dz1 '{U+FEAB}' // THAL
stringdef dz2 '{U+FEAC}' // THAL
stringdef r1 '{U+FEAD}' // REH
stringdef r2 '{U+FEAE}' // REH
stringdef z1 '{U+FEAF}' // ZAIN
stringdef z2 '{U+FEB0}' // ZAIN
stringdef s1 '{U+FEB3}' // SEEN
stringdef s2 '{U+FEB4}' // SEEN
stringdef s3 '{U+FEB1}' // SEEN
stringdef s4 '{U+FEB2}' // SEEN
stringdef sh1 '{U+FEB7}' // SHEEN
stringdef sh2 '{U+FEB8}' // SHEEN
stringdef sh3 '{U+FEB5}' // SHEEN
stringdef sh4 '{U+FEB6}' // SHEEN
stringdef c1 '{U+FEBB}' // SAD
stringdef c2 '{U+FEBC}' // SAD
stringdef c3 '{U+FEB9}' // SAD
stringdef c4 '{U+FEBA}' // SAD
stringdef dh1 '{U+FEBF}' // DAD
stringdef dh2 '{U+FEC0}' // DAD
stringdef dh3 '{U+FEBD}' // DAD
stringdef dh4 '{U+FEBE}' // DAD
stringdef tt1 '{U+FEC3}' // TAH
stringdef tt2 '{U+FEC4}' // TAH
stringdef tt3 '{U+FEC1}' // TAH
stringdef tt4 '{U+FEC2}' // TAH
stringdef zh1 '{U+FEC7}' // ZAH
stringdef zh2 '{U+FEC8}' // ZAH
stringdef zh3 '{U+FEC5}' // ZAH
stringdef zh4 '{U+FEC6}' // ZAH
stringdef i1 '{U+FECB}' // AIN
stringdef i2 '{U+FECC}' // AIN
stringdef i3 '{U+FEC9}' // AIN
stringdef i4 '{U+FECA}' // AIN
stringdef gh1 '{U+FECF}' // GHAIN
stringdef gh2 '{U+FED0}' // GHAIN
stringdef gh3 '{U+FECD}' // GHAIN
stringdef gh4 '{U+FECE}' // GHAIN
stringdef f1 '{U+FED3}' // FEH
stringdef f2 '{U+FED4}' // FEH
stringdef f3 '{U+FED1}' // FEH
stringdef f4 '{U+FED2}' // FEH
stringdef q1 '{U+FED7}' // QAF
stringdef q2 '{U+FED8}' // QAF
stringdef q3 '{U+FED5}' // QAF
stringdef q4 '{U+FED6}' // QAF
stringdef k1 '{U+FEDB}' // KAF
stringdef k2 '{U+FEDC}' // KAF
stringdef k3 '{U+FED9}' // KAF
stringdef k4 '{U+FEDA}' // KAF
stringdef l1 '{U+FEDF}' // LAM
stringdef l2 '{U+FEE0}' // LAM
stringdef l3 '{U+FEDD}' // LAM
stringdef l4 '{U+FEDE}' // LAM
stringdef m1 '{U+FEE3}' // MEEM
stringdef m2 '{U+FEE4}' // MEEM
stringdef m3 '{U+FEE1}' // MEEM
stringdef m4 '{U+FEE2}' // MEEM
stringdef n1 '{U+FEE7}' // NOON
stringdef n2 '{U+FEE8}' // NOON
stringdef n3 '{U+FEE5}' // NOON
stringdef n4 '{U+FEE6}' // NOON
stringdef e1 '{U+FEEB}' // HEH
stringdef e2 '{U+FEEC}' // HEH
stringdef e3 '{U+FEE9}' // HEH
stringdef e4 '{U+FEEA}' // HEH
stringdef w1 '{U+FEED}' // WAW
stringdef w2 '{U+FEEE}' // WAW
stringdef a_1 '{U+FEEF}' // ALEF_MAKSURA
stringdef a_2 '{U+FEF0}' // ALEF_MAKSURA
stringdef y1 '{U+FEF3}' // YEH
stringdef y2 '{U+FEF4}' // YEH
stringdef y3 '{U+FEF1}' // YEH
stringdef y4 '{U+FEF2}' // YEH

// Ligatures Lam-Alef
stringdef la '{U+FEFB}' // LAM_ALEF
stringdef la2 '{U+FEFC}' // LAM_ALEF
stringdef lao '{U+FEF7}' // LAM_ALEF_HAMZA_ABOVE
stringdef lao2 '{U+FEF8}' // LAM_ALEF_HAMZA_ABOVE
stringdef lao_ '{U+FEF9}' // LAM_ALEF_HAMZA_BELOW
stringdef lao_2 '{U+FEFA}' // LAM_ALEF_HAMZA_BELOW
stringdef la~ '{U+FEF5}' // LAM_ALEF_MADDA_ABOVE
stringdef la~2 '{U+FEF6}' // LAM_ALEF_MADDA_ABOVE


booleans (
is_noun
is_verb
is_defined
)

routines (
Prefix_Step1
Prefix_Step2
Prefix_Step3a_Noun
Prefix_Step3b_Noun
Prefix_Step3_Verb
Prefix_Step4_Verb

Suffix_All_alef_maqsura
Suffix_Noun_Step1a
Suffix_Noun_Step1b
Suffix_Noun_Step2a
Suffix_Noun_Step2b
Suffix_Noun_Step2c1
Suffix_Noun_Step2c2
Suffix_Noun_Step3
Suffix_Verb_Step1
Suffix_Verb_Step2a
Suffix_Verb_Step2b
Suffix_Verb_Step2c

Normalize_post
Normalize_pre

Checks1
)

externals ( stem )

groupings ( )


// Normalizations
define Normalize_pre as (
do repeat (
(
[substring] among (
'{aan}' '{uun}' '{iin}' '{aa}' '{uu}' '{ii}' '{oo}' '{~}'( delete ) // strip vocalization
'{_}' ( delete ) // strip kasheeda

// Hindu–Arabic numerals
'{0}' ( <- '0')
'{1}' ( <- '1')
'{2}' ( <- '2')
'{3}' ( <- '3')
'{4}' ( <- '4')
'{5}' ( <- '5')
'{6}' ( <- '6')
'{7}' ( <- '7')
'{8}' ( <- '8')
'{9}' ( <- '9')

// Shaped forms
'{o1}' ( <- '{o}' ) // HAMZA
'{ao1}' '{ao2}' ( <- '{ao}' ) // ALEF_HAMZA_ABOVE
'{ao_1}' '{ao_2}' ( <- '{ao_}' ) // ALEF_HAMZA_BELOW
'{yo1}' '{yo2}' '{yo3}' '{yo4}' ( <- '{yo}' ) // YEH_HAMZA
'{a~1}' '{a~2}'( <- '{a~}' ) // ALEF_MADDA
'{wo1}' '{wo2}'( <- '{wo}' ) // WAW_HAMZA
'{a1}' '{a2}' ( <- '{a}' ) // ALEF
'{b1}' '{b2}' '{b3}' '{b4}' ( <- '{b}' ) // BEH
'{t_1}' '{t_2}' ( <- '{t_}' ) // TEH_MARBUTA
'{t1}' '{t2}' '{t3}' '{t4}' ( <- '{t}' ) // TEH
'{th1}' '{th2}' '{th3}' '{th4}' ( <- '{th}' ) // THEH
'{j1}' '{j2}' '{j3}' '{j4}'( <- '{j}' ) // JEEM
'{h1}' '{h2}' '{h3}' '{h4}' ( <- '{h}' ) // HAH
'{x1}' '{x2}' '{x3}' '{x4}'( <- '{x}' ) // KHAH
'{d1}' '{d2}' ( <- '{d}' ) // DAL
'{dz1}''{dz2}' ( <- '{dz}' ) // THAL
'{r1}' '{r2}'( <- '{r}' ) // REH
'{z1}' '{z2}' ( <- '{z}' ) // ZAIN
'{s1}' '{s2}' '{s3}' '{s4}'( <- '{s}' ) // SEEN
'{sh1}' '{sh2}' '{sh3}' '{sh4}' ( <- '{sh}' ) // SHEEN
'{c1}' '{c2}' '{c3}' '{c4}'( <- '{c}' ) // SAD
'{dh1}' '{dh2}' '{dh3}' '{dh4}'( <- '{dh}' ) // DAD
'{tt1}' '{tt2}' '{tt3}' '{tt4}' ( <- '{tt}' ) // TAH
'{zh1}' '{zh2}' '{zh3}' '{zh4}'( <- '{zh}' ) // ZAH
'{i1}' '{i2}' '{i3}' '{i4}'( <- '{i}' ) // AIN
'{gh1}' '{gh2}' '{gh3}' '{gh4}'( <- '{gh}' ) // GHAIN
'{f1}' '{f2}' '{f3}' '{f4}' ( <- '{f}' ) // FEH
'{q1}' '{q2}' '{q3}' '{q4}' ( <- '{q}' ) // QAF
'{k1}' '{k2}' '{k3}' '{k4}'( <- '{k}' ) // KAF
'{l1}' '{l2}' '{l3}' '{l4}'( <- '{l}' ) // LAM
'{m1}' '{m2}' '{m3}' '{m4}' ( <- '{m}' ) // MEEM
'{n1}' '{n2}' '{n3}' '{n4}'( <- '{n}' ) // NOON
'{e1}' '{e2}' '{e3}' '{e4}' ( <- '{e}' ) // HEH
'{w1}' '{w2}' ( <- '{w}' ) // WAW
'{a_1}' '{a_2}' ( <- '{a_}' ) // ALEF_MAKSURA
'{y1}' '{y2}' '{y3}' '{y4}' ( <- '{y}' ) // YEH

// Ligatures Lam-Alef
'{la}' '{la2}' (<- '{l}{a}')
'{lao}' '{lao2}' (<- '{l}{ao}')
'{lao_}' '{lao_2}' (<- '{l}{ao_}')
'{la~}' '{la~2}' (<- '{l}{a~}')

)
)
or
next
)
)

define Normalize_post as (

do (
// normalize last hamza
backwards (
[substring] among (
'{ao}''{ao_}' '{a~}' ( <- '{o}')
'{wo}' ( <- '{o}')
'{yo}' ( <- '{o}')
)
)
)

do repeat (
(
// normalize other hamza's
[substring] among (
'{ao}''{ao_}' '{a~}' ( <- '{a}')
'{wo}' ( <- '{w}')
'{yo}' ( <- '{y}')
)
)
or
next
)
)

// Checks
define Checks1 as (
[substring] among (
'{b}{a}{l}' '{k}{a}{l}' ($(len > 4) set is_noun unset is_verb set is_defined)
'{l}{l}' '{a}{l}' ($(len > 3) set is_noun unset is_verb set is_defined)
)
)


//prefixes
define Prefix_Step1 as (
[substring] among (
'{ao}{ao}' ($(len > 3) <- '{ao}' )
'{ao}{a~}' ($(len > 3) <- '{a~}' )
'{ao}{wo}' ($(len > 3) <- '{ao}' )
'{ao}{a}' ($(len > 3) <- '{a}' )
'{ao}{ao_}' ($(len > 3) <- '{ao_}' )
// '{ao}' ($(len > 3) delete) //rare case
)
)

define Prefix_Step2 as (
not '{f}{a}'
not '{w}{a}'
[substring] among (
'{f}' ($(len > 3) delete)
'{w}' ($(len > 3) delete)
)
)

define Prefix_Step3a_Noun as ( // it is noun and defined
[substring] among (
'{b}{a}{l}' '{k}{a}{l}' ($(len > 5) delete)
'{l}{l}' '{a}{l}' ($(len > 4) delete)
)
)

define Prefix_Step3b_Noun as ( // probably noun and defined
not '{b}{a}' // exception
[substring] among (
'{b}' ($(len > 3) delete)
// '{k}' '{l}' ($(len > 3) delete) // BUG: cause confusion
'{b}{b}' ($(len > 3) <- '{b}' )
'{k}{k}' ($(len > 3) <- '{k}' )
)

)

define Prefix_Step3_Verb as (
[substring] among (
//'{s}' ($(len > 4) delete)// BUG: cause confusion
'{s}{y}' ($(len > 4) <- '{y}' )
'{s}{t}' ($(len > 4) <- '{t}')
'{s}{n}' ($(len > 4) <- '{n}')
'{s}{ao}' ($(len > 4) <- '{ao}')
)
)

define Prefix_Step4_Verb as (
[substring] among (
'{y}{s}{t}' '{n}{s}{t}' '{t}{s}{t}' ($(len > 4) set is_verb unset is_noun <- '{a}{s}{t}' )
)
)

// suffixes
backwardmode (

define Suffix_Noun_Step1a as (
[substring] among (
'{y}' '{k}' '{e}' ($(len >= 4) delete)
'{n}{a}' '{k}{m}' '{e}{a}' '{e}{n}' '{e}{m}' ($(len >= 5) delete)
'{k}{m}{a}' '{e}{m}{a}' ($(len >= 6) delete)
)
)
define Suffix_Noun_Step1b as (
[substring] among (
'{n}' ($(len > 5) delete)
)
)

define Suffix_Noun_Step2a as (
[substring] among (
'{a}' '{y}' '{w}' ($(len > 4) delete)
)
)

define Suffix_Noun_Step2b as (
[substring] among (
'{a}{t}' ($(len >= 5) delete)
)
)

define Suffix_Noun_Step2c1 as (
[substring] among (
'{t}' ($(len >= 4) delete)
)
)
define Suffix_Noun_Step2c2 as ( // feminine t_
[substring] among (
'{t_}' ($(len >= 4) delete)
)
)
define Suffix_Noun_Step3 as ( // ya' nisbiya
[substring] among (
'{y}' ($(len >= 3) delete)
)
)

define Suffix_Verb_Step1 as (
[substring] among (
'{e}' '{k}' ($(len >= 4) delete)
'{n}{y}' '{n}{a}' '{e}{a}' '{e}{m}' '{e}{n}' '{k}{m}' '{k}{n}' ($(len >= 5) delete)
'{e}{m}{a}' '{k}{m}{a}' '{k}{m}{w}'($(len >= 6) delete)
)
)
define Suffix_Verb_Step2a as (
[substring] among (
'{t}' ($(len >= 4) delete)
'{a}' '{n}' '{y}' ($(len >= 4) delete)
'{n}{a}' '{t}{a}' '{t}{n}' ($(len >= 5) delete)// past
'{a}{n}' '{w}{n}' '{y}{n}' ($(len > 5) delete) // present
'{t}{m}{a}' ($(len >= 6) delete)
)
)

define Suffix_Verb_Step2b as (
[substring] among (
'{w}{a}' '{t}{m}' ($(len >= 5) delete)
)
)


define Suffix_Verb_Step2c as (
[substring] among (
'{w}' ($(len >= 4) delete)
'{t}{m}{w}' ($(len >= 6) delete)
)
)

define Suffix_All_alef_maqsura as (
[substring] among (
'{a_}' ( <- '{y}' ) // spell error
// '{a_}' ( delete ) // if noun > 3
// '{a_}' ( <- '{a}') // if verb
)
)
)

define stem as (
// set initial values
set is_noun
set is_verb
unset is_defined

// guess type and properties
do Checks1

// normalization pre-stemming
do Normalize_pre


backwards (

do (
//Suffixes for verbs
(
is_verb
(
(
(atleast 1 Suffix_Verb_Step1)
( Suffix_Verb_Step2a or Suffix_Verb_Step2c or next)
)
or Suffix_Verb_Step2b
or Suffix_Verb_Step2a
)
)
//Suffixes for nouns
or (
is_noun
(

try (
Suffix_Noun_Step2c2
or (not is_defined Suffix_Noun_Step1a (
Suffix_Noun_Step2a
or Suffix_Noun_Step2b
or Suffix_Noun_Step2c1
or next))
or (Suffix_Noun_Step1b (
Suffix_Noun_Step2a
or Suffix_Noun_Step2b
or Suffix_Noun_Step2c1))
or (not is_defined Suffix_Noun_Step2a)
or (Suffix_Noun_Step2b)
)
Suffix_Noun_Step3
)

)

// Suffixes for alef maqsura
or Suffix_All_alef_maqsura
)
)

//Prefixes
do (
try Prefix_Step1
try Prefix_Step2
( Prefix_Step3a_Noun
or (is_noun Prefix_Step3b_Noun)
or (is_verb try Prefix_Step3_Verb Prefix_Step4_Verb)
)
)

// normalization post-stemming
do Normalize_post

)

+ 149
- 0
contrib/snowball/algorithms/basque.sbl View File

@@ -0,0 +1,149 @@
routines (
aditzak
izenak
adjetiboak
mark_regions
RV R2 R1
)

externals ( stem )

integers ( pV p1 p2 )

groupings ( v )

stringescapes {}

/* special characters */

stringdef n~ '{U+00F1}'

define v 'aeiou'

define mark_regions as (

$pV = limit
$p1 = limit
$p2 = limit // defaults

do (
( v (non-v gopast v) or (v gopast non-v) )
or
( non-v (non-v gopast v) or (v next) )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)

backwardmode (

define RV as $pV <= cursor
define R2 as $p2 <= cursor
define R1 as $p1 <= cursor

define aditzak as (
[substring] among(
'le' 'la' 'tzaile' 'aldatu' 'atu' 'tzailea' 'taile' 'tailea' 'pera' 'gale' 'galea'
'gura' 'kura' 'kor' 'korra' 'or' 'orra' 'tun' 'tuna' 'gaitz' 'gaitza'
'kaitz' 'kaitza' 'ezin' 'ezina' 'tezin' 'tezina' 'errez' 'erreza'
'karri' 'karria' 'tzaga' 'tzaka' 'tzake' 'tzeke' 'ez' 'eza' 'tzez'
'keta' 'eta' 'etan' 'pen' 'pena' 'tze' 'atze' 'kuntza' 'kunde' 'kundea'
'kune' 'kunea' 'kuna' 'kera' 'era' 'kizun' 'kizuna' 'dura' 'tura' 'men' 'mena'
'go' 'ago' 'tio' 'taldi' 'taldia' 'aldi' 'aldia' 'gune' 'gunea' 'bide' 'bidea'
'pide' 'pidea' 'gai' 'gaia' 'ki' 'kin' 'rekin' 'kina' 'kari' 'karia' 'ari' 'tari' 'etari'
'gailu' 'gailua' 'kide' 'kidea' 'ide' 'idea' 'du' 'ka' 'kan' 'an' 'ean' 'tu' 'lari' 'tatu'
'rean' 'tarazi' 'arazi' 'tzat' 'bera' 'dako'
( RV delete )
'garri' 'garria' 'tza'
(R2 delete)
'atseden'
(<- 'atseden')
'arabera'
(<- 'arabera')
'baditu'
(<- 'baditu')

)
)

define izenak as (
[substring] among(
'ari' 'aria' 'bizia' 'kari' 'karia' 'lari' 'laria' 'tari' 'taria' 'zain' 'zaina'
'tzain' 'tzaina' 'zale' 'zalea' 'tzale' 'tzalea' 'aizun' 'orde' 'ordea'
'burua' 'ohi' 'ohia' 'kintza' 'gintzo' 'gintzu' 'tzu' 'tzua'
'tzo' 'tzoa' 'kuntza' 'talde' 'taldea' 'eria' 'keria' 'teria' 'di'
'za' 'ada' 'tara' 'etara' 'tra' 'ta' 'tegi' 'tegia' 'keta' 'z' 'zko' 'zkoa'
'ti' 'tia' 'tsu' 'tsua' 'zu' 'zua' 'bera' 'pera' 'zto' 'ztoa' 'asi' 'asia'
'gile' 'gilea' 'estu' 'estua' 'larri' 'larria' 'nahi' 'nahia'
'koi' 'koia' 'oi' 'oia' 'goi' 'min' 'mina' 'dun' 'duna' 'duru' 'durua'
'duri' 'duria' 'os' 'osa' 'oso' 'osoa' 'ar' 'ara' 'tar' 'dar' 'dara'
'tiar' 'tiara' 'liar' 'liara' 'gabe' 'gabea' 'kabe' 'kabea' 'ga' 'ge'
'kada' 'tasun' 'tasuna' 'asun' 'asuna' 'go' 'mendu' 'mendua' 'mentu' 'mentua'
'mendi' 'mendia' 'zio' 'zioa' 'zino' 'zinoa' 'zione' 'zionea' 'ezia'
'degi' 'degia' 'egi' 'egia' 'toki' 'tokia' 'leku' 'lekua' 'gintza' 'alde'
'aldea' 'kalde' 'kaldea' 'gune' 'gunea' 'une' 'unea' 'una' 'pe' 'pea'
'gibel' 'gibela' 'ondo' 'ondoa' 'arte' 'artea' 'aurre' 'aurrea'
'etxe' 'etxea' 'ola' 'ontzi' 'ontzia' 'gela' 'denda' 'taldi' 'taldia'
'aldi' 'aldia' 'te' 'tea' 'zaro' 'zaroa' 'taro' 'taroa' 'oro' 'oroa'
'aro' 'aroa' 'ero' 'eroa' 'eroz' 'eroza' 'ka' 'kan' 'kana' 'tako' 'etako' 'takoa'
'kote' 'kotea' 'tzar' 'tzarra' 'handi' 'handia' 'kondo' 'kondoa' 'skila'
'no' 'noa' '{n~}o' '{n~}oa' 'ska' 'xka' 'zka' 'tila' 'to' 'toa' 'tto' 'ttoa'
'txo' 'txoa' 'txu' 'txua' 'anda' 'anga' 'urren' 'urrena' 'gai' 'gaia'
'gei' 'geia' 'eme' 'emea' 'kume' 'kumea' 'sa' 'ko' 'eko' 'koa' 'ena'
'enea' 'ne' 'nea' 'kor' 'korra' 'ez' 'eza' 'eta' 'etan'
'ki' 'kia' 'kin' 'kina' 'tu' 'tua' 'du' 'dua' 'ek'
'tarik' 'tariko' 'tan' 'ordu' 'ordua' 'oste' 'ostea' 'tzara'
'ra' 'antza' 'behar' 'ro' 'giro' 'ak' 'zp' 'ket'
'kail' 'kaila' 'ail' 'kirri' 'kirria' 'ngo' 'ngoa' '{n~}i' 'sko'
'sta' 'koitz' 'koitza' 'na' 'garren' 'garrena' 'kera'
'gerren' 'gerrena' 'garna' 'kide' 'tz' 'tuko'
( RV delete )
'ora' 'garri' 'garria' 'or' 'buru' 'ren' 'tza'
( R2 delete )
'joka'
(<- 'jok')
'tzen' 'ten' 'en' 'tatu'
(R1 delete)
'trako'
(<- 'tra')
'minutuko'
(<- 'minutu')
'zehar'
(<- 'zehar')
'geldi'
(<- 'geldi')
'igaro'
(<- 'igaro')
'aurka'
(<- 'aurka')
)
)

define adjetiboak as (
[substring] among(
'era' 'ero' 'go' 'tate' 'tade' 'date' 'dade' 'keria'
'ki' 'to' 'ro' 'la' 'gi' 'larik' 'lanik' 'ik' 'ztik' 'rik'
( RV delete )
'zlea'
(<- 'z')
)
)

)

define stem as (
do mark_regions
backwards (
repeat aditzak
repeat izenak
do adjetiboak
)

)

/*
Note 1: additions of 21 Jul 2010
*/

+ 202
- 0
contrib/snowball/algorithms/catalan.sbl View File

@@ -0,0 +1,202 @@
routines (
cleaning mark_regions
R1 R2
attached_pronoun
standard_suffix
verb_suffix
residual_suffix
)

externals ( stem )

integers ( p1 p2 )

groupings ( v )

stringescapes {}

/* special characters */

stringdef a' '{U+00E1}' // a-acute
stringdef a` '{U+00E0}' // a-grave
stringdef c, '{U+00E7}' // c-cedilla
stringdef e' '{U+00E9}' // e-acute
stringdef e` '{U+00E8}' // e-grave
stringdef i' '{U+00ED}' // i-acute
stringdef i` '{U+00EC}' // i-grave
stringdef i" '{U+00EF}' // i-diaeresis
stringdef o' '{U+00F3}' // o-acute
stringdef o` '{U+00F2}' // o-grave
stringdef u' '{U+00FA}' // u-acute
stringdef u" '{U+00FC}' // u-diaeresis
stringdef . '{U+00B7}' // - per l aggeminades

define v 'aeiou{a'}{a`}{e'}{e`}{i'}{i"}{o'}{o`}{u'}{u"}'

define mark_regions as (

$p1 = limit
$p2 = limit // defaults

do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)

define cleaning as repeat (
[substring] among(
'{a'}' (<- 'a')
'{a`}' (<- 'a')
'{e'}' (<- 'e')
'{e`}' (<- 'e')
'{i'}' (<- 'i')
'{i`}' (<- 'i')
'{o'}' (<- 'o')
'{o`}' (<- 'o')
'{u'}' (<- 'u')
'{u"}' (<- 'u')
'{i"}' (<- 'i')
'{.}' (<- '.')
'' (next)
)
)

backwardmode (

define R1 as $p1 <= cursor
define R2 as $p2 <= cursor

define attached_pronoun as (
[substring] among (
'{'}s' '{'}hi' '{'}ho' '{'}l' '{'}ls'
'-ls' '-la' '-les' '-li'
'vos' 'se' 'nos' '-nos' '-us' 'us'
'{'}n' '{'}ns' '-n' '-ns'
'{'}m' '-me' '-m'
'-te' '{'}t'
'li' 'lo' 'los'
'me' 'sela' 'selo' 'selas' 'selos' 'le'
'la' 'las' 'les' 'ens' 'ho' 'hi'
(R1 delete)
)
)

define standard_suffix as (
[substring] among(
'ar' 'atge' 'formes' 'icte' 'ictes'
'ell' 'ells' 'ella' '{e'}s' '{e`}s' 'esc' 'essa' 'et' 'ets' 'eta'
'eres' 'eries' 'ers' 'ina' 'ines' 'able' 'ls'
'i{o'}' 'itat' 'itats' 'itzar' 'iva' 'ives' 'ivisme' 'ius'
'fer' 'ment' 'amen' 'ament' 'aments' 'ments' 'ot' 'sfera' 'al' 'als' 'era' 'ana' 'iste'
'aire' 'eria' 'esa' 'eses' 'esos' 'or' '{i'}cia' '{i'}cies' 'icis' 'ici' '{i'}ci' '{i'}cis'
'{a`}ria' '{a`}ries' 'alla' 'ci{o'}' 'cions' 'n{c,}a' 'nces' '{o'}' 'dor' 'all'
'il' '{i'}stic' 'enc' 'enca' '{i'}s' 'issa' 'issos' '{i'}ssem' '{i'}ssiu' 'issem' 'isseu' '{i'}sseu'
'{o'}s' 'osa' 'dora' 'dores' 'dors' 'adura' 'ble' 'bles' '{i'}vol' '{i'}vola' 'd{i'}s' 'egar' 'ejar' 'ificar'
'itar' 'ables' 'adors' 'idores' 'idors'
'adora' 'aci{o'}' 'doras' 'dur' 'dures' 'alleng{u"}es'
'ant' 'ants' 'ancia' 'ancies' 'at{o`}ria' 'at{o`}ries' 'tori' 'toris'
'ats' 'ions' 'ota' 'isam' 'ors' 'ora' 'ores' 'isament'
'bilitat' 'bilitats' 'ivitat' 'ivitats' 'ari' 'aris' 'ionisme' 'ionista' 'ionistes'
'ialista' 'ialistes' 'ialisme' 'ialismes' 'ud' 'uts' 'uds' 'encia' 'encies' '{e`}ncia' '{e`}ncies'
'{i"}tat' '{i"}tats' 'atiu' 'atius' 'atives' 'ativa' 'ativitat' 'ativitats' 'ible' 'ibles'
'assa' 'asses' 'assos'
'ent' 'ents'
'{i'}ssim' '{i'}ssima' '{i'}ssims' '{i'}ssimes' '{i`}ssem' '{i`}sseu' '{i`}ssin'
'ims' 'ima' 'imes'
'isme' 'ista' 'ismes' 'istes'
'inia' 'inies' '{i'}inia' '{i'}nies' 'ita' 'ites' 'triu' 'trius'
'oses' 'osos' 'ient' 'otes' 'ots'
(R1 delete)
'acions' 'ada' 'ades'
(R2 delete)
'log{i'}a' 'log{i'}es''logia' 'logies' 'logi' 'logis' 'l{o'}gica' 'l{o'}gics' 'l{o'}giques'
(R2 <- 'log')
'ic' 'ica' 'ics' 'iques'
(R2 <- 'ic')
'qu{i'}ssim' 'qu{i'}ssims' 'qu{i'}ssimes' 'qu{i'}ssima'
(R1 <- 'c')
)
)

define verb_suffix as (
[substring] among(
'ador' 'adora' 'adors' 'adores' 're' 'ie'
'ent' 'ents' 'udes' 'ar{a`}' 'eren'
'ar{a'}' 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais'
'aria' 'arian' 'arien' 'aries' 'ar{a`}s'
'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ara'
'ar{e'}' 'ar{e'}s'
'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais'
'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}'
'er{e'}' 'er' 'erau' 'erass'
'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais'
'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}'
'ir{e'}' '{i'}rem' '{i'}reu' '{i'}eu'
'ia' 'ies' '{i'}em' '{i`}eu' 'ien'
'at' 'ut' 'uda' 'ava' 'aves' 'avem' '{a'}vem' '{a`}vem' '{a`}veu' '{a'}veu' 'aven' 'au' 'ats'
'asseu' 'esseu' 'eresseu' '{a`}sseu' '{a`}ssem' '{a`}ssim' '{a`}ssiu'
'essen' 'esses' 'assen' 'asses' 'assim' 'assiu'
'{e'}ssen' '{e'}sseu' '{e'}ssim' '{e'}ssiu' '{e'}ssem'
'{i'}' 'ares' '{a`}rem' '{a`}reu' '{a`}ren'
'ar{i'}em' 'ar{i'}eu'
'areu' 'aren' 'ant' '{i"}m' '{i"}u'
'{e'}s' '{i"}en' 'en' 'es' 'em' 'am' 'ams' '{i"}a' '{i"}es'
'dre' 'eix' 'eixer' 'tzar' 'eixes' 'ides' '{i"}des' 'it' '{i"}t' '{i"}da'
'aba' 'ada' 'ades' 'ida' '{i'}a' 'iera' 'ad' 'ed' 'its'
'id' 'ids' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an'
'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado'
'ido' 'iendo' 'i{o'}' 'ar' 'ir' 'as'
'ieu' 'ii' 'io' 'i{a`}'
'ess' 'essin' 'essis' 'ass' 'assin' 'assis' 'essim' '{e`}ssim' '{e`}ssiu'
'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases'
'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais'
'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados'
'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' 'ques'
'{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos'
'ira' 'iran' 'irem' 'iren' 'ires' 'ireu' 'iria' 'irien'
'iries' 'ir{a`}' 'ir{a`}s' 'ir{e`}' 'ir{i`}em' 'ir{i`}eu'
'isquen' 'iguem' 'igueu' 'esqui' 'esquin' 'esquis' 'eixi' 'eixin' 'eixis'
'eixen' 'eixo' 'isin' 'isis' 'esques' 'sis' 'sin'
'int' 'ir{i'}em' 'ir{i'}eu' 'isc' 'atges' 'esca' 'esquen'
'issen' 'isses' 'issin' 'issis' 'isca' 'issiu' 'issim'
'{i"}sc' '{i"}sca' '{i"}ssin' '{i'}ssiu' '{i'}ssim' '{i"}ssis' '{i"}guem' '{i"}gueu'
'{i"}ra' '{i"}ren' '{i"}res'
'{i"}squen' '{i"}sques' '{i"}ssen' '{i"}sses' '{i"}xo' '{i"}xen' '{i"}xes' '{i"}x'
'ixo' 'ixen' 'ixes' 'ix' 'ixa' 'inin' 'inis' 'ini' 'ineu' 'itza' 'itzi' 'itzeu' 'itzis'
'itzo' 'itz' 'itz{a`}' 'arem' 'in' '{a`}s' 'i{i"}' 'i{i"}n' 'i{i"}s'
(R1 delete)
'ando'
(R2 delete)
)
)

define residual_suffix as (
[substring] among(
'os' 'a' 'o' '{a'}' '{a`}' '{i'}' '{o'}' 'e' '{e'}' 'eu' 'iu'
'is' 'i' 'ir' 's' '{i`}' 'itz' '{i"}' '{i"}n' '{i"}s' 'it'
(R1 delete)
'iqu'
(R1 <- 'ic')
)
)
)

define stem as (
do mark_regions
backwards (
do attached_pronoun
do ( standard_suffix or
verb_suffix
)
do residual_suffix
)
do cleaning
)

/*
First works 2010/07/19
First Grammatical Reviews: https://ca.wikipedia.org/wiki/Gram%C3%A0tica_del_catal%C3%A0
Suffix list: https://ca.wikipedia.org/wiki/Llista_de_sufixos
Irregular Verbs: https://ca.wikipedia.org/wiki/Flexi%C3%B3_verbal_del_catal%C3%A0
*/

contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl → contrib/snowball/algorithms/danish.sbl View File

@@ -12,15 +12,17 @@ strings ( ch )

integers ( p1 x )

groupings ( v s_ending )
groupings ( c v s_ending )

stringescapes {}

/* special characters (in ISO Latin I) */
/* special characters */

stringdef ae hex 'E6'
stringdef ao hex 'E5'
stringdef o/ hex 'F8'
stringdef ae '{U+00E6}'
stringdef ao '{U+00E5}'
stringdef o/ '{U+00F8}'

define c 'bcdfghjklmnpqrstvwxz'

define v 'aeiouy{ae}{ao}{o/}'

@@ -73,7 +75,7 @@ backwardmode (
)
)
define undouble as (
setlimit tomark p1 for ([non-v] ->ch)
setlimit tomark p1 for ([c] ->ch)
ch
delete
)

+ 0
- 91
contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl View File

@@ -1,91 +0,0 @@
routines (
mark_regions
main_suffix
consonant_pair
other_suffix
undouble
)

externals ( stem )

strings ( ch )

integers ( p1 x )

groupings ( v s_ending )

stringescapes {}

/* special characters (in MS-DOS Latin I) */

stringdef ae hex '91'
stringdef ao hex '86'
stringdef o/ hex '9B'

define v 'aeiouy{ae}{ao}{o/}'

define s_ending 'abcdfghjklmnoprtvyz{ao}'

define mark_regions as (

$p1 = limit

test ( hop 3 setmark x )
goto v gopast non-v setmark p1
try ( $p1 < x $p1 = x )
)

backwardmode (

define main_suffix as (
setlimit tomark p1 for ([substring])
among(

'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere'
'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes'
'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets'
'erets' 'et' 'eret'
(delete)
's'
(s_ending delete)
)
)

define consonant_pair as (
test (
setlimit tomark p1 for ([substring])
among(
'gd' // significant in the call from other_suffix
'dt' 'gt' 'kt'
)
)
next] delete
)

define other_suffix as (
do ( ['st'] 'ig' delete )
setlimit tomark p1 for ([substring])
among(
'ig' 'lig' 'elig' 'els'
(delete do consonant_pair)
'l{o/}st'
(<-'l{o/}s')
)
)
define undouble as (
setlimit tomark p1 for ([non-v] ->ch)
ch
delete
)
)

define stem as (

do mark_regions
backwards (
do main_suffix
do consonant_pair
do other_suffix
do undouble
)
)

contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl → contrib/snowball/algorithms/dutch.sbl View File

@@ -18,21 +18,21 @@ groupings ( v v_I v_j )

stringescapes {}

/* special characters (in MS-DOS Latin I) */
/* special characters */

stringdef a" hex '84'
stringdef e" hex '89'
stringdef i" hex '8B'
stringdef o" hex '94'
stringdef u" hex '81'
stringdef a" '{U+00E4}'
stringdef e" '{U+00EB}'
stringdef i" '{U+00EF}'
stringdef o" '{U+00F6}'
stringdef u" '{U+00FC}'

stringdef a' hex 'A0'
stringdef e' hex '82'
stringdef i' hex 'A1'
stringdef o' hex 'A2'
stringdef u' hex 'A3'
stringdef a' '{U+00E1}'
stringdef e' '{U+00E9}'
stringdef i' '{U+00ED}'
stringdef o' '{U+00F3}'
stringdef u' '{U+00FA}'

stringdef e` hex '8A'
stringdef e` '{U+00E8}'

define v 'aeiouy{e`}'
define v_I v + 'I'

+ 0
- 164
contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl View File

@@ -1,164 +0,0 @@
routines (
prelude postlude
e_ending
en_ending
mark_regions
R1 R2
undouble
standard_suffix
)

externals ( stem )

booleans ( e_found )

integers ( p1 p2 )

groupings ( v v_I v_j )

stringescapes {}

/* special characters (in ISO Latin I) */

stringdef a" hex 'E4'
stringdef e" hex 'EB'
stringdef i" hex 'EF'
stringdef o" hex 'F6'
stringdef u" hex 'FC'

stringdef a' hex 'E1'
stringdef e' hex 'E9'
stringdef i' hex 'ED'
stringdef o' hex 'F3'
stringdef u' hex 'FA'

stringdef e` hex 'E8'

define v 'aeiouy{e`}'
define v_I v + 'I'
define v_j v + 'j'

define prelude as (
test repeat (
[substring] among(
'{a"}' '{a'}'
(<- 'a')
'{e"}' '{e'}'
(<- 'e')
'{i"}' '{i'}'
(<- 'i')
'{o"}' '{o'}'
(<- 'o')
'{u"}' '{u'}'
(<- 'u')
'' (next)
) //or next
)
try(['y'] <- 'Y')
repeat goto (
v [('i'] v <- 'I') or
('y'] <- 'Y')
)
)

define mark_regions as (

$p1 = limit
$p2 = limit

gopast v gopast non-v setmark p1
try($p1 < 3 $p1 = 3) // at least 3
gopast v gopast non-v setmark p2

)

define postlude as repeat (

[substring] among(
'Y' (<- 'y')
'I' (<- 'i')
'' (next)
) //or next

)

backwardmode (

define R1 as $p1 <= cursor
define R2 as $p2 <= cursor

define undouble as (
test among('kk' 'dd' 'tt') [next] delete
)

define e_ending as (
unset e_found
['e'] R1 test non-v delete
set e_found
undouble
)

define en_ending as (
R1 non-v and not 'gem' delete
undouble
)

define standard_suffix as (
do (
[substring] among(
'heden'
( R1 <- 'heid'
)
'en' 'ene'
( en_ending
)
's' 'se'
( R1 non-v_j delete
)
)
)
do e_ending

do ( ['heid'] R2 not 'c' delete
['en'] en_ending
)

do (
[substring] among(
'end' 'ing'
( R2 delete
(['ig'] R2 not 'e' delete) or undouble
)
'ig'
( R2 not 'e' delete
)
'lijk'
( R2 delete e_ending
)
'baar'
( R2 delete
)
'bar'
( R2 e_found delete
)
)
)
do (
non-v_I
test (
among ('aa' 'ee' 'oo' 'uu')
non-v
)
[next] delete
)
)
)

define stem as (

do prelude
do mark_regions
backwards
do standard_suffix
do postlude
)

contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl → contrib/snowball/algorithms/english.sbl View File


contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl → contrib/snowball/algorithms/finnish.sbl View File

@@ -24,16 +24,17 @@ externals ( stem )
integers ( p1 p2 )
strings ( x )
booleans ( ending_removed )
groupings ( AEI V1 V2 particle_end )
groupings ( AEI C V1 V2 particle_end )

stringescapes {}

/* special characters (in ISO Latin I) */
/* special characters */

stringdef a" hex 'E4'
stringdef o" hex 'F6'
stringdef a" '{U+00E4}'
stringdef o" '{U+00F6}'

define AEI 'a{a"}ei'
define C 'bcdfghjklmnpqrstvwxz'
define V1 'aeiouy{a"}{o"}'
define V2 'aeiou{a"}{o"}'
define particle_end V1 + 'nt'
@@ -116,7 +117,7 @@ backwardmode (
)

'a' '{a"}' //-.
(V1 non-V1) // |
(V1 C) // |
'tta' 'tt{a"}' // Partitive [32]
('e') // |
'ta' 't{a"}' //-'
@@ -172,11 +173,11 @@ backwardmode (
define tidy as (
setlimit tomark p1 for (
do ( LONG and ([next] delete ) ) // undouble vowel
do ( [AEI] non-V1 delete ) // remove trailing a, a", e, i
do ( [AEI] C delete ) // remove trailing a, a", e, i
do ( ['j'] 'o' or 'u' delete )
do ( ['o'] 'j' delete )
)
goto non-V1 [next] -> x x delete // undouble consonant
goto non-V1 [C] -> x x delete // undouble consonant
)
)


contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl → contrib/snowball/algorithms/french.sbl View File

@@ -17,21 +17,21 @@ groupings ( v keep_with_s )

stringescapes {}

/* special characters (in ISO Latin I) */
stringdef a^ hex 'E2' // a-circumflex
stringdef a` hex 'E0' // a-grave
stringdef c, hex 'E7' // c-cedilla
stringdef e" hex 'EB' // e-diaeresis (rare)
stringdef e' hex 'E9' // e-acute
stringdef e^ hex 'EA' // e-circumflex
stringdef e` hex 'E8' // e-grave
stringdef i" hex 'EF' // i-diaeresis
stringdef i^ hex 'EE' // i-circumflex
stringdef o^ hex 'F4' // o-circumflex
stringdef u^ hex 'FB' // u-circumflex
stringdef u` hex 'F9' // u-grave
/* special characters */
stringdef a^ '{U+00E2}' // a-circumflex
stringdef a` '{U+00E0}' // a-grave
stringdef c, '{U+00E7}' // c-cedilla
stringdef e" '{U+00EB}' // e-diaeresis (rare)
stringdef e' '{U+00E9}' // e-acute
stringdef e^ '{U+00EA}' // e-circumflex
stringdef e` '{U+00E8}' // e-grave
stringdef i" '{U+00EF}' // i-diaeresis
stringdef i^ '{U+00EE}' // i-circumflex
stringdef o^ '{U+00F4}' // o-circumflex
stringdef u^ '{U+00FB}' // u-circumflex
stringdef u` '{U+00F9}' // u-grave

define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}'

@@ -42,6 +42,10 @@ define prelude as repeat goto (
('y' ] <- 'Y')
)
or
( [ '{e"}' ] <- 'He' )
or
( [ '{i"}' ] <- 'Hi' )
or
( ['y'] v <- 'Y' )
or
( 'q' ['u'] <- 'U' )
@@ -78,6 +82,9 @@ define postlude as repeat (
'I' (<- 'i')
'U' (<- 'u')
'Y' (<- 'y')
'He' (<- '{e"}')
'Hi' (<- '{i"}')
'H' (delete)
'' (next)
)
)
@@ -167,7 +174,7 @@ backwardmode (
'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait'
'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses'
'issez' 'issiez' 'issions' 'issons' 'it'
(non-v delete)
(not 'H' non-v delete)
)
)

@@ -196,14 +203,13 @@ backwardmode (
define keep_with_s 'aiou{e`}s'

define residual_suffix as (
try(['s'] test non-keep_with_s delete)
try(['s'] test ('Hi' or non-keep_with_s) delete)
setlimit tomark pV for (
[substring] among(
'ion' (R2 's' or 't' delete)
'ier' 'i{e`}re'
'Ier' 'I{e`}re' (<-'i')
'e' (delete)
'{e"}' ('gu' delete)
)
)
)

+ 0
- 239
contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl View File

@@ -1,239 +0,0 @@
routines (
prelude postlude mark_regions
RV R1 R2
standard_suffix
i_verb_suffix
verb_suffix
residual_suffix
un_double
un_accent
)

externals ( stem )

integers ( pV p1 p2 )

groupings ( v keep_with_s )

stringescapes {}

/* special characters (in MS-DOS Latin I) */

stringdef a^ hex '83' // a-circumflex
stringdef a` hex '85' // a-grave
stringdef c, hex '87' // c-cedilla

stringdef e" hex '89' // e-diaeresis (rare)
stringdef e' hex '82' // e-acute
stringdef e^ hex '88' // e-circumflex
stringdef e` hex '8A' // e-grave
stringdef i" hex '8B' // i-diaeresis
stringdef i^ hex '8C' // i-circumflex
stringdef o^ hex '93' // o-circumflex
stringdef u^ hex '96' // u-circumflex
stringdef u` hex '97' // u-grave

define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}'

define prelude as repeat goto (

( v [ ('u' ] v <- 'U') or
('i' ] v <- 'I') or
('y' ] <- 'Y')
)
or
( ['y'] v <- 'Y' )
or
( 'q' ['u'] <- 'U' )
)

define mark_regions as (

$pV = limit
$p1 = limit
$p2 = limit // defaults

do (
( v v next ) or ( next gopast v )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)

define postlude as repeat (

[substring] among(
'I' (<- 'i')
'U' (<- 'u')
'Y' (<- 'y')
'' (next)
)
)

backwardmode (

define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor

define standard_suffix as (
[substring] among(

'ance' 'iqUe' 'isme' 'able' 'iste' 'eux'
'ances' 'iqUes' 'ismes' 'ables' 'istes'
( R2 delete )
'atrice' 'ateur' 'ation'
'atrices' 'ateurs' 'ations'
( R2 delete
try ( ['ic'] (R2 delete) or <-'iqU' )
)
'logie'
'logies'
( R2 <- 'log' )
'usion' 'ution'
'usions' 'utions'
( R2 <- 'u' )
'ence'
'ences'
( R2 <- 'ent' )
'ement'
'ements'
(
RV delete
try (
[substring] among(
'iv' (R2 delete ['at'] R2 delete)
'eus' ((R2 delete) or (R1<-'eux'))
'abl' 'iqU'
(R2 delete)
'i{e`}r' 'I{e`}r' //)
(RV <-'i') //)--new 2 Sept 02
)
)
)
'it{e'}'
'it{e'}s'
(
R2 delete
try (
[substring] among(
'abil' ((R2 delete) or <-'abl')
'ic' ((R2 delete) or <-'iqU')
'iv' (R2 delete)
)
)
)
'if' 'ive'
'ifs' 'ives'
(
R2 delete
try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' )
)
'eaux' (<- 'eau')
'aux' (R1 <- 'al')
'euse'
'euses'((R2 delete) or (R1<-'eux'))

'issement'
'issements'(R1 non-v delete) // verbal

// fail(...) below forces entry to verb_suffix. -ment typically
// follows the p.p., e.g 'confus{e'}ment'.

'amment' (RV fail(<- 'ant'))
'emment' (RV fail(<- 'ent'))
'ment'
'ments' (test(v RV) fail(delete))
// v is e,i,u,{e'},I or U
)
)

define i_verb_suffix as setlimit tomark pV for (
[substring] among (
'{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai'
'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez'
'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait'
'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses'
'issez' 'issiez' 'issions' 'issons' 'it'
(non-v delete)
)
)

define verb_suffix as setlimit tomark pV for (
[substring] among (
'ions'
(R2 delete)

'{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai'
'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions'
'erons' 'eront' 'ez' 'iez'

// 'ons' //-best omitted

(delete)

'{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant'
'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez'
'assions'
(delete
try(['e'] delete)
)
)
)

define keep_with_s 'aiou{e`}s'

define residual_suffix as (
try(['s'] test non-keep_with_s delete)
setlimit tomark pV for (
[substring] among(
'ion' (R2 's' or 't' delete)
'ier' 'i{e`}re'
'Ier' 'I{e`}re' (<-'i')
'e' (delete)
'{e"}' ('gu' delete)
)
)
)

define un_double as (
test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete
)

define un_accent as (
atleast 1 non-v
[ '{e'}' or '{e`}' ] <-'e'
)
)

define stem as (

do prelude
do mark_regions
backwards (

do (
(
( standard_suffix or
i_verb_suffix or
verb_suffix
)
and
try( [ ('Y' ] <- 'i' ) or
('{c,}'] <- 'c' )
)
) or
residual_suffix
)

// try(['ent'] RV delete) // is best omitted

do un_double
do un_accent
)
do postlude
)


contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl → contrib/snowball/algorithms/german.sbl View File

@@ -18,12 +18,12 @@ groupings ( v s_ending st_ending )

stringescapes {}

/* special characters (in ISO Latin I) */
/* special characters */

stringdef a" hex 'E4'
stringdef o" hex 'F6'
stringdef u" hex 'FC'
stringdef ss hex 'DF'
stringdef a" '{U+00E4}'
stringdef o" '{U+00F6}'
stringdef u" '{U+00FC}'
stringdef ss '{U+00DF}'

define v 'aeiouy{a"}{o"}{u"}'


+ 0
- 139
contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl View File

@@ -1,139 +0,0 @@

/*
Extra rule for -nisse ending added 11 Dec 2009
*/

routines (
prelude postlude
mark_regions
R1 R2
standard_suffix
)

externals ( stem )

integers ( p1 p2 x )

groupings ( v s_ending st_ending )

stringescapes {}

/* special characters (in MS-DOS Latin I) */

stringdef a" hex '84'
stringdef o" hex '94'
stringdef u" hex '81'
stringdef ss hex 'E1'

define v 'aeiouy{a"}{o"}{u"}'

define s_ending 'bdfghklmnrt'
define st_ending s_ending - 'r'

define prelude as (

test repeat (
(
['{ss}'] <- 'ss'
) or next
)

repeat goto (
v [('u'] v <- 'U') or
('y'] v <- 'Y')
)
)

define mark_regions as (

$p1 = limit
$p2 = limit

test(hop 3 setmark x)

gopast v gopast non-v setmark p1
try($p1 < x $p1 = x) // at least 3
gopast v gopast non-v setmark p2

)

define postlude as repeat (

[substring] among(
'Y' (<- 'y')
'U' (<- 'u')
'{a"}' (<- 'a')
'{o"}' (<- 'o')
'{u"}' (<- 'u')
'' (next)
)

)

backwardmode (

define R1 as $p1 <= cursor
define R2 as $p2 <= cursor

define standard_suffix as (
do (
[substring] R1 among(
'em' 'ern' 'er'
( delete
)
'e' 'en' 'es'
( delete
try (['s'] 'nis' delete)
)
's'
( s_ending delete
)
)
)
do (
[substring] R1 among(
'en' 'er' 'est'
( delete
)
'st'
( st_ending hop 3 delete
)
)
)
do (
[substring] R2 among(
'end' 'ung'
( delete
try (['ig'] not 'e' R2 delete)
)
'ig' 'ik' 'isch'
( not 'e' delete
)
'lich' 'heit'
( delete
try (
['er' or 'en'] R1 delete
)
)
'keit'
( delete
try (
[substring] R2 among(
'lich' 'ig'
( delete
)
)
)
)
)
)
)
)

define stem as (
do prelude
do mark_regions
backwards
do standard_suffix
do postlude
)

contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl → contrib/snowball/algorithms/german2.sbl View File

@@ -18,12 +18,12 @@ groupings ( v s_ending st_ending )

stringescapes {}

/* special characters (in ISO Latin I) */
/* special characters */

stringdef a" hex 'E4'
stringdef o" hex 'F6'
stringdef u" hex 'FC'
stringdef ss hex 'DF'
stringdef a" '{U+00E4}'
stringdef o" '{U+00F6}'
stringdef u" '{U+00FC}'
stringdef ss '{U+00DF}'

define v 'aeiouy{a"}{o"}{u"}'


+ 706
- 0
contrib/snowball/algorithms/greek.sbl View File

@@ -0,0 +1,706 @@
// A stemmer for Modern Greek language, based on:
//
// Ntais, Georgios. Development of a Stemmer for the Greek
// Language. Diss. Royal Institute of Technology, 2006.
// https://sais.se/mthprize/2007/ntais2007.pdf
//
// Saroukos, Spyridon. Enhancing a Greek language stemmer.
// University of Tampere, 2008.
// https://tampub.uta.fi/bitstream/handle/10024/80480/gradu03463.pdf

stringescapes {}

stringdef a '{U+03B1}' // alpha
stringdef v '{U+03B2}' // beta
stringdef g '{U+03B3}' // gamma
stringdef d '{U+03B4}' // delta
stringdef e '{U+03B5}' // epsilon
stringdef z '{U+03B6}' // zeta
stringdef i '{U+03B7}' // eta
stringdef th '{U+03B8}' // theta
stringdef y '{U+03B9}' // iota
stringdef k '{U+03BA}' // kappa
stringdef l '{U+03BB}' // lamda
stringdef m '{U+03BC}' // mu
stringdef n '{U+03BD}' // nu
stringdef x '{U+03BE}' // xi
stringdef o '{U+03BF}' // omicron
stringdef p '{U+03C0}' // pi
stringdef r '{U+03C1}' // rho
stringdef ss '{U+03C2}' // sigma final
stringdef s '{U+03C3}' // sigma
stringdef t '{U+03C4}' // tau
stringdef u '{U+03C5}' // upsilon
stringdef f '{U+03C6}' // phi
stringdef ch '{U+03C7}' // chi
stringdef ps '{U+03C8}' // psi
stringdef oo '{U+03C9}' // omega

stringdef A '{U+0391}' // Alpha
stringdef V '{U+0392}' // Beta
stringdef G '{U+0393}' // Gamma
stringdef D '{U+0394}' // Delta
stringdef E '{U+0395}' // Epsilon
stringdef Z '{U+0396}' // Zeta
stringdef I '{U+0397}' // Eta
stringdef Th '{U+0398}' // Theta
stringdef Y '{U+0399}' // Iota
stringdef K '{U+039A}' // Kappa
stringdef L '{U+039B}' // Lamda
stringdef M '{U+039C}' // Mu
stringdef N '{U+039D}' // Nu
stringdef X '{U+039E}' // Xi
stringdef O '{U+039F}' // Omicron
stringdef P '{U+03A0}' // Pi
stringdef R '{U+03A1}' // Rho
stringdef S '{U+03A3}' // Sigma
stringdef T '{U+03A4}' // Tau
stringdef U '{U+03A5}' // Upsilon
stringdef F '{U+03A6}' // Phi
stringdef Ch '{U+03A7}' // Chi
stringdef Ps '{U+03A8}' // Psi
stringdef Oo '{U+03A9}' // Omega

stringdef Y: '{U+03AA}' // Iota with dialytika
stringdef U: '{U+03AB}' // Upsilon with dialytika

stringdef a' '{U+03AC}' // alpha with tonos
stringdef e' '{U+03AD}' // epsilon with tonos
stringdef i' '{U+03AE}' // eta with tonos
stringdef y' '{U+03AF}' // iota with tonos
stringdef o' '{U+03CC}' // omicron with tonos
stringdef u' '{U+03CD}' // upsilon with tonos
stringdef oo' '{U+03CE}' // omega with tonos

stringdef i:' '{U+0390}' // iota with dialytika and tonos
stringdef u:' '{U+03B0}' // upsilon with dialytika and tonos

stringdef i: '{U+03CA}' // iota with dialytika
stringdef u: '{U+03CB}' // upsilon with dialytika

stringdef A' '{U+0386}' // Alpha with tonos
stringdef E' '{U+0388}' // Epsilon with tonos
stringdef I' '{U+0389}' // Eta with tonos
stringdef Y' '{U+038A}' // Iota with tonos
stringdef O' '{U+038C}' // Omicron with tonos
stringdef U' '{U+038E}' // Upsilon with tonos
stringdef OO' '{U+038F}' // Omega with tonos

externals ( stem )

booleans ( test1 )

groupings ( v v2 )

routines ( tolower has_min_length
steps1 steps2 steps3 steps4 steps5 steps6 steps7
steps8 steps9 steps10
step1 step2a step2b step2c step2d step3 step4
step5a step5b step5c step5d step5e step5f
step5g step5h step5i
step5j step5k step5l step5m
step6 step7 )

define v '{a}{e}{i}{y}{o}{u}{oo}'
define v2 '{a}{e}{i}{y}{o}{oo}'

backwardmode (
define has_min_length as (
$(len >= 3)
)

define tolower as (
repeat (
[substring] among (
'{A}' (<- '{a}')
'{V}' (<- '{v}')
'{G}' (<- '{g}')
'{D}' (<- '{d}')
'{E}' (<- '{e}')
'{Z}' (<- '{z}')
'{I}' (<- '{i}')
'{Th}' (<- '{th}')
'{Y}' (<- '{y}')
'{K}' (<- '{k}')
'{L}' (<- '{l}')
'{M}' (<- '{m}')
'{N}' (<- '{n}')
'{X}' (<- '{x}')
'{O}' (<- '{o}')
'{P}' (<- '{p}')
'{R}' (<- '{r}')
'{S}' (<- '{s}')
'{T}' (<- '{t}')
'{U}' (<- '{u}')
'{F}' (<- '{f}')
'{Ch}' (<- '{ch}')
'{Ps}' (<- '{ps}')
'{Oo}' (<- '{oo}')
'{Y:}' (<- '{y}')
'{U:}' (<- '{u}')
'{a'}' (<- '{a}')
'{e'}' (<- '{e}')
'{i'}' (<- '{i}')
'{y'}' (<- '{y}')
'{o'}' (<- '{o}')
'{u'}' (<- '{u}')
'{oo'}' (<- '{oo}')
'{i:'}' (<- '{i}')
'{u:'}' (<- '{u}')
'{i:}' (<- '{i}')
'{u:}' (<- '{u}')
'{A'}' (<- '{a}')
'{E'}' (<- '{e}')
'{I'}' (<- '{i}')
'{Y'}' (<- '{y}')
'{O'}' (<- '{o}')
'{U'}' (<- '{u}')
'{OO'}' (<- '{oo}')
'{ss}' (<- '{s}')
'' (next)
)
)
)

define step1 as (
[substring] among (
'{f}{a}{g}{y}{a}' '{f}{a}{g}{y}{o}{u}' '{f}{a}{g}{y}{oo}{n}' (<- '{f}{a}')
'{s}{k}{a}{g}{y}{a}' '{s}{k}{a}{g}{y}{o}{u}' '{s}{k}{a}{g}{y}{oo}{n}' (<- '{s}{k}{a}')
'{o}{l}{o}{g}{y}{o}{u}' '{o}{l}{o}{g}{y}{a}' '{o}{l}{o}{g}{y}{oo}{n}' (<- '{o}{l}{o}')
'{s}{o}{g}{y}{o}{u}' '{s}{o}{g}{y}{a}' '{s}{o}{g}{y}{oo}{n}' (<- '{s}{o}')
'{t}{a}{t}{o}{g}{y}{a}' '{t}{a}{t}{o}{g}{y}{o}{u}' '{t}{a}{t}{o}{g}{y}{oo}{n}' (<- '{t}{a}{t}{o}')
'{k}{r}{e}{a}{s}' '{k}{r}{e}{a}{t}{o}{s}' '{k}{r}{e}{a}{t}{a}' '{k}{r}{e}{a}{t}{oo}{n}' (<- '{k}{r}{e}')
'{p}{e}{r}{a}{s}' '{p}{e}{r}{a}{t}{o}{s}' '{p}{e}{r}{a}{t}{i}' '{p}{e}{r}{a}{t}{a}' '{p}{e}{r}{a}{t}{oo}{n}' (<- '{p}{e}{r}')
'{t}{e}{r}{a}{s}' '{t}{e}{r}{a}{t}{o}{s}' '{t}{e}{r}{a}{t}{a}' '{t}{e}{r}{a}{t}{oo}{n}' (<- '{t}{e}{r}')
'{f}{oo}{s}' '{f}{oo}{t}{o}{s}' '{f}{oo}{t}{a}' '{f}{oo}{t}{oo}{n}' (<- '{f}{oo}')
'{k}{a}{th}{e}{s}{t}{oo}{s}' '{k}{a}{th}{e}{s}{t}{oo}{t}{o}{s}' '{k}{a}{th}{e}{s}{t}{oo}{t}{a}' '{k}{a}{th}{e}{s}{t}{oo}{t}{oo}{n}' (<- '{k}{a}{th}{e}{s}{t}')
'{g}{e}{g}{o}{n}{o}{s}' '{g}{e}{g}{o}{n}{o}{t}{o}{s}' '{g}{e}{g}{o}{n}{o}{t}{a}' '{g}{e}{g}{o}{n}{o}{t}{oo}{n}' (<- '{g}{e}{g}{o}{n}')
)
unset test1
)

define steps1 as (
[substring] among (
'{y}{z}{a}' '{y}{z}{e}{s}' '{y}{z}{e}' '{y}{z}{a}{m}{e}' '{y}{z}{a}{t}{e}' '{y}{z}{a}{n}' '{y}{z}{a}{n}{e}' '{y}{z}{oo}' '{y}{z}{e}{y}{s}' '{y}{z}{e}{y}'
'{y}{z}{o}{u}{m}{e}' '{y}{z}{e}{t}{e}' '{y}{z}{o}{u}{n}' '{y}{z}{o}{u}{n}{e}' (
delete
unset test1
([] substring atlimit among (
'{a}{n}{a}{m}{p}{a}' '{e}{m}{p}{a}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{p}{a}' '{p}{e}{r}{y}{p}{a}' '{a}{th}{r}{o}' '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}'
(<- '{y}')
)) or
([] substring atlimit among (
'{m}{a}{r}{k}' '{k}{o}{r}{n}' '{a}{m}{p}{a}{r}' '{a}{r}{r}' '{v}{a}{th}{u}{r}{y}' '{v}{a}{r}{k}' '{v}' '{v}{o}{l}{v}{o}{r}' '{g}{k}{r}'
'{g}{l}{u}{k}{o}{r}' '{g}{l}{u}{k}{u}{r}' '{y}{m}{p}' '{l}' '{l}{o}{u}' '{m}{a}{r}' '{m}' '{p}{r}' '{m}{p}{r}' '{p}{o}{l}{u}{r}' '{p}'
'{r}' '{p}{y}{p}{e}{r}{o}{r}'
(<- '{y}{z}')
))
)
)
)

define steps2 as (
[substring] among (
'{oo}{th}{i}{k}{a}' '{oo}{th}{i}{k}{e}{s}' '{oo}{th}{i}{k}{e}' '{oo}{th}{i}{k}{a}{m}{e}' '{oo}{th}{i}{k}{a}{t}{e}' '{oo}{th}{i}{k}{a}{n}' '{oo}{th}{i}{k}{a}{n}{e}' (
delete
unset test1
[] substring atlimit among (
'{a}{l}' '{v}{y}' '{e}{n}' '{u}{ps}' '{l}{y}' '{z}{oo}' '{s}' '{ch}' (<- '{oo}{n}')
)
)
)
)

define steps3 as (
[substring] among (
'{y}{s}{a}' '{y}{s}{e}{s}' '{y}{s}{e}' '{y}{s}{a}{m}{e}' '{y}{s}{a}{t}{e}' '{y}{s}{a}{n}' '{y}{s}{a}{n}{e}' (
delete
unset test1
('{y}{s}{a}' atlimit <- '{y}{s}') or
([] substring atlimit among (
'{a}{n}{a}{m}{p}{a}' '{a}{th}{r}{o}' '{e}{m}{p}{a}' '{e}{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{e}{p}{e}' '{p}{e}{r}{y}{p}{a}'
'{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' '{k}{l}{e}' '{ch}{a}{r}{t}{o}{p}{a}' '{e}{x}{a}{r}{ch}{a}' '{m}{e}{t}{e}{p}{e}' '{a}{p}{o}{k}{l}{e}'
'{a}{p}{e}{k}{l}{e}' '{e}{k}{l}{e}' '{p}{e}'
(<- '{y}')
)) or
([] substring atlimit among (
'{a}{n}' '{a}{f}' '{g}{e}' '{g}{y}{g}{a}{n}{t}{o}{a}{f}' '{g}{k}{e}' '{d}{i}{m}{o}{k}{r}{a}{t}' '{k}{o}{m}' '{g}{k}' '{m}' '{p}'
'{p}{o}{u}{k}{a}{m}' '{o}{l}{o}' '{l}{a}{r}'
(<- '{y}{s}')
))
)
)
)

define steps4 as (
[substring] among (
'{y}{s}{oo}' '{y}{s}{e}{y}{s}' '{y}{s}{e}{y}' '{y}{s}{o}{u}{m}{e}' '{y}{s}{e}{t}{e}' '{y}{s}{o}{u}{n}' '{y}{s}{o}{u}{n}{e}' (
delete
unset test1
[] substring atlimit among (
'{a}{n}{a}{m}{p}{a}' '{e}{m}{p}{a}' '{e}{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{e}{p}{e}' '{p}{e}{r}{y}{p}{a}' '{a}{th}{r}{o}'
'{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' '{k}{l}{e}' '{ch}{a}{r}{t}{o}{p}{a}' '{e}{x}{a}{r}{ch}{a}' '{m}{e}{t}{e}{p}{e}' '{a}{p}{o}{k}{l}{e}' '{a}{p}{e}{k}{l}{e}'
'{e}{k}{l}{e}' '{p}{e}'
(<- '{y}')
)
)
)
)

define steps5 as (
[substring] among (
'{y}{s}{t}{o}{s}' '{y}{s}{t}{o}{u}' '{y}{s}{t}{o}' '{y}{s}{t}{e}' '{y}{s}{t}{o}{y}' '{y}{s}{t}{oo}{n}' '{y}{s}{t}{o}{u}{s}' '{y}{s}{t}{i}' '{y}{s}{t}{i}{s}'
'{y}{s}{t}{a}' '{y}{s}{t}{e}{s}' (
delete
unset test1
([] substring atlimit among (
'{d}{a}{n}{e}' '{s}{u}{n}{a}{th}{r}{o}' '{k}{l}{e}' '{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{a}{s}{e}' '{p}{l}{e}'
(<- '{y}')
)) or
([] substring atlimit among (
'{m}' '{p}' '{a}{p}' '{a}{r}' '{i}{d}' '{k}{t}' '{s}{k}' '{s}{ch}' '{u}{ps}' '{f}{a}' '{ch}{r}' '{ch}{t}' '{a}{k}{t}'
'{a}{o}{r}' '{a}{s}{ch}' '{a}{t}{a}' '{a}{ch}{n}' '{a}{ch}{t}' '{g}{e}{m}' '{g}{u}{r}' '{e}{m}{p}' '{e}{u}{p}' '{e}{ch}{th}' '{i}{f}{a}'
'{k}{a}{th}' '{k}{a}{k}' '{k}{u}{l}' '{l}{u}{g}' '{m}{a}{k}' '{m}{e}{g}' '{t}{a}{ch}' '{f}{y}{l}' '{ch}{oo}{r}'
(<- '{y}{s}{t}')
))
)
)
)

define steps6 as (
[substring] among (
'{y}{s}{m}{o}' '{y}{s}{m}{o}{y}' '{y}{s}{m}{o}{s}' '{y}{s}{m}{o}{u}' '{y}{s}{m}{o}{u}{s}' '{y}{s}{m}{oo}{n}' (
delete
unset test1
([] substring atlimit among (
'{s}{e}' '{m}{e}{t}{a}{s}{e}' '{m}{y}{k}{r}{o}{s}{e}' '{e}{g}{k}{l}{e}' '{a}{p}{o}{k}{l}{e}'
(<- '{y}{s}{m}')
)) or
([] substring atlimit among (
'{d}{a}{n}{e}' '{a}{n}{t}{y}{d}{a}{n}{e}'
(<- '{y}')
)) or
([substring] among (
'{a}{g}{n}{oo}{s}{t}{y}{k}' (<- '{a}{g}{n}{oo}{s}{t}')
'{a}{t}{o}{m}{y}{k}' (<- '{a}{t}{o}{m}')
'{g}{n}{oo}{s}{t}{y}{k}' (<- '{g}{n}{oo}{s}{t}')
'{e}{th}{n}{y}{k}' (<- '{e}{th}{n}')
'{e}{k}{l}{e}{k}{t}{y}{k}' (<- '{e}{k}{l}{e}{k}{t}')
'{s}{k}{e}{p}{t}{y}{k}' (<- '{s}{k}{e}{p}{t}')
'{t}{o}{p}{y}{k}' (<- '{t}{o}{p}')
'{a}{l}{e}{x}{a}{n}{d}{r}{y}{n}' (<- '{a}{l}{e}{x}{a}{n}{d}{r}')
'{v}{u}{z}{a}{n}{t}{y}{n}' (<- '{v}{u}{z}{a}{n}{t}')
'{th}{e}{a}{t}{r}{y}{n}' (<- '{th}{e}{a}{t}{r}')
))
)
)
)

define steps7 as (
[substring] among (
'{a}{r}{a}{k}{y}' '{a}{r}{a}{k}{y}{a}' '{o}{u}{d}{a}{k}{y}' '{o}{u}{d}{a}{k}{y}{a}' (
delete
unset test1
[] substring atlimit among (
'{s}' '{ch}'
(<- '{a}{r}{a}{k}')
)
)
)
)

define steps8 as (
[substring] among (
'{a}{k}{y}' '{a}{k}{y}{a}' '{y}{t}{s}{a}' '{y}{t}{s}{a}{s}' '{y}{t}{s}{e}{s}' '{y}{t}{s}{oo}{n}' '{a}{r}{a}{k}{y}' '{a}{r}{a}{k}{y}{a}' (
delete
unset test1
([] substring atlimit among (
'{v}{a}{m}{v}' '{v}{r}' '{k}{a}{y}{m}' '{k}{o}{n}' '{k}{o}{r}' '{l}{a}{v}{r}' '{l}{o}{u}{l}' '{m}{e}{r}' '{m}{o}{u}{s}{t}'
'{n}{a}{g}{k}{a}{s}' '{p}{l}' '{r}' '{r}{u}' '{s}' '{s}{k}' '{s}{o}{k}' '{s}{p}{a}{n}' '{t}{z}' '{f}{a}{r}{m}' '{ch}' '{k}{a}{p}{a}{k}'
'{a}{l}{y}{s}{f}' '{a}{m}{v}{r}' '{a}{n}{th}{r}' '{k}' '{f}{u}{l}' '{k}{a}{t}{r}{a}{p}' '{k}{l}{y}{m}' '{m}{a}{l}' '{s}{l}{o}{v}' '{f}'
'{s}{f}' '{t}{s}{e}{ch}{o}{s}{l}{o}{v}'
(<- '{a}{k}')
)) or
([] substring atlimit among (
'{v}' '{v}{a}{l}' '{g}{y}{a}{n}' '{g}{l}' '{z}' '{i}{g}{o}{u}{m}{e}{n}' '{k}{a}{r}{d}' '{k}{o}{n}' '{m}{a}{k}{r}{u}{n}' '{n}{u}{f}'
'{p}{a}{t}{e}{r}' '{p}' '{s}{k}' '{t}{o}{s}' '{t}{r}{y}{p}{o}{l}'
(<- '{y}{t}{s}')
)) or
([] '{k}{o}{r}' <- '{y}{t}{s}')
)
)
)

define steps9 as (
[substring] among (
'{y}{d}{y}{o}' '{y}{d}{y}{a}' '{y}{d}{y}{oo}{n}' (
delete
unset test1
([] substring atlimit among (
'{a}{y}{f}{n}' '{y}{r}' '{o}{l}{o}' '{ps}{a}{l}' (<- '{y}{d}')
)) or
([] substring among (
'{e}' '{p}{a}{y}{ch}{n}' (<- '{y}{d}')
))
)
)
)

define steps10 as (
[substring] among (
'{y}{s}{k}{o}{s}' '{y}{s}{k}{o}{u}' '{y}{s}{k}{o}' '{y}{s}{k}{e}' (
delete
unset test1
[] substring atlimit among (
'{d}' '{y}{v}' '{m}{i}{n}' '{r}' '{f}{r}{a}{g}{k}' '{l}{u}{k}' '{o}{v}{e}{l}'
(<- '{y}{s}{k}')
)
)
)
)

define step2a as (
[substring] among (
'{a}{d}{e}{s}' '{a}{d}{oo}{n}' (delete)
)
not ([substring] among (
'{o}{k}' '{m}{a}{m}' '{m}{a}{n}' '{m}{p}{a}{m}{p}' '{p}{a}{t}{e}{r}' '{g}{y}{a}{g}{y}' '{n}{t}{a}{n}{t}' '{k}{u}{r}' '{th}{e}{y}' '{p}{e}{th}{e}{r}'
))
insert '{a}{d}'
)

define step2b as (
[substring] among (
'{e}{d}{e}{s}' '{e}{d}{oo}{n}' (delete)
)
[] substring among (
'{o}{p}' '{y}{p}' '{e}{m}{p}' '{u}{p}' '{g}{i}{p}' '{d}{a}{p}' '{k}{r}{a}{s}{p}' '{m}{y}{l}' (<- '{e}{d}')
)
)

define step2c as (
[substring] among (
'{o}{u}{d}{e}{s}' '{o}{u}{d}{oo}{n}' (delete)
)
[] substring among (
'{a}{r}{k}' '{k}{a}{l}{y}{a}{k}' '{p}{e}{t}{a}{l}' '{l}{y}{ch}' '{p}{l}{e}{x}' '{s}{k}' '{s}' '{f}{l}' '{f}{r}' '{v}{e}{l}' '{l}{o}{u}{l}' '{ch}{n}'
'{s}{p}' '{t}{r}{a}{g}' '{f}{e}' (<- '{o}{u}{d}')
)
)

define step2d as (
[substring] among (
'{e}{oo}{s}' '{e}{oo}{n}' (delete unset test1)
)
[] substring atlimit among (
'{th}' '{d}' '{e}{l}' '{g}{a}{l}' '{n}' '{p}' '{y}{d}' '{p}{a}{r}' (<- '{e}')
)
)

define step3 as (
[substring] among (
'{y}{a}' '{y}{o}{u}' '{y}{oo}{n}' (delete unset test1)
)
([] v <- '{y}')
)

define step4 as (
[substring] among (
'{y}{k}{a}' '{y}{k}{o}' '{y}{k}{o}{u}' '{y}{k}{oo}{n}' (delete unset test1)
)
([] v <- '{y}{k}') or
[] substring atlimit among (
'{a}{l}' '{a}{d}' '{e}{n}{d}' '{a}{m}{a}{n}' '{a}{m}{m}{o}{ch}{a}{l}' '{i}{th}' '{a}{n}{i}{th}' '{a}{n}{t}{y}{d}' '{f}{u}{s}' '{v}{r}{oo}{m}' '{g}{e}{r}'
'{e}{x}{oo}{d}' '{k}{a}{l}{p}' '{k}{a}{l}{l}{y}{n}' '{k}{a}{t}{a}{d}' '{m}{o}{u}{l}' '{m}{p}{a}{n}' '{m}{p}{a}{g}{y}{a}{t}' '{m}{p}{o}{l}' '{m}{p}{o}{s}'
'{n}{y}{t}' '{x}{y}{k}' '{s}{u}{n}{o}{m}{i}{l}' '{p}{e}{t}{s}' '{p}{y}{t}{s}' '{p}{y}{k}{a}{n}{t}' '{p}{l}{y}{a}{t}{s}' '{p}{o}{s}{t}{e}{l}{n}' '{p}{r}{oo}{t}{o}{d}'
'{s}{e}{r}{t}' '{s}{u}{n}{a}{d}' '{t}{s}{a}{m}' '{u}{p}{o}{d}' '{f}{y}{l}{o}{n}' '{f}{u}{l}{o}{d}' '{ch}{a}{s}'
(<- '{y}{k}')
)
)

define step5a as (
do ('{a}{g}{a}{m}{e}' atlimit <- '{a}{g}{a}{m}')
do (
[substring] among (
'{a}{g}{a}{m}{e}' '{i}{s}{a}{m}{e}' '{o}{u}{s}{a}{m}{e}' '{i}{k}{a}{m}{e}' '{i}{th}{i}{k}{a}{m}{e}' (delete unset test1)
)
)
['{a}{m}{e}']
delete
unset test1
[] substring atlimit among (
'{a}{n}{a}{p}' '{a}{p}{o}{th}' '{a}{p}{o}{k}' '{a}{p}{o}{s}{t}' '{v}{o}{u}{v}' '{x}{e}{th}' '{o}{u}{l}' '{p}{e}{th}' '{p}{y}{k}{r}' '{p}{o}{t}' '{s}{y}{ch}' '{ch}'
(<- '{a}{m}')
)
)

define step5b as (
do (
[substring] among (
'{a}{g}{a}{n}{e}' '{i}{s}{a}{n}{e}' '{o}{u}{s}{a}{n}{e}' '{y}{o}{n}{t}{a}{n}{e}' '{y}{o}{t}{a}{n}{e}' '{y}{o}{u}{n}{t}{a}{n}{e}' '{o}{n}{t}{a}{n}{e}' '{o}{t}{a}{n}{e}'
'{o}{u}{n}{t}{a}{n}{e}' '{i}{k}{a}{n}{e}' '{i}{th}{i}{k}{a}{n}{e}' (
delete
unset test1
[] substring atlimit among (
'{t}{r}' '{t}{s}' (<- '{a}{g}{a}{n}')
)
)
)
)
['{a}{n}{e}']
delete
unset test1
([] v2 <- '{a}{n}') or
[] substring atlimit among (
'{v}{e}{t}{e}{r}' '{v}{o}{u}{l}{k}' '{v}{r}{a}{ch}{m}' '{g}' '{d}{r}{a}{d}{o}{u}{m}'
'{th}' '{k}{a}{l}{p}{o}{u}{z}' '{k}{a}{s}{t}{e}{l}' '{k}{o}{r}{m}{o}{r}' '{l}{a}{o}{p}{l}' '{m}{oo}{a}{m}{e}{th}'
'{m}' '{m}{o}{u}{s}{o}{u}{l}{m}' '{n}' '{o}{u}{l}' '{p}' '{p}{e}{l}{e}{k}' '{p}{l}' '{p}{o}{l}{y}{s}'
'{p}{o}{r}{t}{o}{l}' '{s}{a}{r}{a}{k}{a}{t}{s}' '{s}{o}{u}{l}{t}' '{t}{s}{a}{r}{l}{a}{t}' '{o}{r}{f}'
'{t}{s}{y}{g}{g}' '{t}{s}{o}{p}' '{f}{oo}{t}{o}{s}{t}{e}{f}' '{ch}' '{ps}{u}{ch}{o}{p}{l}' '{a}{g}'
'{g}{a}{l}' '{g}{e}{r}' '{d}{e}{k}' '{d}{y}{p}{l}' '{a}{m}{e}{r}{y}{k}{a}{n}' '{o}{u}{r}' '{p}{y}{th}'
'{p}{o}{u}{r}{y}{t}' '{s}' '{z}{oo}{n}{t}' '{y}{k}' '{k}{a}{s}{t}' '{k}{o}{p}' '{l}{y}{ch}'
'{l}{o}{u}{th}{i}{r}' '{m}{a}{y}{n}{t}' '{m}{e}{l}' '{s}{y}{g}' '{s}{p}' '{s}{t}{e}{g}' '{t}{r}{a}{g}'
'{t}{s}{a}{g}' '{f}' '{e}{r}' '{a}{d}{a}{p}' '{a}{th}{y}{g}{g}' '{a}{m}{i}{ch}' '{a}{n}{y}{k}'
'{a}{n}{o}{r}{g}' '{a}{p}{i}{g}' '{a}{p}{y}{th}' '{a}{t}{s}{y}{g}{g}' '{v}{a}{s}' '{v}{a}{s}{k}'
'{v}{a}{th}{u}{g}{a}{l}' '{v}{y}{o}{m}{i}{ch}' '{v}{r}{a}{ch}{u}{k}' '{d}{y}{a}{t}' '{d}{y}{a}{f}' '{e}{n}{o}{r}{g}'
'{th}{u}{s}' '{k}{a}{p}{n}{o}{v}{y}{o}{m}{i}{ch}' '{k}{a}{t}{a}{g}{a}{l}' '{k}{l}{y}{v}' '{k}{o}{y}{l}{a}{r}{f}'
'{l}{y}{v}' '{m}{e}{g}{l}{o}{v}{y}{o}{m}{i}{ch}' '{m}{y}{k}{r}{o}{v}{y}{o}{m}{i}{ch}' '{n}{t}{a}{v}'
'{x}{i}{r}{o}{k}{l}{y}{v}' '{o}{l}{y}{g}{o}{d}{a}{m}' '{o}{l}{o}{g}{a}{l}' '{p}{e}{n}{t}{a}{r}{f}' '{p}{e}{r}{i}{f}'
'{p}{e}{r}{y}{t}{r}' '{p}{l}{a}{t}' '{p}{o}{l}{u}{d}{a}{p}' '{p}{o}{l}{u}{m}{i}{ch}' '{s}{t}{e}{f}' '{t}{a}{v}'
'{t}{e}{t}' '{u}{p}{e}{r}{i}{f}' '{u}{p}{o}{k}{o}{p}' '{ch}{a}{m}{i}{l}{o}{d}{a}{p}' '{ps}{i}{l}{o}{t}{a}{v}'
(<- '{a}{n}')
)
)

define step5c as (
do (
[substring] among (
'{i}{s}{e}{t}{e}' (delete unset test1)
)
)
['{e}{t}{e}']
delete
unset test1
([] v2 <- '{e}{t}') or
([] substring among (
'{o}{d}' '{a}{y}{r}' '{f}{o}{r}' '{t}{a}{th}' '{d}{y}{a}{th}' '{s}{ch}' '{e}{n}{d}' '{e}{u}{r}' '{t}{y}{th}' '{u}{p}{e}{r}{th}'
'{r}{a}{th}' '{e}{n}{th}' '{r}{o}{th}' '{s}{th}' '{p}{u}{r}' '{a}{y}{n}' '{s}{u}{n}{d}' '{s}{u}{n}' '{s}{u}{n}{th}' '{ch}{oo}{r}'
'{p}{o}{n}' '{v}{r}' '{k}{a}{th}' '{e}{u}{th}' '{e}{k}{th}' '{n}{e}{t}' '{r}{o}{n}' '{a}{r}{k}' '{v}{a}{r}' '{v}{o}{l}' '{oo}{f}{e}{l}'
(<- '{e}{t}')
)) or
[] substring atlimit among (
'{a}{v}{a}{r}' '{v}{e}{n}' '{e}{n}{a}{r}' '{a}{v}{r}' '{a}{d}' '{a}{th}' '{a}{n}' '{a}{p}{l}' '{v}{a}{r}{o}{n}' '{n}{t}{r}' '{s}{k}' '{k}{o}{p}'
'{m}{p}{o}{r}' '{n}{y}{f}' '{p}{a}{g}' '{p}{a}{r}{a}{k}{a}{l}' '{s}{e}{r}{p}' '{s}{k}{e}{l}' '{s}{u}{r}{f}' '{t}{o}{k}' '{u}' '{d}' '{e}{m}'
'{th}{a}{r}{r}' '{th}'
(<- '{e}{t}')
)
)

define step5d as (
[substring] among (
'{o}{n}{t}{a}{s}' '{oo}{n}{t}{a}{s}' (
delete
unset test1
([] '{a}{r}{ch}' atlimit <- '{o}{n}{t}') or
([] '{k}{r}{e}' <- '{oo}{n}{t}')
)
)
)

define step5e as (
[substring] among (
'{o}{m}{a}{s}{t}{e}' '{y}{o}{m}{a}{s}{t}{e}' (
delete
unset test1
([] '{o}{n}' atlimit <- '{o}{m}{a}{s}{t}')
)
)
)

define step5f as (
do (
['{y}{e}{s}{t}{e}']
delete
unset test1
[] substring atlimit among (
'{p}' '{a}{p}' '{s}{u}{m}{p}' '{a}{s}{u}{m}{p}' '{a}{k}{a}{t}{a}{p}' '{a}{m}{e}{t}{a}{m}{f}' (<- '{y}{e}{s}{t}')
)
)
['{e}{s}{t}{e}']
delete
unset test1
[] substring atlimit among (
'{a}{l}' '{a}{r}' '{e}{k}{t}{e}{l}' '{z}' '{m}' '{x}' '{p}{a}{r}{a}{k}{a}{l}' '{p}{r}{o}' '{n}{y}{s}'
(<- '{y}{e}{s}{t}')
)
)

define step5g as (
do (
[substring] among (
'{i}{th}{i}{k}{a}' '{i}{th}{i}{k}{e}{s}' '{i}{th}{i}{k}{e}' (delete unset test1)
)
)
[substring] among (
'{i}{k}{a}' '{i}{k}{e}{s}' '{i}{k}{e}' (
delete
unset test1
([] substring among (
'{s}{k}{oo}{l}' '{s}{k}{o}{u}{l}' '{n}{a}{r}{th}' '{s}{f}' '{o}{th}' '{p}{y}{th}' (<- '{i}{k}')
)) or
([] substring atlimit among (
'{d}{y}{a}{th}' '{th}' '{p}{a}{r}{a}{k}{a}{t}{a}{th}' '{p}{r}{o}{s}{th}' '{s}{u}{n}{th}' (<- '{i}{k}')
))
)
)
)

define step5h as (
[substring] among (
'{o}{u}{s}{a}' '{o}{u}{s}{e}{s}' '{o}{u}{s}{e}' (
delete
unset test1
([] substring among (
'{p}{o}{d}{a}{r}' '{v}{l}{e}{p}' '{p}{a}{n}{t}{a}{ch}' '{f}{r}{u}{d}' '{m}{a}{n}{t}{y}{l}' '{m}{a}{l}{l}' '{k}{u}{m}{a}{t}' '{l}{a}{ch}' '{l}{i}{g}'
'{f}{a}{g}' '{o}{m}' '{p}{r}{oo}{t}' (<- '{o}{u}{s}')

)) or
([] substring atlimit among (
'{f}{a}{r}{m}{a}{k}' '{ch}{a}{d}' '{a}{g}{k}' '{a}{n}{a}{r}{r}' '{v}{r}{o}{m}' '{e}{k}{l}{y}{p}' '{l}{a}{m}{p}{y}{d}' '{l}{e}{ch}' '{m}' '{p}{a}{t}'
'{r}' '{l}' '{m}{e}{d}' '{m}{e}{s}{a}{z}' '{u}{p}{o}{t}{e}{y}{n}' '{a}{m}' '{a}{y}{th}' '{a}{n}{i}{k}' '{d}{e}{s}{p}{o}{z}'
'{e}{n}{d}{y}{a}{f}{e}{r}' '{d}{e}' '{d}{e}{u}{t}{e}{r}{e}{u}' '{k}{a}{th}{a}{r}{e}{u}' '{p}{l}{e}' '{t}{s}{a}'
(<- '{o}{u}{s}')
))
)
)
)

define step5i as (
[substring] among (
'{a}{g}{a}' '{a}{g}{e}{s}' '{a}{g}{e}' (
delete
unset test1
([] '{k}{o}{l}{l}' <- '{a}{g}') or (
not ([substring] among ('{ps}{o}{f}' '{n}{a}{u}{l}{o}{ch}'))
([] substring among (
'{o}{f}' '{p}{e}{l}' '{ch}{o}{r}{t}' '{l}{l}' '{s}{f}' '{r}{p}' '{f}{r}' '{p}{r}' '{l}{o}{ch}' '{s}{m}{i}{n}'
(<- '{a}{g}')
)) or
([] substring atlimit among (
'{a}{v}{a}{s}{t}' '{p}{o}{l}{u}{f}' '{a}{d}{i}{f}' '{p}{a}{m}{f}' '{r}' '{a}{s}{p}' '{a}{f}' '{a}{m}{a}{l}' '{a}{m}{a}{l}{l}{y}'
'{a}{n}{u}{s}{t}' '{a}{p}{e}{r}' '{a}{s}{p}{a}{r}' '{a}{ch}{a}{r}' '{d}{e}{r}{v}{e}{n}' '{d}{r}{o}{s}{o}{p}' '{x}{e}{f}' '{n}{e}{o}{p}'
'{n}{o}{m}{o}{t}' '{o}{l}{o}{p}' '{o}{m}{o}{t}' '{p}{r}{o}{s}{t}' '{p}{r}{o}{s}{oo}{p}{o}{p}' '{s}{u}{m}{p}' '{s}{u}{n}{t}' '{t}' '{u}{p}{o}{t}'
'{ch}{a}{r}' '{a}{e}{y}{p}' '{a}{y}{m}{o}{s}{t}' '{a}{n}{u}{p}' '{a}{p}{o}{t}' '{a}{r}{t}{y}{p}' '{d}{y}{a}{t}' '{e}{n}' '{e}{p}{y}{t}'
'{k}{r}{o}{k}{a}{l}{o}{p}' '{s}{y}{d}{i}{r}{o}{p}' '{l}' '{n}{a}{u}' '{o}{u}{l}{a}{m}' '{o}{u}{r}' '{p}' '{t}{r}' '{m}'
(<- '{a}{g}')
))
)
)
)
)

define step5j as (
[substring] among (
'{i}{s}{e}' '{i}{s}{o}{u}' '{i}{s}{a}' (delete unset test1)
)
[] substring atlimit among (
'{n}' '{ch}{e}{r}{s}{o}{n}' '{d}{oo}{d}{e}{k}{a}{n}' '{e}{r}{i}{m}{o}{n}' '{m}{e}{g}{a}{l}{o}{n}' '{e}{p}{t}{a}{n}' (<- '{i}{s}')
)
)

define step5k as (
[substring] among (
'{i}{s}{t}{e}' (delete unset test1)
)
[] substring atlimit among (
'{a}{s}{v}' '{s}{v}' '{a}{ch}{r}' '{ch}{r}' '{a}{p}{l}' '{a}{e}{y}{m}{n}' '{d}{u}{s}{ch}{r}' '{e}{u}{ch}{r}' '{k}{o}{y}{n}{o}{ch}{r}' '{p}{a}{l}{y}{m}{ps}'
(<- '{i}{s}{t}')
)
)

define step5l as (
[substring] among (
'{o}{u}{n}{e}' '{i}{s}{o}{u}{n}{e}' '{i}{th}{o}{u}{n}{e}' (delete unset test1)
)
[] substring atlimit among (
'{n}' '{r}' '{s}{p}{y}' '{s}{t}{r}{a}{v}{o}{m}{o}{u}{t}{s}' '{k}{a}{k}{o}{m}{o}{u}{t}{s}' '{e}{x}{oo}{n}' (<- '{o}{u}{n}')
)
)

define step5m as (
[substring] among (
'{o}{u}{m}{e}' '{i}{s}{o}{u}{m}{e}' '{i}{th}{o}{u}{m}{e}' (delete unset test1)
)
[] substring atlimit among (
'{p}{a}{r}{a}{s}{o}{u}{s}' '{f}' '{ch}' '{oo}{r}{y}{o}{p}{l}' '{a}{z}' '{a}{l}{l}{o}{s}{o}{u}{s}' '{a}{s}{o}{u}{s}'
(<- '{o}{u}{m}')
)
)

define step6 as (
do (
[substring] among (
'{m}{a}{t}{a}' '{m}{a}{t}{oo}{n}' '{m}{a}{t}{o}{s}' (<- '{m}{a}')
)
)
test1
[substring] among (
'{a}' '{a}{g}{a}{t}{e}' '{a}{g}{a}{n}' '{a}{e}{y}' '{a}{m}{a}{y}' '{a}{n}' '{a}{s}' '{a}{s}{a}{y}' '{a}{t}{a}{y}' '{a}{oo}' '{e}' '{e}{y}'
'{e}{y}{s}' '{e}{y}{t}{e}' '{e}{s}{a}{y}' '{e}{s}' '{e}{t}{a}{y}' '{y}' '{y}{e}{m}{a}{y}' '{y}{e}{m}{a}{s}{t}{e}' '{y}{e}{t}{a}{y}' '{y}{e}{s}{a}{y}'
'{y}{e}{s}{a}{s}{t}{e}' '{y}{o}{m}{a}{s}{t}{a}{n}' '{y}{o}{m}{o}{u}{n}' '{y}{o}{m}{o}{u}{n}{a}' '{y}{o}{n}{t}{a}{n}' '{y}{o}{n}{t}{o}{u}{s}{a}{n}' '{y}{o}{s}{a}{s}{t}{a}{n}'
'{y}{o}{s}{a}{s}{t}{e}' '{y}{o}{s}{o}{u}{n}' '{y}{o}{s}{o}{u}{n}{a}' '{y}{o}{t}{a}{n}' '{y}{o}{u}{m}{a}' '{y}{o}{u}{m}{a}{s}{t}{e}' '{y}{o}{u}{n}{t}{a}{y}'
'{y}{o}{u}{n}{t}{a}{n}' '{i}' '{i}{d}{e}{s}' '{i}{d}{oo}{n}' '{i}{th}{e}{y}' '{i}{th}{e}{y}{s}' '{i}{th}{e}{y}{t}{e}' '{i}{th}{i}{k}{a}{t}{e}' '{i}{th}{i}{k}{a}{n}'
'{i}{th}{o}{u}{n}' '{i}{th}{oo}' '{i}{k}{a}{t}{e}' '{i}{k}{a}{n}' '{i}{s}' '{i}{s}{a}{n}' '{i}{s}{a}{t}{e}' '{i}{s}{e}{y}' '{i}{s}{e}{s}' '{i}{s}{o}{u}{n}'
'{i}{s}{oo}' '{o}' '{o}{y}' '{o}{m}{a}{y}' '{o}{m}{a}{s}{t}{a}{n}' '{o}{m}{o}{u}{n}' '{o}{m}{o}{u}{n}{a}' '{o}{n}{t}{a}{y}' '{o}{n}{t}{a}{n}'
'{o}{n}{t}{o}{u}{s}{a}{n}' '{o}{s}' '{o}{s}{a}{s}{t}{a}{n}' '{o}{s}{a}{s}{t}{e}' '{o}{s}{o}{u}{n}' '{o}{s}{o}{u}{n}{a}' '{o}{t}{a}{n}' '{o}{u}' '{o}{u}{m}{a}{y}'
'{o}{u}{m}{a}{s}{t}{e}' '{o}{u}{n}' '{o}{u}{n}{t}{a}{y}' '{o}{u}{n}{t}{a}{n}' '{o}{u}{s}' '{o}{u}{s}{a}{n}' '{o}{u}{s}{a}{t}{e}' '{u}' '{u}{s}' '{oo}'
'{oo}{n}' (delete)
)
)

define step7 as (
[substring] among (
'{e}{s}{t}{e}{r}' '{e}{s}{t}{a}{t}' '{o}{t}{e}{r}' '{o}{t}{a}{t}' '{u}{t}{e}{r}' '{u}{t}{a}{t}' '{oo}{t}{e}{r}' '{oo}{t}{a}{t}' (delete)
)
)
)

define stem as (
backwards (
do tolower
has_min_length
set test1
do step1
do steps1
do steps2
do steps3
do steps4
do steps5
do steps6
do steps7
do steps8
do steps9
do steps10
do step2a
do step2b
do step2c
do step2d
do step3
do step4
do step5a
do step5b
do step5c
do step5d
do step5e
do step5f
do step5g
do step5h
do step5j
do step5i
do step5k
do step5l
do step5m
do step6
do step7
)
)

+ 323
- 0
contrib/snowball/algorithms/hindi.sbl View File

@@ -0,0 +1,323 @@
// An implementation of "A Lightweight Stemmer for Hindi":
// http://www.kbcs.in/downloads/papers/StmmerHindi.pdf

externals ( stem )

stringescapes {}

// The transliteration scheme used for our stringdefs matches that used in the
// paper, as documented in the appendix. It appears to match the WX notation
// (https://en.wikipedia.org/wiki/WX_notation) except that WX apparently
// uses 'z' for Anunasika whereas the paper uses Mh.
//
// We discriminate dependent vowels by adding a leading "_" to their stringdef
// names (mnemonic: the _ signifies removing the implicit a from the preceding
// character).

// Vowels and sonorants:
stringdef a '{U+0905}'
stringdef A '{U+0906}'
stringdef i '{U+0907}'
stringdef I '{U+0908}'
stringdef u '{U+0909}'
stringdef U '{U+090A}'
stringdef q '{U+090B}'
stringdef e '{U+090F}'
stringdef E '{U+0910}'
stringdef o '{U+0913}'
stringdef O '{U+0914}'

// Vowel signs:
stringdef _A '{U+093E}'
stringdef _i '{U+093F}'
stringdef _I '{U+0940}'
stringdef _u '{U+0941}'
stringdef _U '{U+0942}'
stringdef _q '{U+0943}'
stringdef _e '{U+0947}'
stringdef _E '{U+0948}'
stringdef _o '{U+094B}'
stringdef _O '{U+094C}'

// Diacritics:
stringdef M '{U+0902}'
stringdef H '{U+0903}'
stringdef Mh '{U+0901}'
stringdef Z '{U+093C}' // Nukta
stringdef virama '{U+094D}'

// Velar consonants:
stringdef k '{U+0915}'
stringdef K '{U+0916}'
stringdef g '{U+0917}'
stringdef G '{U+0918}'
stringdef f '{U+0919}'

// Palatal consonants:
stringdef c '{U+091A}'
stringdef C '{U+091B}'
stringdef j '{U+091C}'
stringdef J '{U+091D}'
stringdef F '{U+091E}'

// Retroflex consonants:
stringdef t '{U+091F}'
stringdef T '{U+0920}'
stringdef d '{U+0921}'
stringdef D '{U+0922}'
stringdef N '{U+0923}'

// Dental consonants:
stringdef w '{U+0924}'
stringdef W '{U+0925}'
stringdef x '{U+0926}'
stringdef X '{U+0927}'
stringdef n '{U+0928}'

// Labial consonants:
stringdef p '{U+092A}'
stringdef P '{U+092B}'
stringdef b '{U+092C}'
stringdef B '{U+092D}'
stringdef m '{U+092E}'

// Semi-vowels:
stringdef y '{U+092F}'
stringdef r '{U+0930}'
stringdef l '{U+0932}'
stringdef v '{U+0935}'

// Fricatives:
stringdef S '{U+0936}'
stringdef R '{U+0937}'
stringdef s '{U+0938}'
stringdef h '{U+0939}'

stringdef lY '{U+0933}'

// Precomposed characters - letters + nukta:
stringdef nZ '{U+0929}' // ≡ {n}{Z}
stringdef rZ '{U+0931}' // ≡ {r}{Z}
stringdef lYZ '{U+0934}' // ≡ {lY}{Z}
stringdef kZ '{U+0958}' // ≡ {k}{Z}
stringdef KZ '{U+0959}' // ≡ {K}{Z}
stringdef gZ '{U+095A}' // ≡ {g}{Z}
stringdef jZ '{U+095B}' // ≡ {j}{Z}
stringdef dZ '{U+095C}' // ≡ {d}{Z}
stringdef DZ '{U+095D}' // ≡ {D}{Z}
stringdef PZ '{U+095E}' // ≡ {P}{Z}
stringdef yZ '{U+095F}' // ≡ {y}{Z}

integers ( p )

groupings ( consonant )

routines ( CONSONANT )

define consonant '{k}{K}{g}{G}{f}' +
'{c}{C}{j}{J}{F}' +
'{t}{T}{d}{D}{N}' +
'{w}{W}{x}{X}{n}' +
'{p}{P}{b}{B}{m}' +
'{y}{r}{l}{v}' +
'{S}{R}{s}{h}' +
'{lY}' +
'{Z}' + // Nukta
// Precomposed characters - letter and nukta:
'{nZ}{rZ}{lYZ}{kZ}{KZ}{gZ}{jZ}{dZ}{DZ}{PZ}{yZ}'

backwardmode ( define CONSONANT as ( consonant ) )

define stem as (
test ( next setmark p )
backwards (
// We assume in this implementation that the whole word doesn't count
// as a valid suffix to remove, so we remove the longest suffix from
// the list which leaves at least one character. This change affects
// 47 words out of the 65,140 in the sample vocabulary from Hindi
// wikipedia.
setlimit tomark p for ([substring])
among (
// The list below is derived from figure 3 in the paper.
//
// We perform the stemming on the Devanagari characters rather than
// transliterating to Latin, so we have adapted the list below to
// reflect this by converting suffixes back to Devanagari as
// follows:
//
// * within the suffixes, "a" after a consonant is dropped since
// consonants have an implicit "a".
//
// * within the suffixes, a vowel other than "a" after a consonant
// is a dependent vowel (vowel sign); a vowel (including "a")
// after a non-consonant is an independent vowel.
//
// * to allow the vowel at the start of each suffix being dependent
// or independent, we include each suffix twice. For the
// dependent version, a leading "a" is dropped and we check that
// the suffix is preceded by a consonant (which will have an
// implicit "a").
//
// * we add '{a}', which is needed for the example given right at
// the end of section 5 to work (conflating BarawIya and
// BarawIyawA), and which 3.1 a.v strongly suggests should be in
// the list:
//
// Thus, the following suffix deletions (longest possible
// match) are required to reduce inflected forms of masculine
// nouns to a common stem:
// a A i [...]
//
// Adding '{a}' only affect 2 words out of the 65,140 in the
// sample vocabulary.
//
// * The transliterations of our stems would end with "a" when our
// stems end in a consonant, so we also include {virama} in the
// list of suffixes to remove (this affects 222 words from the
// sample vocabulary).
//
// We've also assumed that Mh in the suffix list always means {Mh}
// and never {M}{h}{virama}. Only one of the 65,140 words in the
// sample vocabulary stems differently due to this (and that word
// seems to be a typo).

'{virama}'

'{a}'
'{A}'
'{i}'
'{I}'
'{u}'
'{U}'
'{e}'
'{o}'
'{e}{M}'
'{o}{M}'
'{A}{M}'
'{u}{A}{M}'
'{u}{e}{M}'
'{u}{o}{M}'
'{A}{e}{M}'
'{A}{o}{M}'
'{i}{y}{_A}{M}'
'{i}{y}{_o}{M}'
'{A}{i}{y}{_A}{M}'
'{A}{i}{y}{_o}{M}'
'{A}{Mh}'
'{i}{y}{_A}{Mh}'
'{A}{i}{y}{_A}{Mh}'
'{a}{w}{_A}{e}{M}'
'{a}{w}{_A}{o}{M}'
'{a}{n}{_A}{e}{M}'
'{a}{n}{_A}{o}{M}'
'{a}{w}{_A}'
'{a}{w}{_I}'
'{I}{M}'
'{a}{w}{_I}{M}'
'{a}{w}{_e}'
'{A}{w}{_A}'
'{A}{w}{_I}'
'{A}{w}{_I}{M}'
'{A}{w}{_e}'
'{a}{n}{_A}'
'{a}{n}{_I}'
'{a}{n}{_e}'
'{A}{n}{_A}'
'{A}{n}{_e}'
'{U}{M}{g}{_A}'
'{U}{M}{g}{_I}'
'{A}{U}{M}{g}{_A}'
'{A}{U}{M}{g}{_I}'
'{e}{M}{g}{_e}'
'{e}{M}{g}{_I}'
'{A}{e}{M}{g}{_e}'
'{A}{e}{M}{g}{_I}'
'{o}{g}{_e}'
'{o}{g}{_I}'
'{A}{o}{g}{_e}'
'{A}{o}{g}{_I}'
'{e}{g}{_A}'
'{e}{g}{_I}'
'{A}{e}{g}{_A}'
'{A}{e}{g}{_I}'
'{A}{y}{_A}'
'{A}{e}'
'{A}{I}'
'{A}{I}{M}'
'{i}{e}'
'{A}{o}'
'{A}{i}{e}'
'{a}{k}{r}'
'{A}{k}{r}'

'{_A}'
'{_i}'
'{_I}'
'{_u}'
'{_U}'
'{_e}'
'{_o}'
'{_e}{M}'
'{_o}{M}'
'{_A}{M}'
'{_u}{A}{M}'
'{_u}{e}{M}'
'{_u}{o}{M}'
'{_A}{e}{M}'
'{_A}{o}{M}'
'{_i}{y}{_A}{M}'
'{_i}{y}{_o}{M}'
'{_A}{i}{y}{_A}{M}'
'{_A}{i}{y}{_o}{M}'
'{_A}{Mh}'
'{_i}{y}{_A}{Mh}'
'{_A}{i}{y}{_A}{Mh}'
'{_I}{M}'
'{_A}{w}{_A}'
'{_A}{w}{_I}'
'{_A}{w}{_I}{M}'
'{_A}{w}{_e}'
'{_A}{n}{_A}'
'{_A}{n}{_e}'
'{_U}{M}{g}{_A}'
'{_U}{M}{g}{_I}'
'{_A}{U}{M}{g}{_A}'
'{_A}{U}{M}{g}{_I}'
'{_e}{M}{g}{_e}'
'{_e}{M}{g}{_I}'
'{_A}{e}{M}{g}{_e}'
'{_A}{e}{M}{g}{_I}'
'{_o}{g}{_e}'
'{_o}{g}{_I}'
'{_A}{o}{g}{_e}'
'{_A}{o}{g}{_I}'
'{_e}{g}{_A}'
'{_e}{g}{_I}'
'{_A}{e}{g}{_A}'
'{_A}{e}{g}{_I}'
'{_A}{y}{_A}'
'{_A}{e}'
'{_A}{I}'
'{_A}{I}{M}'
'{_i}{e}'
'{_A}{o}'
'{_A}{i}{e}'
'{_A}{k}{r}'

/* Suffixes with a leading implicit a: */
'{w}{_A}{e}{M}' CONSONANT
'{w}{_A}{o}{M}' CONSONANT
'{n}{_A}{e}{M}' CONSONANT
'{n}{_A}{o}{M}' CONSONANT
'{w}{_A}' CONSONANT
'{w}{_I}' CONSONANT
'{w}{_I}{M}' CONSONANT
'{w}{_e}' CONSONANT
'{n}{_A}' CONSONANT
'{n}{_I}' CONSONANT
'{n}{_e}' CONSONANT
'{k}{r}' CONSONANT
)
delete
)
)

contrib/snowball/algorithms/hungarian/stem_Unicode.sbl → contrib/snowball/algorithms/hungarian.sbl View File

@@ -27,17 +27,17 @@ groupings ( v )

stringescapes {}

/* special characters (in Unicode) */
stringdef a' hex 'E1' //a-acute
stringdef e' hex 'E9' //e-acute
stringdef i' hex 'ED' //i-acute
stringdef o' hex 'F3' //o-acute
stringdef o" hex 'F6' //o-umlaut
stringdef oq hex '151' //o-double acute
stringdef u' hex 'FA' //u-acute
stringdef u" hex 'FC' //u-umlaut
stringdef uq hex '171' //u-double acute
/* special characters */
stringdef a' '{U+00E1}' //a-acute
stringdef e' '{U+00E9}' //e-acute
stringdef i' '{U+00ED}' //i-acute
stringdef o' '{U+00F3}' //o-acute
stringdef o" '{U+00F6}' //o-umlaut
stringdef oq '{U+0151}' //o-double acute
stringdef u' '{U+00FA}' //u-acute
stringdef u" '{U+00FC}' //u-umlaut
stringdef uq '{U+0171}' //u-double acute

define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}'


+ 0
- 241
contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl View File

@@ -1,241 +0,0 @@
/*
Hungarian Stemmer
Removes noun inflections
*/

routines (
mark_regions
R1
v_ending
case
case_special
case_other
plural
owned
sing_owner
plur_owner
instrum
factive
undouble
double
)

externals ( stem )

integers ( p1 )
groupings ( v )

stringescapes {}

/* special characters (in ISO Latin 2) */

stringdef a' hex 'E1' //a-acute
stringdef e' hex 'E9' //e-acute
stringdef i' hex 'ED' //i-acute
stringdef o' hex 'F3' //o-acute
stringdef o" hex 'F6' //o-umlaut
stringdef oq hex 'F5' //o-double acute
stringdef u' hex 'FA' //u-acute
stringdef u" hex 'FC' //u-umlaut
stringdef uq hex 'FB' //u-double acute

define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}'

define mark_regions as (

$p1 = limit

(v goto non-v
among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next
setmark p1)
or

(non-v gopast v setmark p1)
)

backwardmode (

define R1 as $p1 <= cursor

define v_ending as (
[substring] R1 among(
'{a'}' (<- 'a')
'{e'}' (<- 'e')
)
)

define double as (
test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm'
'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs')
)

define undouble as (
next [hop 1] delete
)

define instrum as(
[substring] R1 among(
'al' (double)
'el' (double)
)
delete
undouble
)


define case as (
[substring] R1 among(
'ban' 'ben'
'ba' 'be'
'ra' 're'
'nak' 'nek'
'val' 'vel'
't{o'}l' 't{oq}l'
'r{o'}l' 'r{oq}l'
'b{o'}l' 'b{oq}l'
'hoz' 'hez' 'h{o"}z'
'n{a'}l' 'n{e'}l'
'ig'
'at' 'et' 'ot' '{o"}t'
'{e'}rt'
'k{e'}pp' 'k{e'}ppen'
'kor'
'ul' '{u"}l'
'v{a'}' 'v{e'}'
'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt'
'k{e'}nt'
'en' 'on' 'an' '{o"}n'
'n'
't'
)
delete
v_ending
)

define case_special as(
[substring] R1 among(
'{e'}n' (<- 'e')
'{a'}n' (<- 'a')
'{a'}nk{e'}nt' (<- 'a')
)
)

define case_other as(
[substring] R1 among(
'astul' 'est{u"}l' (delete)
'stul' 'st{u"}l' (delete)
'{a'}stul' (<- 'a')
'{e'}st{u"}l' (<- 'e')
)
)

define factive as(
[substring] R1 among(
'{a'}' (double)
'{e'}' (double)
)
delete
undouble
)

define plural as (
[substring] R1 among(
'{a'}k' (<- 'a')
'{e'}k' (<- 'e')
'{o"}k' (delete)
'ak' (delete)
'ok' (delete)
'ek' (delete)
'k' (delete)
)
)

define owned as (
[substring] R1 among (
'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete)
'{e'}k{e'}' (<- 'e')
'{a'}k{e'}' (<- 'a')
'k{e'}' (delete)
'{e'}{e'}i' (<- 'e')
'{a'}{e'}i' (<- 'a')
'{e'}i' (delete)
'{e'}{e'}' (<- 'e')
'{e'}' (delete)
)
)

define sing_owner as (
[substring] R1 among(
'{u"}nk' 'unk' (delete)
'{a'}nk' (<- 'a')
'{e'}nk' (<- 'e')
'nk' (delete)
'{a'}juk' (<- 'a')
'{e'}j{u"}k' (<- 'e')
'juk' 'j{u"}k' (delete)
'uk' '{u"}k' (delete)
'em' 'om' 'am' (delete)
'{a'}m' (<- 'a')
'{e'}m' (<- 'e')
'm' (delete)
'od' 'ed' 'ad' '{o"}d' (delete)
'{a'}d' (<- 'a')
'{e'}d' (<- 'e')
'd' (delete)
'ja' 'je' (delete)
'a' 'e' 'o' (delete)
'{a'}' (<- 'a')
'{e'}' (<- 'e')
)
)

define plur_owner as (
[substring] R1 among(
'jaim' 'jeim' (delete)
'{a'}im' (<- 'a')
'{e'}im' (<- 'e')
'aim' 'eim' (delete)
'im' (delete)
'jaid' 'jeid' (delete)
'{a'}id' (<- 'a')
'{e'}id' (<- 'e')
'aid' 'eid' (delete)
'id' (delete)
'jai' 'jei' (delete)
'{a'}i' (<- 'a')
'{e'}i' (<- 'e')
'ai' 'ei' (delete)
'i' (delete)
'jaink' 'jeink' (delete)
'eink' 'aink' (delete)
'{a'}ink' (<- 'a')
'{e'}ink' (<- 'e')
'ink'
'jaitok' 'jeitek' (delete)
'aitok' 'eitek' (delete)
'{a'}itok' (<- 'a')
'{e'}itek' (<- 'e')
'itek' (delete)
'jeik' 'jaik' (delete)
'aik' 'eik' (delete)
'{a'}ik' (<- 'a')
'{e'}ik' (<- 'e')
'ik' (delete)
)
)
)

define stem as (
do mark_regions
backwards (
do instrum
do case
do case_special
do case_other
do factive
do owned
do sing_owner
do plur_owner
do plural
)
)

+ 192
- 0
contrib/snowball/algorithms/indonesian.sbl View File

@@ -0,0 +1,192 @@
// An implementation of the "Porter Stemmer for Bahasa Indonesia" from:
// http://www.illc.uva.nl/Research/Publications/Reports/MoL-2003-02.text.pdf

integers (
// The paper defines measure as the number of vowels in the word. We
// count this initially, then adjust the count each time we remove a
// prefix or suffix.
measure

// Numeric code for the type of prefix removed:
//
// 0 other/none
// 1 'di' or 'meng' or 'ter'
// 2 'per'
// 3 'ke' or 'peng'
// 4 'ber'
//
// Some of these have variant forms, so e.g. "meng" includes "men", "me",
// "meny", "mem".
//
// Note that the value of prefix is only used in remove_suffix (and
// routines it calls) so we don't need to worry about
// remove_second_order_prefix overwriting a value of prefix set by
// remove_first_order_prefix since remove_suffix gets called between
// the two.
prefix
)

groupings ( vowel )

routines (
remove_particle
remove_possessive_pronoun
remove_first_order_prefix
remove_second_order_prefix
remove_suffix
KER
SUFFIX_KAN_OK
SUFFIX_AN_OK
SUFFIX_I_OK
VOWEL
)

externals ( stem )

stringescapes {}

backwardmode (

define remove_particle as (
[substring] among (
'kah' 'lah' 'pun' (delete $measure-=1)
)
)

define remove_possessive_pronoun as (
[substring] among (
'ku' 'mu' 'nya' (delete $measure-=1)
)
)

// prefix not in {ke, peng, per}
define SUFFIX_KAN_OK as (
// On page 29, the example "kompas Q.31" says "Both Nazief and Porter
// stemmer converted the word peledakan (blast, explotion) to ledak (to
// blast, to explode)". However, the algorithm as described doesn't
// behave in this way - grammatically the prefix pe- occurs as a
// variation of both the first-order derivational prefix peng- and the
// second-order derivational prefix per-, but table 2.5 doesn't include
// "pe", only table 2.6 does, so "peledakan" is handled (incorrectly)
// as having prefix "per" not "peng", and so we remove derivational
// suffix "kan" rather than "an" to give stem leda. (Porter-style
// stemmers remove the longest suffix they can amongst those available,
// which this paper notes in the last paragraph on page 15).
//
// We resolve this by amending the condition on suffix "kan" to
// "prefix ∉ {ke, peng, per}", which seems to make the stemmer's
// behaviour match all the examples in the paper except for one:
// "perbaikan" is shown in table 3.4 as stemming to "bai", but with
// this change it now stems to "baik". The table notes that "baik" is
// the actual root so this deviation is an improvement. In a sample
// vocabulary derived from the most common words in id.wikipedia.org,
// this change only affects 0.12% of words (76 out of 64,587, including
// "peledakan" and "perbaikan").
$prefix != 3 and $prefix != 2
)

// prefix not in {di, meng, ter}
define SUFFIX_AN_OK as ( $prefix != 1 )

define SUFFIX_I_OK as (
// prefix not in {ke, peng, ber}
$prefix <= 2

// The rest of the condition from the paper is:
// V|K...c₁c₁, c₁ ≠ s, c₂ ≠ i
//
// The meaning of this is unclear in several ways, and none of the
// examples given of the stemmer's behaviour in the paper help to
// resolve these issues.
//
// Notice that c₂ isn't actually used - the most obvious explanation
// seems to be that "c₁c₁" should read "c₁c₂", or maybe "c₂c₁".
//
// Elsewhere the paper defines V... as meaning "the stem starts with
// a vowel" and K... as meaning "the stem starts with a consonant".
//
// In other places where it says X|Y... it seems the | binds more
// tightly, so it's (V|K)...cᵢcⱼ not V|(K...cᵢcⱼ). That seems a bit
// odd as the first letter must be either a vowel or a consonant, so
// that really just means "ends cᵢcⱼ". However, nowhere in the paper
// uses or defines a notation such as ...X, which may explain this
// seemingly redundant way of specifying this.
//
// The conditions elsewhere on prefix removal (e.g. V...) are clearly
// on the stem left after the prefix is removed. None of the other
// rules for suffix removal have conditions on the stem, but for
// consistency with the prefix rules we might expect that the cᵢcⱼ
// test is on what's left *after* removing the "i" suffix.
//
// However, studying Indonesian wordlists and discussion with a native
// speaker leads us to conclude that the purpose of this check is to
// protect words of foreign origin (e.g. "televisi", "organisasi",
// "komunikasi") from stemming, and the common feature of these is
// that the word ends "-si", so we conclude that the condition here
// should be read as "word does not end -si", and this is what we
// have implemented.
not 's'
)

define remove_suffix as (
[substring] among (
'kan' SUFFIX_KAN_OK 'an' SUFFIX_AN_OK 'i' SUFFIX_I_OK
(delete $measure-=1)
)
)
)

define vowel 'aeiou'

define VOWEL as ( vowel )

define KER as ( non-vowel 'er' )

define remove_first_order_prefix as (
[substring] among (
'di' 'meng' 'men' 'me' 'ter' (delete $prefix=1 $measure-=1)
'ke' 'peng' 'pen' (delete $prefix=3 $measure-=1)
'meny' VOWEL ($prefix=1 <-'s' $measure-=1)
'peny' VOWEL ($prefix=3 <-'s' $measure-=1)
'mem' ($prefix=1 $measure-=1 vowel and <-'p' or delete)
'pem' ($prefix=3 $measure-=1 vowel and <-'p' or delete)
)
)

define remove_second_order_prefix as (
// The paper has the condition on removal of prefix "bel" and "pel" as
// just "ajar" not "ajar..." but it seems that the latter must be what
// is intended so that e.g. "pelajaran" stems to "ajar" not "lajar".
// This change only affects a very small number of words (11 out of
// 64,587) and only for the better.
[substring] among (
'per' 'pe' (delete $prefix=2 $measure-=1)
'pelajar' (<-'ajar' $measure-=1)
'ber' (delete $prefix=4 $measure-=1)
'belajar' (<-'ajar' $prefix=4 $measure-=1)
'be' KER (delete $prefix=4 $measure-=1)
)
)

define stem as (
$measure = 0
do ( repeat ( gopast vowel $measure+=1 ) )
$measure > 2
$prefix = 0
backwards (
do remove_particle
$measure > 2
do remove_possessive_pronoun
)
$measure > 2
test (
remove_first_order_prefix
do (
test ($measure > 2 backwards remove_suffix)
$measure > 2 remove_second_order_prefix
)
) or (
do remove_second_order_prefix
do ($measure > 2 backwards remove_suffix)
)
)

+ 151
- 0
contrib/snowball/algorithms/irish.sbl View File

@@ -0,0 +1,151 @@
routines (
R1 R2 RV
initial_morph
mark_regions
noun_sfx
deriv
verb_sfx
)

externals ( stem )

integers ( pV p1 p2 )

groupings ( v )

stringescapes {}

/* Accented characters */

stringdef a' '{U+00E1}' // a-acute
stringdef e' '{U+00E9}' // e-acute
stringdef i' '{U+00ED}' // i-acute
stringdef o' '{U+00F3}' // o-acute
stringdef u' '{U+00FA}' // u-acute

define v 'aeiou{a'}{e'}{i'}{o'}{u'}'

define mark_regions as (

$pV = limit
$p1 = limit
$p2 = limit // defaults

do (
gopast v setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)

define initial_morph as (
[substring] among (
'h-' 'n-' 't-' //nAthair -> n-athair, but alone are problematic
(delete)

// verbs
'd{'}'
(delete)
'd{'}fh'
(<- 'f')
// other contractions
'm{'}' 'b{'}'
(delete)

'sh'
(<- 's')

'mb'
(<- 'b')
'gc'
(<- 'c')
'nd'
(<- 'd')
'bhf'
(<- 'f')
'ng'
(<- 'g')
'bp'
(<- 'p')
'ts'
(<- 's')
'dt'
(<- 't')

// Lenition
'bh'
(<- 'b')
'ch'
(<- 'c')
'dh'
(<- 'd')
'fh'
(<- 'f')
'gh'
(<- 'g')
'mh'
(<- 'm')
'ph'
(<- 'p')
'th'
(<- 't')
)
)

backwardmode (

define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor

define noun_sfx as (
[substring] among (
'amh' 'eamh' 'abh' 'eabh'
'aibh' 'ibh' 'aimh' 'imh'
'a{i'}ocht' '{i'}ocht' 'a{i'}ochta' '{i'}ochta'
(R1 delete)
'ire' 'ir{i'}' 'aire' 'air{i'}'
(R2 delete)
)
)
define deriv as (
[substring] among (
'acht' 'eacht' 'ach' 'each' 'eacht{u'}il' 'eachta' 'acht{u'}il' 'achta'
(R2 delete) //siopadóireacht -> siopadóir but not poblacht -> pobl
'arcacht' 'arcachta{i'}' 'arcachta'
(<- 'arc') // monarcacht -> monarc
'gineach' 'gineas' 'ginis'
(<- 'gin')
'grafa{i'}och' 'grafa{i'}ocht' 'grafa{i'}ochta' 'grafa{i'}ochta{i'}'
(<- 'graf')
'paite' 'patach' 'pataigh' 'patacha'
(<- 'paite')
'{o'}ideach' '{o'}ideacha' '{o'}idigh'
(<- '{o'}id')
)
)
define verb_sfx as (
[substring] among (
'imid' 'aimid' '{i'}mid' 'a{i'}mid'
'faidh' 'fidh'
(RV delete)
'ain'
'eadh' 'adh'
'{a'}il'
'tear' 'tar'
(R1 delete)
)
)
)

define stem as (
do initial_morph
do mark_regions
backwards (
do noun_sfx
do deriv
do verb_sfx
)
)

contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl → contrib/snowball/algorithms/italian.sbl View File

@@ -16,18 +16,18 @@ groupings ( v AEIO CG )

stringescapes {}

/* special characters (in MS-DOS Latin I) */
stringdef a' hex 'A0'
stringdef a` hex '85'
stringdef e' hex '82'
stringdef e` hex '8A'
stringdef i' hex 'A1'
stringdef i` hex '8D'
stringdef o' hex 'A2'
stringdef o` hex '95'
stringdef u' hex 'A3'
stringdef u` hex '97'
/* special characters */
stringdef a' '{U+00E1}'
stringdef a` '{U+00E0}'
stringdef e' '{U+00E9}'
stringdef e` '{U+00E8}'
stringdef i' '{U+00ED}'
stringdef i` '{U+00EC}'
stringdef o' '{U+00F3}'
stringdef o` '{U+00F2}'
stringdef u' '{U+00FA}'
stringdef u` '{U+00F9}'

define v 'aeiou{a`}{e`}{i`}{o`}{u`}'


+ 0
- 195
contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl View File

@@ -1,195 +0,0 @@

routines (
prelude postlude mark_regions
RV R1 R2
attached_pronoun
standard_suffix
verb_suffix
vowel_suffix
)

externals ( stem )

integers ( pV p1 p2 )

groupings ( v AEIO CG )

stringescapes {}

/* special characters (in ISO Latin I) */

stringdef a' hex 'E1'
stringdef a` hex 'E0'
stringdef e' hex 'E9'
stringdef e` hex 'E8'
stringdef i' hex 'ED'
stringdef i` hex 'EC'
stringdef o' hex 'F3'
stringdef o` hex 'F2'
stringdef u' hex 'FA'
stringdef u` hex 'F9'

define v 'aeiou{a`}{e`}{i`}{o`}{u`}'

define prelude as (
test repeat (
[substring] among(
'{a'}' (<- '{a`}')
'{e'}' (<- '{e`}')
'{i'}' (<- '{i`}')
'{o'}' (<- '{o`}')
'{u'}' (<- '{u`}')
'qu' (<- 'qU')
'' (next)
)
)
repeat goto (
v [ ('u' ] v <- 'U') or
('i' ] v <- 'I')
)
)

define mark_regions as (

$pV = limit
$p1 = limit
$p2 = limit // defaults

do (
( v (non-v gopast v) or (v gopast non-v) )
or
( non-v (non-v gopast v) or (v next) )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)

define postlude as repeat (

[substring] among(
'I' (<- 'i')
'U' (<- 'u')
'' (next)
)

)

backwardmode (

define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor

define attached_pronoun as (
[substring] among(
'ci' 'gli' 'la' 'le' 'li' 'lo'
'mi' 'ne' 'si' 'ti' 'vi'
// the compound forms are:
'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene'
'mela' 'mele' 'meli' 'melo' 'mene'
'tela' 'tele' 'teli' 'telo' 'tene'
'cela' 'cele' 'celi' 'celo' 'cene'
'vela' 'vele' 'veli' 'velo' 'vene'
)
among( (RV)
'ando' 'endo' (delete)
'ar' 'er' 'ir' (<- 'e')
)
)

define standard_suffix as (
[substring] among(

'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo'
'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti'
'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente'
'atrice' 'atrici'
'ante' 'anti' // Note 1
( R2 delete )
'azione' 'azioni' 'atore' 'atori'
( R2 delete
try ( ['ic'] R2 delete )
)
'logia' 'logie'
( R2 <- 'log' )
'uzione' 'uzioni' 'usione' 'usioni'
( R2 <- 'u' )
'enza' 'enze'
( R2 <- 'ente' )
'amento' 'amenti' 'imento' 'imenti'
( RV delete )
'amente' (
R1 delete
try (
[substring] R2 delete among(
'iv' ( ['at'] R2 delete )
'os' 'ic' 'abil'
)
)
)
'it{a`}' (
R2 delete
try (
[substring] among(
'abil' 'ic' 'iv' (R2 delete)
)
)
)
'ivo' 'ivi' 'iva' 'ive' (
R2 delete
try ( ['at'] R2 delete ['ic'] R2 delete )
)
)
)

define verb_suffix as setlimit tomark pV for (
[substring] among(
'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi'
'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate'
'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai'
'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo'
'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete'
'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo'
'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei'
'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono'
'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita'
'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo'
'ono' 'uta' 'ute' 'uti' 'uto'

'ar' 'ir' // but 'er' is problematical
(delete)
)
)

define AEIO 'aeio{a`}{e`}{i`}{o`}'
define CG 'cg'

define vowel_suffix as (
try (
[AEIO] RV delete
['i'] RV delete
)
try (
['h'] CG RV delete
)
)
)

define stem as (
do prelude
do mark_regions
backwards (
do attached_pronoun
do (standard_suffix or verb_suffix)
do vowel_suffix
)
do postlude
)

/*
Note 1: additions of 15 Jun 2005
*/


contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl → contrib/snowball/algorithms/kraaij_pohlmann.sbl View File

@@ -1,5 +1,5 @@
strings ( ch )
integers ( x p1 p2 )
integers ( p1 p2 )
booleans ( Y_found stemmed GE_removed )

routines (
@@ -20,8 +20,6 @@ groupings ( v v_WX AOU AIOU )

stringescapes {}

stringdef ' hex '27' // yuk

define v 'aeiouy'
define v_WX v + 'wx'
define AOU 'aou'
@@ -29,8 +27,8 @@ define AIOU 'aiou'

backwardmode (

define R1 as (setmark x $x >= p1)
define R2 as (setmark x $x >= p2)
define R1 as ($p1 <= cursor)
define R2 as ($p2 <= cursor)

define V as test (v or 'ij')
define VX as test (next v or 'ij')
@@ -46,7 +44,7 @@ backwardmode (

define Step_1 as
(
[among ( (])
[substring] among (

'{'}s' (delete)
's' (R1 not ('t' R1) C delete)
@@ -68,7 +66,7 @@ backwardmode (

define Step_2 as
(
[among ( (])
[substring] among (
'je' (('{'}t' ] delete) or
('et' ] R1 C delete) or
('rnt' ] <-'rn') or
@@ -92,7 +90,7 @@ backwardmode (

define Step_3 as
(
[among ( (])
[substring] among (
'atie' (R1 <-'eer')
'iteit' (R1 delete lengthen_V)
'heid'
@@ -112,7 +110,7 @@ backwardmode (

define Step_4 as
(
( [among ( (])
( [substring] among (
'ioneel' (R1 <-'ie')
'atief' (R1 <-'eer')
'baar' (R1 delete)
@@ -132,7 +130,7 @@ backwardmode (
)
)
or
( [among ( (])
( [substring] among (
'iger'
'igst'
'ig' (R1 C delete lengthen_V)
@@ -142,7 +140,7 @@ backwardmode (

define Step_7 as
(
[among ( (])
[substring] among (
'kt' (<-'k')
'ft' (<-'f')
'pt' (<-'p')
@@ -151,7 +149,7 @@ backwardmode (

define Step_6 as
(
[among ( (])
[substring] among (
'bb' (<-'b')
'cc' (<-'c')
'dd' (<-'d')
@@ -179,7 +177,7 @@ backwardmode (

define Step_1c as
(
[among ( (] R1 C)
[substring] among ( (R1 C)
'd' (not ('n' R1) delete)
't' (not ('h' R1) delete)
)
@@ -200,11 +198,8 @@ define Lose_infix as (
)

define measure as (
do (
tolimit
setmark p1
setmark p2
)
$p1 = limit
$p2 = limit
do(
repeat non-v atleast 1 ('ij' or v) non-v setmark p1
repeat non-v atleast 1 ('ij' or v) non-v setmark p2

+ 373
- 0
contrib/snowball/algorithms/lithuanian.sbl View File

@@ -0,0 +1,373 @@
externals ( stem )

// escape symbols for substituting lithuanian characters
stringescapes { }

/* Special characters in Unicode Latin Extended-A */
// ' nosine
stringdef a' '{U+0105}' // ą a + ogonek
stringdef e' '{U+0119}' // ę e + ogonek
stringdef i' '{U+012F}' // į i + ogonek
stringdef u' '{U+0173}' // ų u + ogonek

// . taskas
stringdef e. '{U+0117}' // ė e + dot

// - ilgoji
stringdef u- '{U+016B}' // ū u + macron

// * varnele
stringdef c* '{U+010D}' // č c + caron (haček)
stringdef s* '{U+0161}' // š s + caron (haček)
stringdef z* '{U+017E}' // ž z + caron (haček)

// [C](VC)^m[V|C]
// definitions of variables for
// p1 - position of m = 0
integers ( p1 )

// groupings
// v - lithuanian vowels
groupings ( v )

// v - all lithuanian vowels
define v 'aeiyou{a'}{e'}{i'}{u'}{e.}{u-}'

// all lithuanian stemmer routines: 4 steps
routines (
step2 R1 step1 fix_chdz fix_gd fix_conflicts
)

backwardmode (

define R1 as $p1 <= cursor
define step1 as (
setlimit tomark p1 for ([substring]) R1 among (
// Daiktavardžiai (Nouns)
// I linksniuotė (declension I)
'as' 'ias' 'is' 'ys' // vyras, kelias, brolis, gaidys
'o' 'io' // vyro, kelio
'ui' 'iui' // vyrui, keliui
'{a'}' 'i{a'}' '{i'}' // vyrą, kelią, brolį
'u' 'iu' // vyru, keliu
'e' 'yje' // vyre, kelyje
'y' 'au' 'i' // kely, brolau, broli,
'an' // nusižengiman

'ai' 'iai' // vyrai, keliai
'{u'}' 'i{u'}' // vyrų, kelių
'ams' 'am' // vyrams, vyram
'iams' 'iam' // broliams, broliam
'us' 'ius' // vyrus, brolius
'ais' 'iais' // vyrais, keliais
'uose' 'iuose' 'uos' 'iuos' // vyruose, keliuose, vyruos, keliuos
'uosna' 'iuosna' // vyruosna, keliuosna
'ysna' // žutysna

'asis' 'aisi' // sukimasis, sukimaisi
'osi' '{u'}si' // sukimosi, sukimųsi
'uisi' // sukimuisi
'{a'}si' // sukimąsi
'usi' // sukimusi
'esi' // sukimesi

'uo' // mėnuo


// II linksniuote (declension II)
'a' 'ia' // galva, vysnios
'os' 'ios' // galvos, vysnios
'oj' 'oje' 'ioje' // galvoje, vysnioje
'osna' 'iosna' // galvosna, vyšniosna
'om' 'oms' 'ioms' // galvoms, vysnioms
'omis' 'iomis' // galvomis, vysniomis
'ose' 'iose' // galvose, vysniose
'on' 'ion' // galvon, vyšnion


// III linksniuote (declension III)
'{e.}' // gervė
'{e.}s' // gervės
'ei' // gervei
'{e'}' // gervę
'{e.}j' '{e.}je' // gervėj, gervėje
'{e.}ms' // gervėms
'es' // gerves
'{e.}mis' // gervėmis
'{e.}se' // gervėse
'{e.}sna' // gervėsna
'{e.}n' // žydaitėn


// IV linksniuote (declension IV)
'aus' 'iaus' // sūnaus, skaičiaus
'umi' 'iumi' // sūnumi, skaičiumi
'uje' 'iuje' // sūnuje, skaičiuje
'iau' // skaičiau

'{u-}s' // sūnūs
'ums' // sūnums
'umis' // sūnumis
'un' 'iun' // sūnun, administratoriun


// V linksniuote (declension V)
'ies' 'ens' 'enio' 'ers' // avies, vandens, sesers
'eniui' 'eriai' // vandeniui, eriai
'en{i'}' 'er{i'}' // vandenį, seserį
'imi' 'eniu' 'erimi' 'eria' // avimi, vandeniu, seserimi, seseria
'enyje' 'eryje' // vandenyje, seseryje
'ie' 'enie' 'erie' // avie, vandenie, seserie

'enys' 'erys' // vandenys, seserys
// 'en{u'}' konfliktas su 'žandenų' 'antenų'
'er{u'}' // seserų
'ims' 'enims' 'erims' // avims, vandemins, seserims
'enis' // vandenis
'imis' // žebenkštimis
'enimis' // vandenimis
'yse' 'enyse' 'eryse' // avyse, vandenyse, seseryse


// Būdvardžiai (Adjectives)
// (i)a linksniuotė
'iem' 'iems' // geriem, geriems
'ame' 'iame' // naujame, mediniame


// Veiksmažodžiai (Verbs)
// Tiesioginė nuosaka (indicative mood)
// esamasis laikas (present tense)
// (i)a asmenuotė (declension (i)a)
'uosi' 'iuosi' // dirbuosi, traukiuosi
'iesi' // dirbiesi
'asi' 'iasi' // dirbasi, traukiasi
'am{e.}s' 'iam{e.}s' // dirbamės, traukiamės
'at' 'ate' 'iat' 'iate' // dirbat, dirbate, ariat, traukiate
'at{e.}s' 'iat{e.}s' // dirbatės, traukiatės

// i asmenuotė (declension i)
'isi' // tikisi
'im' // mylim
// 'ime' konfliktassu daiktavardžiu vietininku, pvz. 'gėrime'
'im{e.}s' // tikimės
'it' 'ite' // mylit, mylite, tikitės
// 'it{e.}s' konfliktas su priesaga ir dgs. vardininko galūne -ait-ės pvz. žydaitės

// o asmenuotė (declension o)
'ome' // mokome
'ot' 'ote' // mokot, mokote

// būtasis laikas
// o asmenuotė (declension o)
'{e.}jo' '{e.}josi' // tikėjo, tikėjosi
'ot{e.}s' // tikėjotės/bijotės

// ė asmenuotė (declension ė)
'eisi' // mokeisi
'{e.}si' // mokėsi
'{e.}m' '{e.}me' // mokėm, mokėme
'{e.}m{e.}s' // mokėmės
'{e.}t' '{e.}te' // mokėt, mokėte
'{e.}t{e.}s' // mokėtės

// būtasis dažninis laikas (frequentative past tense)
'ausi' // mokydavausi
'om{e.}s' // mokydavomės/bijomės


// būsimasis laikas (future tense)
'siu' 'siuosi' // dirbsiu, mokysiuosi
'si' 'siesi' // dirbsi, dirbsiesi
's' 'ysis' // dirbs, mokysis
'sim' 'sime' // dirbsim, dirbsime
'sit' 'site' // gersit, gersite

// tariamoji nuosaka (subjunctive mood)
'{c*}iau' '{c*}iausi' // dirbčiau
'tum' 'tumei' // dirbtum, dirbtumei
'tumeis' 'tumeisi' // mokytumeis, mokytumeisi
// 't{u'}' nes blogai batutų -> batų
't{u'}si' // mokytųsi
// 'tume' konfliktas su 'šventume'
'tum{e.}m' // dirbtumėm
'tum{e.}me' // dirbtumėme
'tum{e.}m{e.}s' // mokytumėmės
'tute' 'tum{e.}t' // dirbtute, dirbtumėt
'tum{e.}te' // dirbtumėte
'tum{e.}t{e.}s' // mokytumėtės

// liepiamoji nuosaka (imperative mood)
'k' 'ki' // dirbk, dirbki, mokykis
// 'kis' konfliktas viln-išk-is
// 'kime' konfliktas, nes pirkime
'kim{e.}s' // mokykimės

// bendratis (infinitive)
'uoti' 'iuoti' // meluoti, dygsniuoti
'auti' 'iauti' // draugauti, girtuokliauti
'oti' 'ioti' // dovanoti, meškerioti
'{e.}ti' // auklėti
'yti' // akyti
'inti' // auginti
'in{e.}ti' // blusinėti
'enti' // gyventi
'tel{e.}ti' // bumbtelėti
'ter{e.}ti' // bumbterėti

'ti' // skalbti
// 'tis' konfliktas, nes rytme-tis -> rytme

// dalyviai (participles)
'{a'}s' 'i{a'}s' '{i'}s' // dirbąs, žaidžiąs, gulįs
't{u'}s' // suktųs -> suk
'sim{e.}s' // suksimės
'sit{e.}s' // suksitės
'kite' // supkite
)

delete
)

define step2 as repeat (
setlimit tomark p1 for ([substring]) among (
// daiktavardziu priesagos (Noun suffixes)

// budvardziu priesagos (Adjective suffixes)
// 'in' // konfliktas su 'augintinis' ir 'akiniais' // lauk-in-is
'ing' // tvark-ing-as
'i{s*}k' // lenk-išk-as
'{e.}t' // dem-ėt-as
'ot' // garban-ot-as
'uot' 'iuot' // lang-uot-as, akin-iuot-as
// 'tin', nes augintinis // dirb-tin-is
// 'ut', nes batutas, degutas etc. // maž-ut-is
'yt' // maž-yt-is
'iuk' // maž-iuk-as
'iul' // maž-ul-is
'{e.}l' // maž-ėl-is
'yl' // maž-yl-is
'u{c*}iuk' // maž-učiuk-as
'uliuk' // maž-uliuk-as
'ut{e.}ait' // maž-utėlait-is
'ok' // did-ok-as
'iok' // višč-iok-as
'sv' '{s*}v' 'zgan' // sal-sv-as, pilk-šv-as, bal-zgan-as
'op' 'iop' // dvej-op-as, viener-iop-as
'ain' // apval-ain-as
'yk{s*}t' 'yk{s*}{c*}' // ten-ykšt-is, vakar-ykšč-ias

// laisniai
'esn' // did-esn-is
'aus' 'iaus' // nauj-aus-ias, ger-iaus-ias

// ivardziuotiniai budvardziai (Pronominal adjectives)
// vyriska gimine (Male gender)
'ias' // žaliasis
'oj' 'ioj' // gerojo, žaliojo
'aj' 'iaj' // gerajam, žaliajam
'{a'}j' 'i{a'}j' // garąjį, žaliąjį
'uoj' 'iuoj' // geruoju, žaliuoju
'iej' // gerieji
'{u'}j' 'i{u'}j' // gerųjų, žaliųjų
'ies' // geriesiems
'uos' 'iuos' // geruosius, žaliuosius
'ais' 'iais' // geraisiais, žaliaisiais

// moteriska gimine (Female gender)
'os' 'ios' // gerosios, žaliosios
'{a'}s' 'i{a'}s' // gerąsios, žaliąsias

// būtasis dažninis laikas (frequentative past tense)
'dav' // ei-dav-o

// dalyvių priesagos (particple suffix)
'ant' 'iant'
'int' // tur-int-is
'{e.}j' // tur-ėj-o
'{e'}' //
'{e.}j{e'}'
'{e'}s' // dirb-ęs-is

'siant' // dirb-siant

// pusdalyviai (participle)
'dam' // bėg-dam-as

'auj' // ūkinink-auj-a
'jam'
'iau'
'am' // baiminim-ams-i
)

delete
)

define fix_conflicts as (
[substring] among (
// 'lietuvaite' -> 'lietuvaitė', konfliktas su 'myl-ite'
'aite' (<-'ait{e.}')
// 'lietuvaitės' -> 'lietuvaitė', konfliktas su 'myl-itės'
'ait{e.}s' (<-'ait{e.}')

// ''ūs-uotės' -> 'ūs-uotė', konfliktas 'mokotės'
'uot{e.}s' (<-'uot{e.}')
// ''ūs-uote' -> 'ūs-uotė', konfliktas 'mokote'
'uote' (<-'uot{e.}')

// 'žerėjime' -> 'žėrėjimas', konfliktas su 'žais-ime'
'{e.}jime' (<-'{e.}jimas')

// 'žvilgesiu' -> 'žvilgesys', konfliktas su 'dirb-siu'
'esiu' (<-'esys')
// 'duobkasiu' -> 'duobkasys', konfliktas su 'pakasiu'
'asius' (<-'asys')

// 'žioravime' -> 'žioravimas', konfliktas su 'myl-ime'
'avime' (<-'avimas')
'ojime' (<-'ojimas')

// 'advokatės' -> 'advokatė', konfliktas su 'dirb-atės'
'okat{e.}s' (<-'okat{e.}')
// 'advokate' -> 'advokatė', konfliktas su 'dirb-ate'
'okate' (<-'okat{e.}')
)
)

define fix_chdz as (
[substring] among (
'{c*}' (<-'t')
'd{z*}' (<-'d')
)
)

define fix_gd as (
[substring] among (
'gd' (<-'g')
// '{e.}k' (<-'{e.}g')
)
)

)

define stem as (

$p1 = limit

do (
// priešdėlis 'a' ilgeniuose nei 6 raidės žodžiuose, pvz. 'a-liejus'.
try (test 'a' $(len > 6) hop 1)

gopast v gopast non-v setmark p1
)

backwards (
do fix_conflicts
do step1
do fix_chdz
do step2
do fix_chdz
do fix_gd
)

)

contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl → contrib/snowball/algorithms/lovins.sbl View File


+ 92
- 0
contrib/snowball/algorithms/nepali.sbl View File

@@ -0,0 +1,92 @@
/*
* Authors:
* - Ingroj Shrestha <ing.stha@gmail.com>, Nepali NLP Group
* - Oleg Bartunov <obartunov@gmail.com>, Postgres Professional Ltd.
* - Shreeya Singh Dhakal, Nepali NLP Group
*/

routines (
remove_category_1
check_category_2
remove_category_2
remove_category_3
)

stringescapes {}

stringdef dsc '{U+0901}' // DEVANAGARI_SIGN_CANDRABINDU
stringdef dsa '{U+0902}' // DEVANAGARI_SIGN_ANUSVARA
stringdef dli '{U+0907}' // DEVANAGARI_LETTER_I
stringdef dlii '{U+0908}' // DEVANAGARI_LETTER_II
stringdef dle '{U+090F}' // DEVANAGARI_LETTER_E
stringdef dlka '{U+0915}' // DEVANAGARI_LETTER_KA
stringdef dlkha '{U+0916}' // DEVANAGARI_LETTER_KHA
stringdef dlg '{U+0917}' // DEVANAGARI_LETTER_GA
stringdef dlc '{U+091B}' // DEVANAGARI_LETTER_CHA
stringdef dlta '{U+0924}' // DEVANAGARI_LETTER_TA
stringdef dltha '{U+0925}' // DEVANAGARI_LETTER_THA
stringdef dld '{U+0926}' // DEVANAGARI_LETTER_DA
stringdef dln '{U+0928}' // DEVANAGARI_LETTER_NA
stringdef dlpa '{U+092A}' // DEVANAGARI_LETTER_PA
stringdef dlpha '{U+092B}' // DEVANAGARI_LETTER_PHA
stringdef dlb '{U+092D}' // DEVANAGARI_LETTER_BHA
stringdef dlm '{U+092E}' // DEVANAGARI_LETTER_MA
stringdef dly '{U+092F}' // DEVANAGARI_LETTER_YA
stringdef dlr '{U+0930}' // DEVANAGARI_LETTER_RA
stringdef dll '{U+0932}' // DEVANAGARI_LETTER_LA
stringdef dlv '{U+0935}' // DEVANAGARI_LETTER_VA
stringdef dls '{U+0938}' // DEVANAGARI_LETTER_SA
stringdef dlh '{U+0939}' // DEVANAGARI_LETTER_HA
stringdef dvsaa '{U+093E}' // DEVANAGARI_VOWEL_SIGN_AA
stringdef dvsi '{U+093F}' // DEVANAGARI_VOWEL_SIGN_I
stringdef dvsii '{U+0940}' // DEVANAGARI_VOWEL_SIGN_II
stringdef dvsu '{U+0941}' // DEVANAGARI_VOWEL_SIGN_U
stringdef dvsuu '{U+0942}' // DEVANAGARI_VOWEL_SIGN_UU
stringdef dvse '{U+0947}' // DEVANAGARI_VOWEL_SIGN_E
stringdef dvsai '{U+0948}' // DEVANAGARI_VOWEL_SIGN_AI
stringdef dvso '{U+094B}' // DEVANAGARI_VOWEL_SIGN_O
stringdef dvsau '{U+094C}' // DEVANAGARI_VOWEL_SIGN_AU
stringdef dsv '{U+094D}' // DEVANAGARI_SIGN_VIRAMA

externals ( stem )
backwardmode (
define remove_category_1 as(
[substring] among (
'{dlm}{dvsaa}{dlr}{dsv}{dlpha}{dlta}' '{dld}{dsv}{dlv}{dvsaa}{dlr}{dvsaa}' '{dls}{dsc}{dlg}{dvsai}' '{dls}{dsa}{dlg}'
'{dls}{dsc}{dlg}' '{dll}{dvsaa}{dli}' '{dll}{dvsaa}{dlii}' '{dlpa}{dlc}{dvsi}'
'{dll}{dvse}' '{dlr}{dlta}' '{dlm}{dvsai}' '{dlm}{dvsaa}'
(delete)
'{dlka}{dvso}' '{dlka}{dvsaa}' '{dlka}{dvsi}' '{dlka}{dvsii}' '{dlka}{dvsai}'(('{dle}' or '{dvse}' ()) or delete)
)
)

define check_category_2 as(
[substring] among(
'{dsc}' '{dsa}' '{dvsai}'
)
)

define remove_category_2 as (
[substring] among(
'{dsc}' '{dsa}' ('{dly}{dvsau}' or '{dlc}{dvsau}' or '{dln}{dvsau}' or '{dltha}{dvse}' delete)
'{dvsai}' ('{dlta}{dsv}{dlr}' delete)
)
)

define remove_category_3 as(
[substring] among(
'{dltha}{dvsi}{dli}{dls}{dsv}' '{dlh}{dvsu}{dln}{dvse}{dlc}' '{dlh}{dvsu}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dls}{dsv}' '{dln}{dvse}{dlc}{dln}{dsv}' '{dli}{dle}{dlka}{dvsii}' '{dli}{dle}{dlka}{dvsaa}' '{dli}{dle}{dlka}{dvso}' '{dvsi}{dle}{dlka}{dvsii}' '{dvsi}{dle}{dlka}{dvsaa}' '{dvsi}{dle}{dlka}{dvso}' '{dli}{dlc}{dln}{dsv}' '{dvsi}{dlc}{dln}{dsv}' '{dli}{dlc}{dls}{dsv}' '{dvsi}{dlc}{dls}{dsv}' '{dle}{dlc}{dln}{dsv}' '{dvse}{dlc}{dln}{dsv}' '{dle}{dlc}{dls}{dsv}' '{dvse}{dlc}{dls}{dsv}' '{dlc}{dvsi}{dln}{dsv}' '{dlc}{dvse}{dls}{dsv}' '{dlc}{dsv}{dly}{dvsau}' '{dltha}{dvsi}{dln}{dsv}' '{dltha}{dvsi}{dly}{dvso}' '{dltha}{dvsi}{dly}{dvsau}' '{dltha}{dvsi}{dls}{dsv}' '{dltha}{dsv}{dly}{dvso}' '{dltha}{dsv}{dly}{dvsau}' '{dld}{dvsi}{dly}{dvso}' '{dld}{dvse}{dlkha}{dvsi}' '{dld}{dvse}{dlkha}{dvsii}' '{dll}{dvsaa}{dln}{dsv}' '{dlm}{dvsaa}{dltha}{dvsi}' '{dln}{dvse}{dlka}{dvsai}' '{dln}{dvse}{dlka}{dvsaa}' '{dln}{dvse}{dlka}{dvso}' '{dln}{dvse}{dlc}{dvsau}' '{dlh}{dvso}{dls}{dsv}' '{dli}{dln}{dsv}{dlc}' '{dvsi}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dvsu}' '{dli}{dlc}{dvsau}' '{dvsi}{dlc}{dvsau}' '{dli}{dls}{dsv}' '{dvsi}{dls}{dsv}' '{dvsi}{dly}{dvso}' '{dli}{dly}{dvso}' '{dle}{dlka}{dvsaa}' '{dvse}{dlka}{dvsaa}' '{dle}{dlka}{dvsii}' '{dvse}{dlka}{dvsii}' '{dle}{dlka}{dvsai}' '{dvse}{dlka}{dvsai}' '{dle}{dlka}{dvso}' '{dvse}{dlka}{dvso}' '{dle}{dlc}{dvsu}' '{dvse}{dlc}{dvsu}' '{dle}{dlc}{dvsau}' '{dvse}{dlc}{dvsau}' '{dlc}{dln}{dsv}' '{dlc}{dls}{dsv}' '{dltha}{dvsi}{dle}' '{dlpa}{dlr}{dsv}' '{dlb}{dly}{dvso}' '{dlh}{dlr}{dvsu}' '{dlh}{dlr}{dvsuu}' '{dvsi}{dld}{dvsaa}' '{dli}{dld}{dvsaa}' '{dvsi}{dld}{dvso}' '{dli}{dld}{dvso}' '{dvsi}{dld}{dvsai}' '{dli}{dld}{dvsai}' '{dln}{dvse}{dlc}' '{dli}{dlc}' '{dvsi}{dlc}' '{dle}{dlc}' '{dvse}{dlc}' '{dlc}{dvsu}' '{dlc}{dvse}' '{dlc}{dvsau}' '{dltha}{dvsii}' '{dltha}{dvse}' '{dld}{dvsaa}' '{dld}{dvsii}' '{dld}{dvsai}' '{dld}{dvso}' '{dln}{dvsu}' '{dln}{dvse}' '{dly}{dvso}' '{dly}{dvsau}' '{dlc}'
(delete)
)
)

)

define stem as (
backwards (
do remove_category_1
do (
repeat (do (check_category_2 and remove_category_2) remove_category_3)
)
)
)

contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl → contrib/snowball/algorithms/norwegian.sbl View File

@@ -13,11 +13,11 @@ groupings ( v s_ending )

stringescapes {}

/* special characters (in ISO Latin I) */
/* special characters */

stringdef ae hex 'E6'
stringdef ao hex 'E5'
stringdef o/ hex 'F8'
stringdef ae '{U+00E6}'
stringdef ao '{U+00E5}'
stringdef o/ '{U+00F8}'

define v 'aeiouy{ae}{ao}{o/}'


+ 0
- 80
contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl View File

@@ -1,80 +0,0 @@
routines (
mark_regions
main_suffix
consonant_pair
other_suffix
)

externals ( stem )

integers ( p1 x )

groupings ( v s_ending )

stringescapes {}

/* special characters (in MS-DOS Latin I) */

stringdef ae hex '91'
stringdef ao hex '86'
stringdef o/ hex '9B'

define v 'aeiouy{ae}{ao}{o/}'

define s_ending 'bcdfghjlmnoprtvyz'

define mark_regions as (

$p1 = limit

test ( hop 3 setmark x )
goto v gopast non-v setmark p1
try ( $p1 < x $p1 = x )
)

backwardmode (

define main_suffix as (
setlimit tomark p1 for ([substring])
among(

'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar'
'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens'
'hetens' 'ers' 'ets' 'et' 'het' 'ast'
(delete)
's'
(s_ending or ('k' non-v) delete)
'erte' 'ert'
(<-'er')
)
)

define consonant_pair as (
test (
setlimit tomark p1 for ([substring])
among(
'dt' 'vt'
)
)
next] delete
)

define other_suffix as (
setlimit tomark p1 for ([substring])
among(
'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov'
'hetslov'
(delete)
)
)
)

define stem as (

do mark_regions
backwards (
do main_suffix
do consonant_pair
do other_suffix
)
)

contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl → contrib/snowball/algorithms/porter.sbl View File


contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl → contrib/snowball/algorithms/portuguese.sbl View File

@@ -15,20 +15,20 @@ groupings ( v )

stringescapes {}

/* special characters (in ISO Latin I) */
/* special characters */

stringdef a' hex 'E1' // a-acute
stringdef a^ hex 'E2' // a-circumflex e.g. 'bota^nico
stringdef e' hex 'E9' // e-acute
stringdef e^ hex 'EA' // e-circumflex
stringdef i' hex 'ED' // i-acute
stringdef o^ hex 'F4' // o-circumflex
stringdef o' hex 'F3' // o-acute
stringdef u' hex 'FA' // u-acute
stringdef c, hex 'E7' // c-cedilla
stringdef a' '{U+00E1}' // a-acute
stringdef a^ '{U+00E2}' // a-circumflex e.g. 'bota^nico
stringdef e' '{U+00E9}' // e-acute
stringdef e^ '{U+00EA}' // e-circumflex
stringdef i' '{U+00ED}' // i-acute
stringdef o^ '{U+00F4}' // o-circumflex
stringdef o' '{U+00F3}' // o-acute
stringdef u' '{U+00FA}' // u-acute
stringdef c, '{U+00E7}' // c-cedilla

stringdef a~ hex 'E3' // a-tilde
stringdef o~ hex 'F5' // o-tilde
stringdef a~ '{U+00E3}' // a-tilde
stringdef o~ '{U+00F5}' // o-tilde


define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}'
@@ -92,12 +92,12 @@ backwardmode (
(
R2 delete
)
'log{i'}a'
'log{i'}as'
'logia'
'logias'
(
R2 <- 'log'
)
'uci{o'}n' 'uciones'
'u{c,}a~o' 'u{c,}o~es'
(
R2 <- 'u'
)

+ 0
- 218
contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl View File

@@ -1,218 +0,0 @@
routines (
prelude postlude mark_regions
RV R1 R2
standard_suffix
verb_suffix
residual_suffix
residual_form
)

externals ( stem )

integers ( pV p1 p2 )

groupings ( v )

stringescapes {}

/* special characters (in MS-DOS Latin I) */

stringdef a' hex 'A0' // a-acute
stringdef a^ hex '83' // a-circumflex e.g. 'bota^nico
stringdef e' hex '82' // e-acute
stringdef e^ hex '88' // e-circumflex
stringdef i' hex 'A1' // i-acute
stringdef o^ hex '93' // o-circumflex
stringdef o' hex 'A2' // o-acute
stringdef u' hex 'A3' // u-acute
stringdef c, hex '87' // c-cedilla

stringdef a~ hex 'C6' // a-tilde
stringdef o~ hex 'E4' // o-tilde


define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}'

define prelude as repeat (
[substring] among(
'{a~}' (<- 'a~')
'{o~}' (<- 'o~')
'' (next)
) //or next
)

define mark_regions as (

$pV = limit
$p1 = limit
$p2 = limit // defaults

do (
( v (non-v gopast v) or (v gopast non-v) )
or
( non-v (non-v gopast v) or (v next) )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)

define postlude as repeat (
[substring] among(
'a~' (<- '{a~}')
'o~' (<- '{o~}')
'' (next)
) //or next
)

backwardmode (

define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor

define standard_suffix as (
[substring] among(

'eza' 'ezas'
'ico' 'ica' 'icos' 'icas'
'ismo' 'ismos'
'{a'}vel'
'{i'}vel'
'ista' 'istas'
'oso' 'osa' 'osos' 'osas'
'amento' 'amentos'
'imento' 'imentos'

'adora' 'ador' 'a{c,}a~o'
'adoras' 'adores' 'a{c,}o~es' // no -ic test
'ante' 'antes' '{a^}ncia' // Note 1
(
R2 delete
)
'log{i'}a'
'log{i'}as'
(
R2 <- 'log'
)
'uci{o'}n' 'uciones'
(
R2 <- 'u'
)
'{e^}ncia' '{e^}ncias'
(
R2 <- 'ente'
)
'amente'
(
R1 delete
try (
[substring] R2 delete among(
'iv' (['at'] R2 delete)
'os'
'ic'
'ad'
)
)
)
'mente'
(
R2 delete
try (
[substring] among(
'ante' // Note 1
'avel'
'{i'}vel' (R2 delete)
)
)
)
'idade'
'idades'
(
R2 delete
try (
[substring] among(
'abil'
'ic'
'iv' (R2 delete)
)
)
)
'iva' 'ivo'
'ivas' 'ivos'
(
R2 delete
try (
['at'] R2 delete // but not a further ['ic'] R2 delete
)
)
'ira' 'iras'
(
RV 'e' // -eira -eiras usually non-verbal
<- 'ir'
)
)
)

define verb_suffix as setlimit tomark pV for (
[substring] among(
'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}'
'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste'
'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam'
'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem'
'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o'
'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias'
'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras'
'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres'
'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is'
'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis'
'{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis'
'{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos'
'{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos'
'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos'
'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos'
'{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou'

'ira' 'iras'
(delete)
)
)

define residual_suffix as (
[substring] among(
'os'
'a' 'i' 'o' '{a'}' '{i'}' '{o'}'
( RV delete )
)
)

define residual_form as (
[substring] among(
'e' '{e'}' '{e^}'
( RV delete [('u'] test 'g') or
('i'] test 'c') RV delete )
'{c,}' (<-'c')
)
)
)

define stem as (
do prelude
do mark_regions
backwards (
do (
( ( standard_suffix or verb_suffix )
and do ( ['i'] test 'c' RV delete )
)
or residual_suffix
)
do residual_form
)
do postlude
)

/*
Note 1: additions of 15 Jun 2005
*/

contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl → contrib/snowball/algorithms/romanian.sbl View File

@@ -20,11 +20,11 @@ stringescapes {}

/* special characters */

stringdef a^ hex 'E2' // a circumflex
stringdef i^ hex 'EE' // i circumflex
stringdef a+ hex 'E3' // a breve
stringdef s, hex 'BA' // s cedilla
stringdef t, hex 'FE' // t cedilla
stringdef a^ '{U+00E2}' // a circumflex
stringdef i^ '{U+00EE}' // i circumflex
stringdef a+ '{U+0103}' // a breve
stringdef s, '{U+015F}' // s cedilla
stringdef t, '{U+0163}' // t cedilla

define v 'aeiou{a^}{i^}{a+}'


+ 0
- 236
contrib/snowball/algorithms/romanian/stem_Unicode.sbl View File

@@ -1,236 +0,0 @@

routines (
prelude postlude mark_regions
RV R1 R2
step_0
standard_suffix combo_suffix
verb_suffix
vowel_suffix
)

externals ( stem )

integers ( pV p1 p2 )

groupings ( v )

booleans ( standard_suffix_removed )

stringescapes {}

/* special characters */

stringdef a^ hex '0E2' // a circumflex
stringdef i^ hex '0EE' // i circumflex
stringdef a+ hex '103' // a breve
stringdef s, hex '15F' // s cedilla
stringdef t, hex '163' // t cedilla

define v 'aeiou{a^}{i^}{a+}'

define prelude as (
repeat goto (
v [ ('u' ] v <- 'U') or
('i' ] v <- 'I')
)
)

define mark_regions as (

$pV = limit
$p1 = limit
$p2 = limit // defaults

do (
( v (non-v gopast v) or (v gopast non-v) )
or
( non-v (non-v gopast v) or (v next) )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)

define postlude as repeat (

[substring] among(
'I' (<- 'i')
'U' (<- 'u')
'' (next)
)

)

backwardmode (

define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor

define step_0 as (
[substring] R1 among(
'ul' 'ului'
( delete )
'aua'
( <-'a' )
'ea' 'ele' 'elor'
( <-'e' )
'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor'
( <-'i')
'ile'
( not 'ab' <- 'i' )
'atei'
( <- 'at' )
'a{t,}ie' 'a{t,}ia'
( <- 'a{t,}i' )
)
)

define combo_suffix as test (
[substring] R1 (
among(
/* 'IST'. alternative: include the following
'alism' 'alisme'
'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' (
<- 'al'
)
*/
'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' (
<- 'abil'
)
'ibilitate' (
<- 'ibil'
)
'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' (
<- 'iv'
)
'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i'
'icator' 'icatori'
'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}'
'ical' 'icala' 'icale' 'icali' 'ical{a+}' (
<- 'ic'
)
'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune'
'atoare' 'ator' 'atori'
'{a+}toare' '{a+}tor' '{a+}tori' (
<- 'at'
)
'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune'
'itoare' 'itor' 'itori' (
<- 'it'
)
)
set standard_suffix_removed
)
)

define standard_suffix as (
unset standard_suffix_removed
repeat combo_suffix
[substring] R2 (
among(

// past participle is treated here, rather than
// as a verb ending:
'at' 'ata' 'at{a+}' 'ati' 'ate'
'ut' 'uta' 'ut{a+}' 'uti' 'ute'
'it' 'ita' 'it{a+}' 'iti' 'ite'

'ic' 'ica' 'ice' 'ici' 'ic{a+}'
'abil' 'abila' 'abile' 'abili' 'abil{a+}'
'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}'
'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i'
'ant' 'anta' 'ante' 'anti' 'ant{a+}'
'ator' 'atori'
'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i'
'iv' 'iva' 'ive' 'ivi' 'iv{a+}' (
delete
)
'iune' 'iuni' (
'{t,}'] <- 't'
)
'ism' 'isme'
'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' (
<- 'ist'
/* 'IST'. alternative: remove with <- '' */
)
)
set standard_suffix_removed
)
)

define verb_suffix as setlimit tomark pV for (
[substring] among(
// 'long' infinitive:
'are' 'ere' 'ire' '{a^}re'

// gerund:
'ind' '{a^}nd'
'indu' '{a^}ndu'

'eze'
'easc{a+}'
// present:
'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti'
'e{s,}te'
'{a+}sc' '{a+}{s,}ti'
'{a+}{s,}te'

// imperfect:
'am' 'ai' 'au'
'eam' 'eai' 'ea' 'ea{t,}i' 'eau'
'iam' 'iai' 'ia' 'ia{t,}i' 'iau'

// past: // (not 'ii')
'ui'
'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}'
'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}'
'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}'
'{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}'

// pluferfect:
'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}'
'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}'
'{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i'
'{a^}ser{a+}'
'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}'

( non-v or 'u' delete )

// present:
'{a+}m' 'a{t,}i'
'em' 'e{t,}i'
'im' 'i{t,}i'
'{a^}m' '{a^}{t,}i'

// past:
'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}'
'sei' 'se'

// pluperfect:
'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}'
(delete)
)
)

define vowel_suffix as (
[substring] RV among (
'a' 'e' 'i' 'ie' '{a+}' ( delete )
)
)
)

define stem as (
do prelude
do mark_regions
backwards (
do step_0
do standard_suffix
do ( standard_suffix_removed or verb_suffix )
do vowel_suffix
)
do postlude
)


contrib/snowball/algorithms/russian/stem_KOI8_R.sbl → contrib/snowball/algorithms/russian.sbl View File

@@ -1,41 +1,41 @@
stringescapes {}

/* the 32 Cyrillic letters in the KOI8-R coding scheme, and represented
in Latin characters following the conventions of the standard Library
of Congress transliteration: */
stringdef a hex 'C1'
stringdef b hex 'C2'
stringdef v hex 'D7'
stringdef g hex 'C7'
stringdef d hex 'C4'
stringdef e hex 'C5'
stringdef zh hex 'D6'
stringdef z hex 'DA'
stringdef i hex 'C9'
stringdef i` hex 'CA'
stringdef k hex 'CB'
stringdef l hex 'CC'
stringdef m hex 'CD'
stringdef n hex 'CE'
stringdef o hex 'CF'
stringdef p hex 'D0'
stringdef r hex 'D2'
stringdef s hex 'D3'
stringdef t hex 'D4'
stringdef u hex 'D5'
stringdef f hex 'C6'
stringdef kh hex 'C8'
stringdef ts hex 'C3'
stringdef ch hex 'DE'
stringdef sh hex 'DB'
stringdef shch hex 'DD'
stringdef " hex 'DF'
stringdef y hex 'D9'
stringdef ' hex 'D8'
stringdef e` hex 'DC'
stringdef iu hex 'C0'
stringdef ia hex 'D1'
/* the 33 Cyrillic letters represented in ASCII characters following the
* conventions of the standard Library of Congress transliteration: */
stringdef a '{U+0430}'
stringdef b '{U+0431}'
stringdef v '{U+0432}'
stringdef g '{U+0433}'
stringdef d '{U+0434}'
stringdef e '{U+0435}'
stringdef e" '{U+0451}'
stringdef zh '{U+0436}'
stringdef z '{U+0437}'
stringdef i '{U+0438}'
stringdef i` '{U+0439}'
stringdef k '{U+043A}'
stringdef l '{U+043B}'
stringdef m '{U+043C}'
stringdef n '{U+043D}'
stringdef o '{U+043E}'
stringdef p '{U+043F}'
stringdef r '{U+0440}'
stringdef s '{U+0441}'
stringdef t '{U+0442}'
stringdef u '{U+0443}'
stringdef f '{U+0444}'
stringdef kh '{U+0445}'
stringdef ts '{U+0446}'
stringdef ch '{U+0447}'
stringdef sh '{U+0448}'
stringdef shch '{U+0449}'
stringdef " '{U+044A}'
stringdef y '{U+044B}'
stringdef ' '{U+044C}'
stringdef e` '{U+044D}'
stringdef iu '{U+044E}'
stringdef ia '{U+044F}'

routines ( mark_regions R2
perfective_gerund
@@ -200,6 +200,10 @@ backwardmode (

define stem as (

// Normalise {e"} to {e}. The documentation has long suggested the user
// should do this before calling the stemmer - we now do it for them.
do repeat ( goto (['{e"}']) <- '{e}' )

do mark_regions
backwards setlimit tomark pV for (
do (

+ 0
- 215
contrib/snowball/algorithms/russian/stem_Unicode.sbl View File

@@ -1,215 +0,0 @@
stringescapes {}

/* the 32 Cyrillic letters in Unicode */

stringdef a hex '430'
stringdef b hex '431'
stringdef v hex '432'
stringdef g hex '433'
stringdef d hex '434'
stringdef e hex '435'
stringdef zh hex '436'
stringdef z hex '437'
stringdef i hex '438'
stringdef i` hex '439'
stringdef k hex '43A'
stringdef l hex '43B'
stringdef m hex '43C'
stringdef n hex '43D'
stringdef o hex '43E'
stringdef p hex '43F'
stringdef r hex '440'
stringdef s hex '441'
stringdef t hex '442'
stringdef u hex '443'
stringdef f hex '444'
stringdef kh hex '445'
stringdef ts hex '446'
stringdef ch hex '447'
stringdef sh hex '448'
stringdef shch hex '449'
stringdef " hex '44A'
stringdef y hex '44B'
stringdef ' hex '44C'
stringdef e` hex '44D'
stringdef iu hex '44E'
stringdef ia hex '44F'

routines ( mark_regions R2
perfective_gerund
adjective
adjectival
reflexive
verb
noun
derivational
tidy_up
)

externals ( stem )

integers ( pV p2 )

groupings ( v )

define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}'

define mark_regions as (

$pV = limit
$p2 = limit
do (
gopast v setmark pV gopast non-v
gopast v gopast non-v setmark p2
)
)

backwardmode (

define R2 as $p2 <= cursor

define perfective_gerund as (
[substring] among (
'{v}'
'{v}{sh}{i}'
'{v}{sh}{i}{s}{'}'
('{a}' or '{ia}' delete)
'{i}{v}'
'{i}{v}{sh}{i}'
'{i}{v}{sh}{i}{s}{'}'
'{y}{v}'
'{y}{v}{sh}{i}'
'{y}{v}{sh}{i}{s}{'}'
(delete)
)
)

define adjective as (
[substring] among (
'{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}'
'{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}'
'{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}'
'{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}'
'{ia}{ia}'
// and -
'{o}{iu}' // - which is somewhat archaic
'{e}{iu}' // - soft form of {o}{iu}
(delete)
)
)

define adjectival as (
adjective

/* of the participle forms, em, vsh, ivsh, yvsh are readily removable.
nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of
errors. Removing im, uem, enn creates too many errors.
*/

try (
[substring] among (
'{e}{m}' // present passive participle
'{n}{n}' // adjective from past passive participle
'{v}{sh}' // past active participle
'{iu}{shch}' '{shch}' // present active participle
('{a}' or '{ia}' delete)

//but not '{i}{m}' '{u}{e}{m}' // present passive participle
//or '{e}{n}{n}' // adjective from past passive participle

'{i}{v}{sh}' '{y}{v}{sh}'// past active participle
'{u}{iu}{shch}' // present active participle
(delete)
)
)

)

define reflexive as (
[substring] among (
'{s}{ia}'
'{s}{'}'
(delete)
)
)

define verb as (
[substring] among (
'{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}'
'{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}'
'{n}{y}' '{t}{'}' '{e}{sh}{'}'

'{n}{n}{o}'
('{a}' or '{ia}' delete)

'{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}'
'{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}'
'{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}'
'{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}'
'{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}'
'{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}'
(delete)
/* note the short passive participle tests:
'{n}{a}' '{n}' '{n}{o}' '{n}{y}'
'{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}'
*/
)
)

define noun as (
[substring] among (
'{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}'
'{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}'
'{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}'
'{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}'
'{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}'
'{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}'
(delete)
/* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}'
'{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}'
omitted - they only occur on 12 words.
*/
)
)

define derivational as (
[substring] R2 among (
'{o}{s}{t}'
'{o}{s}{t}{'}'
(delete)
)
)

define tidy_up as (
[substring] among (

'{e}{i`}{sh}'
'{e}{i`}{sh}{e}' // superlative forms
(delete
['{n}'] '{n}' delete
)
'{n}'
('{n}' delete) // e.g. -nno endings
'{'}'
(delete) // with some slight false conflations
)
)
)

define stem as (

do mark_regions
backwards setlimit tomark pV for (
do (
perfective_gerund or
( try reflexive
adjectival or verb or noun
)
)
try([ '{i}' ] delete)
// because noun ending -i{iu} is being treated as verb ending -{iu}

do derivational
do tidy_up
)
)

+ 2378
- 0
contrib/snowball/algorithms/serbian.sbl
File diff suppressed because it is too large
View File


contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl → contrib/snowball/algorithms/spanish.sbl View File

@@ -16,15 +16,15 @@ groupings ( v )

stringescapes {}

/* special characters (in ISO Latin I) */
stringdef a' hex 'E1' // a-acute
stringdef e' hex 'E9' // e-acute
stringdef i' hex 'ED' // i-acute
stringdef o' hex 'F3' // o-acute
stringdef u' hex 'FA' // u-acute
stringdef u" hex 'FC' // u-diaeresis
stringdef n~ hex 'F1' // n-tilde
/* special characters */
stringdef a' '{U+00E1}' // a-acute
stringdef e' '{U+00E9}' // e-acute
stringdef i' '{U+00ED}' // i-acute
stringdef o' '{U+00F3}' // o-acute
stringdef u' '{U+00FA}' // u-acute
stringdef u" '{U+00FC}' // u-diaeresis
stringdef n~ '{U+00F1}' // n-tilde

define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}'


+ 0
- 230
contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl View File

@@ -1,230 +0,0 @@
routines (
postlude mark_regions
RV R1 R2
attached_pronoun
standard_suffix
y_verb_suffix
verb_suffix
residual_suffix
)

externals ( stem )

integers ( pV p1 p2 )

groupings ( v )

stringescapes {}

/* special characters (in MS-DOS Latin I) */

stringdef a' hex 'A0' // a-acute
stringdef e' hex '82' // e-acute
stringdef i' hex 'A1' // i-acute
stringdef o' hex 'A2' // o-acute
stringdef u' hex 'A3' // u-acute
stringdef u" hex '81' // u-diaeresis
stringdef n~ hex 'A4' // n-tilde

define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}'

define mark_regions as (

$pV = limit
$p1 = limit
$p2 = limit // defaults

do (
( v (non-v gopast v) or (v gopast non-v) )
or
( non-v (non-v gopast v) or (v next) )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)

define postlude as repeat (
[substring] among(
'{a'}' (<- 'a')
'{e'}' (<- 'e')
'{i'}' (<- 'i')
'{o'}' (<- 'o')
'{u'}' (<- 'u')
// and possibly {u"}->u here, or in prelude
'' (next)
) //or next
)

backwardmode (

define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor

define attached_pronoun as (
[substring] among(
'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo'
'las' 'les' 'los' 'nos'
)
substring RV among(
'i{e'}ndo' (] <- 'iendo')
'{a'}ndo' (] <- 'ando')
'{a'}r' (] <- 'ar')
'{e'}r' (] <- 'er')
'{i'}r' (] <- 'ir')
'ando'
'iendo'
'ar' 'er' 'ir'
(delete)
'yendo' ('u' delete)
)
)

define standard_suffix as (
[substring] among(

'anza' 'anzas'
'ico' 'ica' 'icos' 'icas'
'ismo' 'ismos'
'able' 'ables'
'ible' 'ibles'
'ista' 'istas'
'oso' 'osa' 'osos' 'osas'
'amiento' 'amientos'
'imiento' 'imientos'
(
R2 delete
)
'adora' 'ador' 'aci{o'}n'
'adoras' 'adores' 'aciones'
'ante' 'antes' 'ancia' 'ancias'// Note 1
(
R2 delete
try ( ['ic'] R2 delete )
)
'log{i'}a'
'log{i'}as'
(
R2 <- 'log'
)
'uci{o'}n' 'uciones'
(
R2 <- 'u'
)
'encia' 'encias'
(
R2 <- 'ente'
)
'amente'
(
R1 delete
try (
[substring] R2 delete among(
'iv' (['at'] R2 delete)
'os'
'ic'
'ad'
)
)
)
'mente'
(
R2 delete
try (
[substring] among(
'ante' // Note 1
'able'
'ible' (R2 delete)
)
)
)
'idad'
'idades'
(
R2 delete
try (
[substring] among(
'abil'
'ic'
'iv' (R2 delete)
)
)
)
'iva' 'ivo'
'ivas' 'ivos'
(
R2 delete
try (
['at'] R2 delete // but not a further ['ic'] R2 delete
)
)
)
)

define y_verb_suffix as (
setlimit tomark pV for ([substring]) among(
'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}'
'yas' 'yes' 'yais' 'yamos'
('u' delete)
)
)

define verb_suffix as (
setlimit tomark pV for ([substring]) among(

'en' 'es' '{e'}is' 'emos'
(try ('u' test 'g') ] delete)

'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais'
'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}'
'ar{e'}'
'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais'
'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}'
'er{e'}'
'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais'
'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}'
'ir{e'}'

'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed'
'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an'
'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado'
'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as'
'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases'
'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais'
'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados'
'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos'
'{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos'
(delete)
)
)

define residual_suffix as (
[substring] among(
'os'
'a' 'o' '{a'}' '{i'}' '{o'}'
( RV delete )
'e' '{e'}'
( RV delete try( ['u'] test 'g' RV delete ) )
)
)
)

define stem as (
do mark_regions
backwards (
do attached_pronoun
do ( standard_suffix or
y_verb_suffix or
verb_suffix
)
do residual_suffix
)
do postlude
)

/*
Note 1: additions of 15 Jun 2005
*/

contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl → contrib/snowball/algorithms/swedish.sbl View File

@@ -13,11 +13,11 @@ groupings ( v s_ending )

stringescapes {}

/* special characters (in ISO Latin I) */
/* special characters */

stringdef a" hex 'E4'
stringdef ao hex 'E5'
stringdef o" hex 'F6'
stringdef a" '{U+00E4}'
stringdef ao '{U+00E5}'
stringdef o" '{U+00F6}'

define v 'aeiouy{a"}{ao}{o"}'


+ 0
- 72
contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl View File

@@ -1,72 +0,0 @@
routines (
mark_regions
main_suffix
consonant_pair
other_suffix
)

externals ( stem )

integers ( p1 x )

groupings ( v s_ending )

stringescapes {}

/* special characters (in MS-DOS Latin I) */

stringdef a" hex '84'
stringdef ao hex '86'
stringdef o" hex '94'

define v 'aeiouy{a"}{ao}{o"}'

define s_ending 'bcdfghjklmnoprtvy'

define mark_regions as (

$p1 = limit
test ( hop 3 setmark x )
goto v gopast non-v setmark p1
try ( $p1 < x $p1 = x )
)

backwardmode (

define main_suffix as (
setlimit tomark p1 for ([substring])
among(

'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne'
'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter'
'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens'
'hetens' 'erns' 'at' 'andet' 'het' 'ast'
(delete)
's'
(s_ending delete)
)
)

define consonant_pair as setlimit tomark p1 for (
among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt')
and ([next] delete)
)

define other_suffix as setlimit tomark p1 for (
[substring] among(
'lig' 'ig' 'els' (delete)
'l{o"}st' (<-'l{o"}s')
'fullt' (<-'full')
)
)
)

define stem as (

do mark_regions
backwards (
do main_suffix
do consonant_pair
do other_suffix
)
)

+ 405
- 0
contrib/snowball/algorithms/tamil.sbl View File

@@ -0,0 +1,405 @@
/*
* Affix stripping stemming algorithm for Tamil
* By Damodharan Rajalingam
*/

stringescapes {}

/* Aytham */
stringdef aytham '{U+0B83}'

/* Uyir - independent vowels */
stringdef a '{U+0B85}'
stringdef aa '{U+0B86}'
stringdef i '{U+0B87}'
stringdef ii '{U+0B88}'
stringdef u '{U+0B89}'
stringdef uu '{U+0B8A}'
stringdef e '{U+0B8E}'
stringdef ee '{U+0B8F}'
stringdef ai '{U+0B90}'
stringdef o '{U+0B92}'
stringdef oo '{U+0B93}'
stringdef au '{U+0B94}'

/* Consonants */
stringdef ka '{U+0B95}'
stringdef nga '{U+0B99}'
stringdef ca '{U+0B9A}'
stringdef ja '{U+0B9C}'
stringdef nya '{U+0B9E}'
stringdef tta '{U+0B9F}'
stringdef nna '{U+0BA3}'
stringdef ta '{U+0BA4}'
stringdef tha '{U+0BA4}'
stringdef na '{U+0BA8}'
stringdef nnna '{U+0BA9}'
stringdef pa '{U+0BAA}'
stringdef ma '{U+0BAE}'
stringdef ya '{U+0BAF}'
stringdef ra '{U+0BB0}'
stringdef rra '{U+0BB1}'
stringdef la '{U+0BB2}'
stringdef lla '{U+0BB3}'
stringdef llla '{U+0BB4}'
stringdef zha '{U+0BB4}'
stringdef va '{U+0BB5}'

/* Vatamozi - borrowed */
stringdef sha '{U+0BB6}'
stringdef ssa '{U+0BB7}'
stringdef sa '{U+0BB8}'
stringdef ha '{U+0BB9}'


/* Dependent vowel signs (kombu etc.) */
stringdef vs_aa '{U+0BBE}'
stringdef vs_i '{U+0BBF}'
stringdef vs_ii '{U+0BC0}'
stringdef vs_u '{U+0BC1}'
stringdef vs_uu '{U+0BC2}'
stringdef vs_e '{U+0BC6}'
stringdef vs_ee '{U+0BC7}'
stringdef vs_ai '{U+0BC8}'
stringdef vs_o '{U+0BCA}'
stringdef vs_oo '{U+0BCB}'
stringdef vs_au '{U+0BCC}'

/* Pulli */
stringdef pulli '{U+0BCD}'

/* AU length markk */
stringdef au_lmark '{U+0BD7}'


routines (
remove_plural_suffix
remove_question_suffixes
remove_question_prefixes
remove_pronoun_prefixes
remove_command_suffixes
remove_um
remove_vetrumai_urupukal
fix_va_start
fix_ending
fix_endings
remove_tense_suffix
remove_tense_suffixes
remove_common_word_endings
has_min_length
)

externals ( stem )

booleans (
found_a_match
found_vetrumai_urupu
)

define has_min_length as (
$(len > 4)
)

define fix_va_start as (
(try '{va}{vs_oo}' and [ '{va}{vs_oo}' ] <- '{oo}' ) or
(try '{va}{vs_o}' and [ '{va}{vs_o}' ] <- '{o}' ) or
(try '{va}{vs_u}' and [ '{va}{vs_u}' ] <- '{u}' ) or
(try '{va}{vs_uu}' and [ '{va}{vs_uu}' ] <- '{uu}' )
)

define fix_endings as (
do repeat fix_ending
)

define remove_question_prefixes as (
[ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
do fix_va_start
)

// Gives signal t if an ending was fixed, signal f otherwise.
define fix_ending as (
$(len > 3)
backwards (
( [among('{na}{pulli}' '{na}{pulli}{ta}' '{na}{pulli}{ta}{pulli}') ] delete )
or
( ['{ya}{pulli}' test among('{vs_ai}' '{vs_i}' '{vs_ii}') ] delete )
or
( [ '{tta}{pulli}{pa}{pulli}' or '{tta}{pulli}{ka}{pulli}' ] <- '{lla}{pulli}' )
or
( [ '{nnna}{pulli}{rra}{pulli}' ] <- '{la}{pulli}' )
or
// ( [ '{rra}{pulli}{ka}{pulli}' or '{nnna}{pulli}{nnna}{pulli}' ] <- '{la}{pulli}' )
( [ '{rra}{pulli}{ka}{pulli}' ] <- '{la}{pulli}' )
or
( [ '{tta}{pulli}{tta}{pulli}' ] <- '{tta}{vs_u}' )
or
( found_vetrumai_urupu [ '{ta}{pulli}{ta}{pulli}' (test not '{vs_ai}') ] <- '{ma}{pulli}' ] )
or
( [ '{vs_u}{ka}{pulli}' or '{vs_u}{ka}{pulli}{ka}{pulli}' ] <- '{pulli}' )
or
( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
or
( [ '{vs_u}{ka}{pulli}' ] <- '{pulli}' )
or
( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
or
( [ '{pulli}' (among('{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}') or among('{nga}' '{nya}' '{nna}' '{na}' '{ma}' '{nnna}')) '{pulli}' ] <- '{pulli}' )
or
( [ among('{va}' '{ya}' '{va}{pulli}') ] delete )
or
( [ '{nnna}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')) ] delete )
or
( [ '{nga}{pulli}' (test not '{vs_ai}')] <- '{ma}{pulli}' )
or
( [ '{nga}{pulli}' ] delete )
or
( [ '{pulli}' (test (among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') or '{pulli}')) ] delete )
)
)

define remove_pronoun_prefixes as (
unset found_a_match
[ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
(set found_a_match)
do fix_va_start
)

define remove_plural_suffix as (
unset found_a_match
backwards (
( [ '{vs_u}{nga}{pulli}{ka}{lla}{pulli}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')) ] <- '{pulli}' ) or
( [ '{rra}{pulli}{ka}{lla}{pulli}' ] <- '{la}{pulli}' ) or
( [ '{tta}{pulli}{ka}{lla}{pulli}' ] <- '{lla}{pulli}' ) or
( [ '{ka}{lla}{pulli}' ] delete )
(set found_a_match)
)
)

define remove_question_suffixes as (
has_min_length
unset found_a_match
backwards (
do (
[ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}'
(set found_a_match)
)
)
do fix_endings
)

define remove_command_suffixes as (
has_min_length
unset found_a_match
backwards (
[ among('{pa}{vs_i}' '{va}{vs_i}') ] delete
(set found_a_match)
)
)

define remove_um as (
unset found_a_match
has_min_length
backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}'
(set found_a_match)
)
do fix_ending
)

define remove_common_word_endings as (
// These are not suffixes actually but are
// some words that are attached to other words
// but can be removed for stemming
unset found_a_match
has_min_length
backwards (
test ( [ '{vs_u}{tta}{nnna}{pulli}' or
'{vs_i}{la}{pulli}{la}{vs_ai}' or
'{vs_i}{tta}{ma}{pulli}' or
'{vs_i}{nnna}{pulli}{rra}{vs_i}' or
'{vs_aa}{ka}{vs_i}' or
'{vs_aa}{ka}{vs_i}{ya}' or
'{vs_e}{nnna}{pulli}{rra}{vs_u}' or
'{vs_u}{lla}{pulli}{lla}' or
'{vs_u}{tta}{vs_ai}{ya}' or
'{vs_u}{tta}{vs_ai}' or
'{vs_e}{nnna}{vs_u}{ma}{pulli}' or
('{la}{pulli}{la}' test (not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
'{vs_e}{nnna}' or
'{vs_aa}{ka}{vs_i}' ] <- '{pulli}'
(set found_a_match)
)
or
test ( [ among('{pa}{tta}{vs_u}'
'{pa}{tta}{pulli}{tta}'
'{pa}{tta}{pulli}{tta}{vs_u}'
'{pa}{tta}{pulli}{tta}{ta}{vs_u}'
'{pa}{tta}{pulli}{tta}{nna}'
'{ka}{vs_u}{ra}{vs_i}{ya}'
'{pa}{rra}{pulli}{rra}{vs_i}'
'{va}{vs_i}{tta}{vs_u}'
'{va}{vs_i}{tta}{pulli}{tta}{vs_u}'
'{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}'
'{pa}{tta}{vs_i}'
'{ta}{vs_aa}{nnna}'
'{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}')
] delete
(set found_a_match)
)
)
do fix_endings
)

define remove_vetrumai_urupukal as (
unset found_a_match
unset found_vetrumai_urupu
has_min_length
backwards (
(
test ( ['{nnna}{vs_ai}'] delete )
or
test ([ ( '{vs_i}{nnna}{vs_ai}' or
'{vs_ai}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}'))) or
( '{vs_ai}' (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}')))
] <- '{pulli}'
)
or
test ( [
'{vs_o}{tta}{vs_u}' or
'{vs_oo}{tta}{vs_u}' or
'{vs_i}{la}{pulli}' or
'{vs_i}{rra}{pulli}' or
('{vs_i}{nnna}{pulli}' (test not '{ma}')) or
'{vs_i}{nnna}{pulli}{rra}{vs_u}' or
'{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}' or
'{va}{vs_i}{tta}' or
($(len >= 7) '{vs_i}{tta}{ma}{pulli}') or
'{vs_aa}{la}{pulli}' or
'{vs_u}{tta}{vs_ai}' or
'{vs_aa}{ma}{la}{pulli}' or
('{la}{pulli}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
'{vs_u}{lla}{pulli}'
] <- '{pulli}'
)
or
test ( [
'{ka}{nna}{pulli}' or
'{ma}{vs_u}{nnna}{pulli}' or
'{ma}{vs_ee}{la}{pulli}' or
'{ma}{vs_ee}{rra}{pulli}' or
'{ka}{vs_ii}{llla}{pulli}' or
'{pa}{vs_i}{nnna}{pulli}' or
('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')))
] delete
)
or
test ([ '{vs_ii}' ] <- '{vs_i}')
)
(set found_a_match)
(set found_vetrumai_urupu)
do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' )
)
do fix_endings
)

define remove_tense_suffixes as (
set found_a_match
repeat ( found_a_match (do remove_tense_suffix) )
)

define remove_tense_suffix as (
unset found_a_match
has_min_length
backwards (
do (
test ( [among(
'{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}'
'{pa}{tta}{vs_u}'
)] delete
(set found_a_match)
)
or
test ( [
'{ma}{vs_aa}{ra}{pulli}' or
'{ma}{vs_i}{nnna}{pulli}' or
'{nnna}{nnna}{pulli}' or
'{nnna}{vs_aa}{nnna}{pulli}' or
'{nnna}{vs_aa}{lla}{pulli}' or
'{nnna}{vs_aa}{ra}{pulli}' or
('{va}{nnna}{pulli}' test (not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}')) ) or
'{nnna}{lla}{pulli}' or
'{va}{lla}{pulli}' or
'{nnna}{ra}{pulli}' or
'{va}{ra}{pulli}' or
'{nnna}' or '{pa}' or '{ka}' or '{ta}' or '{ya}' or
'{pa}{nnna}{pulli}' or
'{pa}{lla}{pulli}' or
'{pa}{ra}{pulli}' or
('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
'{vs_i}{rra}{pulli}{rra}{vs_u}' or
'{pa}{ma}{pulli}' or
'{nnna}{ma}{pulli}' or
'{ta}{vs_u}{ma}{pulli}' or
'{rra}{vs_u}{ma}{pulli}' or
'{ka}{vs_u}{ma}{pulli}' or
'{nnna}{vs_e}{nnna}{pulli}' or
'{nnna}{vs_ai}' or
'{va}{vs_ai}'
] delete
(set found_a_match)
)
or
test ( [
('{vs_aa}{nnna}{pulli}' test (not '{ca}')) or
'{vs_aa}{lla}{pulli}' or
'{vs_aa}{ra}{pulli}' or
'{vs_ee}{nnna}{pulli}' or
'{vs_aa}' or
'{vs_aa}{ma}{pulli}' or
'{vs_e}{ma}{pulli}' or
'{vs_ee}{ma}{pulli}' or
'{vs_oo}{ma}{pulli}' or
'{ka}{vs_u}{ma}{pulli}' or
'{ta}{vs_u}{ma}{pulli}' or
'{tta}{vs_u}{ma}{pulli}' or
'{rra}{vs_u}{ma}{pulli}' or
'{vs_aa}{ya}{pulli}' or
'{nnna}{vs_e}{nnna}{pulli}' or
'{nnna}{vs_i}{ra}{pulli}' or
'{vs_ii}{ra}{pulli}' or
'{vs_ii}{ya}{ra}{pulli}'
] <- '{pulli}'
(set found_a_match)
)
or
test ( ([ '{ka}{vs_u}' or '{ta}{vs_u}' ) (test '{pulli}') ] delete
(set found_a_match)
)
)
do ([among(
'{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}'
'{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}'
'{ka}{vs_i}{nnna}{pulli}{rra}'
'{ka}{vs_i}{nnna}{pulli}{rra}{pulli}'
'{ka}{vs_i}{rra}'
'{ka}{vs_i}{rra}{pulli}'
)] delete
(set found_a_match)
)
)
do fix_endings
)

define stem as (
unset found_vetrumai_urupu
do fix_ending
has_min_length
do remove_question_prefixes
do remove_pronoun_prefixes
do remove_question_suffixes
do remove_um
do remove_common_word_endings
do remove_vetrumai_urupukal
do remove_plural_suffix
do remove_command_suffixes
do remove_tense_suffixes
)

contrib/snowball/algorithms/turkish/stem_Unicode.sbl → contrib/snowball/algorithms/turkish.sbl View File

@@ -2,7 +2,7 @@
* author: Evren (Kapusuz) Çilden
* email: evren.kapusuz at gmail.com
* version: 1.0 (15.01.2007)

* stems nominal verb suffixes
* stems nominal inflections
@@ -10,13 +10,13 @@
* (y,n,s,U) context check
* vowel harmony check
* last consonant check and conversion (b, c, d, ğ to p, ç, t, k)
* The stemming algorithm is based on the paper "An Affix Stripping
* Morphological Analyzer for Turkish" by Gülşen Eryiğit and
* Eşref Adalı (Proceedings of the IAESTED International Conference
* ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004,
* Innsbruck, Austria
* Turkish is an agglutinative language and has a very rich morphological
* structure. In Turkish, you can form many different words from a single stem
* by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means
@@ -59,14 +59,14 @@ routines (
mark_yken // nominal verb suffix
mark_ymUs_ // nominal verb suffix
mark_ysA // nominal verb suffix
mark_suffix_with_optional_y_consonant
mark_suffix_with_optional_U_vowel
mark_suffix_with_optional_n_consonant
mark_suffix_with_optional_s_consonant
more_than_one_syllable_word
post_process_last_consonants
postlude

@@ -75,34 +75,32 @@ routines (
stem_suffix_chain_before_ki
)

/* Special characters in Unicode Latin-1 and Latin Extended-A */
stringdef c. hex 'E7' // LATIN SMALL LETTER C WITH CEDILLA
stringdef g~ hex '011F' // LATIN SMALL LETTER G WITH BREVE
stringdef i' hex '0131' // LATIN SMALL LETTER I WITHOUT DOT
stringdef o" hex 'F6' // LATIN SMALL LETTER O WITH DIAERESIS
stringdef s. hex '015F' // LATIN SMALL LETTER S WITH CEDILLA
stringdef u" hex 'FC' // LATIN SMALL LETTER U WITH DIAERESIS

stringescapes { }
stringescapes { }

integers ( strlen ) // length of a string
/* Special characters in Unicode Latin-1 and Latin Extended-A */
stringdef c, '{U+00E7}' // LATIN SMALL LETTER C WITH CEDILLA
stringdef g~ '{U+011F}' // LATIN SMALL LETTER G WITH BREVE
stringdef i' '{U+0131}' // LATIN SMALL LETTER I WITHOUT DOT
stringdef o" '{U+00F6}' // LATIN SMALL LETTER O WITH DIAERESIS
stringdef s, '{U+015F}' // LATIN SMALL LETTER S WITH CEDILLA
stringdef u" '{U+00FC}' // LATIN SMALL LETTER U WITH DIAERESIS

booleans ( continue_stemming_noun_suffixes )

groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6)
groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6)

define vowel 'ae{i'}io{o"}u{u"}'
define vowel 'ae{i'}io{o"}u{u"}'
define U '{i'}iu{u"}'

// the vowel grouping definitions below are used for checking vowel harmony
define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a'
define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e'
define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i''
define vowel4 'ei' // vowels that can end with suffixes containing 'i'
define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u'
define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"'
define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a'
define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e'
define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i''
define vowel4 'ei' // vowels that can end with suffixes containing 'i'
define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u'
define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"'

externals ( stem )
externals ( stem )

backwardmode (
// checks vowel harmony for possible suffixes,
@@ -124,165 +122,165 @@ backwardmode (
)
)
)
// if the last consonant before suffix is vowel and n then advance and delete
// if the last consonant before suffix is non vowel and n do nothing
// if the last consonant before suffix is not n then only delete the suffix
// assumption: slice beginning is set correctly
define mark_suffix_with_optional_n_consonant as (
((test 'n') next (test vowel))
('n' (test vowel))
or
((not(test 'n')) test(next (test vowel)))
((not(test 'n')) test(next vowel))

)
// if the last consonant before suffix is vowel and s then advance and delete
// if the last consonant before suffix is non vowel and s do nothing
// if the last consonant before suffix is not s then only delete the suffix
// assumption: slice beginning is set correctly
define mark_suffix_with_optional_s_consonant as (
((test 's') next (test vowel))
('s' (test vowel))
or
((not(test 's')) test(next (test vowel)))
((not(test 's')) test(next vowel))
)
// if the last consonant before suffix is vowel and y then advance and delete
// if the last consonant before suffix is non vowel and y do nothing
// if the last consonant before suffix is not y then only delete the suffix
// assumption: slice beginning is set correctly
define mark_suffix_with_optional_y_consonant as (
((test 'y') next (test vowel))
('y' (test vowel))
or
((not(test 'y')) test(next (test vowel)))
((not(test 'y')) test(next vowel))
)
define mark_suffix_with_optional_U_vowel as (
((test U) next (test non-vowel))
(U (test non-vowel))
or
((not(test U)) test(next (test non-vowel)))
((not(test U)) test(next non-vowel))

)
define mark_possessives as (
among ('m{i'}z' 'miz' 'muz' 'm{u"}z'
'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n')
(mark_suffix_with_optional_U_vowel)
)
define mark_sU as (
check_vowel_harmony
U
(mark_suffix_with_optional_s_consonant)
)
define mark_lArI as (
among ('leri' 'lar{i'}')
)
define mark_yU as (
check_vowel_harmony
U
(mark_suffix_with_optional_y_consonant)
(mark_suffix_with_optional_y_consonant)
)
define mark_nU as (
check_vowel_harmony
among ('n{i'}' 'ni' 'nu' 'n{u"}')
among ('n{i'}' 'ni' 'nu' 'n{u"}')
)
define mark_nUn as (
check_vowel_harmony
among ('{i'}n' 'in' 'un' '{u"}n')
among ('{i'}n' 'in' 'un' '{u"}n')
(mark_suffix_with_optional_n_consonant)
)
define mark_yA as (
check_vowel_harmony
among('a' 'e')
(mark_suffix_with_optional_y_consonant)
)
define mark_nA as (
check_vowel_harmony
among('na' 'ne')
)
define mark_DA as (
check_vowel_harmony
among('da' 'de' 'ta' 'te')
)
define mark_ndA as (
check_vowel_harmony
among('nda' 'nde')
)
define mark_DAn as (
check_vowel_harmony
among('dan' 'den' 'tan' 'ten')
)
define mark_ndAn as (
check_vowel_harmony
among('ndan' 'nden')
)
define mark_ylA as (
check_vowel_harmony
among('la' 'le')
(mark_suffix_with_optional_y_consonant)
)
define mark_ki as (
'ki'
)
define mark_ncA as (
check_vowel_harmony
among('ca' 'ce')
among('ca' 'ce')
(mark_suffix_with_optional_n_consonant)
)
define mark_yUm as (
check_vowel_harmony
among ('{i'}m' 'im' 'um' '{u"}m')
(mark_suffix_with_optional_y_consonant)
)
define mark_sUn as (
check_vowel_harmony
among ('s{i'}n' 'sin' 'sun' 's{u"}n' )
)
define mark_yUz as (
check_vowel_harmony
among ('{i'}z' 'iz' 'uz' '{u"}z')
(mark_suffix_with_optional_y_consonant)
)
define mark_sUnUz as (
among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z')
)
define mark_lAr as (
check_vowel_harmony
among ('ler' 'lar')
)
define mark_nUz as (
check_vowel_harmony
among ('n{i'}z' 'niz' 'nuz' 'n{u"}z')
)
define mark_DUr as (
check_vowel_harmony
among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r')
)
define mark_cAsInA as (
among ('cas{i'}na' 'cesine')
)
define mark_yDU as (
check_vowel_harmony
among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m'
@@ -292,24 +290,24 @@ backwardmode (
(mark_suffix_with_optional_y_consonant)
)

// does not fully obey vowel harmony
// does not fully obey vowel harmony
define mark_ysA as (
among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se')
(mark_suffix_with_optional_y_consonant)
)
define mark_ymUs_ as (
check_vowel_harmony
among ('m{i'}{s.}' 'mi{s.}' 'mu{s.}' 'm{u"}{s.}')
among ('m{i'}{s,}' 'mi{s,}' 'mu{s,}' 'm{u"}{s,}')
(mark_suffix_with_optional_y_consonant)
)
define mark_yken as (
'ken' (mark_suffix_with_optional_y_consonant)
)
define stem_nominal_verb_suffixes as (
[
[
set continue_stemming_noun_suffixes
(mark_ymUs_ or mark_yDU or mark_ysA or mark_yken)
or
@@ -327,7 +325,7 @@ backwardmode (
(mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_))
]delete
)
// stems noun suffix chains ending with -ki
define stem_suffix_chain_before_ki as (
[
@@ -337,7 +335,7 @@ backwardmode (
(mark_lAr] delete try(stem_suffix_chain_before_ki))
or
(mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
))
or
(mark_nUn] delete try([
@@ -348,7 +346,7 @@ backwardmode (
(stem_suffix_chain_before_ki)
))
or
(mark_ndA (
(mark_ndA (
(mark_lArI] delete)
or
((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki)))
@@ -357,7 +355,7 @@ backwardmode (
))
)
)
define stem_noun_suffixes as (
([mark_lAr] delete try(stem_suffix_chain_before_ki))
or
@@ -373,24 +371,24 @@ backwardmode (
or
([(mark_ndA or mark_nA)
(
(mark_lArI] delete)
or
(mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
or
(stem_suffix_chain_before_ki)
)
(mark_lArI] delete)
or
(mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
or
(stem_suffix_chain_before_ki)
)
)
or
([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI)))
or
( [mark_DAn] delete try ([
(
(mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
or
(mark_lAr] delete try(stem_suffix_chain_before_ki))
or
(stem_suffix_chain_before_ki)
))
(mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
or
(mark_lAr] delete try(stem_suffix_chain_before_ki))
or
(stem_suffix_chain_before_ki)
))
)
or
([mark_nUn or mark_ylA] delete
@@ -404,18 +402,18 @@ backwardmode (
)
or
([mark_lArI] delete)
or
or
(stem_suffix_chain_before_ki)
or
([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki))
or
([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
)
define post_process_last_consonants as (
define post_process_last_consonants as (
[substring] among (
'b' (<- 'p')
'c' (<- '{c.}')
'c' (<- '{c,}')
'd' (<- 't')
'{g~}' (<- 'k')
)
@@ -424,7 +422,7 @@ backwardmode (
// after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed
// like in 'kedim' -> 'ked'
// Turkish words don't usually end with 'd' or 'g'
// some very well known words are ignored (like 'ad' 'soyad'
// some very well known words are ignored (like 'ad' 'soyad'
// appends U to stems ending with d or g, decides which vowel to add
// based on the last vowel in the stem
define append_U_to_stems_ending_with_d_or_g as (
@@ -437,7 +435,10 @@ backwardmode (
or
(test((goto vowel) '{o"}' or '{u"}') <+ '{u"}')
)

define is_reserved_word as (
'ad' try 'soy' atlimit
)
)

// Tests if there are more than one syllables
@@ -446,18 +447,12 @@ define more_than_one_syllable_word as (
test (atleast 2 (gopast vowel))
)

define is_reserved_word as (
test(gopast 'ad' ($strlen = 2) ($strlen == limit))
or
test(gopast 'soyad' ($strlen = 5) ($strlen == limit))
)

define postlude as (
not(is_reserved_word)
backwards (
not(is_reserved_word)
do append_U_to_stems_ending_with_d_or_g
do post_process_last_consonants

)
)

@@ -469,9 +464,7 @@ define stem as (
continue_stemming_noun_suffixes
do stem_noun_suffixes
)
postlude
)
)



+ 98
- 0
contrib/snowball/charsets/ISO-8859-2.sbl View File

@@ -0,0 +1,98 @@
// ISO-8859-2 character mappings.

stringdef U+00A0 hex 'A0'
stringdef U+0104 hex 'A1'
stringdef U+02D8 hex 'A2'
stringdef U+0141 hex 'A3'
stringdef U+00A4 hex 'A4'
stringdef U+013D hex 'A5'
stringdef U+015A hex 'A6'
stringdef U+00A7 hex 'A7'
stringdef U+00A8 hex 'A8'
stringdef U+0160 hex 'A9'
stringdef U+015E hex 'AA'
stringdef U+0164 hex 'AB'
stringdef U+0179 hex 'AC'
stringdef U+00AD hex 'AD'
stringdef U+017D hex 'AE'
stringdef U+017B hex 'AF'
stringdef U+00B0 hex 'B0'
stringdef U+0105 hex 'B1'
stringdef U+02DB hex 'B2'
stringdef U+0142 hex 'B3'
stringdef U+00B4 hex 'B4'
stringdef U+013E hex 'B5'
stringdef U+015B hex 'B6'
stringdef U+02C7 hex 'B7'
stringdef U+00B8 hex 'B8'
stringdef U+0161 hex 'B9'
stringdef U+015F hex 'BA'
stringdef U+0165 hex 'BB'
stringdef U+017A hex 'BC'
stringdef U+02DD hex 'BD'
stringdef U+017E hex 'BE'
stringdef U+017C hex 'BF'
stringdef U+0154 hex 'C0'
stringdef U+00C1 hex 'C1'
stringdef U+00C2 hex 'C2'
stringdef U+0102 hex 'C3'
stringdef U+00C4 hex 'C4'
stringdef U+0139 hex 'C5'
stringdef U+0106 hex 'C6'
stringdef U+00C7 hex 'C7'
stringdef U+010C hex 'C8'
stringdef U+00C9 hex 'C9'
stringdef U+0118 hex 'CA'
stringdef U+00CB hex 'CB'
stringdef U+011A hex 'CC'
stringdef U+00CD hex 'CD'
stringdef U+00CE hex 'CE'
stringdef U+010E hex 'CF'
stringdef U+0110 hex 'D0'
stringdef U+0143 hex 'D1'
stringdef U+0147 hex 'D2'
stringdef U+00D3 hex 'D3'
stringdef U+00D4 hex 'D4'
stringdef U+0150 hex 'D5'
stringdef U+00D6 hex 'D6'
stringdef U+00D7 hex 'D7'
stringdef U+0158 hex 'D8'
stringdef U+016E hex 'D9'
stringdef U+00DA hex 'DA'
stringdef U+0170 hex 'DB'
stringdef U+00DC hex 'DC'
stringdef U+00DD hex 'DD'
stringdef U+0162 hex 'DE'
stringdef U+00DF hex 'DF'
stringdef U+0155 hex 'E0'
stringdef U+00E1 hex 'E1'
stringdef U+00E2 hex 'E2'
stringdef U+0103 hex 'E3'
stringdef U+00E4 hex 'E4'
stringdef U+013A hex 'E5'
stringdef U+0107 hex 'E6'
stringdef U+00E7 hex 'E7'
stringdef U+010D hex 'E8'
stringdef U+00E9 hex 'E9'
stringdef U+0119 hex 'EA'
stringdef U+00EB hex 'EB'
stringdef U+011B hex 'EC'
stringdef U+00ED hex 'ED'
stringdef U+00EE hex 'EE'
stringdef U+010F hex 'EF'
stringdef U+0111 hex 'F0'
stringdef U+0144 hex 'F1'
stringdef U+0148 hex 'F2'
stringdef U+00F3 hex 'F3'
stringdef U+00F4 hex 'F4'
stringdef U+0151 hex 'F5'
stringdef U+00F6 hex 'F6'
stringdef U+00F7 hex 'F7'
stringdef U+0159 hex 'F8'
stringdef U+016F hex 'F9'
stringdef U+00FA hex 'FA'
stringdef U+0171 hex 'FB'
stringdef U+00FC hex 'FC'
stringdef U+00FD hex 'FD'
stringdef U+0163 hex 'FE'
stringdef U+02D9 hex 'FF'

+ 74
- 0
contrib/snowball/charsets/KOI8-R.sbl View File

@@ -0,0 +1,74 @@
// KOI8-R character mappings.

stringdef U+00A0 hex '9A'
stringdef U+00A9 hex 'BF'
stringdef U+00B0 hex '9C'
stringdef U+00B2 hex '9D'
stringdef U+00B7 hex '9E'
stringdef U+00F7 hex '9F'
stringdef U+0401 hex 'B3'
stringdef U+0410 hex 'E1'
stringdef U+0411 hex 'E2'
stringdef U+0412 hex 'F7'
stringdef U+0413 hex 'E7'
stringdef U+0414 hex 'E4'
stringdef U+0415 hex 'E5'
stringdef U+0416 hex 'F6'
stringdef U+0417 hex 'FA'
stringdef U+0418 hex 'E9'
stringdef U+0419 hex 'EA'
stringdef U+041A hex 'EB'
stringdef U+041B hex 'EC'
stringdef U+041C hex 'ED'
stringdef U+041D hex 'EE'
stringdef U+041E hex 'EF'
stringdef U+041F hex 'F0'
stringdef U+0420 hex 'F2'
stringdef U+0421 hex 'F3'
stringdef U+0422 hex 'F4'
stringdef U+0423 hex 'F5'
stringdef U+0424 hex 'E6'
stringdef U+0425 hex 'E8'
stringdef U+0426 hex 'E3'
stringdef U+0427 hex 'FE'
stringdef U+0428 hex 'FB'
stringdef U+0429 hex 'FD'
stringdef U+042A hex 'FF'
stringdef U+042B hex 'F9'
stringdef U+042C hex 'F8'
stringdef U+042D hex 'FC'
stringdef U+042E hex 'E0'
stringdef U+042F hex 'F1'
stringdef U+0430 hex 'C1'
stringdef U+0431 hex 'C2'
stringdef U+0432 hex 'D7'
stringdef U+0433 hex 'C7'
stringdef U+0434 hex 'C4'
stringdef U+0435 hex 'C5'
stringdef U+0436 hex 'D6'
stringdef U+0437 hex 'DA'
stringdef U+0438 hex 'C9'
stringdef U+0439 hex 'CA'
stringdef U+043A hex 'CB'
stringdef U+043B hex 'CC'
stringdef U+043C hex 'CD'
stringdef U+043D hex 'CE'
stringdef U+043E hex 'CF'
stringdef U+043F hex 'D0'
stringdef U+0440 hex 'D2'
stringdef U+0441 hex 'D3'
stringdef U+0442 hex 'D4'
stringdef U+0443 hex 'D5'
stringdef U+0444 hex 'C6'
stringdef U+0445 hex 'C8'
stringdef U+0446 hex 'C3'
stringdef U+0447 hex 'DE'
stringdef U+0448 hex 'DB'
stringdef U+0449 hex 'DD'
stringdef U+044A hex 'DF'
stringdef U+044B hex 'D9'
stringdef U+044C hex 'D8'
stringdef U+044D hex 'DC'
stringdef U+044E hex 'C0'
stringdef U+044F hex 'D1'
stringdef U+0451 hex 'A3'

+ 130
- 0
contrib/snowball/charsets/cp850.sbl View File

@@ -0,0 +1,130 @@
// Code page 850 (MSDOS Latin 1) character mappings.

stringdef U+00A0 hex 'FF'
stringdef U+00A1 hex 'AD'
stringdef U+00A2 hex 'BD'
stringdef U+00A3 hex '9C'
stringdef U+00A4 hex 'CF'
stringdef U+00A5 hex 'BE'
stringdef U+00A6 hex 'DD'
stringdef U+00A7 hex 'F5'
stringdef U+00A8 hex 'F9'
stringdef U+00A9 hex 'B8'
stringdef U+00AA hex 'A6'
stringdef U+00AB hex 'AE'
stringdef U+00AC hex 'AA'
stringdef U+00AD hex 'F0'
stringdef U+00AE hex 'A9'
stringdef U+00AF hex 'EE'
stringdef U+00B0 hex 'F8'
stringdef U+00B1 hex 'F1'
stringdef U+00B2 hex 'FD'
stringdef U+00B3 hex 'FC'
stringdef U+00B4 hex 'EF'
stringdef U+00B5 hex 'E6'
stringdef U+00B6 hex 'F4'
stringdef U+00B7 hex 'FA'
stringdef U+00B8 hex 'F7'
stringdef U+00B9 hex 'FB'
stringdef U+00BA hex 'A7'
stringdef U+00BB hex 'AF'
stringdef U+00BC hex 'AC'
stringdef U+00BD hex 'AB'
stringdef U+00BE hex 'F3'
stringdef U+00BF hex 'A8'
stringdef U+00C0 hex 'B7'
stringdef U+00C1 hex 'B5'
stringdef U+00C2 hex 'B6'
stringdef U+00C3 hex 'C7'
stringdef U+00C4 hex '8E'
stringdef U+00C5 hex '8F'
stringdef U+00C6 hex '92'
stringdef U+00C7 hex '80'
stringdef U+00C8 hex 'D4'
stringdef U+00C9 hex '90'
stringdef U+00CA hex 'D2'
stringdef U+00CB hex 'D3'
stringdef U+00CC hex 'DE'
stringdef U+00CD hex 'D6'
stringdef U+00CE hex 'D7'
stringdef U+00CF hex 'D8'
stringdef U+00D0 hex 'D1'
stringdef U+00D1 hex 'A5'
stringdef U+00D2 hex 'E3'
stringdef U+00D3 hex 'E0'
stringdef U+00D4 hex 'E2'
stringdef U+00D5 hex 'E5'
stringdef U+00D6 hex '99'
stringdef U+00D7 hex '9E'
stringdef U+00D8 hex '9D'
stringdef U+00D9 hex 'EB'
stringdef U+00DA hex 'E9'
stringdef U+00DB hex 'EA'
stringdef U+00DC hex '9A'
stringdef U+00DD hex 'ED'
stringdef U+00DE hex 'E8'
stringdef U+00DF hex 'E1'
stringdef U+00E0 hex '85'
stringdef U+00E1 hex 'A0'
stringdef U+00E2 hex '83'
stringdef U+00E3 hex 'C6'
stringdef U+00E4 hex '84'
stringdef U+00E5 hex '86'
stringdef U+00E6 hex '91'
stringdef U+00E7 hex '87'
stringdef U+00E8 hex '8A'
stringdef U+00E9 hex '82'
stringdef U+00EA hex '88'
stringdef U+00EB hex '89'
stringdef U+00EC hex '8D'
stringdef U+00ED hex 'A1'
stringdef U+00EE hex '8C'
stringdef U+00EF hex '8B'
stringdef U+00F0 hex 'D0'
stringdef U+00F1 hex 'A4'
stringdef U+00F2 hex '95'
stringdef U+00F3 hex 'A2'
stringdef U+00F4 hex '93'
stringdef U+00F5 hex 'E4'
stringdef U+00F6 hex '94'
stringdef U+00F7 hex 'F6'
stringdef U+00F8 hex '9B'
stringdef U+00F9 hex '97'
stringdef U+00FA hex 'A3'
stringdef U+00FB hex '96'
stringdef U+00FC hex '81'
stringdef U+00FD hex 'EC'
stringdef U+00FE hex 'E7'
stringdef U+00FF hex '98'
stringdef U+0131 hex 'D5'
stringdef U+0192 hex '9F'
stringdef U+2017 hex 'F2'
stringdef U+2500 hex 'C4'
stringdef U+2502 hex 'B3'
stringdef U+250C hex 'DA'
stringdef U+2510 hex 'BF'
stringdef U+2514 hex 'C0'
stringdef U+2518 hex 'D9'
stringdef U+251C hex 'C3'
stringdef U+2524 hex 'B4'
stringdef U+252C hex 'C2'
stringdef U+2534 hex 'C1'
stringdef U+253C hex 'C5'
stringdef U+2550 hex 'CD'
stringdef U+2551 hex 'BA'
stringdef U+2554 hex 'C9'
stringdef U+2557 hex 'BB'
stringdef U+255A hex 'C8'
stringdef U+255D hex 'BC'
stringdef U+2560 hex 'CC'
stringdef U+2563 hex 'B9'
stringdef U+2566 hex 'CB'
stringdef U+2569 hex 'CA'
stringdef U+256C hex 'CE'
stringdef U+2580 hex 'DF'
stringdef U+2584 hex 'DC'
stringdef U+2588 hex 'DB'
stringdef U+2591 hex 'B0'
stringdef U+2592 hex 'B1'
stringdef U+2593 hex 'B2'
stringdef U+25A0 hex 'FE'

+ 644
- 222
contrib/snowball/compiler/analyser.c
File diff suppressed because it is too large
View File


+ 391
- 75
contrib/snowball/compiler/driver.c View File

@@ -1,48 +1,86 @@
#include <ctype.h> /* for toupper etc */
#include <stdio.h> /* for fprintf etc */
#include <stdlib.h> /* for free etc */
#include <string.h> /* for strlen */
#include <string.h> /* for strcmp */
#include "header.h"

#define DEFAULT_PACKAGE "org.tartarus.snowball.ext"
#define DEFAULT_BASE_CLASS "org.tartarus.snowball.SnowballProgram"
#define DEFAULT_AMONG_CLASS "org.tartarus.snowball.Among"
#define DEFAULT_STRING_CLASS "java.lang.StringBuilder"
#define DEFAULT_JAVA_PACKAGE "org.tartarus.snowball.ext"
#define DEFAULT_JAVA_BASE_CLASS "org.tartarus.snowball.SnowballProgram"
#define DEFAULT_JAVA_AMONG_CLASS "org.tartarus.snowball.Among"
#define DEFAULT_JAVA_STRING_CLASS "java.lang.StringBuilder"

#define DEFAULT_GO_PACKAGE "snowball"
#define DEFAULT_GO_SNOWBALL_RUNTIME "github.com/snowballstem/snowball/go"

#define DEFAULT_CS_NAMESPACE "Snowball"
#define DEFAULT_CS_BASE_CLASS "Stemmer"
#define DEFAULT_CS_AMONG_CLASS "Among"
#define DEFAULT_CS_STRING_CLASS "StringBuilder"

#define DEFAULT_JS_BASE_CLASS "BaseStemmer"

#define DEFAULT_PYTHON_BASE_CLASS "BaseStemmer"

static int eq(const char * s1, const char * s2) {
int s1_len = strlen(s1);
int s2_len = strlen(s2);
return s1_len == s2_len && memcmp(s1, s2, s1_len) == 0;
return strcmp(s1, s2) == 0;
}

static void print_arglist(void) {
fprintf(stderr, "Usage: snowball <file> [options]\n\n"
"options are: [-o[utput] file]\n"
" [-s[yntax]]\n"
static void print_arglist(int exit_code) {
FILE * f = exit_code ? stderr : stdout;
fprintf(f, "Usage: snowball SOURCE_FILE... [OPTIONS]\n\n"
"Supported options:\n"
" -o[utput] file\n"
" -s[yntax]\n"
" -comments\n"
#ifndef DISABLE_JAVA
" [-j[ava]]\n"
" -j[ava]\n"
#endif
" [-c++]\n"
" [-w[idechars]]\n"
" [-u[tf8]]\n"
" [-n[ame] class name]\n"
" [-ep[refix] string]\n"
" [-vp[refix] string]\n"
" [-i[nclude] directory]\n"
" [-r[untime] path to runtime headers]\n"
#ifndef DISABLE_JAVA
" [-p[arentclassname] fully qualified parent class name]\n"
" [-P[ackage] package name for stemmers]\n"
" [-S[tringclass] StringBuffer-compatible class]\n"
" [-a[mongclass] fully qualified name of the Among class]\n"
#ifndef DISABLE_CSHARP
" -cs[harp]\n"
#endif
" -c++\n"
#ifndef DISABLE_PASCAL
" -pascal\n"
#endif
#ifndef DISABLE_PYTHON
" -py[thon]\n"
#endif
#ifndef DISABLE_JS
" -js\n"
#endif
#ifndef DISABLE_RUST
" -rust\n"
#endif
#ifndef DISABLE_GO
" -go\n"
#endif
" -w[idechars]\n"
" -u[tf8]\n"
" -n[ame] class name\n"
" -ep[refix] string\n"
" -vp[refix] string\n"
" -i[nclude] directory\n"
" -r[untime] path to runtime headers\n"
" -p[arentclassname] fully qualified parent class name\n"
#if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP)
" -P[ackage] package name for stemmers\n"
" -S[tringclass] StringBuffer-compatible class\n"
" -a[mongclass] fully qualified name of the Among class\n"
#endif
#ifndef DISABLE_GO
" -gop[ackage] Go package name for stemmers\n"
" -gor[untime] Go snowball runtime package\n"
#endif
" --help display this help and exit\n"
" --version output version information and exit\n"
);
exit(1);
exit(exit_code);
}

static void check_lim(int i, int argc) {
if (i >= argc) {
fprintf(stderr, "argument list is one short\n");
print_arglist();
print_arglist(1);
}
}

@@ -57,35 +95,47 @@ static FILE * get_output(symbol * b) {
return output;
}

static void read_options(struct options * o, int argc, char * argv[]) {
static int read_options(struct options * o, int argc, char * argv[]) {
char * s;
int i = 2;
int i = 1;
int new_argc = 1;
/* Note down the last option used to specify an explicit encoding so
* we can warn we ignored it for languages with a fixed encoding.
*/
const char * encoding_opt = NULL;

/* set defaults: */

o->output_file = 0;
o->syntax_tree = false;
o->externals_prefix = "";
o->comments = false;
o->externals_prefix = NULL;
o->variables_prefix = 0;
o->runtime_path = 0;
o->parent_class_name = DEFAULT_BASE_CLASS;
o->string_class = DEFAULT_STRING_CLASS;
o->among_class = DEFAULT_AMONG_CLASS;
o->package = DEFAULT_PACKAGE;
o->name = "";
o->parent_class_name = NULL;
o->string_class = NULL;
o->among_class = NULL;
o->package = NULL;
o->go_snowball_runtime = DEFAULT_GO_SNOWBALL_RUNTIME;
o->name = NULL;
o->make_lang = LANG_C;
o->widechars = false;
o->includes = 0;
o->includes_end = 0;
o->utf8 = false;
o->encoding = ENC_SINGLEBYTE;

/* read options: */

repeat {
if (i >= argc) break;
while (i < argc) {
s = argv[i++];
{ if (eq(s, "-o") || eq(s, "-output")) {
check_lim(i, argc);
if (s[0] != '-') {
/* Non-option argument - shuffle down. */
argv[new_argc++] = s;
continue;
}

{
if (eq(s, "-o") || eq(s, "-output")) {
check_lim(i, argc);
o->output_file = argv[i++];
continue;
}
@@ -94,10 +144,33 @@ static void read_options(struct options * o, int argc, char * argv[]) {
o->name = argv[i++];
continue;
}
#ifndef DISABLE_JS
if (eq(s, "-js")) {
o->make_lang = LANG_JAVASCRIPT;
continue;
}
#endif
#ifndef DISABLE_RUST
if (eq(s, "-rust")) {
o->make_lang = LANG_RUST;
continue;
}
#endif
#ifndef DISABLE_GO
if (eq(s, "-go")) {
o->make_lang = LANG_GO;
continue;
}
#endif
#ifndef DISABLE_JAVA
if (eq(s, "-j") || eq(s, "-java")) {
o->make_lang = LANG_JAVA;
o->widechars = true;
continue;
}
#endif
#ifndef DISABLE_CSHARP
if (eq(s, "-cs") || eq(s, "-csharp")) {
o->make_lang = LANG_CSHARP;
continue;
}
#endif
@@ -105,15 +178,31 @@ static void read_options(struct options * o, int argc, char * argv[]) {
o->make_lang = LANG_CPLUSPLUS;
continue;
}
#ifndef DISABLE_PASCAL
if (eq(s, "-pascal")) {
o->make_lang = LANG_PASCAL;
continue;
}
#endif
#ifndef DISABLE_PYTHON
if (eq(s, "-py") || eq(s, "-python")) {
o->make_lang = LANG_PYTHON;
continue;
}
#endif
if (eq(s, "-w") || eq(s, "-widechars")) {
o->widechars = true;
o->utf8 = false;
encoding_opt = s;
o->encoding = ENC_WIDECHARS;
continue;
}
if (eq(s, "-s") || eq(s, "-syntax")) {
o->syntax_tree = true;
continue;
}
if (eq(s, "-comments")) {
o->comments = true;
continue;
}
if (eq(s, "-ep") || eq(s, "-eprefix")) {
check_lim(i, argc);
o->externals_prefix = argv[i++];
@@ -145,16 +234,16 @@ static void read_options(struct options * o, int argc, char * argv[]) {
continue;
}
if (eq(s, "-u") || eq(s, "-utf8")) {
o->utf8 = true;
o->widechars = false;
encoding_opt = s;
o->encoding = ENC_UTF8;
continue;
}
#ifndef DISABLE_JAVA
if (eq(s, "-p") || eq(s, "-parentclassname")) {
check_lim(i, argc);
o->parent_class_name = argv[i++];
continue;
}
#if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP)
if (eq(s, "-P") || eq(s, "-Package")) {
check_lim(i, argc);
o->package = argv[i++];
@@ -171,44 +260,216 @@ static void read_options(struct options * o, int argc, char * argv[]) {
continue;
}
#endif
#ifndef DISABLE_GO
if (eq(s, "-gop") || eq(s, "-gopackage")) {
check_lim(i, argc);
o->package = argv[i++];
continue;
}
if (eq(s, "-gor") || eq(s, "-goruntime")) {
check_lim(i, argc);
o->go_snowball_runtime = argv[i++];
continue;
}
#endif
if (eq(s, "--help")) {
print_arglist(0);
}

if (eq(s, "--version")) {
printf("Snowball compiler version " SNOWBALL_VERSION "\n");
exit(0);
}

fprintf(stderr, "'%s' misplaced\n", s);
print_arglist();
print_arglist(1);
}
}
if (new_argc == 1) {
fprintf(stderr, "no source files specified\n");
print_arglist(1);
}
argv[new_argc] = NULL;

/* Set language-dependent defaults. */
switch (o->make_lang) {
case LANG_C:
case LANG_CPLUSPLUS:
encoding_opt = NULL;
break;
case LANG_CSHARP:
o->encoding = ENC_WIDECHARS;
if (!o->parent_class_name)
o->parent_class_name = DEFAULT_CS_BASE_CLASS;
if (!o->string_class)
o->string_class = DEFAULT_CS_STRING_CLASS;
if (!o->among_class)
o->among_class = DEFAULT_CS_AMONG_CLASS;
if (!o->package)
o->package = DEFAULT_CS_NAMESPACE;
break;
case LANG_GO:
o->encoding = ENC_UTF8;
if (!o->package)
o->package = DEFAULT_GO_PACKAGE;
break;
case LANG_JAVA:
o->encoding = ENC_WIDECHARS;
if (!o->parent_class_name)
o->parent_class_name = DEFAULT_JAVA_BASE_CLASS;
if (!o->string_class)
o->string_class = DEFAULT_JAVA_STRING_CLASS;
if (!o->among_class)
o->among_class = DEFAULT_JAVA_AMONG_CLASS;
if (!o->package)
o->package = DEFAULT_JAVA_PACKAGE;
break;
case LANG_JAVASCRIPT:
o->encoding = ENC_WIDECHARS;
if (!o->parent_class_name)
o->parent_class_name = DEFAULT_JS_BASE_CLASS;
break;
case LANG_PYTHON:
o->encoding = ENC_WIDECHARS;
if (!o->parent_class_name)
o->parent_class_name = DEFAULT_PYTHON_BASE_CLASS;
break;
case LANG_RUST:
o->encoding = ENC_UTF8;
break;
default:
break;
}

if (encoding_opt) {
fprintf(stderr, "warning: %s only meaningful for C and C++\n",
encoding_opt);
}

if (o->make_lang != LANG_C && o->make_lang != LANG_CPLUSPLUS) {
if (o->runtime_path) {
fprintf(stderr, "warning: -r/-runtime only meaningful for C and C++\n");
}
if (o->externals_prefix) {
fprintf(stderr, "warning: -ep/-eprefix only meaningful for C and C++\n");
}
}
if (!o->externals_prefix) o->externals_prefix = "";

if (!o->name && o->output_file) {
/* Default class name to basename of output_file - this is the standard
* convention for at least Java and C#.
*/
const char * slash = strrchr(o->output_file, '/');
size_t len;
const char * leaf = (slash == NULL) ? o->output_file : slash + 1;

slash = strrchr(leaf, '\\');
if (slash != NULL) leaf = slash + 1;

{
const char * dot = strchr(leaf, '.');
len = (dot == NULL) ? strlen(leaf) : (size_t)(dot - leaf);
}

{
char * new_name = malloc(len + 1);
switch (o->make_lang) {
case LANG_CSHARP:
case LANG_PASCAL:
/* Upper case initial letter. */
memcpy(new_name, leaf, len);
new_name[0] = toupper(new_name[0]);
break;
case LANG_JAVASCRIPT:
case LANG_PYTHON: {
/* Upper case initial letter and change each
* underscore+letter or hyphen+letter to an upper case
* letter.
*/
size_t i, j = 0;
int uc_next = true;
for (i = 0; i != len; ++i) {
unsigned char ch = leaf[i];
if (ch == '_' || ch == '-') {
uc_next = true;
} else {
if (uc_next) {
new_name[j] = toupper(ch);
uc_next = false;
} else {
new_name[j] = ch;
}
++j;
}
}
len = j;
break;
}
default:
/* Just copy. */
memcpy(new_name, leaf, len);
break;
}
new_name[len] = '\0';
o->name = new_name;
}
}

return new_argc;
}

extern int main(int argc, char * argv[]) {

int i;
NEW(options, o);
if (argc == 1) print_arglist();
read_options(o, argc, argv);
argc = read_options(o, argc, argv);
{
symbol * filename = add_s_to_b(0, argv[1]);
char * file;
symbol * u = get_input(filename, &file);
char * file = argv[1];
symbol * u = get_input(file);
if (u == 0) {
fprintf(stderr, "Can't open input %s\n", argv[1]);
fprintf(stderr, "Can't open input %s\n", file);
exit(1);
}
{
struct tokeniser * t = create_tokeniser(u, file);
struct analyser * a = create_analyser(t);
t->widechars = o->widechars;
struct input ** next_input_ptr = &(t->next);
a->encoding = t->encoding = o->encoding;
t->includes = o->includes;
a->utf8 = t->utf8 = o->utf8;
/* If multiple source files are specified, set up the others to be
* read after the first in order, using the same mechanism as
* 'get' uses. */
for (i = 2; i != argc; ++i) {
NEW(input, q);
file = argv[i];
u = get_input(file);
if (u == 0) {
fprintf(stderr, "Can't open input %s\n", file);
exit(1);
}
q->p = u;
q->c = 0;
q->file = file;
q->file_needs_freeing = false;
q->line_number = 1;
*next_input_ptr = q;
next_input_ptr = &(q->next);
}
*next_input_ptr = NULL;
read_program(a);
if (t->error_count > 0) exit(1);
if (o->syntax_tree) print_program(a);
close_tokeniser(t);
unless (o->syntax_tree) {
if (!o->syntax_tree) {
struct generator * g;

char * s = o->output_file;
unless (s) {
const char * s = o->output_file;
if (!s) {
fprintf(stderr, "Please include the -o option\n");
print_arglist();
exit(1);
print_arglist(1);
}
g = create_generator(a, o);
if (o->make_lang == LANG_C || o->make_lang == LANG_CPLUSPLUS) {
symbol * b = add_s_to_b(0, s);
b = add_s_to_b(b, ".h");
@@ -217,41 +478,96 @@ extern int main(int argc, char * argv[]) {
if (o->make_lang == LANG_CPLUSPLUS) {
b = add_s_to_b(b, "c");
}
o->output_c = get_output(b);
o->output_src = get_output(b);
lose_b(b);

g = create_generator_c(a, o);
generate_program_c(g);
close_generator_c(g);
fclose(o->output_c);
fclose(o->output_src);
fclose(o->output_h);
}
#ifndef DISABLE_JAVA
if (o->make_lang == LANG_JAVA) {
symbol * b = add_s_to_b(0, s);
b = add_s_to_b(b, ".java");
o->output_java = get_output(b);
o->output_src = get_output(b);
lose_b(b);
g = create_generator_java(a, o);
generate_program_java(g);
close_generator_java(g);
fclose(o->output_java);
fclose(o->output_src);
}
#endif
#ifndef DISABLE_PASCAL
if (o->make_lang == LANG_PASCAL) {
symbol *b = add_s_to_b(0, s);
b = add_s_to_b(b, ".pas");
o->output_src = get_output(b);
lose_b(b);
generate_program_pascal(g);
fclose(o->output_src);
}
#endif
#ifndef DISABLE_PYTHON
if (o->make_lang == LANG_PYTHON) {
symbol * b = add_s_to_b(0, s);
b = add_s_to_b(b, ".py");
o->output_src = get_output(b);
lose_b(b);
generate_program_python(g);
fclose(o->output_src);
}
#endif
#ifndef DISABLE_JS
if (o->make_lang == LANG_JAVASCRIPT) {
symbol * b = add_s_to_b(0, s);
b = add_s_to_b(b, ".js");
o->output_src = get_output(b);
lose_b(b);
generate_program_js(g);
fclose(o->output_src);
}
#endif
#ifndef DISABLE_CSHARP
if (o->make_lang == LANG_CSHARP) {
symbol * b = add_s_to_b(0, s);
b = add_s_to_b(b, ".cs");
o->output_src = get_output(b);
lose_b(b);
generate_program_csharp(g);
fclose(o->output_src);
}
#endif
#ifndef DISABLE_RUST
if (o->make_lang == LANG_RUST) {
symbol * b = add_s_to_b(0, s);
b = add_s_to_b(b, ".rs");
o->output_src = get_output(b);
lose_b(b);
generate_program_rust(g);
fclose(o->output_src);
}
#endif
#ifndef DISABLE_GO
if (o->make_lang == LANG_GO) {
symbol * b = add_s_to_b(0, s);
b = add_s_to_b(b, ".go");
o->output_src = get_output(b);
lose_b(b);
generate_program_go(g);
fclose(o->output_src);
}
#endif
close_generator(g);
}
close_analyser(a);
}
lose_b(u);
lose_b(filename);
}
{ struct include * p = o->includes;
until (p == 0)
{ struct include * q = p->next;
while (p) {
struct include * q = p->next;
lose_b(p->b); FREE(p); p = q;
}
}
FREE(o);
unless (space_count == 0) fprintf(stderr, "%d blocks unfreed\n", space_count);
if (space_count) fprintf(stderr, "%d blocks unfreed\n", space_count);
return 0;
}


+ 709
- 449
contrib/snowball/compiler/generator.c
File diff suppressed because it is too large
View File


+ 0
- 1452
contrib/snowball/compiler/generator_java.c
File diff suppressed because it is too large
View File


+ 151
- 64
contrib/snowball/compiler/header.h View File

@@ -1,49 +1,56 @@
#include <stdio.h>

#define SNOWBALL_VERSION "2.0.0"

typedef unsigned char byte;
typedef unsigned short symbol;

#define true 1
#define false 0
#define repeat while(true)
#define unless(C) if(!(C))
#define until(C) while(!(C))

#define MALLOC check_malloc
#define FREE check_free

#define NEW(type, p) struct type * p = (struct type *) MALLOC(sizeof(struct type))
#define NEWVEC(type, p, n) struct type * p = (struct type *) MALLOC(sizeof(struct type) * n)
#define NEWVEC(type, p, n) struct type * p = (struct type *) MALLOC(sizeof(struct type) * (n))

#define STARTSIZE 10
#define SIZE(p) ((int *)(p))[-1]
#define CAPACITY(p) ((int *)(p))[-2]

extern symbol * create_b(int n);
extern void report_b(FILE * out, symbol * p);
extern void report_b(FILE * out, const symbol * p);
extern void lose_b(symbol * p);
extern symbol * increase_capacity(symbol * p, int n);
extern symbol * move_to_b(symbol * p, int n, symbol * q);
extern symbol * add_to_b(symbol * p, int n, symbol * q);
extern symbol * copy_b(symbol * p);
extern char * b_to_s(symbol * p);
extern symbol * move_to_b(symbol * p, int n, const symbol * q);
extern symbol * add_to_b(symbol * p, int n, const symbol * q);
extern symbol * copy_b(const symbol * p);
extern char * b_to_s(const symbol * p);
extern symbol * add_s_to_b(symbol * p, const char * s);

#define MOVE_TO_B(B, LIT) \
move_to_b(B, sizeof(LIT) / sizeof(LIT[0]), LIT)

struct str; /* defined in space.c */

extern struct str * str_new(void);
extern void str_delete(struct str * str);
extern void str_append(struct str * str, struct str * add);
extern void str_append(struct str * str, const struct str * add);
extern void str_append_ch(struct str * str, char add);
extern void str_append_b(struct str * str, symbol * q);
extern void str_append_b(struct str * str, const symbol * q);
extern void str_append_b_tail(struct str * str, const symbol * q, int skip);
extern void str_append_string(struct str * str, const char * s);
extern void str_append_int(struct str * str, int i);
extern void str_clear(struct str * str);
extern void str_assign(struct str * str, char * s);
extern struct str * str_copy(struct str * old);
extern symbol * str_data(struct str * str);
extern int str_len(struct str * str);
extern void str_assign(struct str * str, const char * s);
extern struct str * str_copy(const struct str * old);
extern symbol * str_data(const struct str * str);
extern int str_len(const struct str * str);
extern int str_back(const struct str *str);
extern int get_utf8(const symbol * p, int * slot);
extern int put_utf8(int ch, symbol * p);
extern void output_str(FILE * outfile, struct str * str);

typedef enum { ENC_SINGLEBYTE, ENC_UTF8, ENC_WIDECHARS } enc;

struct m_pair {

@@ -60,6 +67,7 @@ struct input {
symbol * p;
int c;
char * file;
int file_needs_freeing;
int line_number;

};
@@ -71,6 +79,28 @@ struct include {

};

enum token_codes {

#include "syswords2.h"

c_mathassign,
c_name,
c_number,
c_literalstring,
c_neg,
c_call,
c_grouping,
c_booltest,

NUM_TOKEN_CODES
};

enum uplus_modes {
UPLUS_NONE,
UPLUS_DEFINED,
UPLUS_UNICODE
};

/* struct input must be a prefix of struct tokeniser. */
struct tokeniser {

@@ -78,6 +108,7 @@ struct tokeniser {
symbol * p;
int c;
char * file;
int file_needs_freeing;
int line_number;
symbol * b;
symbol * b2;
@@ -90,34 +121,28 @@ struct tokeniser {
int token;
int previous_token;
byte token_held;
byte widechars;
byte utf8;
enc encoding;

int omission;
struct include * includes;

/* Mode in which U+ has been used:
* UPLUS_NONE - not used yet
* UPLUS_DEFINED - stringdef U+xxxx ....
* UPLUS_UNICODE - {U+xxxx} used with implicit meaning
*/
int uplusmode;

char token_disabled[NUM_TOKEN_CODES];
};

extern symbol * get_input(symbol * p, char ** p_file);
extern symbol * get_input(const char * filename);
extern struct tokeniser * create_tokeniser(symbol * b, char * file);
extern int read_token(struct tokeniser * t);
extern const char * name_of_token(int code);
extern void disable_token(struct tokeniser * t, int code);
extern void close_tokeniser(struct tokeniser * t);

enum token_codes {

#include "syswords2.h"

c_mathassign,
c_name,
c_number,
c_literalstring,
c_neg,
c_call,
c_grouping,
c_booltest
};

extern int space_count;
extern void * check_malloc(int n);
extern void check_free(void * p);
@@ -134,7 +159,13 @@ struct name {
int count; /* 0, 1, 2 for each type */
struct grouping * grouping; /* for grouping names */
byte referenced;
byte used;
byte used_in_among; /* Function used in among? */
byte value_used; /* (For variables) is its value ever used? */
byte initialised; /* (For variables) is it ever initialised? */
byte used_in_definition; /* (grouping) used in grouping definition? */
struct node * used; /* First use, or NULL if not used */
struct name * local_to; /* Local to one routine/external */
int declaration_line_number;/* Line number of declaration */

};

@@ -149,9 +180,10 @@ struct amongvec {

symbol * b; /* the string giving the case */
int size; /* - and its size */
struct node * p; /* the corresponding command */
struct node * action; /* the corresponding action */
int i; /* the amongvec index of the longest substring of b */
int result; /* the numeric result for the case */
int line_number; /* for diagnostics and stable sorting */
struct name * function;

};
@@ -163,19 +195,22 @@ struct among {
int number; /* amongs are numbered 0, 1, 2 ... */
int literalstring_count; /* in this among */
int command_count; /* in this among */
int nocommand_count; /* number of "no command" entries in this among */
int function_count; /* in this among */
int amongvar_needed; /* do we need to set among_var? */
struct node * starter; /* i.e. among( (starter) 'string' ... ) */
struct node * substring; /* i.e. substring ... among ( ... ) */
struct node ** commands; /* array with command_count entries */
};

struct grouping {

struct grouping * next;
int number; /* groupings are numbered 0, 1, 2 ... */
symbol * b; /* the characters of this group */
int largest_ch; /* character with max code */
int smallest_ch; /* character with min code */
byte no_gaps; /* not used in generator.c after 11/5/05 */
struct name * name; /* so g->name->grouping == g */
int line_number;
};

struct node {
@@ -234,7 +269,8 @@ struct analyser {
struct grouping * groupings;
struct grouping * groupings_end;
struct node * substring; /* pending 'substring' in current routine definition */
byte utf8;
enc encoding;
byte int_limits_used; /* are maxint or minint used? */
};

enum analyser_modes {
@@ -259,16 +295,23 @@ struct generator {
struct str * outbuf; /* temporary str to store output */
struct str * declarations; /* str storing variable declarations */
int next_label;
#ifndef DISABLE_PYTHON
int max_label;
#endif
int margin;

const char * failure_string; /* String to output in case of a failure. */
#ifndef DISABLE_JAVA
struct str * failure_str; /* This is used by the java generator instead of failure_string */
/* if > 0, keep_count to restore in case of a failure;
* if < 0, the negated keep_count for the limit to restore in case of
* failure. */
int failure_keep_count;
#if !defined(DISABLE_JAVA) && !defined(DISABLE_JS) && !defined(DISABLE_PYTHON) && !defined(DISABLE_CSHARP)
struct str * failure_str; /* This is used by some generators instead of failure_keep_count */
#endif

int label_used; /* Keep track of whether the failure label is used. */
int failure_label;
int debug_count;
int copy_from_count; /* count of calls to copy_from() */

const char * S[10]; /* strings */
symbol * B[10]; /* blocks */
@@ -277,48 +320,92 @@ struct generator {
symbol * L[5]; /* literals, used in formatted write */

int line_count; /* counts number of lines output */
int line_labelled; /* in ANSI C, will need extra ';' if it is a block end */
int line_labelled; /* in ISO C, will need extra ';' if it is a block end */
int literalstring_count;
int keep_count; /* used to number keep/restore pairs to avoid compiler warnings
about shadowed variables */
};

/* Special values for failure_label in struct generator. */
enum special_labels {
x_return = -1
};

struct options {

/* for the command line: */

char * output_file;
char * name;
FILE * output_c;
const char * output_file;
const char * name;
FILE * output_src;
FILE * output_h;
#ifndef DISABLE_JAVA
FILE * output_java;
#endif
byte syntax_tree;
byte widechars;
enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS } make_lang;
char * externals_prefix;
char * variables_prefix;
char * runtime_path;
char * parent_class_name;
char * package;
char * string_class;
char * among_class;
byte comments;
enc encoding;
enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS, LANG_CSHARP, LANG_PASCAL, LANG_PYTHON, LANG_JAVASCRIPT, LANG_RUST, LANG_GO } make_lang;
const char * externals_prefix;
const char * variables_prefix;
const char * runtime_path;
const char * parent_class_name;
const char * package;
const char * go_snowball_runtime;
const char * string_class;
const char * among_class;
struct include * includes;
struct include * includes_end;
byte utf8;
};

/* Generator for C code. */
extern struct generator * create_generator_c(struct analyser * a, struct options * o);
extern void close_generator_c(struct generator * g);
/* Generator functions common to several backends. */

extern struct generator * create_generator(struct analyser * a, struct options * o);
extern void close_generator(struct generator * g);

extern void write_char(struct generator * g, int ch);
extern void write_newline(struct generator * g);
extern void write_string(struct generator * g, const char * s);
extern void write_int(struct generator * g, int i);
extern void write_b(struct generator * g, symbol * b);
extern void write_str(struct generator * g, struct str * str);

extern void write_comment_content(struct generator * g, struct node * p);
extern void write_generated_comment_content(struct generator * g);
extern void write_start_comment(struct generator * g,
const char * comment_start,
const char * comment_end);

extern int K_needed(struct generator * g, struct node * p);
extern int repeat_restore(struct generator * g, struct node * p);

/* Generator for C code. */
extern void generate_program_c(struct generator * g);

#ifndef DISABLE_JAVA
/* Generator for Java code. */
extern struct generator * create_generator_java(struct analyser * a, struct options * o);
extern void close_generator_java(struct generator * g);

extern void generate_program_java(struct generator * g);
#endif

#ifndef DISABLE_CSHARP
/* Generator for C# code. */
extern void generate_program_csharp(struct generator * g);
#endif

#ifndef DISABLE_PASCAL
extern void generate_program_pascal(struct generator * g);
#endif

#ifndef DISABLE_PYTHON
/* Generator for Python code. */
extern void generate_program_python(struct generator * g);
#endif

#ifndef DISABLE_JS
extern void generate_program_js(struct generator * g);
#endif

#ifndef DISABLE_RUST
extern void generate_program_rust(struct generator * g);
#endif

#ifndef DISABLE_GO
extern void generate_program_go(struct generator * g);
#endif

+ 48
- 24
contrib/snowball/compiler/space.c View File

@@ -57,9 +57,19 @@ extern symbol * create_b(int n) {
return p;
}

extern void report_b(FILE * out, symbol * p) {
extern void report_b(FILE * out, const symbol * p) {
int i;
for (i = 0; i < SIZE(p); i++) fprintf(out, "%c", p[i]);
for (i = 0; i < SIZE(p); i++) {
if (p[i] > 255) {
printf("In report_b, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]);
exit(1);
}
putc(p[i], out);
}
}

extern void output_str(FILE * outfile, struct str * str) {
report_b(outfile, str_data(str));
}

extern void lose_b(symbol * p) {
@@ -74,19 +84,19 @@ extern symbol * increase_capacity(symbol * p, int n) {
lose_b(p); return q;
}

extern symbol * move_to_b(symbol * p, int n, symbol * q) {
extern symbol * move_to_b(symbol * p, int n, const symbol * q) {
int x = n - CAPACITY(p);
if (x > 0) p = increase_capacity(p, x);
memmove(p, q, n * sizeof(symbol)); SIZE(p) = n; return p;
}

extern symbol * add_to_b(symbol * p, int n, symbol * q) {
extern symbol * add_to_b(symbol * p, int n, const symbol * q) {
int x = SIZE(p) + n - CAPACITY(p);
if (x > 0) p = increase_capacity(p, x);
memmove(p + SIZE(p), q, n * sizeof(symbol)); SIZE(p) += n; return p;
}

extern symbol * copy_b(symbol * p) {
extern symbol * copy_b(const symbol * p) {
int n = SIZE(p);
symbol * q = create_b(n);
move_to_b(q, n, p);
@@ -97,7 +107,7 @@ int space_count = 0;

extern void * check_malloc(int n) {
space_count++;
return calloc(1, n);
return malloc(n);
}

extern void check_free(void * p) {
@@ -107,18 +117,18 @@ extern void check_free(void * p) {

/* To convert a block to a zero terminated string: */

extern char * b_to_s(symbol * p) {
extern char * b_to_s(const symbol * p) {
int n = SIZE(p);
char * s = (char *)calloc(1, n + 1);
char * s = (char *)malloc(n + 1);
{
int i;
for (i = 0; i < n; i++) {
if (p[i] > 255) {
printf("In b_to_s, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]);
exit(1);
}
s[i] = (char)p[i];
}
if (p[i] > 255) {
printf("In b_to_s, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]);
exit(1);
}
s[i] = (char)p[i];
}
}
s[n] = 0;
return s;
@@ -153,9 +163,9 @@ struct str {
};

/* Create a new string. */
extern struct str * str_new() {
extern struct str * str_new(void) {

struct str * output = (struct str *) calloc(1, sizeof(struct str));
struct str * output = (struct str *) malloc(sizeof(struct str));
output->data = create_b(0);
return output;
}
@@ -168,7 +178,7 @@ extern void str_delete(struct str * str) {
}

/* Append a str to this str. */
extern void str_append(struct str * str, struct str * add) {
extern void str_append(struct str * str, const struct str * add) {

symbol * q = add->data;
str->data = add_to_b(str->data, SIZE(q), q);
@@ -183,12 +193,19 @@ extern void str_append_ch(struct str * str, char add) {
}

/* Append a low level block to a str. */
extern void str_append_b(struct str * str, symbol * q) {
extern void str_append_b(struct str * str, const symbol * q) {

str->data = add_to_b(str->data, SIZE(q), q);
}

/* Append a (char *, null teminated) string to a str. */
/* Append the tail of a low level block to a str. */
extern void str_append_b_tail(struct str * str, const symbol * q, int skip) {
if (skip < 0 || skip >= SIZE(q)) return;

str->data = add_to_b(str->data, SIZE(q) - skip, q + skip);
}

/* Append a (char *, null terminated) string to a str. */
extern void str_append_string(struct str * str, const char * s) {

str->data = add_s_to_b(str->data, s);
@@ -209,14 +226,14 @@ extern void str_clear(struct str * str) {
}

/* Set a string */
extern void str_assign(struct str * str, char * s) {
extern void str_assign(struct str * str, const char * s) {

str_clear(str);
str_append_string(str, s);
}

/* Copy a string. */
extern struct str * str_copy(struct str * old) {
extern struct str * str_copy(const struct str * old) {

struct str * newstr = str_new();
str_append(newstr, old);
@@ -224,17 +241,25 @@ extern struct str * str_copy(struct str * old) {
}

/* Get the data stored in this str. */
extern symbol * str_data(struct str * str) {
extern symbol * str_data(const struct str * str) {

return str->data;
}

/* Get the length of the str. */
extern int str_len(struct str * str) {
extern int str_len(const struct str * str) {

return SIZE(str->data);
}

/* Get the last character of the str.
*
* Or -1 if the string is empty.
*/
extern int str_back(const struct str *str) {
return SIZE(str->data) ? str->data[SIZE(str->data) - 1] : -1;
}

extern int get_utf8(const symbol * p, int * slot) {
int b0, b1;
b0 = *p++;
@@ -260,4 +285,3 @@ extern int put_utf8(int ch, symbol * p) {
p[1] = ((ch >> 6) & 0x3F) | 0x80;
p[2] = (ch & 0x3F) | 0x80; return 3;
}


+ 4
- 2
contrib/snowball/compiler/syswords.h View File

@@ -1,5 +1,5 @@
static const struct system_word vocab[80+1] = {
{ 0, (const byte *)"", 80+1},
static const struct system_word vocab[82+1] = {
{ 0, (const byte *)"", 82+1},

{ 1, (const byte *)"$", c_dollar },
{ 1, (const byte *)"(", c_bra },
@@ -36,6 +36,7 @@ static const struct system_word vocab[80+1] = {
{ 3, (const byte *)"get", c_get },
{ 3, (const byte *)"hex", c_hex },
{ 3, (const byte *)"hop", c_hop },
{ 3, (const byte *)"len", c_len },
{ 3, (const byte *)"non", c_non },
{ 3, (const byte *)"not", c_not },
{ 3, (const byte *)"set", c_set },
@@ -49,6 +50,7 @@ static const struct system_word vocab[80+1] = {
{ 4, (const byte *)"true", c_true },
{ 5, (const byte *)"among", c_among },
{ 5, (const byte *)"false", c_false },
{ 5, (const byte *)"lenof", c_lenof },
{ 5, (const byte *)"limit", c_limit },
{ 5, (const byte *)"unset", c_unset },
{ 6, (const byte *)"atmark", c_atmark },

+ 2
- 2
contrib/snowball/compiler/syswords2.h View File

@@ -4,8 +4,8 @@
c_decimal, c_define, c_delete, c_divide, c_divideassign, c_do,
c_dollar, c_eq, c_externals, c_fail, c_false, c_for, c_ge, c_get,
c_gopast, c_goto, c_gr, c_groupings, c_hex, c_hop, c_insert,
c_integers, c_ket, c_le, c_leftslice, c_limit, c_loop, c_ls,
c_maxint, c_minint, c_minus, c_minusassign, c_multiply,
c_integers, c_ket, c_le, c_leftslice, c_len, c_lenof, c_limit, c_loop,
c_ls, c_maxint, c_minint, c_minus, c_minusassign, c_multiply,
c_multiplyassign, c_ne, c_next, c_non, c_not, c_or, c_plus,
c_plusassign, c_repeat, c_reverse, c_rightslice, c_routines,
c_set, c_setlimit, c_setmark, c_size, c_sizeof, c_slicefrom,

+ 252
- 155
contrib/snowball/compiler/tokeniser.c View File

@@ -16,57 +16,57 @@ struct system_word {

#include "syswords.h"

static int smaller(int a, int b) { return a < b ? a : b; }
#define INITIAL_INPUT_BUFFER_SIZE 8192

static int hex_to_num(int ch);

extern symbol * get_input(symbol * p, char ** p_file) {
static int smaller(int a, int b) { return a < b ? a : b; }

char * s = b_to_s(p);
extern symbol * get_input(const char * filename) {
FILE * input = fopen(filename, "r");
if (input == 0) { return 0; }
{
FILE * input = fopen(s, "r");
if (input == 0) { free(s); return 0; }
*p_file = s;
{
symbol * u = create_b(STARTSIZE);
int size = 0;
repeat
{ int ch = getc(input);
if (ch == EOF) break;
if (size >= CAPACITY(u)) u = increase_capacity(u, size/2);
u[size++] = ch;
}
fclose(input);
SIZE(u) = size; return u;
symbol * u = create_b(INITIAL_INPUT_BUFFER_SIZE);
int size = 0;
while (true) {
int ch = getc(input);
if (ch == EOF) break;
if (size >= CAPACITY(u)) u = increase_capacity(u, size);
u[size++] = ch;
}
fclose(input);
SIZE(u) = size;
return u;
}
}

static void error(struct tokeniser * t, char * s1, int n, symbol * p, char * s2) {
static void error(struct tokeniser * t, const char * s1, int n, symbol * p, const char * s2) {
if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); }
fprintf(stderr, "%s:%d: ", t->file, t->line_number);
unless (s1 == 0) fprintf(stderr, "%s", s1);
unless (p == 0) {
if (s1) fprintf(stderr, "%s", s1);
if (p) {
int i;
for (i = 0; i < n; i++) fprintf(stderr, "%c", p[i]);
}
unless (s2 == 0) fprintf(stderr, "%s", s2);
if (s2) fprintf(stderr, "%s", s2);
fprintf(stderr, "\n");
t->error_count++;
}

static void error1(struct tokeniser * t, char * s) {
static void error1(struct tokeniser * t, const char * s) {
error(t, s, 0,0, 0);
}

static void error2(struct tokeniser * t, char * s) {
static void error2(struct tokeniser * t, const char * s) {
error(t, "unexpected end of text after ", 0,0, s);
}

static int compare_words(int m, symbol * p, int n, const byte * q) {
unless (m == n) return m - n;
if (m != n) return m - n;
{
int i; for (i = 0; i < n; i++) {
int diff = p[i] - q[i];
unless (diff == 0) return diff;
if (diff) return diff;
}
}
return 0;
@@ -74,14 +74,13 @@ static int compare_words(int m, symbol * p, int n, const byte * q) {

static int find_word(int n, symbol * p) {
int i = 0; int j = vocab->code;
repeat {
do {
int k = i + (j - i)/2;
const struct system_word * w = vocab + k;
int diff = compare_words(n, p, w->s_size, w->s);
if (diff == 0) return w->code;
if (diff < 0) j = k; else i = k;
if (j - i == 1) break;
}
} while (j - i != 1);
return -1;
}

@@ -91,7 +90,7 @@ static int get_number(int n, symbol * p) {
return x;
}

static int eq_s(struct tokeniser * t, char * s) {
static int eq_s(struct tokeniser * t, const char * s) {
int l = strlen(s);
if (SIZE(t->p) - t->c < l) return false;
{
@@ -103,64 +102,141 @@ static int eq_s(struct tokeniser * t, char * s) {

static int white_space(struct tokeniser * t, int ch) {
switch (ch) {
case '\n': t->line_number++;
case '\n':
t->line_number++;
/* fall through */
case '\r':
case '\t':
case ' ': return true;
case ' ':
return true;
}
return false;
}

static symbol * find_in_m(struct tokeniser * t, int n, symbol * p) {
struct m_pair * q = t->m_pairs;
repeat {
if (q == 0) return 0;
{
symbol * name = q->name;
if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value;
}
q = q->next;
struct m_pair * q;
for (q = t->m_pairs; q; q = q->next) {
symbol * name = q->name;
if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value;
}
return 0;
}

static int read_literal_string(struct tokeniser * t, int c) {
symbol * p = t->p;
int ch;
SIZE(t->b) = 0;
repeat {
while (true) {
if (c >= SIZE(p)) { error2(t, "'"); return c; }
ch = p[c];
if (ch == '\n') { error1(t, "string not terminated"); return c; }
c++;
if (ch == t->m_start) {
/* Inside insert characters. */
int c0 = c;
int newlines = false; /* no newlines as yet */
int black_found = false; /* no printing chars as yet */
repeat {
while (true) {
if (c >= SIZE(p)) { error2(t, "'"); return c; }
ch = p[c]; c++;
if (ch == t->m_end) break;
unless (white_space(t, ch)) black_found = true;
if (!white_space(t, ch)) black_found = true;
if (ch == '\n') newlines = true;
if (newlines && black_found) {
error1(t, "string not terminated");
return c;
}
}
unless (newlines) {
if (!newlines) {
int n = c - c0 - 1; /* macro size */
int firstch = p[c0];
symbol * q = find_in_m(t, n, p + c0);
if (q == 0) {
if (n == 1 && (firstch == '\'' || firstch == t->m_start))
t->b = add_to_b(t->b, 1, p + c0);
else
else if (n >= 3 && firstch == 'U' && p[c0 + 1] == '+') {
int codepoint = 0;
int x;
if (t->uplusmode == UPLUS_DEFINED) {
/* See if found with xxxx upper-cased. */
symbol * uc = create_b(n);
int i;
for (i = 0; i != n; ++i) {
uc[i] = toupper(p[c0 + i]);
}
q = find_in_m(t, n, uc);
lose_b(uc);
if (q != 0) {
t->b = add_to_b(t->b, SIZE(q), q);
continue;
}
error1(t, "Some U+xxxx stringdefs seen but not this one");
} else {
t->uplusmode = UPLUS_UNICODE;
}
for (x = c0 + 2; x != c - 1; ++x) {
int hex = hex_to_num(p[x]);
if (hex < 0) {
error1(t, "Bad hex digit following U+");
break;
}
codepoint = (codepoint << 4) | hex;
}
if (t->encoding == ENC_UTF8) {
if (codepoint < 0 || codepoint > 0x01ffff) {
error1(t, "character values exceed 0x01ffff");
}
/* Ensure there's enough space for a max length
* UTF-8 sequence. */
if (CAPACITY(t->b) < SIZE(t->b) + 3) {
t->b = increase_capacity(t->b, 3);
}
SIZE(t->b) += put_utf8(codepoint, t->b + SIZE(t->b));
} else {
symbol sym;
if (t->encoding == ENC_SINGLEBYTE) {
/* Only ISO-8859-1 is handled this way - for
* other single-byte character sets you need
* stringdef all the U+xxxx codes you use
* like - e.g.:
*
* stringdef U+0171 hex 'FB'
*/
if (codepoint < 0 || codepoint > 0xff) {
error1(t, "character values exceed 256");
}
} else {
if (codepoint < 0 || codepoint > 0xffff) {
error1(t, "character values exceed 64K");
}
}
sym = codepoint;
t->b = add_to_b(t->b, 1, &sym);
}
} else
error(t, "string macro '", n, p + c0, "' undeclared");
} else
t->b = add_to_b(t->b, SIZE(q), q);
}
} else {
if (ch == '\'') return c;
if (ch < 0 || ch >= 0x80) {
if (t->encoding != ENC_WIDECHARS) {
/* We don't really want people using non-ASCII literal
* strings, but historically it's worked for single-byte
* and UTF-8 if the source encoding matches what the
* generated stemmer works in and it seems unfair to just
* suddenly make this a hard error.`
*/
fprintf(stderr,
"%s:%d: warning: Non-ASCII literal strings aren't "
"portable - use stringdef instead\n",
t->file, t->line_number);
} else {
error1(t, "Non-ASCII literal strings aren't "
"portable - use stringdef instead");
}
}
t->b = add_to_b(t->b, 1, p + c - 1);
}
}
@@ -171,7 +247,7 @@ static int next_token(struct tokeniser * t) {
int c = t->c;
int ch;
int code = -1;
repeat {
while (true) {
if (c >= SIZE(p)) { t->c = c; return -1; }
ch = p[c];
if (white_space(t, ch)) { c++; continue; }
@@ -179,7 +255,7 @@ static int next_token(struct tokeniser * t) {
int c0 = c;
while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++;
code = find_word(c - c0, p + c0);
if (code < 0) {
if (code < 0 || t->token_disabled[code]) {
t->b = move_to_b(t->b, c - c0, p + c0);
code = c_name;
}
@@ -218,10 +294,9 @@ static int next_char(struct tokeniser * t) {
}

static int next_real_char(struct tokeniser * t) {
repeat {
while (true) {
int ch = next_char(t);
if (white_space(t, ch)) continue;
return ch;
if (!white_space(t, ch)) return ch;
}
}

@@ -230,7 +305,7 @@ static void read_chars(struct tokeniser * t) {
if (ch < 0) { error2(t, "stringdef"); return; }
{
int c0 = t->c-1;
repeat {
while (true) {
ch = next_char(t);
if (white_space(t, ch) || ch < 0) break;
}
@@ -246,19 +321,20 @@ static int decimal_to_num(int ch) {
static int hex_to_num(int ch) {
if ('0' <= ch && ch <= '9') return ch - '0';
if ('a' <= ch && ch <= 'f') return ch - 'a' + 10;
if ('A' <= ch && ch <= 'F') return ch - 'A' + 10;
return -1;
}

static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) {
int c = 0; int d = 0;
repeat {
while (true) {
while (c < SIZE(p) && p[c] == ' ') c++;
if (c == SIZE(p)) break;
{
int number = 0;
repeat {
while (c != SIZE(p)) {
int ch = p[c];
if (c == SIZE(p) || ch == ' ') break;
if (ch == ' ') break;
if (base == 10) {
ch = decimal_to_num(ch);
if (ch < 0) {
@@ -266,7 +342,7 @@ static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) {
return;
}
} else {
ch = hex_to_num(tolower(ch));
ch = hex_to_num(ch);
if (ch < 0) {
error1(t, "hex string contains non-hex characters");
return;
@@ -275,18 +351,18 @@ static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) {
number = base * number + ch;
c++;
}
if (t->widechars || t->utf8) {
unless (0 <= number && number <= 0xffff) {
error1(t, "character values exceed 64K");
if (t->encoding == ENC_SINGLEBYTE) {
if (number < 0 || number > 0xff) {
error1(t, "character values exceed 256");
return;
}
} else {
unless (0 <= number && number <= 0xff) {
error1(t, "character values exceed 256");
if (number < 0 || number > 0xffff) {
error1(t, "character values exceed 64K");
return;
}
}
if (t->utf8)
if (t->encoding == ENC_UTF8)
d += put_utf8(number, p + d);
else
p[d++] = number;
@@ -300,104 +376,118 @@ extern int read_token(struct tokeniser * t) {
int held = t->token_held;
t->token_held = false;
if (held) return t->token;
repeat {
while (true) {
int code = next_token(t);
switch (code) {
case c_comment1: /* slash-slash comment */
while (t->c < SIZE(p) && p[t->c] != '\n') t->c++;
continue;
while (t->c < SIZE(p) && p[t->c] != '\n') t->c++;
continue;
case c_comment2: /* slash-star comment */
repeat {
if (t->c >= SIZE(p)) {
error1(t, "/* comment not terminated");
t->token = -1;
return -1;
}
if (p[t->c] == '\n') t->line_number++;
if (eq_s(t, "*/")) break;
t->c++;
}
continue;
case c_stringescapes:
{
int ch1 = next_real_char(t);
int ch2 = next_real_char(t);
if (ch2 < 0)
{ error2(t, "stringescapes"); continue; }
if (ch1 == '\'')
{ error1(t, "first stringescape cannot be '"); continue; }
t->m_start = ch1;
t->m_end = ch2;
}
continue;
case c_stringdef:
{
int base = 0;
read_chars(t);
code = read_token(t);
if (code == c_hex) { base = 16; code = read_token(t); } else
if (code == c_decimal) { base = 10; code = read_token(t); }
unless (code == c_literalstring)
{ error1(t, "string omitted after stringdef"); continue; }
if (base > 0) convert_numeric_string(t, t->b, base);
{ NEW(m_pair, q);
q->next = t->m_pairs;
q->name = copy_b(t->b2);
q->value = copy_b(t->b);
t->m_pairs = q;
}
}
continue;
while (true) {
if (t->c >= SIZE(p)) {
error1(t, "/* comment not terminated");
t->token = -1;
return -1;
}
if (p[t->c] == '\n') t->line_number++;
if (eq_s(t, "*/")) break;
t->c++;
}
continue;
case c_stringescapes: {
int ch1 = next_real_char(t);
int ch2 = next_real_char(t);
if (ch2 < 0) {
error2(t, "stringescapes");
continue;
}
if (ch1 == '\'') {
error1(t, "first stringescape cannot be '");
continue;
}
t->m_start = ch1;
t->m_end = ch2;
continue;
}
case c_stringdef: {
int base = 0;
read_chars(t);
code = read_token(t);
if (code == c_hex) { base = 16; code = read_token(t); } else
if (code == c_decimal) { base = 10; code = read_token(t); }
if (code != c_literalstring) {
error1(t, "string omitted after stringdef");
continue;
}
if (base > 0) convert_numeric_string(t, t->b, base);
{ NEW(m_pair, q);
q->next = t->m_pairs;
q->name = copy_b(t->b2);
q->value = copy_b(t->b);
t->m_pairs = q;
if (t->uplusmode != UPLUS_DEFINED &&
(SIZE(t->b2) >= 3 && t->b2[0] == 'U' && t->b2[1] == '+')) {
if (t->uplusmode == UPLUS_UNICODE) {
error1(t, "U+xxxx already used with implicit meaning");
} else {
t->uplusmode = UPLUS_DEFINED;
}
}
}
continue;
}
case c_get:
code = read_token(t);
unless (code == c_literalstring) {
error1(t, "string omitted after get"); continue;
}
t->get_depth++;
if (t->get_depth > 10) {
fprintf(stderr, "get directives go 10 deep. Looping?\n");
exit(1);
}
{
char * file;
NEW(input, q);
symbol * u = get_input(t->b, &file);
if (u == 0) {
struct include * r = t->includes;
until (r == 0) {
symbol * b = copy_b(r->b);
b = add_to_b(b, SIZE(t->b), t->b);
u = get_input(b, &file);
lose_b(b);
unless (u == 0) break;
r = r->next;
}
}
if (u == 0) {
error(t, "Can't get '", SIZE(t->b), t->b, "'");
exit(1);
}
memmove(q, t, sizeof(struct input));
t->next = q;
t->p = u;
t->c = 0;
t->file = file;
t->line_number = 1;
}
p = t->p;
continue;
code = read_token(t);
if (code != c_literalstring) {
error1(t, "string omitted after get"); continue;
}
t->get_depth++;
if (t->get_depth > 10) {
fprintf(stderr, "get directives go 10 deep. Looping?\n");
exit(1);
}
{
NEW(input, q);
char * file = b_to_s(t->b);
symbol * u = get_input(file);
if (u == 0) {
struct include * r;
for (r = t->includes; r; r = r->next) {
symbol * b = copy_b(r->b);
b = add_to_b(b, SIZE(t->b), t->b);
free(file);
file = b_to_s(b);
u = get_input(file);
lose_b(b);
if (u != 0) break;
}
}
if (u == 0) {
error(t, "Can't get '", SIZE(t->b), t->b, "'");
exit(1);
}
memmove(q, t, sizeof(struct input));
t->next = q;
t->p = u;
t->c = 0;
t->file = file;
t->file_needs_freeing = true;
t->line_number = 1;
}
p = t->p;
continue;
case -1:
unless (t->next == 0) {
lose_b(p);
{
struct input * q = t->next;
memmove(t, q, sizeof(struct input)); p = t->p;
FREE(q);
}
t->get_depth--;
continue;
}
/* drop through */
if (t->next) {
lose_b(p);
{
struct input * q = t->next;
memmove(t, q, sizeof(struct input)); p = t->p;
FREE(q);
}
t->get_depth--;
continue;
}
/* fall through */
default:
t->previous_token = t->token;
t->token = code;
@@ -425,12 +515,17 @@ extern const char * name_of_token(int code) {
}
}

extern void disable_token(struct tokeniser * t, int code) {
t->token_disabled[code] = 1;
}

extern struct tokeniser * create_tokeniser(symbol * p, char * file) {
NEW(tokeniser, t);
t->next = 0;
t->p = p;
t->c = 0;
t->file = file;
t->file_needs_freeing = false;
t->line_number = 1;
t->b = create_b(0);
t->b2 = create_b(0);
@@ -441,6 +536,8 @@ extern struct tokeniser * create_tokeniser(symbol * p, char * file) {
t->token_held = false;
t->token = -2;
t->previous_token = -2;
t->uplusmode = UPLUS_NONE;
memset(t->token_disabled, 0, sizeof(t->token_disabled));
return t;
}

@@ -449,7 +546,7 @@ extern void close_tokeniser(struct tokeniser * t) {
lose_b(t->b2);
{
struct m_pair * q = t->m_pairs;
until (q == 0) {
while (q) {
struct m_pair * q_next = q->next;
lose_b(q->name);
lose_b(q->value);
@@ -459,12 +556,12 @@ extern void close_tokeniser(struct tokeniser * t) {
}
{
struct input * q = t->next;
until (q == 0) {
while (q) {
struct input * q_next = q->next;
FREE(q);
q = q_next;
}
}
free(t->file);
if (t->file_needs_freeing) free(t->file);
FREE(t);
}

+ 0
- 209
contrib/snowball/examples/stemwords.c View File

@@ -1,209 +0,0 @@
/* This is a simple program which uses libstemmer to provide a command
* line interface for stemming using any of the algorithms provided.
*/

#include <stdio.h>
#include <stdlib.h> /* for malloc, free */
#include <string.h> /* for memmove */
#include <ctype.h> /* for isupper, tolower */

#include "libstemmer.h"

const char * progname;
static int pretty = 1;

static void
stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
{
#define INC 10
int lim = INC;
sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol));

while(1) {
int ch = getc(f_in);
if (ch == EOF) {
free(b); return;
}
{
int i = 0;
int inlen = 0;
while(1) {
if (ch == '\n' || ch == EOF) break;
if (i == lim) {
sb_symbol * newb;
newb = (sb_symbol *)
realloc(b, (lim + INC) * sizeof(sb_symbol));
if (newb == 0) goto error;
b = newb;
lim = lim + INC;
}
/* Update count of utf-8 characters. */
if (ch < 0x80 || ch > 0xBF) inlen += 1;
/* force lower case: */
if (isupper(ch)) ch = tolower(ch);

b[i] = ch;
i++;
ch = getc(f_in);
}

{
const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i);
if (stemmed == NULL)
{
fprintf(stderr, "Out of memory");
exit(1);
}
else
{
if (pretty == 1) {
fwrite(b, i, 1, f_out);
fputs(" -> ", f_out);
} else if (pretty == 2) {
fwrite(b, i, 1, f_out);
if (sb_stemmer_length(stemmer) > 0) {
int j;
if (inlen < 30) {
for (j = 30 - inlen; j > 0; j--)
fputs(" ", f_out);
} else {
fputs("\n", f_out);
for (j = 30; j > 0; j--)
fputs(" ", f_out);
}
}
}

fputs((const char *)stemmed, f_out);
putc('\n', f_out);
}
}
}
}
error:
if (b != 0) free(b);
return;
}

/** Display the command line syntax, and then exit.
* @param n The value to exit with.
*/
static void
usage(int n)
{
printf("usage: %s [-l <language>] [-i <input file>] [-o <output file>] [-c <character encoding>] [-p[2]] [-h]\n"
"\n"
"The input file consists of a list of words to be stemmed, one per\n"
"line. Words should be in lower case, but (for English) A-Z letters\n"
"are mapped to their a-z equivalents anyway. If omitted, stdin is\n"
"used.\n"
"\n"
"If -c is given, the argument is the character encoding of the input\n"
"and output files. If it is omitted, the UTF-8 encoding is used.\n"
"\n"
"If -p is given the output file consists of each word of the input\n"
"file followed by \"->\" followed by its stemmed equivalent.\n"
"If -p2 is given the output file is a two column layout containing\n"
"the input words in the first column and the stemmed equivalents in\n"
"the second column.\n"
"Otherwise, the output file consists of the stemmed words, one per\n"
"line.\n"
"\n"
"-h displays this help\n",
progname);
exit(n);
}

int
main(int argc, char * argv[])
{
char * in = 0;
char * out = 0;
FILE * f_in;
FILE * f_out;
struct sb_stemmer * stemmer;

char * language = "english";
char * charenc = NULL;

char * s;
int i = 1;
pretty = 0;

progname = argv[0];

while(i < argc) {
s = argv[i++];
if (s[0] == '-') {
if (strcmp(s, "-o") == 0) {
if (i >= argc) {
fprintf(stderr, "%s requires an argument\n", s);
exit(1);
}
out = argv[i++];
} else if (strcmp(s, "-i") == 0) {
if (i >= argc) {
fprintf(stderr, "%s requires an argument\n", s);
exit(1);
}
in = argv[i++];
} else if (strcmp(s, "-l") == 0) {
if (i >= argc) {
fprintf(stderr, "%s requires an argument\n", s);
exit(1);
}
language = argv[i++];
} else if (strcmp(s, "-c") == 0) {
if (i >= argc) {
fprintf(stderr, "%s requires an argument\n", s);
exit(1);
}
charenc = argv[i++];
} else if (strcmp(s, "-p2") == 0) {
pretty = 2;
} else if (strcmp(s, "-p") == 0) {
pretty = 1;
} else if (strcmp(s, "-h") == 0) {
usage(0);
} else {
fprintf(stderr, "option %s unknown\n", s);
usage(1);
}
} else {
fprintf(stderr, "unexpected parameter %s\n", s);
usage(1);
}
}

/* prepare the files */
f_in = (in == 0) ? stdin : fopen(in, "r");
if (f_in == 0) {
fprintf(stderr, "file %s not found\n", in);
exit(1);
}
f_out = (out == 0) ? stdout : fopen(out, "w");
if (f_out == 0) {
fprintf(stderr, "file %s cannot be opened\n", out);
exit(1);
}

/* do the stemming process: */
stemmer = sb_stemmer_new(language, charenc);
if (stemmer == 0) {
if (charenc == NULL) {
fprintf(stderr, "language `%s' not available for stemming\n", language);
exit(1);
} else {
fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc);
exit(1);
}
}
stem_file(stemmer, f_in, f_out);
sb_stemmer_delete(stemmer);

if (in != 0) (void) fclose(f_in);
if (out != 0) (void) fclose(f_out);

return 0;
}


+ 4
- 5
contrib/snowball/include/libstemmer.h View File

@@ -32,9 +32,9 @@ const char ** sb_stemmer_list(void);
*
* @param charenc The character encoding. NULL may be passed as
* this value, in which case UTF-8 encoding will be assumed. Otherwise,
* the argument may be one of "UTF_8", "ISO_8859_1" (ie, Latin 1),
* "CP850" (ie, MS-DOS Latin 1) or "KOI8_R" (Russian). Note that
* case is significant in this parameter.
* the argument may be one of "UTF_8", "ISO_8859_1" (i.e. Latin 1),
* "ISO_8859_2" (i.e. Latin 2) or "KOI8_R" (Russian). Note that case is
* significant in this parameter.
*
* @return NULL if the specified algorithm is not recognised, or the
* algorithm is not available for the requested encoding. Otherwise,
@@ -66,7 +66,7 @@ void sb_stemmer_delete(struct sb_stemmer * stemmer);
* If an out-of-memory error occurs, this will return NULL.
*/
const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer,
const sb_symbol * word, int size);
const sb_symbol * word, int size);

/** Get the length of the result of the last stemmed word.
* This should not be called before sb_stemmer_stem() has been called.
@@ -76,4 +76,3 @@ int sb_stemmer_length(struct sb_stemmer * stemmer);
#ifdef __cplusplus
}
#endif


+ 0
- 31
contrib/snowball/java/org/tartarus/snowball/Among.java View File

@@ -1,31 +0,0 @@
package org.tartarus.snowball;

import java.lang.reflect.Method;

public class Among {
public Among (String s, int substring_i, int result,
String methodname, SnowballProgram methodobject) {
this.s_size = s.length();
this.s = s.toCharArray();
this.substring_i = substring_i;
this.result = result;
this.methodobject = methodobject;
if (methodname.length() == 0) {
this.method = null;
} else {
try {
this.method = methodobject.getClass().
getDeclaredMethod(methodname, new Class[0]);
} catch (NoSuchMethodException e) {
throw new RuntimeException(e);
}
}
}

public final int s_size; /* search string */
public final char[] s; /* search string */
public final int substring_i; /* index to longest matching substring */
public final int result; /* result of the lookup */
public final Method method; /* method to use if substring matches */
public final SnowballProgram methodobject; /* object to invoke method on */
};

+ 0
- 432
contrib/snowball/java/org/tartarus/snowball/SnowballProgram.java View File

@@ -1,432 +0,0 @@

package org.tartarus.snowball;
import java.lang.reflect.InvocationTargetException;

public class SnowballProgram {
protected SnowballProgram()
{
current = new StringBuffer();
setCurrent("");
}

/**
* Set the current string.
*/
public void setCurrent(String value)
{
current.replace(0, current.length(), value);
cursor = 0;
limit = current.length();
limit_backward = 0;
bra = cursor;
ket = limit;
}

/**
* Get the current string.
*/
public String getCurrent()
{
String result = current.toString();
// Make a new StringBuffer. If we reuse the old one, and a user of
// the library keeps a reference to the buffer returned (for example,
// by converting it to a String in a way which doesn't force a copy),
// the buffer size will not decrease, and we will risk wasting a large
// amount of memory.
// Thanks to Wolfram Esser for spotting this problem.
current = new StringBuffer();
return result;
}

// current string
protected StringBuffer current;

protected int cursor;
protected int limit;
protected int limit_backward;
protected int bra;
protected int ket;

protected void copy_from(SnowballProgram other)
{
current = other.current;
cursor = other.cursor;
limit = other.limit;
limit_backward = other.limit_backward;
bra = other.bra;
ket = other.ket;
}

protected boolean in_grouping(char [] s, int min, int max)
{
if (cursor >= limit) return false;
char ch = current.charAt(cursor);
if (ch > max || ch < min) return false;
ch -= min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
cursor++;
return true;
}

protected boolean in_grouping_b(char [] s, int min, int max)
{
if (cursor <= limit_backward) return false;
char ch = current.charAt(cursor - 1);
if (ch > max || ch < min) return false;
ch -= min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
cursor--;
return true;
}

protected boolean out_grouping(char [] s, int min, int max)
{
if (cursor >= limit) return false;
char ch = current.charAt(cursor);
if (ch > max || ch < min) {
cursor++;
return true;
}
ch -= min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
cursor ++;
return true;
}
return false;
}

protected boolean out_grouping_b(char [] s, int min, int max)
{
if (cursor <= limit_backward) return false;
char ch = current.charAt(cursor - 1);
if (ch > max || ch < min) {
cursor--;
return true;
}
ch -= min;
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
cursor--;
return true;
}
return false;
}

protected boolean in_range(int min, int max)
{
if (cursor >= limit) return false;
char ch = current.charAt(cursor);
if (ch > max || ch < min) return false;
cursor++;
return true;
}

protected boolean in_range_b(int min, int max)
{
if (cursor <= limit_backward) return false;
char ch = current.charAt(cursor - 1);
if (ch > max || ch < min) return false;
cursor--;
return true;
}

protected boolean out_range(int min, int max)
{
if (cursor >= limit) return false;
char ch = current.charAt(cursor);
if (!(ch > max || ch < min)) return false;
cursor++;
return true;
}

protected boolean out_range_b(int min, int max)
{
if (cursor <= limit_backward) return false;
char ch = current.charAt(cursor - 1);
if(!(ch > max || ch < min)) return false;
cursor--;
return true;
}

protected boolean eq_s(int s_size, String s)
{
if (limit - cursor < s_size) return false;
int i;
for (i = 0; i != s_size; i++) {
if (current.charAt(cursor + i) != s.charAt(i)) return false;
}
cursor += s_size;
return true;
}

protected boolean eq_s_b(int s_size, String s)
{
if (cursor - limit_backward < s_size) return false;
int i;
for (i = 0; i != s_size; i++) {
if (current.charAt(cursor - s_size + i) != s.charAt(i)) return false;
}
cursor -= s_size;
return true;
}

protected boolean eq_v(CharSequence s)
{
return eq_s(s.length(), s.toString());
}

protected boolean eq_v_b(CharSequence s)
{ return eq_s_b(s.length(), s.toString());
}

protected int find_among(Among v[], int v_size)
{
int i = 0;
int j = v_size;

int c = cursor;
int l = limit;

int common_i = 0;
int common_j = 0;

boolean first_key_inspected = false;

while(true) {
int k = i + ((j - i) >> 1);
int diff = 0;
int common = common_i < common_j ? common_i : common_j; // smaller
Among w = v[k];
int i2;
for (i2 = common; i2 < w.s_size; i2++) {
if (c + common == l) {
diff = -1;
break;
}
diff = current.charAt(c + common) - w.s[i2];
if (diff != 0) break;
common++;
}
if (diff < 0) {
j = k;
common_j = common;
} else {
i = k;
common_i = common;
}
if (j - i <= 1) {
if (i > 0) break; // v->s has been inspected
if (j == i) break; // only one item in v

// - but now we need to go round once more to get
// v->s inspected. This looks messy, but is actually
// the optimal approach.

if (first_key_inspected) break;
first_key_inspected = true;
}
}
while(true) {
Among w = v[i];
if (common_i >= w.s_size) {
cursor = c + w.s_size;
if (w.method == null) return w.result;
boolean res;
try {
Object resobj = w.method.invoke(w.methodobject,
new Object[0]);
res = resobj.toString().equals("true");
} catch (InvocationTargetException e) {
res = false;
// FIXME - debug message
} catch (IllegalAccessException e) {
res = false;
// FIXME - debug message
}
cursor = c + w.s_size;
if (res) return w.result;
}
i = w.substring_i;
if (i < 0) return 0;
}
}

// find_among_b is for backwards processing. Same comments apply
protected int find_among_b(Among v[], int v_size)
{
int i = 0;
int j = v_size;

int c = cursor;
int lb = limit_backward;

int common_i = 0;
int common_j = 0;

boolean first_key_inspected = false;

while(true) {
int k = i + ((j - i) >> 1);
int diff = 0;
int common = common_i < common_j ? common_i : common_j;
Among w = v[k];
int i2;
for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) {
if (c - common == lb) {
diff = -1;
break;
}
diff = current.charAt(c - 1 - common) - w.s[i2];
if (diff != 0) break;
common++;
}
if (diff < 0) {
j = k;
common_j = common;
} else {
i = k;
common_i = common;
}
if (j - i <= 1) {
if (i > 0) break;
if (j == i) break;
if (first_key_inspected) break;
first_key_inspected = true;
}
}
while(true) {
Among w = v[i];
if (common_i >= w.s_size) {
cursor = c - w.s_size;
if (w.method == null) return w.result;

boolean res;
try {
Object resobj = w.method.invoke(w.methodobject,
new Object[0]);
res = resobj.toString().equals("true");
} catch (InvocationTargetException e) {
res = false;
// FIXME - debug message
} catch (IllegalAccessException e) {
res = false;
// FIXME - debug message
}
cursor = c - w.s_size;
if (res) return w.result;
}
i = w.substring_i;
if (i < 0) return 0;
}
}

/* to replace chars between c_bra and c_ket in current by the
* chars in s.
*/
protected int replace_s(int c_bra, int c_ket, String s)
{
int adjustment = s.length() - (c_ket - c_bra);
current.replace(c_bra, c_ket, s);
limit += adjustment;
if (cursor >= c_ket) cursor += adjustment;
else if (cursor > c_bra) cursor = c_bra;
return adjustment;
}

protected void slice_check()
{
if (bra < 0 ||
bra > ket ||
ket > limit ||
limit > current.length()) // this line could be removed
{
System.err.println("faulty slice operation");
// FIXME: report error somehow.
/*
fprintf(stderr, "faulty slice operation:\n");
debug(z, -1, 0);
exit(1);
*/
}
}

protected void slice_from(String s)
{
slice_check();
replace_s(bra, ket, s);
}

protected void slice_from(CharSequence s)
{
slice_from(s.toString());
}

protected void slice_del()
{
slice_from("");
}

protected void insert(int c_bra, int c_ket, String s)
{
int adjustment = replace_s(c_bra, c_ket, s);
if (c_bra <= bra) bra += adjustment;
if (c_bra <= ket) ket += adjustment;
}

protected void insert(int c_bra, int c_ket, CharSequence s)
{
insert(c_bra, c_ket, s.toString());
}

/* Copy the slice into the supplied StringBuffer */
protected StringBuffer slice_to(StringBuffer s)
{
slice_check();
int len = ket - bra;
s.replace(0, s.length(), current.substring(bra, ket));
return s;
}

/* Copy the slice into the supplied StringBuilder */
protected StringBuilder slice_to(StringBuilder s)
{
slice_check();
int len = ket - bra;
s.replace(0, s.length(), current.substring(bra, ket));
return s;
}

protected StringBuffer assign_to(StringBuffer s)
{
s.replace(0, s.length(), current.substring(0, limit));
return s;
}

protected StringBuilder assign_to(StringBuilder s)
{
s.replace(0, s.length(), current.substring(0, limit));
return s;
}

/*
extern void debug(struct SN_env * z, int number, int line_count)
{ int i;
int limit = SIZE(z->p);
//if (number >= 0) printf("%3d (line %4d): '", number, line_count);
if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
for (i = 0; i <= limit; i++)
{ if (z->lb == i) printf("{");
if (z->bra == i) printf("[");
if (z->c == i) printf("|");
if (z->ket == i) printf("]");
if (z->l == i) printf("}");
if (i < limit)
{ int ch = z->p[i];
if (ch == 0) ch = '#';
printf("%c", ch);
}
}
printf("'\n");
}
*/

};

+ 0
- 7
contrib/snowball/java/org/tartarus/snowball/SnowballStemmer.java View File

@@ -1,7 +0,0 @@

package org.tartarus.snowball;
import java.lang.reflect.InvocationTargetException;

public abstract class SnowballStemmer extends SnowballProgram {
public abstract boolean stem();
};

+ 0
- 77
contrib/snowball/java/org/tartarus/snowball/TestApp.java View File

@@ -1,77 +0,0 @@

package org.tartarus.snowball;

import java.lang.reflect.Method;
import java.io.Reader;
import java.io.Writer;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.OutputStream;
import java.io.FileOutputStream;

public class TestApp {
private static void usage()
{
System.err.println("Usage: TestApp <algorithm> <input file> [-o <output file>]");
}

public static void main(String [] args) throws Throwable {
if (args.length < 2) {
usage();
return;
}

Class stemClass = Class.forName("org.tartarus.snowball.ext." +
args[0] + "Stemmer");
SnowballStemmer stemmer = (SnowballStemmer) stemClass.newInstance();

Reader reader;
reader = new InputStreamReader(new FileInputStream(args[1]));
reader = new BufferedReader(reader);

StringBuffer input = new StringBuffer();

OutputStream outstream;

if (args.length > 2) {
if (args.length >= 4 && args[2].equals("-o")) {
outstream = new FileOutputStream(args[3]);
} else {
usage();
return;
}
} else {
outstream = System.out;
}
Writer output = new OutputStreamWriter(outstream);
output = new BufferedWriter(output);

int repeat = 1;
if (args.length > 4) {
repeat = Integer.parseInt(args[4]);
}

Object [] emptyArgs = new Object[0];
int character;
while ((character = reader.read()) != -1) {
char ch = (char) character;
if (Character.isWhitespace((char) ch)) {
if (input.length() > 0) {
stemmer.setCurrent(input.toString());
for (int i = repeat; i != 0; i--) {
stemmer.stem();
}
output.write(stemmer.getCurrent());
output.write('\n');
input.delete(0, input.length());
}
} else {
input.append(Character.toLowerCase(ch));
}
}
output.flush();
}
}

+ 8
- 7
contrib/snowball/libstemmer/libstemmer_c.in View File

@@ -22,10 +22,10 @@ sb_stemmer_list(void)
static stemmer_encoding_t
sb_getenc(const char * charenc)
{
struct stemmer_encoding * encoding;
const struct stemmer_encoding * encoding;
if (charenc == NULL) return ENC_UTF_8;
for (encoding = encodings; encoding->name != 0; encoding++) {
if (strcmp(encoding->name, charenc) == 0) break;
if (strcmp(encoding->name, charenc) == 0) break;
}
if (encoding->name == NULL) return ENC_UNKNOWN;
return encoding->enc;
@@ -35,14 +35,14 @@ extern struct sb_stemmer *
sb_stemmer_new(const char * algorithm, const char * charenc)
{
stemmer_encoding_t enc;
struct stemmer_modules * module;
const struct stemmer_modules * module;
struct sb_stemmer * stemmer;

enc = sb_getenc(charenc);
if (enc == ENC_UNKNOWN) return NULL;

for (module = modules; module->name != 0; module++) {
if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break;
}
if (module->name == NULL) return NULL;
@@ -67,9 +67,10 @@ void
sb_stemmer_delete(struct sb_stemmer * stemmer)
{
if (stemmer == 0) return;
if (stemmer->close == 0) return;
stemmer->close(stemmer->env);
stemmer->close = 0;
if (stemmer->close) {
stemmer->close(stemmer->env);
stemmer->close = 0;
}
free(stemmer);
}


+ 18
- 7
contrib/snowball/libstemmer/mkmodules.pl View File

@@ -1,10 +1,12 @@
#!/usr/bin/perl -w
#!/usr/bin/env perl
use strict;
use 5.006;
use warnings;

my $progname = $0;

if (scalar @ARGV < 4 || scalar @ARGV > 5) {
print "Usage: $progname <outfile> <C source directory> <modules description file> <source list file> [<extn>]\n";
print "Usage: $progname <outfile> <C source directory> <modules description file> <source list file> [<enc>]\n";
exit 1;
}

@@ -12,9 +14,11 @@ my $outname = shift(@ARGV);
my $c_src_dir = shift(@ARGV);
my $descfile = shift(@ARGV);
my $srclistfile = shift(@ARGV);
my $enc_only;
my $extn = '';
if (@ARGV) {
$extn = '_'.shift(@ARGV);
$enc_only = shift(@ARGV);
$extn = '_'.$enc_only;
}

my %aliases = ();
@@ -27,6 +31,14 @@ sub addalgenc($$) {
my $alg = shift();
my $enc = shift();

if (defined $enc_only) {
my $norm_enc = lc $enc;
$norm_enc =~ s/_//g;
if ($norm_enc ne $enc_only) {
return;
}
}

if (defined $algorithm_encs{$alg}) {
my $hashref = $algorithm_encs{$alg};
$$hashref{$enc}=1;
@@ -42,7 +54,7 @@ sub readinput()
{
open DESCFILE, $descfile;
my $line;
while($line = <DESCFILE>)
while ($line = <DESCFILE>)
{
next if $line =~ m/^\s*#/;
next if $line =~ m/^\s*$/;
@@ -123,7 +135,7 @@ struct stemmer_encoding {
const char * name;
stemmer_encoding_t enc;
};
static struct stemmer_encoding encodings[] = {
static const struct stemmer_encoding encodings[] = {
EOS
for $enc (sort keys %encs) {
print OUT " {\"${enc}\", ENC_${enc}},\n";
@@ -139,7 +151,7 @@ struct stemmer_modules {
void (*close)(struct SN_env *);
int (*stem)(struct SN_env *);
};
static struct stemmer_modules modules[] = {
static const struct stemmer_modules modules[] = {
EOS

for $lang (sort keys %aliases) {
@@ -162,7 +174,6 @@ static const char * algorithm_names[] = {
EOS

for $lang (@algorithms) {
my $l = $aliases{$lang};
print OUT " \"$lang\", \n";
}


+ 26
- 18
contrib/snowball/libstemmer/modules.txt View File

@@ -9,27 +9,35 @@
# List all the main algorithms for each language, in UTF-8, and also with
# the most commonly used encoding.

danish UTF_8,ISO_8859_1 danish,da,dan
dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld
english UTF_8,ISO_8859_1 english,en,eng
finnish UTF_8,ISO_8859_1 finnish,fi,fin
french UTF_8,ISO_8859_1 french,fr,fre,fra
german UTF_8,ISO_8859_1 german,de,ger,deu
hungarian UTF_8,ISO_8859_2 hungarian,hu,hun
italian UTF_8,ISO_8859_1 italian,it,ita
norwegian UTF_8,ISO_8859_1 norwegian,no,nor
portuguese UTF_8,ISO_8859_1 portuguese,pt,por
romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron
russian UTF_8,KOI8_R russian,ru,rus
spanish UTF_8,ISO_8859_1 spanish,es,esl,spa
swedish UTF_8,ISO_8859_1 swedish,sv,swe
arabic UTF_8 arabic,ar,ara
danish UTF_8 danish,da,dan
dutch UTF_8 dutch,nl,dut,nld
english UTF_8 english,en,eng
finnish UTF_8 finnish,fi,fin
french UTF_8 french,fr,fre,fra
german UTF_8 german,de,ger,deu
greek UTF_8 greek,el,gre,ell
hindi UTF_8 hindi,hi,hin
hungarian UTF_8 hungarian,hu,hun
indonesian UTF_8 indonesian,id,ind
italian UTF_8 italian,it,ita
lithuanian UTF_8 lithuanian,lt,lit
nepali UTF_8 nepali,ne,nep
norwegian UTF_8 norwegian,no,nor
portuguese UTF_8 portuguese,pt,por
romanian UTF_8 romanian,ro,rum,ron
russian UTF_8 russian,ru,rus
serbian UTF_8 serbian,sr,srp
spanish UTF_8 spanish,es,esl,spa
swedish UTF_8 swedish,sv,swe
tamil UTF_8 tamil,ta,tam
turkish UTF_8 turkish,tr,tur

# Also include the traditional porter algorithm for english.
# The porter algorithm is included in the libstemmer distribution to assist
# with backwards compatibility, but for new systems the english algorithm
# should be used in preference.
porter UTF_8,ISO_8859_1 porter
porter UTF_8 porter english

# Some other stemmers in the snowball project are not included in the standard
# distribution. To compile a libstemmer with them in, add them to this list,
@@ -39,12 +47,12 @@ porter UTF_8,ISO_8859_1 porter
# algorithms are:
#
# german2 - This is a slight modification of the german stemmer.
#german2 UTF_8,ISO_8859_1 german2
#german2 UTF_8,ISO_8859_1 german2 german
#
# kraaij_pohlmann - This is a different dutch stemmer.
#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann
#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch
#
# lovins - This is an english stemmer, but fairly outdated, and
# only really applicable to a restricted type of input text
# (keywords in academic publications).
#lovins UTF_8,ISO_8859_1 lovins
#lovins UTF_8,ISO_8859_1 lovins english

+ 1
- 9
contrib/snowball/runtime/api.c View File

@@ -2,7 +2,7 @@
#include <stdlib.h> /* for calloc, free */
#include "header.h"

extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
extern struct SN_env * SN_create_env(int S_size, int I_size)
{
struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env));
if (z == NULL) return NULL;
@@ -27,12 +27,6 @@ extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size)
if (z->I == NULL) goto error;
}

if (B_size)
{
z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char));
if (z->B == NULL) goto error;
}

return z;
error:
SN_close_env(z, S_size);
@@ -52,7 +46,6 @@ extern void SN_close_env(struct SN_env * z, int S_size)
free(z->S);
}
free(z->I);
free(z->B);
if (z->p) lose_s(z->p);
free(z);
}
@@ -63,4 +56,3 @@ extern int SN_set_current(struct SN_env * z, int size, const symbol * s)
z->c = 0;
return err;
}


+ 8
- 2
contrib/snowball/runtime/api.h View File

@@ -16,11 +16,17 @@ struct SN_env {
int c; int l; int lb; int bra; int ket;
symbol * * S;
int * I;
unsigned char * B;
};

extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size);
#ifdef __cplusplus
extern "C" {
#endif

extern struct SN_env * SN_create_env(int S_size, int I_size);
extern void SN_close_env(struct SN_env * z, int S_size);

extern int SN_set_current(struct SN_env * z, int size, const symbol * s);

#ifdef __cplusplus
}
#endif

+ 2
- 1
contrib/snowball/runtime/header.h View File

@@ -54,5 +54,6 @@ extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p);
extern symbol * slice_to(struct SN_env * z, symbol * p);
extern symbol * assign_to(struct SN_env * z, symbol * p);

extern void debug(struct SN_env * z, int number, int line_count);
extern int len_utf8(const symbol * p);

extern void debug(struct SN_env * z, int number, int line_count);

+ 108
- 83
contrib/snowball/runtime/utilities.c View File

@@ -5,8 +5,6 @@

#include "header.h"

#define unless(C) if(!(C))

#define CREATE_SIZE 1

extern symbol * create_s(void) {
@@ -15,7 +13,7 @@ extern symbol * create_s(void) {
if (mem == NULL) return NULL;
p = (symbol *) (HEAD + (char *) mem);
CAPACITY(p) = CREATE_SIZE;
SET_SIZE(p, CREATE_SIZE);
SET_SIZE(p, 0);
return p;
}

@@ -27,7 +25,7 @@ extern void lose_s(symbol * p) {
/*
new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c
if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new
position, or 0 on failure.
position, or -1 on failure.

-- used to implement hop and next in the utf8 case.
*/
@@ -66,77 +64,95 @@ extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) {
/* Code for character groupings: utf8 cases */

static int get_utf8(const symbol * p, int c, int l, int * slot) {
int b0, b1;
int b0, b1, b2;
if (c >= l) return 0;
b0 = p[c++];
if (b0 < 0xC0 || c == l) { /* 1100 0000 */
* slot = b0; return 1;
*slot = b0;
return 1;
}
b1 = p[c++];
b1 = p[c++] & 0x3F;
if (b0 < 0xE0 || c == l) { /* 1110 0000 */
* slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2;
*slot = (b0 & 0x1F) << 6 | b1;
return 2;
}
b2 = p[c++] & 0x3F;
if (b0 < 0xF0 || c == l) { /* 1111 0000 */
*slot = (b0 & 0xF) << 12 | b1 << 6 | b2;
return 3;
}
* slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[c] & 0x3F); return 3;
*slot = (b0 & 0xE) << 18 | b1 << 12 | b2 << 6 | (p[c] & 0x3F);
return 4;
}

static int get_b_utf8(const symbol * p, int c, int lb, int * slot) {
int b0, b1;
int a, b;
if (c <= lb) return 0;
b0 = p[--c];
if (b0 < 0x80 || c == lb) { /* 1000 0000 */
* slot = b0; return 1;
b = p[--c];
if (b < 0x80 || c == lb) { /* 1000 0000 */
*slot = b;
return 1;
}
b1 = p[--c];
if (b1 >= 0xC0 || c == lb) { /* 1100 0000 */
* slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2;
a = b & 0x3F;
b = p[--c];
if (b >= 0xC0 || c == lb) { /* 1100 0000 */
*slot = (b & 0x1F) << 6 | a;
return 2;
}
* slot = (p[c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3;
a |= (b & 0x3F) << 6;
b = p[--c];
if (b >= 0xE0 || c == lb) { /* 1110 0000 */
*slot = (b & 0xF) << 12 | a;
return 3;
}
*slot = (p[--c] & 0xE) << 18 | (b & 0x3F) << 12 | a;
return 4;
}

extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
int ch;
int w = get_utf8(z->p, z->c, z->l, & ch);
unless (w) return -1;
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
return w;
z->c += w;
int ch;
int w = get_utf8(z->p, z->c, z->l, & ch);
if (!w) return -1;
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
return w;
z->c += w;
} while (repeat);
return 0;
}

extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
int ch;
int w = get_b_utf8(z->p, z->c, z->lb, & ch);
unless (w) return -1;
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
return w;
z->c -= w;
int ch;
int w = get_b_utf8(z->p, z->c, z->lb, & ch);
if (!w) return -1;
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
return w;
z->c -= w;
} while (repeat);
return 0;
}

extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
int ch;
int w = get_utf8(z->p, z->c, z->l, & ch);
unless (w) return -1;
unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
return w;
z->c += w;
int ch;
int w = get_utf8(z->p, z->c, z->l, & ch);
if (!w) return -1;
if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
return w;
z->c += w;
} while (repeat);
return 0;
}

extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
int ch;
int w = get_b_utf8(z->p, z->c, z->lb, & ch);
unless (w) return -1;
unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
return w;
z->c -= w;
int ch;
int w = get_b_utf8(z->p, z->c, z->lb, & ch);
if (!w) return -1;
if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
return w;
z->c -= w;
} while (repeat);
return 0;
}
@@ -145,48 +161,48 @@ extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min,

extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
int ch;
if (z->c >= z->l) return -1;
ch = z->p[z->c];
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
return 1;
z->c++;
int ch;
if (z->c >= z->l) return -1;
ch = z->p[z->c];
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
return 1;
z->c++;
} while (repeat);
return 0;
}

extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
int ch;
if (z->c <= z->lb) return -1;
ch = z->p[z->c - 1];
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
return 1;
z->c--;
int ch;
if (z->c <= z->lb) return -1;
ch = z->p[z->c - 1];
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
return 1;
z->c--;
} while (repeat);
return 0;
}

extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
int ch;
if (z->c >= z->l) return -1;
ch = z->p[z->c];
unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
return 1;
z->c++;
int ch;
if (z->c >= z->l) return -1;
ch = z->p[z->c];
if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
return 1;
z->c++;
} while (repeat);
return 0;
}

extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) {
do {
int ch;
if (z->c <= z->lb) return -1;
ch = z->p[z->c - 1];
unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)
return 1;
z->c--;
int ch;
if (z->c <= z->lb) return -1;
ch = z->p[z->c - 1];
if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0))
return 1;
z->c--;
} while (repeat);
return 0;
}
@@ -215,7 +231,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
int j = v_size;

int c = z->c; int l = z->l;
symbol * q = z->p + c;
const symbol * q = z->p + c;

const struct among * w;

@@ -224,7 +240,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) {

int first_key_inspected = 0;

while(1) {
while (1) {
int k = i + ((j - i) >> 1);
int diff = 0;
int common = common_i < common_j ? common_i : common_j; /* smaller */
@@ -237,8 +253,13 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
common++;
}
}
if (diff < 0) { j = k; common_j = common; }
else { i = k; common_i = common; }
if (diff < 0) {
j = k;
common_j = common;
} else {
i = k;
common_i = common;
}
if (j - i <= 1) {
if (i > 0) break; /* v->s has been inspected */
if (j == i) break; /* only one item in v */
@@ -251,7 +272,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) {
first_key_inspected = 1;
}
}
while(1) {
while (1) {
w = v + i;
if (common_i >= w->s_size) {
z->c = c + w->s_size;
@@ -275,7 +296,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) {
int j = v_size;

int c = z->c; int lb = z->lb;
symbol * q = z->p + c - 1;
const symbol * q = z->p + c - 1;

const struct among * w;

@@ -284,7 +305,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) {

int first_key_inspected = 0;

while(1) {
while (1) {
int k = i + ((j - i) >> 1);
int diff = 0;
int common = common_i < common_j ? common_i : common_j;
@@ -306,7 +327,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) {
first_key_inspected = 1;
}
}
while(1) {
while (1) {
w = v + i;
if (common_i >= w->s_size) {
z->c = c - w->s_size;
@@ -367,11 +388,10 @@ extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const
z->l += adjustment;
if (z->c >= c_ket)
z->c += adjustment;
else
if (z->c > c_bra)
z->c = c_bra;
else if (z->c > c_bra)
z->c = c_bra;
}
unless (s_size == 0) memmove(z->p + c_bra, s, s_size * sizeof(symbol));
if (s_size) memmove(z->p + c_bra, s, s_size * sizeof(symbol));
if (adjptr != NULL)
*adjptr = adjustment;
return 0;
@@ -417,12 +437,7 @@ extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbo
}

extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) {
int adjustment;
if (replace_s(z, bra, ket, SIZE(p), p, &adjustment))
return -1;
if (bra <= z->bra) z->bra += adjustment;
if (bra <= z->ket) z->ket += adjustment;
return 0;
return insert_s(z, bra, ket, SIZE(p), p);
}

extern symbol * slice_to(struct SN_env * z, symbol * p) {
@@ -455,6 +470,16 @@ extern symbol * assign_to(struct SN_env * z, symbol * p) {
return p;
}

extern int len_utf8(const symbol * p) {
int size = SIZE(p);
int len = 0;
while (size--) {
symbol b = *p++;
if (b >= 0xC0 || b < 0x80) ++len;
}
return len;
}

#if 0
extern void debug(struct SN_env * z, int number, int line_count) {
int i;

Loading…
Cancel
Save