@@ -1,20 +1,14 @@ | |||
# End of configuration | |||
SET(LIBSTEM_ALGORITHMS danish dutch english finnish french german hungarian | |||
italian norwegian porter portuguese romanian | |||
russian spanish swedish turkish) | |||
SET(KOI8_ALGORITHMS russian) | |||
SET(ISO_8859_1_ALGORITHMS danish dutch english finnish french german italian | |||
norwegian porter portuguese spanish swedish) | |||
SET(ISO_8859_2_ALGORITHMS hungarian romanian) | |||
SET(OTHER_ALGORITHMS german2 kraaij_pohlmann lovins) | |||
SET(ALL_ALGORITHMS ${LIBSTEM_ALGORITHMS} ${OTHER_ALGORITHMS}) | |||
SET(LIBSTEM_ALGORITHMS arabic danish dutch english finnish french german greek hindi hungarian | |||
indonesian italian lithuanian nepali norwegian porter portuguese romanian | |||
russian serbian spanish swedish tamil turkish) | |||
SET(ALL_ALGORITHMS ${LIBSTEM_ALGORITHMS}) | |||
SET(COMPILER_SOURCES compiler/space.c | |||
compiler/tokeniser.c | |||
compiler/analyser.c | |||
compiler/generator.c | |||
compiler/driver.c | |||
compiler/generator_java.c) | |||
compiler/driver.c) | |||
SET(SNOWBALL_RUNTIME runtime/api.c | |||
runtime/utilities.c) | |||
@@ -24,9 +18,15 @@ SET(LIBSTEMMER_UTF8_SOURCES libstemmer/libstemmer_utf8.c) | |||
#LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h | |||
#LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in | |||
SET(STEMWORDS_SOURCES examples/stemwords.c) | |||
SET(MODULES_H "modules.h") | |||
CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/libstemmer_c.in ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/libstemmer.c @ONLY) | |||
ADD_DEFINITIONS("-DDISABLE_JS") | |||
ADD_DEFINITIONS("-DDISABLE_GO") | |||
ADD_DEFINITIONS("-DDISABLE_JAVA") | |||
ADD_DEFINITIONS("-DDISABLE_PYTHON") | |||
ADD_DEFINITIONS("-DDISABLE_CSHARP") | |||
ADD_DEFINITIONS("-DDISABLE_PASCAL") | |||
ADD_DEFINITIONS("-DDISABLE_RUST") | |||
MACRO(gen_stem IN ENCODING) | |||
FOREACH(_it ${IN}) | |||
@@ -34,7 +34,7 @@ MACRO(gen_stem IN ENCODING) | |||
SET(_header "${_base}.h") | |||
SET(_source "${_base}.c") | |||
STRING(REPLACE "UTF_8" "Unicode" _in_enc "${ENCODING}") | |||
SET(_input "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}/stem_${_in_enc}.sbl") | |||
SET(_input "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}.sbl") | |||
IF(${_in_enc} STREQUAL "Unicode" AND NOT EXISTS ${_input}) | |||
ADD_CUSTOM_COMMAND(OUTPUT ${_source} | |||
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/snowball "${CMAKE_CURRENT_SOURCE_DIR}/algorithms/${_it}/stem_ISO_8859_1.sbl" -o ${_base} -eprefix ${_it}_${ENCODING}_ -r ../runtime -u | |||
@@ -57,7 +57,7 @@ INCLUDE_DIRECTORIES("include") | |||
ADD_EXECUTABLE(snowball ${COMPILER_SOURCES}) | |||
ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h | |||
COMMAND ${PERL_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/mkmodules.pl libstemmer/modules.h libstemmer ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/modules.txt libstemmer/mkinc.mak) | |||
COMMAND ${PERL_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/mkmodules.pl ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h libstemmer ${CMAKE_CURRENT_SOURCE_DIR}/libstemmer/modules.txt ${CMAKE_CURRENT_BINARY_DIR}/libstemmer/mkinc.mak) | |||
ADD_CUSTOM_TARGET(modules DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/modules.h") | |||
SET(STEMMER_SOURCES "${CMAKE_CURRENT_BINARY_DIR}/libstemmer/libstemmer.c") | |||
@@ -65,13 +65,6 @@ ADD_CUSTOM_TARGET(stemmer_deps ALL) | |||
ADD_DEPENDENCIES(stemmer_deps modules) | |||
gen_stem("${LIBSTEM_ALGORITHMS}" "UTF_8") | |||
gen_stem("${KOI8_ALGORITHMS}" "KOI8_R") | |||
gen_stem("${ISO_8859_1_ALGORITHMS}" "ISO_8859_1") | |||
gen_stem("${ISO_8859_2_ALGORITHMS}" "ISO_8859_2") | |||
ADD_LIBRARY(stemmer ${LINK_TYPE} ${SNOWBALL_RUNTIME} ${STEMMER_SOURCES}) | |||
ADD_DEPENDENCIES(stemmer stemmer_deps) | |||
ADD_EXECUTABLE(stemwords ${STEMWORDS_SOURCES}) | |||
TARGET_LINK_LIBRARIES(stemwords stemmer) |
@@ -1,300 +0,0 @@ | |||
# -*- makefile -*- | |||
c_src_dir = src_c | |||
java_src_main_dir = java/org/tartarus/snowball | |||
java_src_dir = $(java_src_main_dir)/ext | |||
libstemmer_algorithms = danish dutch english finnish french german hungarian \ | |||
italian \ | |||
norwegian porter portuguese romanian \ | |||
russian spanish swedish turkish | |||
KOI8_R_algorithms = russian | |||
ISO_8859_1_algorithms = danish dutch english finnish french german italian \ | |||
norwegian porter portuguese spanish swedish | |||
ISO_8859_2_algorithms = hungarian romanian | |||
other_algorithms = german2 kraaij_pohlmann lovins | |||
all_algorithms = $(libstemmer_algorithms) $(other_algorithms) | |||
COMPILER_SOURCES = compiler/space.c \ | |||
compiler/tokeniser.c \ | |||
compiler/analyser.c \ | |||
compiler/generator.c \ | |||
compiler/driver.c \ | |||
compiler/generator_java.c | |||
COMPILER_HEADERS = compiler/header.h \ | |||
compiler/syswords.h \ | |||
compiler/syswords2.h | |||
RUNTIME_SOURCES = runtime/api.c \ | |||
runtime/utilities.c | |||
RUNTIME_HEADERS = runtime/api.h \ | |||
runtime/header.h | |||
JAVARUNTIME_SOURCES = java/org/tartarus/snowball/Among.java \ | |||
java/org/tartarus/snowball/SnowballProgram.java \ | |||
java/org/tartarus/snowball/SnowballStemmer.java \ | |||
java/org/tartarus/snowball/TestApp.java | |||
LIBSTEMMER_SOURCES = libstemmer/libstemmer.c | |||
LIBSTEMMER_UTF8_SOURCES = libstemmer/libstemmer_utf8.c | |||
LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h | |||
LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in | |||
STEMWORDS_SOURCES = examples/stemwords.c | |||
ALL_ALGORITHM_FILES = $(all_algorithms:%=algorithms/%/stem*.sbl) | |||
C_LIB_SOURCES = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) \ | |||
$(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.c) \ | |||
$(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.c) \ | |||
$(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.c) | |||
C_LIB_HEADERS = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) \ | |||
$(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.h) \ | |||
$(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.h) \ | |||
$(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.h) | |||
C_OTHER_SOURCES = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) | |||
C_OTHER_HEADERS = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) | |||
JAVA_SOURCES = $(libstemmer_algorithms:%=$(java_src_dir)/%Stemmer.java) | |||
COMPILER_OBJECTS=$(COMPILER_SOURCES:.c=.o) | |||
RUNTIME_OBJECTS=$(RUNTIME_SOURCES:.c=.o) | |||
LIBSTEMMER_OBJECTS=$(LIBSTEMMER_SOURCES:.c=.o) | |||
LIBSTEMMER_UTF8_OBJECTS=$(LIBSTEMMER_UTF8_SOURCES:.c=.o) | |||
STEMWORDS_OBJECTS=$(STEMWORDS_SOURCES:.c=.o) | |||
C_LIB_OBJECTS = $(C_LIB_SOURCES:.c=.o) | |||
C_OTHER_OBJECTS = $(C_OTHER_SOURCES:.c=.o) | |||
JAVA_CLASSES = $(JAVA_SOURCES:.java=.class) | |||
JAVA_RUNTIME_CLASSES=$(JAVARUNTIME_SOURCES:.java=.class) | |||
CFLAGS=-Iinclude -O2 | |||
CPPFLAGS=-W -Wall -Wmissing-prototypes -Wmissing-declarations | |||
all: snowball libstemmer.o stemwords $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) | |||
clean: | |||
rm -f $(COMPILER_OBJECTS) $(RUNTIME_OBJECTS) \ | |||
$(LIBSTEMMER_OBJECTS) $(LIBSTEMMER_UTF8_OBJECTS) $(STEMWORDS_OBJECTS) snowball \ | |||
libstemmer.o stemwords \ | |||
libstemmer/modules.h \ | |||
libstemmer/modules_utf8.h \ | |||
snowball.splint \ | |||
$(C_LIB_SOURCES) $(C_LIB_HEADERS) $(C_LIB_OBJECTS) \ | |||
$(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) \ | |||
$(JAVA_SOURCES) $(JAVA_CLASSES) $(JAVA_RUNTIME_CLASSES) \ | |||
libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak \ | |||
libstemmer/libstemmer.c libstemmer/libstemmer_utf8.c | |||
rm -rf dist | |||
rmdir $(c_src_dir) || true | |||
snowball: $(COMPILER_OBJECTS) | |||
$(CC) -o $@ $^ | |||
$(COMPILER_OBJECTS): $(COMPILER_HEADERS) | |||
libstemmer/libstemmer.c: libstemmer/libstemmer_c.in | |||
sed 's/@MODULES_H@/modules.h/' $^ >$@ | |||
libstemmer/libstemmer_utf8.c: libstemmer/libstemmer_c.in | |||
sed 's/@MODULES_H@/modules_utf8.h/' $^ >$@ | |||
libstemmer/modules.h libstemmer/mkinc.mak: libstemmer/mkmodules.pl libstemmer/modules.txt | |||
libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules.txt libstemmer/mkinc.mak | |||
libstemmer/modules_utf8.h libstemmer/mkinc_utf8.mak: libstemmer/mkmodules.pl libstemmer/modules_utf8.txt | |||
libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules_utf8.txt libstemmer/mkinc_utf8.mak utf8 | |||
libstemmer/libstemmer.o: libstemmer/modules.h $(C_LIB_HEADERS) | |||
libstemmer.o: libstemmer/libstemmer.o $(RUNTIME_OBJECTS) $(C_LIB_OBJECTS) | |||
$(AR) -cru $@ $^ | |||
stemwords: $(STEMWORDS_OBJECTS) libstemmer.o | |||
$(CC) -o $@ $^ | |||
algorithms/%/stem_Unicode.sbl: algorithms/%/stem_ISO_8859_1.sbl | |||
cp $^ $@ | |||
$(c_src_dir)/stem_UTF_8_%.c $(c_src_dir)/stem_UTF_8_%.h: algorithms/%/stem_Unicode.sbl snowball | |||
@mkdir -p $(c_src_dir) | |||
@l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \ | |||
o="$(c_src_dir)/stem_UTF_8_$${l}"; \ | |||
echo "./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u"; \ | |||
./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u | |||
$(c_src_dir)/stem_KOI8_R_%.c $(c_src_dir)/stem_KOI8_R_%.h: algorithms/%/stem_KOI8_R.sbl snowball | |||
@mkdir -p $(c_src_dir) | |||
@l=`echo "$<" | sed 's!\(.*\)/stem_KOI8_R.sbl$$!\1!;s!^.*/!!'`; \ | |||
o="$(c_src_dir)/stem_KOI8_R_$${l}"; \ | |||
echo "./snowball $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime"; \ | |||
./snowball $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime | |||
$(c_src_dir)/stem_ISO_8859_1_%.c $(c_src_dir)/stem_ISO_8859_1_%.h: algorithms/%/stem_ISO_8859_1.sbl snowball | |||
@mkdir -p $(c_src_dir) | |||
@l=`echo "$<" | sed 's!\(.*\)/stem_ISO_8859_1.sbl$$!\1!;s!^.*/!!'`; \ | |||
o="$(c_src_dir)/stem_ISO_8859_1_$${l}"; \ | |||
echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime"; \ | |||
./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime | |||
$(c_src_dir)/stem_ISO_8859_2_%.c $(c_src_dir)/stem_ISO_8859_2_%.h: algorithms/%/stem_ISO_8859_2.sbl snowball | |||
@mkdir -p $(c_src_dir) | |||
@l=`echo "$<" | sed 's!\(.*\)/stem_ISO_8859_2.sbl$$!\1!;s!^.*/!!'`; \ | |||
o="$(c_src_dir)/stem_ISO_8859_2_$${l}"; \ | |||
echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime"; \ | |||
./snowball $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime | |||
$(c_src_dir)/stem_%.o: $(c_src_dir)/stem_%.c $(c_src_dir)/stem_%.h | |||
$(CC) $(CFLAGS) $(CPPFLAGS) -c -o $@ $< | |||
$(java_src_dir)/%Stemmer.java: algorithms/%/stem_Unicode.sbl snowball | |||
@mkdir -p $(java_src_dir) | |||
@l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \ | |||
o="$(java_src_dir)/$${l}Stemmer"; \ | |||
echo "./snowball $< -j -o $${o} -p \"org.tartarus.snowball.SnowballStemmer\" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer"; \ | |||
./snowball $< -j -o $${o} -p "org.tartarus.snowball.SnowballStemmer" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer | |||
splint: snowball.splint | |||
snowball.splint: $(COMPILER_SOURCES) | |||
splint $^ >$@ -weak | |||
# Make a full source distribution | |||
dist: dist_snowball dist_libstemmer_c dist_libstemmer_java | |||
# Make a distribution of all the sources involved in snowball | |||
dist_snowball: $(COMPILER_SOURCES) $(COMPILER_HEADERS) \ | |||
$(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \ | |||
$(LIBSTEMMER_SOURCES) \ | |||
$(LIBSTEMMER_UTF8_SOURCES) \ | |||
$(LIBSTEMMER_HEADERS) \ | |||
$(LIBSTEMMER_EXTRA) \ | |||
$(ALL_ALGORITHM_FILES) $(STEMWORDS_SOURCES) \ | |||
GNUmakefile README doc/TODO libstemmer/mkmodules.pl | |||
destname=snowball_code; \ | |||
dest=dist/$${destname}; \ | |||
rm -rf $${dest} && \ | |||
rm -f $${dest}.tgz && \ | |||
for file in $^; do \ | |||
dir=`dirname $$file` && \ | |||
mkdir -p $${dest}/$${dir} && \ | |||
cp -a $${file} $${dest}/$${dir} || exit 1 ; \ | |||
done && \ | |||
(cd dist && tar zcf $${destname}.tgz $${destname}) && \ | |||
rm -rf $${dest} | |||
# Make a distribution of all the sources required to compile the C library. | |||
dist_libstemmer_c: \ | |||
$(RUNTIME_SOURCES) \ | |||
$(RUNTIME_HEADERS) \ | |||
$(LIBSTEMMER_SOURCES) \ | |||
$(LIBSTEMMER_UTF8_SOURCES) \ | |||
$(LIBSTEMMER_HEADERS) \ | |||
$(LIBSTEMMER_EXTRA) \ | |||
$(C_LIB_SOURCES) \ | |||
$(C_LIB_HEADERS) \ | |||
libstemmer/mkinc.mak \ | |||
libstemmer/mkinc_utf8.mak | |||
destname=libstemmer_c; \ | |||
dest=dist/$${destname}; \ | |||
rm -rf $${dest} && \ | |||
rm -f $${dest}.tgz && \ | |||
mkdir -p $${dest} && \ | |||
cp -a doc/libstemmer_c_README $${dest}/README && \ | |||
mkdir -p $${dest}/examples && \ | |||
cp -a examples/stemwords.c $${dest}/examples && \ | |||
mkdir -p $${dest}/$(c_src_dir) && \ | |||
cp -a $(C_LIB_SOURCES) $(C_LIB_HEADERS) $${dest}/$(c_src_dir) && \ | |||
mkdir -p $${dest}/runtime && \ | |||
cp -a $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) $${dest}/runtime && \ | |||
mkdir -p $${dest}/libstemmer && \ | |||
cp -a $(LIBSTEMMER_SOURCES) $(LIBSTEMMER_UTF8_SOURCES) $(LIBSTEMMER_HEADERS) $(LIBSTEMMER_EXTRA) $${dest}/libstemmer && \ | |||
mkdir -p $${dest}/include && \ | |||
mv $${dest}/libstemmer/libstemmer.h $${dest}/include && \ | |||
(cd $${dest} && \ | |||
echo "README" >> MANIFEST && \ | |||
ls $(c_src_dir)/*.c $(c_src_dir)/*.h >> MANIFEST && \ | |||
ls runtime/*.c runtime/*.h >> MANIFEST && \ | |||
ls libstemmer/*.c libstemmer/*.h >> MANIFEST && \ | |||
ls include/*.h >> MANIFEST) && \ | |||
cp -a libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak $${dest}/ && \ | |||
echo 'include mkinc.mak' >> $${dest}/Makefile && \ | |||
echo 'CFLAGS=-Iinclude' >> $${dest}/Makefile && \ | |||
echo 'all: libstemmer.o stemwords' >> $${dest}/Makefile && \ | |||
echo 'libstemmer.o: $$(snowball_sources:.c=.o)' >> $${dest}/Makefile && \ | |||
echo ' $$(AR) -cru $$@ $$^' >> $${dest}/Makefile && \ | |||
echo 'stemwords: examples/stemwords.o libstemmer.o' >> $${dest}/Makefile && \ | |||
echo ' $$(CC) -o $$@ $$^' >> $${dest}/Makefile && \ | |||
echo 'clean:' >> $${dest}/Makefile && \ | |||
echo ' rm -f stemwords *.o $(c_src_dir)/*.o runtime/*.o libstemmer/*.o' >> $${dest}/Makefile && \ | |||
(cd dist && tar zcf $${destname}.tgz $${destname}) && \ | |||
rm -rf $${dest} | |||
# Make a distribution of all the sources required to compile the Java library. | |||
dist_libstemmer_java: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \ | |||
$(LIBSTEMMER_EXTRA) \ | |||
$(JAVA_SOURCES) | |||
destname=libstemmer_java; \ | |||
dest=dist/$${destname}; \ | |||
rm -rf $${dest} && \ | |||
rm -f $${dest}.tgz && \ | |||
mkdir -p $${dest} && \ | |||
cp -a doc/libstemmer_java_README $${dest}/README && \ | |||
mkdir -p $${dest}/$(java_src_dir) && \ | |||
cp -a $(JAVA_SOURCES) $${dest}/$(java_src_dir) && \ | |||
mkdir -p $${dest}/$(java_src_main_dir) && \ | |||
cp -a $(JAVARUNTIME_SOURCES) $${dest}/$(java_src_main_dir) && \ | |||
(cd $${dest} && \ | |||
echo "README" >> MANIFEST && \ | |||
ls $(java_src_dir)/*.java >> MANIFEST && \ | |||
ls $(java_src_main_dir)/*.java >> MANIFEST) && \ | |||
(cd dist && tar zcf $${destname}.tgz $${destname}) && \ | |||
rm -rf $${dest} | |||
check: check_utf8 check_iso_8859_1 check_iso_8859_2 check_koi8r | |||
check_utf8: $(libstemmer_algorithms:%=check_utf8_%) | |||
check_iso_8859_1: $(ISO_8859_1_algorithms:%=check_iso_8859_1_%) | |||
check_iso_8859_2: $(ISO_8859_2_algorithms:%=check_iso_8859_2_%) | |||
check_koi8r: $(KOI8_R_algorithms:%=check_koi8r_%) | |||
# Where the data files are located - assumed their repo is checked out as | |||
# a sibling to this one. | |||
STEMMING_DATA = ../snowball-data | |||
check_utf8_%: $(STEMMING_DATA)/% stemwords | |||
@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with UTF-8" | |||
@./stemwords -c UTF_8 -l `echo $<|sed 's!.*/!!'` -i $</voc.txt -o tmp.txt | |||
@diff -u $</output.txt tmp.txt | |||
@if [ -e $</diffs.txt ] ; \ | |||
then \ | |||
./stemwords -c UTF_8 -l `echo $<|sed 's!.*/!!'` -i $</voc.txt -o tmp.txt -p2 && \ | |||
diff -u $</diffs.txt tmp.txt; \ | |||
fi | |||
@rm tmp.txt | |||
check_iso_8859_1_%: $(STEMMING_DATA)/% stemwords | |||
@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_1" | |||
@python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-1"))' | \ | |||
./stemwords -c ISO_8859_1 -l `echo $<|sed 's!.*/!!'` -o tmp.txt | |||
@python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-1"))' | \ | |||
diff -u - tmp.txt | |||
@rm tmp.txt | |||
check_iso_8859_2_%: $(STEMMING_DATA)/% stemwords | |||
@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with ISO_8859_2" | |||
@python -c 'print(open("$</voc.txt").read().decode("utf8").encode("iso8859-2"))' | \ | |||
./stemwords -c ISO_8859_2 -l `echo $<|sed 's!.*/!!'` -o tmp.txt | |||
@python -c 'print(open("$</output.txt").read().decode("utf8").encode("iso8859-2"))' | \ | |||
diff -u - tmp.txt | |||
@rm tmp.txt | |||
check_koi8r_%: $(STEMMING_DATA)/% stemwords | |||
@echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with KOI8R" | |||
@python -c 'print(open("$</voc.txt").read().decode("utf8").encode("koi8_r"))' | \ | |||
./stemwords -c KOI8_R -l `echo $<|sed 's!.*/!!'` -o tmp.txt | |||
@python -c 'print(open("$</output.txt").read().decode("utf8").encode("koi8_r"))' | \ | |||
diff -u - tmp.txt | |||
@rm tmp.txt |
@@ -0,0 +1,407 @@ | |||
Snowball 2.0.0 (2019-10-02) | |||
=========================== | |||
C/C++ | |||
----- | |||
* Fully handle 4-byte UTF-8 sequences. Previously `hop` and `next` handled | |||
sequences of any length, but commands which look at the character value only | |||
handled sequences up to length 3. Fixes #89. | |||
* Fix handling of a 3-byte UTF-8 sequence in a grouping in `backwardmode`. | |||
Java | |||
---- | |||
* TestApp.java: | |||
- Always use UTF-8 for I/O. Patch from David Corbett (#80). | |||
- Allow reading input from stdin. | |||
- Remove rather pointless "stem n times" feature. | |||
- Only lower case ASCII to match stemwords.c. | |||
- Stem empty lines too to match stemwords.c. | |||
Code Quality Improvements | |||
------------------------- | |||
* Fix various warnings from newer compilers. | |||
* Improve use of `const`. | |||
* Share common functions between compiler backends rather than having multiple | |||
copies of the same code. | |||
* Assorted code clean-up. | |||
* Initialise line_labelled member of struct generator to 0. Previously we were | |||
invoking undefined behaviour, though in practice it'll be zero initialised on | |||
most platforms. | |||
New Code Generators | |||
------------------- | |||
* Add Python generator (#24). Originally written by Yoshiki Shibukawa, with | |||
additional updates by Dmitry Shachnev. | |||
* Add Javascript generator. Based on JSX generator (#26) written by Yoshiki | |||
Shibukawa. | |||
* Add Rust generator from Jakob Demler (#51). | |||
* Add Go generator from Marty Schoch (#57). | |||
* Add C# generator. Based on patch from Cesar Souza (#16, #17). | |||
* Add Pascal generator. Based on Delphi backend from stemming.zip file on old | |||
website (#75). | |||
New Language Features | |||
--------------------- | |||
* Add `len` and `lenof` to measure Unicode length. These are similar to `size` | |||
and `sizeof` (respectively), but `size` and `sizeof` return the length in | |||
bytes under `-utf8`, whereas these new commands give the same result whether | |||
using `-utf8`, `-widechars` or neither (but under `-utf8` they are O(n) in | |||
the length of the string). For compatibility with existing code which might | |||
use these as variable or function names, they stop being treated as tokens if | |||
declared to be a variable or function. | |||
* New `{U+1234}` stringdef notation for Unicode codepoints. | |||
* More versatile integer tests. Now you can compare any two arithmetic | |||
expressions with a relational operator in parentheses after the `$`, so for | |||
example `$(len > 3)` can now be used when previously a temporary variable was | |||
required: `$tmp = len $tmp > 3` | |||
Code generation improvements | |||
---------------------------- | |||
* General: | |||
+ Avoid unnecessarily saving and restoring of the cursor for more commands - | |||
`atlimit`, `do`, `set` and `unset` all leave the cursor alone or always | |||
restore its value, and for C `booltest` (which other languages already | |||
handled). | |||
+ Special case handling for `setlimit tomark AE`. All uses of setlimit in | |||
the current stemmers we ship follow this pattern, and by special-casing we | |||
can avoid having to save and restore the cursor (#74). | |||
+ Merge duplicate actions in the same `among`. This reduces the size of the | |||
switch/if-chain in the generated code which dispatch the among for many of | |||
the stemmers. | |||
+ Generate simpler code for `among`. We always check for a zero return value | |||
when we call the among, so there's no point also checking for that in the | |||
switch/if-chain. We can also avoid the switch/if-chain entirely when | |||
there's only one possible outcome (besides the zero return). | |||
+ Optimise code generated for `do <function call>`. This speeds up "make | |||
check_python" by about 2%, and should speed up other interpreted languages | |||
too (#110). | |||
+ Generate more and better comments referencing snowball source. | |||
+ Add homepage URL and compiler version as comments in generated files. | |||
* C/C++: | |||
+ Fix `size` and `sizeof` to not report one too high (reported by Assem | |||
Chelli in #32). | |||
+ If signal `f` from a function call would lead to return from the current | |||
function then handle this and bailing out on an error together with a | |||
simple `if (ret <= 0) return ret;` | |||
+ Inline testing for a single character literals. | |||
+ Avoiding generating `|| 0` in corner case - this can result in a compiler | |||
warning when building the generated code. | |||
+ Implement `insert_v()` in terms of `insert_s()`. | |||
+ Add conditional `extern "C"` so `runtime/api.h` can be included from C++ | |||
code. Closes #90, reported by vvarma. | |||
* Java: | |||
+ Fix functions in `among` to work in Java. We seem to need to make the | |||
methods called from among `public` instead of `private`, and to call them | |||
on `this` instead of the `methodObject` (which is cleaner anyway). No | |||
revision in version control seems to generate working code for this case, | |||
but Richard says it definitely used to work - possibly older JVMs failed to | |||
correctly enforce the access controls when methods were invoked by | |||
reflection. | |||
+ Code after handling `f` by returning from the current function is | |||
unreachable too. | |||
+ Previously we incorrectly decided that code after an `or` was | |||
unreachable in certain cases. None of the current stemmers in the | |||
distribution triggered this, but Martin Porter's snowball version | |||
of the Schinke Latin stemmer does. Fixes #58, reported by Alexander | |||
Myltsev. | |||
+ The reachability logic was failing to consider reachability from | |||
the final command in an `or`. Fixes #82, reported by David Corbett. | |||
+ Fix `maxint` and `minint`. Patch from David Corbett in #31. | |||
+ Fix `$` on strings. The previous generated code was just wrong. This | |||
doesn't affect any of the included algorithms, but for example breaks | |||
Martin Porter's snowball implementation of Schinke's Latin Stemmer. | |||
Issue noted by Jakob Demler while working on the Rust backend in #51, | |||
and reported in the Schinke's Latin Stemmer by Alexander Myltsev | |||
in #58. | |||
+ Make SnowballProgram objects serializable. Patch from Oleg Smirnov in #43. | |||
+ Eliminate range-check implementation for groupings. This was removed from | |||
the C generator 10 years earlier, isn't used for any of the existing | |||
algorithms, and it doesn't seem likely it would be - the grouping would | |||
have to consist entirely of a contiguous block of Unicode code-points. | |||
+ Simplify code generated for `repeat` and `atleast`. | |||
+ Eliminate unused return values and variables from runtime functions. | |||
+ Only import the `among` and `SnowballProgram` classes if they're actually | |||
used. | |||
+ Only generate `copy_from()` method if it's used. | |||
+ Merge runtime functions `eq_s` and `eq_v` functions. | |||
+ Java arrays know their own length so stop storing it separately. | |||
+ Escape char 127 (DEL) in generated Java code. It's unlikely that this | |||
character would actually be used in a real stemmer, so this was more of a | |||
theoretical bug. | |||
+ Drop unused import of InvocationTargetException from SnowballStemmer. | |||
Reported by GerritDeMeulder in #72. | |||
+ Fix lint check issues in generated Java code. The stemmer classes are only | |||
referenced in the example app via reflection, so add | |||
@SuppressWarnings("unused") for them. The stemmer classes override | |||
equals() and hashCode() methods from the standard java Object class, so | |||
mark these with @Override. Both suggested by GerritDeMeulder in #72. | |||
+ Declare Java variables at point of use in generated code. Putting all | |||
declarations at the top of the function was adding unnecessary complexity | |||
to the Java generator code for no benefit. | |||
+ Improve formatting of generated code. | |||
New stemming algorithms | |||
----------------------- | |||
* Add Tamil stemmer from Damodharan Rajalingam (#2, #3). | |||
* Add Arabic stemmer from Assem Chelli (#32, #50). | |||
* Add Irish stemmer Jim O'Regan (#48). | |||
* Add Nepali stemmer from Arthur Zakirov (#70). | |||
* Add Indonesian stemmer from Olly Betts (#71). | |||
* Add Hindi stemmer from Olly Betts (#73). Thanks to David Corbett for review. | |||
* Add Lithuanian stemmer from Dainius Jocas (#22, #76). | |||
* Add Greek stemmer from Oleg Smirnov (#44). | |||
* Add Catalan and Basque stemmers from Israel Olalla (#104). | |||
Behavioural changes to existing algorithms | |||
------------------------------------------ | |||
* Portuguese: | |||
+ Replace incorrect Spanish suffixes by Portuguese suffixes (#1). | |||
* French: | |||
+ The MSDOS CP850 version of the French algorithm was missing changes present | |||
in the ISO8859-1 and Unicode versions. There's now a single version of | |||
each algorithm which was based on the Unicode version. | |||
+ Recognize French suffixes even when they begin with diaereses. Patch from | |||
David Corbett in #78. | |||
* Russian: | |||
+ We now normalise 'ё' to 'е' before stemming. The documentation has long | |||
said "we assume ['ё'] is mapped into ['е']" but it's more convenient for | |||
the stemmer to actually perform this normalisation. This change has no | |||
effect if the caller is already normalising as we recommend. It's a change | |||
in behaviour they aren't, but 'ё' occurs rarely (there are currently no | |||
instances in our test vocabulary) and this improves behaviour when it does | |||
occur. Patch from Eugene Mirotin (#65, #68). | |||
* Finish: | |||
+ Adjust the Finnish algorithm not to mangle numbers. This change also | |||
means it tends to leave foreign words alone. Fixes #66. | |||
* Danish: | |||
+ Adjust Danish algorithm not to mangle alphanumeric codes. In particular | |||
alphanumeric codes ending in a double digit (e.g. 0x0e00, hal9000, | |||
space1999) are no longer mangled. See #81. | |||
Optimisations to existing algorithms | |||
------------------------------------ | |||
* Turkish: | |||
+ Simplify uses of `test` in stemmer code. | |||
+ Check for 'ad' or 'soyad' more efficiently, and without needing the | |||
strlen variable. This speeds up "make check_utf8_turkish" by 11% | |||
on x86 Linux. | |||
* Kraaij-Pohlmann: | |||
+ Eliminate variable x `$p1 <= cursor` is simpler and a little more efficient | |||
than `setmark x $x >= p1`. | |||
Code clarity improvements to existing algorithms | |||
------------------------------------------------ | |||
* Turkish: | |||
+ Use , for cedilla to match the conventions used in other stemmers. | |||
* Kraaij-Pohlmann: | |||
+ Avoid cryptic `[among ( (])` ... `)` construct - instead use the same | |||
`[substring] among (` ... `)` construct we do in other stemmers. | |||
Compiler | |||
-------- | |||
* Support conventional --help and --version options. | |||
* Warn if -r or -ep used with backend other than C/C++. | |||
* Warn if encoding command line options are specified when generating code in a | |||
language with a fixed encoding. | |||
* The default classname is now set based on the output filename, so `-n` is now | |||
often no longer needed. Fixes #64. | |||
* Avoid potential one byte buffer over-read when parsing snowball code. | |||
* Avoid comparing with uninitialised array element during compilation. | |||
* Improve `-syntax` output for `setlimit L for C`. | |||
* Optimise away double negation so generators don't have to worry about | |||
generating `--` (decrement operator in many languages). Fixes #52, reported | |||
by David Corbett. | |||
* Improved compiler error and warning messages: | |||
- We now report FILE:LINE: before each diagnostic message. | |||
- Improve warnings for unused declarations/definitions. | |||
- Warn for variables which are used, but either never initialised | |||
or never read. | |||
- Flag non-ASCII literal strings. This is an error for wide Unicode, but | |||
only a warning for single-byte and UTF-8 which work so long as the source | |||
encoding matches the encoding used in the generated stemmer code. | |||
- Improve error recovery after an undeclared `define`. We now sniff the | |||
token after the identifier and if it is `as` we parse as a routine, | |||
otherwise we parse as a grouping. Previously we always just assumed it was | |||
a routine, which gave a confusing second error if it was a grouping. | |||
- Improve error recovery after an unexpected token in `among`. Previously | |||
we acted as if the unexpected token closed the `among` (this probably | |||
wasn't intended but just a missing `break;` in a switch statement). Now we | |||
issue an error and try the next token. | |||
* Report error instead of silently truncating character values (e.g. `hex 123` | |||
previously silently became byte 0x23 which is `#` rather than a | |||
g-with-cedilla). | |||
* Enlarge the initial input buffer size to 8192 bytes and double each time we | |||
hit the end. Snowball programs are typically a few KB in size (with the | |||
current largest we ship being the Greek stemmer at 27KB) so the previous | |||
approach of starting with a 10 byte input buffer and increasing its size by | |||
50% plus 40 bytes each time it filled was inefficient, needing up to 15 | |||
reallocations to load greek.sbl. | |||
* Identify variables only used by one `routine`/`external`. This information | |||
isn't yet used, but such variables which are also always written to before | |||
being read can be emitted as local variables in most target languages. | |||
* We now allow multiple source files on command line, and allow them to be | |||
after (or even interspersed) with options to better match modern Unix | |||
conventions. Support for multiple source files allows specifying a single | |||
byte character set mapping via a source file of `stringdef`. | |||
* Avoid infinite recursion in compiler when optimising a recursive snowball | |||
function. Recursive functions aren't typical in snowball programs, but | |||
the compiler shouldn't crash for any input, especially not a valid one. | |||
We now simply limit on how deep the compiler will recurse and make the | |||
pessimistic assumption in the unlikely event we hit this limit. | |||
Build system: | |||
* `make clean` in C libstemmer_c distribution now removes `examples/*.o`. | |||
(#59) | |||
* Fix all the places which previously had to have a list of stemmers to work | |||
dynamically or be generated, so now only modules.txt needs updating to add | |||
a new stemmer. | |||
* Add check_java make target which runs tests for java. | |||
* Support gzipped test data (the uncompressed arabic test data is too big for | |||
github). | |||
* GNUmakefile: Drop useless `-eprefix` and `-r` options from snowball | |||
invocations for Java - these are only meaningful when generating C code. | |||
* Pass CFLAGS when linking which matches convention (e.g. automake does it) and | |||
facilitates use of tools such as ASan. Fixes #84, reported by Thomas | |||
Pointhuber. | |||
* Add CI builds with -std=c90 to check compiler and generated code are C90 | |||
(#54) | |||
libstemmer stuff: | |||
* Split out CPPFLAGS from CFLAGS and use CFLAGS when linking stemwords. | |||
* Add -O2 to CFLAGS. | |||
* Make generated tables of encodings and modules const. | |||
* Fix clang static analyzer memory leak warning (in practice this code path | |||
can never actually be taken). Patch from Patrick O. Perry (#56) | |||
documentation | |||
* Added copyright and licensing details (#10). | |||
* Document that libstemmer supports ISO_8859_2 encoding. Currently hungarian | |||
and romanian are available in ISO_8859_2. | |||
* Remove documentation falsely claiming that libstemmer supports CP850 | |||
encoding. | |||
* CONTRIBUTING.rst: Add guidance for contributing new stemming algorithms and | |||
new language backends. | |||
* Overhaul libstemmer_python_README. Most notably, replace the benchmark data | |||
which was very out of date. |
@@ -0,0 +1,561 @@ | |||
/* | |||
* Authors: | |||
* - Assem Chelli, < assem [dot] ch [at] gmail > | |||
* - Abdelkrim Aries <ab [underscore] aries [at] esi [dot] dz> | |||
* | |||
*/ | |||
stringescapes { } | |||
/* the Arabic letters in Unicode */ | |||
// Hamza | |||
stringdef o '{U+0621}' // Hamza | |||
stringdef ao '{U+0623}' // Hamza above Alef | |||
stringdef ao_ '{U+0625}' // Hamza below Alef | |||
stringdef a~ '{U+0622}' // Alef madda | |||
stringdef wo '{U+0624}' // Hamza above waw | |||
stringdef yo '{U+0626}' // Hamza above yeh | |||
// Letters | |||
stringdef a '{U+0627}' // Alef | |||
stringdef a_ '{U+0649}' // Alef Maksura | |||
stringdef b '{U+0628}' // Beh | |||
stringdef t_ '{U+0629}' // Teh_Marbuta | |||
stringdef t '{U+062A}' // Teh | |||
stringdef th '{U+062B}' // Theh | |||
stringdef j '{U+062C}' // Jeem | |||
stringdef h '{U+062D}' // Hah | |||
stringdef x '{U+062E}' // Khah | |||
stringdef d '{U+062F}' // Dal | |||
stringdef dz '{U+0630}' // Thal | |||
stringdef r '{U+0631}' // Reh | |||
stringdef z '{U+0632}' // Zain | |||
stringdef s '{U+0633}' // Seen | |||
stringdef sh '{U+0634}' // Sheen | |||
stringdef c '{U+0635}' // Sad | |||
stringdef dh '{U+0636}' // Dad | |||
stringdef tt '{U+0637}' // Tah | |||
stringdef zh '{U+0638}' // Zah | |||
stringdef i '{U+0639}' // Ain | |||
stringdef gh '{U+063A}' // Ghain | |||
stringdef f '{U+0641}' // Feh | |||
stringdef q '{U+0642}' // Qaf | |||
stringdef k '{U+0643}' // Kaf | |||
stringdef l '{U+0644}' // Lam | |||
stringdef m '{U+0645}' // Meem | |||
stringdef n '{U+0646}' // Noon | |||
stringdef e '{U+0647}' // Heh | |||
stringdef w '{U+0648}' // Waw | |||
stringdef y '{U+064A}' // Yeh | |||
// Diacritics | |||
stringdef aan '{U+064B}' // FatHatan | |||
stringdef uun '{U+064C}' // Dammatan | |||
stringdef iin '{U+064D}' // Kasratan | |||
stringdef aa '{U+064E}' // FatHa | |||
stringdef uu '{U+064F}' // Damma | |||
stringdef ii '{U+0650}' // Kasra | |||
stringdef oo '{U+0652}' // Sukun | |||
stringdef ~ '{U+0651}' // Shadda | |||
// Hindu–Arabic numerals | |||
stringdef 0 '{U+0660}' | |||
stringdef 1 '{U+0661}' | |||
stringdef 2 '{U+0662}' | |||
stringdef 3 '{U+0663}' | |||
stringdef 4 '{U+0664}' | |||
stringdef 5 '{U+0665}' | |||
stringdef 6 '{U+0666}' | |||
stringdef 7 '{U+0667}' | |||
stringdef 8 '{U+0668}' | |||
stringdef 9 '{U+0669}' | |||
// Kasheeda | |||
stringdef _ '{U+0640}' // Kasheeda, Tatweel | |||
// Shaped forms | |||
stringdef o1 '{U+FE80}' // HAMZA | |||
stringdef ao1 '{U+FE83}' // ALEF_HAMZA_ABOVE | |||
stringdef ao2 '{U+FE84}' // ALEF_HAMZA_ABOVE | |||
stringdef ao_1 '{U+FE87}' // ALEF_HAMZA_BELOW | |||
stringdef ao_2 '{U+FE88}' // ALEF_HAMZA_BELOW | |||
stringdef yo1 '{U+FE8B}' // YEH_HAMZA | |||
stringdef yo2 '{U+FE8C}' // YEH_HAMZA | |||
stringdef yo3 '{U+FE89}' // YEH_HAMZA | |||
stringdef yo4 '{U+FE8A}' // YEH_HAMZA | |||
stringdef a~1 '{U+FE81}' // ALEF_MADDA | |||
stringdef a~2 '{U+FE82}' // ALEF_MADDA | |||
stringdef wo1 '{U+FE85}' // WAW_HAMZA | |||
stringdef wo2 '{U+FE86}' // WAW_HAMZA | |||
stringdef a1 '{U+FE8D}' // ALEF | |||
stringdef a2 '{U+FE8E}' // ALEF | |||
stringdef b1 '{U+FE8F}' // BEH | |||
stringdef b2 '{U+FE90}' // BEH | |||
stringdef b3 '{U+FE91}' // BEH | |||
stringdef b4 '{U+FE92}' // BEH | |||
stringdef t_1 '{U+FE93}' // TEH_MARBUTA | |||
stringdef t_2 '{U+FE94}' // TEH_MARBUTA | |||
stringdef t1 '{U+FE97}' // TEH | |||
stringdef t2 '{U+FE98}' // TEH | |||
stringdef t3 '{U+FE95}' // TEH | |||
stringdef t4 '{U+FE96}' // TEH | |||
stringdef th1 '{U+FE9B}' // THEH | |||
stringdef th2 '{U+FE9C}' // THEH | |||
stringdef th3 '{U+FE9A}' // THEH | |||
stringdef th4 '{U+FE99}' // THEH | |||
stringdef j1 '{U+FE9F}' // JEEM | |||
stringdef j2 '{U+FEA0}' // JEEM | |||
stringdef j3 '{U+FE9D}' // JEEM | |||
stringdef j4 '{U+FE9E}' // JEEM | |||
stringdef h1 '{U+FEA3}' // HAH | |||
stringdef h2 '{U+FEA4}' // HAH | |||
stringdef h3 '{U+FEA1}' // HAH | |||
stringdef h4 '{U+FEA2}' // HAH | |||
stringdef x1 '{U+FEA7}' // KHAH | |||
stringdef x2 '{U+FEA8}' // KHAH | |||
stringdef x3 '{U+FEA5}' // KHAH | |||
stringdef x4 '{U+FEA6}' // KHAH | |||
stringdef d1 '{U+FEA9}' // DAL | |||
stringdef d2 '{U+FEAA}' // DAL | |||
stringdef dz1 '{U+FEAB}' // THAL | |||
stringdef dz2 '{U+FEAC}' // THAL | |||
stringdef r1 '{U+FEAD}' // REH | |||
stringdef r2 '{U+FEAE}' // REH | |||
stringdef z1 '{U+FEAF}' // ZAIN | |||
stringdef z2 '{U+FEB0}' // ZAIN | |||
stringdef s1 '{U+FEB3}' // SEEN | |||
stringdef s2 '{U+FEB4}' // SEEN | |||
stringdef s3 '{U+FEB1}' // SEEN | |||
stringdef s4 '{U+FEB2}' // SEEN | |||
stringdef sh1 '{U+FEB7}' // SHEEN | |||
stringdef sh2 '{U+FEB8}' // SHEEN | |||
stringdef sh3 '{U+FEB5}' // SHEEN | |||
stringdef sh4 '{U+FEB6}' // SHEEN | |||
stringdef c1 '{U+FEBB}' // SAD | |||
stringdef c2 '{U+FEBC}' // SAD | |||
stringdef c3 '{U+FEB9}' // SAD | |||
stringdef c4 '{U+FEBA}' // SAD | |||
stringdef dh1 '{U+FEBF}' // DAD | |||
stringdef dh2 '{U+FEC0}' // DAD | |||
stringdef dh3 '{U+FEBD}' // DAD | |||
stringdef dh4 '{U+FEBE}' // DAD | |||
stringdef tt1 '{U+FEC3}' // TAH | |||
stringdef tt2 '{U+FEC4}' // TAH | |||
stringdef tt3 '{U+FEC1}' // TAH | |||
stringdef tt4 '{U+FEC2}' // TAH | |||
stringdef zh1 '{U+FEC7}' // ZAH | |||
stringdef zh2 '{U+FEC8}' // ZAH | |||
stringdef zh3 '{U+FEC5}' // ZAH | |||
stringdef zh4 '{U+FEC6}' // ZAH | |||
stringdef i1 '{U+FECB}' // AIN | |||
stringdef i2 '{U+FECC}' // AIN | |||
stringdef i3 '{U+FEC9}' // AIN | |||
stringdef i4 '{U+FECA}' // AIN | |||
stringdef gh1 '{U+FECF}' // GHAIN | |||
stringdef gh2 '{U+FED0}' // GHAIN | |||
stringdef gh3 '{U+FECD}' // GHAIN | |||
stringdef gh4 '{U+FECE}' // GHAIN | |||
stringdef f1 '{U+FED3}' // FEH | |||
stringdef f2 '{U+FED4}' // FEH | |||
stringdef f3 '{U+FED1}' // FEH | |||
stringdef f4 '{U+FED2}' // FEH | |||
stringdef q1 '{U+FED7}' // QAF | |||
stringdef q2 '{U+FED8}' // QAF | |||
stringdef q3 '{U+FED5}' // QAF | |||
stringdef q4 '{U+FED6}' // QAF | |||
stringdef k1 '{U+FEDB}' // KAF | |||
stringdef k2 '{U+FEDC}' // KAF | |||
stringdef k3 '{U+FED9}' // KAF | |||
stringdef k4 '{U+FEDA}' // KAF | |||
stringdef l1 '{U+FEDF}' // LAM | |||
stringdef l2 '{U+FEE0}' // LAM | |||
stringdef l3 '{U+FEDD}' // LAM | |||
stringdef l4 '{U+FEDE}' // LAM | |||
stringdef m1 '{U+FEE3}' // MEEM | |||
stringdef m2 '{U+FEE4}' // MEEM | |||
stringdef m3 '{U+FEE1}' // MEEM | |||
stringdef m4 '{U+FEE2}' // MEEM | |||
stringdef n1 '{U+FEE7}' // NOON | |||
stringdef n2 '{U+FEE8}' // NOON | |||
stringdef n3 '{U+FEE5}' // NOON | |||
stringdef n4 '{U+FEE6}' // NOON | |||
stringdef e1 '{U+FEEB}' // HEH | |||
stringdef e2 '{U+FEEC}' // HEH | |||
stringdef e3 '{U+FEE9}' // HEH | |||
stringdef e4 '{U+FEEA}' // HEH | |||
stringdef w1 '{U+FEED}' // WAW | |||
stringdef w2 '{U+FEEE}' // WAW | |||
stringdef a_1 '{U+FEEF}' // ALEF_MAKSURA | |||
stringdef a_2 '{U+FEF0}' // ALEF_MAKSURA | |||
stringdef y1 '{U+FEF3}' // YEH | |||
stringdef y2 '{U+FEF4}' // YEH | |||
stringdef y3 '{U+FEF1}' // YEH | |||
stringdef y4 '{U+FEF2}' // YEH | |||
// Ligatures Lam-Alef | |||
stringdef la '{U+FEFB}' // LAM_ALEF | |||
stringdef la2 '{U+FEFC}' // LAM_ALEF | |||
stringdef lao '{U+FEF7}' // LAM_ALEF_HAMZA_ABOVE | |||
stringdef lao2 '{U+FEF8}' // LAM_ALEF_HAMZA_ABOVE | |||
stringdef lao_ '{U+FEF9}' // LAM_ALEF_HAMZA_BELOW | |||
stringdef lao_2 '{U+FEFA}' // LAM_ALEF_HAMZA_BELOW | |||
stringdef la~ '{U+FEF5}' // LAM_ALEF_MADDA_ABOVE | |||
stringdef la~2 '{U+FEF6}' // LAM_ALEF_MADDA_ABOVE | |||
booleans ( | |||
is_noun | |||
is_verb | |||
is_defined | |||
) | |||
routines ( | |||
Prefix_Step1 | |||
Prefix_Step2 | |||
Prefix_Step3a_Noun | |||
Prefix_Step3b_Noun | |||
Prefix_Step3_Verb | |||
Prefix_Step4_Verb | |||
Suffix_All_alef_maqsura | |||
Suffix_Noun_Step1a | |||
Suffix_Noun_Step1b | |||
Suffix_Noun_Step2a | |||
Suffix_Noun_Step2b | |||
Suffix_Noun_Step2c1 | |||
Suffix_Noun_Step2c2 | |||
Suffix_Noun_Step3 | |||
Suffix_Verb_Step1 | |||
Suffix_Verb_Step2a | |||
Suffix_Verb_Step2b | |||
Suffix_Verb_Step2c | |||
Normalize_post | |||
Normalize_pre | |||
Checks1 | |||
) | |||
externals ( stem ) | |||
groupings ( ) | |||
// Normalizations | |||
define Normalize_pre as ( | |||
do repeat ( | |||
( | |||
[substring] among ( | |||
'{aan}' '{uun}' '{iin}' '{aa}' '{uu}' '{ii}' '{oo}' '{~}'( delete ) // strip vocalization | |||
'{_}' ( delete ) // strip kasheeda | |||
// Hindu–Arabic numerals | |||
'{0}' ( <- '0') | |||
'{1}' ( <- '1') | |||
'{2}' ( <- '2') | |||
'{3}' ( <- '3') | |||
'{4}' ( <- '4') | |||
'{5}' ( <- '5') | |||
'{6}' ( <- '6') | |||
'{7}' ( <- '7') | |||
'{8}' ( <- '8') | |||
'{9}' ( <- '9') | |||
// Shaped forms | |||
'{o1}' ( <- '{o}' ) // HAMZA | |||
'{ao1}' '{ao2}' ( <- '{ao}' ) // ALEF_HAMZA_ABOVE | |||
'{ao_1}' '{ao_2}' ( <- '{ao_}' ) // ALEF_HAMZA_BELOW | |||
'{yo1}' '{yo2}' '{yo3}' '{yo4}' ( <- '{yo}' ) // YEH_HAMZA | |||
'{a~1}' '{a~2}'( <- '{a~}' ) // ALEF_MADDA | |||
'{wo1}' '{wo2}'( <- '{wo}' ) // WAW_HAMZA | |||
'{a1}' '{a2}' ( <- '{a}' ) // ALEF | |||
'{b1}' '{b2}' '{b3}' '{b4}' ( <- '{b}' ) // BEH | |||
'{t_1}' '{t_2}' ( <- '{t_}' ) // TEH_MARBUTA | |||
'{t1}' '{t2}' '{t3}' '{t4}' ( <- '{t}' ) // TEH | |||
'{th1}' '{th2}' '{th3}' '{th4}' ( <- '{th}' ) // THEH | |||
'{j1}' '{j2}' '{j3}' '{j4}'( <- '{j}' ) // JEEM | |||
'{h1}' '{h2}' '{h3}' '{h4}' ( <- '{h}' ) // HAH | |||
'{x1}' '{x2}' '{x3}' '{x4}'( <- '{x}' ) // KHAH | |||
'{d1}' '{d2}' ( <- '{d}' ) // DAL | |||
'{dz1}''{dz2}' ( <- '{dz}' ) // THAL | |||
'{r1}' '{r2}'( <- '{r}' ) // REH | |||
'{z1}' '{z2}' ( <- '{z}' ) // ZAIN | |||
'{s1}' '{s2}' '{s3}' '{s4}'( <- '{s}' ) // SEEN | |||
'{sh1}' '{sh2}' '{sh3}' '{sh4}' ( <- '{sh}' ) // SHEEN | |||
'{c1}' '{c2}' '{c3}' '{c4}'( <- '{c}' ) // SAD | |||
'{dh1}' '{dh2}' '{dh3}' '{dh4}'( <- '{dh}' ) // DAD | |||
'{tt1}' '{tt2}' '{tt3}' '{tt4}' ( <- '{tt}' ) // TAH | |||
'{zh1}' '{zh2}' '{zh3}' '{zh4}'( <- '{zh}' ) // ZAH | |||
'{i1}' '{i2}' '{i3}' '{i4}'( <- '{i}' ) // AIN | |||
'{gh1}' '{gh2}' '{gh3}' '{gh4}'( <- '{gh}' ) // GHAIN | |||
'{f1}' '{f2}' '{f3}' '{f4}' ( <- '{f}' ) // FEH | |||
'{q1}' '{q2}' '{q3}' '{q4}' ( <- '{q}' ) // QAF | |||
'{k1}' '{k2}' '{k3}' '{k4}'( <- '{k}' ) // KAF | |||
'{l1}' '{l2}' '{l3}' '{l4}'( <- '{l}' ) // LAM | |||
'{m1}' '{m2}' '{m3}' '{m4}' ( <- '{m}' ) // MEEM | |||
'{n1}' '{n2}' '{n3}' '{n4}'( <- '{n}' ) // NOON | |||
'{e1}' '{e2}' '{e3}' '{e4}' ( <- '{e}' ) // HEH | |||
'{w1}' '{w2}' ( <- '{w}' ) // WAW | |||
'{a_1}' '{a_2}' ( <- '{a_}' ) // ALEF_MAKSURA | |||
'{y1}' '{y2}' '{y3}' '{y4}' ( <- '{y}' ) // YEH | |||
// Ligatures Lam-Alef | |||
'{la}' '{la2}' (<- '{l}{a}') | |||
'{lao}' '{lao2}' (<- '{l}{ao}') | |||
'{lao_}' '{lao_2}' (<- '{l}{ao_}') | |||
'{la~}' '{la~2}' (<- '{l}{a~}') | |||
) | |||
) | |||
or | |||
next | |||
) | |||
) | |||
define Normalize_post as ( | |||
do ( | |||
// normalize last hamza | |||
backwards ( | |||
[substring] among ( | |||
'{ao}''{ao_}' '{a~}' ( <- '{o}') | |||
'{wo}' ( <- '{o}') | |||
'{yo}' ( <- '{o}') | |||
) | |||
) | |||
) | |||
do repeat ( | |||
( | |||
// normalize other hamza's | |||
[substring] among ( | |||
'{ao}''{ao_}' '{a~}' ( <- '{a}') | |||
'{wo}' ( <- '{w}') | |||
'{yo}' ( <- '{y}') | |||
) | |||
) | |||
or | |||
next | |||
) | |||
) | |||
// Checks | |||
define Checks1 as ( | |||
[substring] among ( | |||
'{b}{a}{l}' '{k}{a}{l}' ($(len > 4) set is_noun unset is_verb set is_defined) | |||
'{l}{l}' '{a}{l}' ($(len > 3) set is_noun unset is_verb set is_defined) | |||
) | |||
) | |||
//prefixes | |||
define Prefix_Step1 as ( | |||
[substring] among ( | |||
'{ao}{ao}' ($(len > 3) <- '{ao}' ) | |||
'{ao}{a~}' ($(len > 3) <- '{a~}' ) | |||
'{ao}{wo}' ($(len > 3) <- '{ao}' ) | |||
'{ao}{a}' ($(len > 3) <- '{a}' ) | |||
'{ao}{ao_}' ($(len > 3) <- '{ao_}' ) | |||
// '{ao}' ($(len > 3) delete) //rare case | |||
) | |||
) | |||
define Prefix_Step2 as ( | |||
not '{f}{a}' | |||
not '{w}{a}' | |||
[substring] among ( | |||
'{f}' ($(len > 3) delete) | |||
'{w}' ($(len > 3) delete) | |||
) | |||
) | |||
define Prefix_Step3a_Noun as ( // it is noun and defined | |||
[substring] among ( | |||
'{b}{a}{l}' '{k}{a}{l}' ($(len > 5) delete) | |||
'{l}{l}' '{a}{l}' ($(len > 4) delete) | |||
) | |||
) | |||
define Prefix_Step3b_Noun as ( // probably noun and defined | |||
not '{b}{a}' // exception | |||
[substring] among ( | |||
'{b}' ($(len > 3) delete) | |||
// '{k}' '{l}' ($(len > 3) delete) // BUG: cause confusion | |||
'{b}{b}' ($(len > 3) <- '{b}' ) | |||
'{k}{k}' ($(len > 3) <- '{k}' ) | |||
) | |||
) | |||
define Prefix_Step3_Verb as ( | |||
[substring] among ( | |||
//'{s}' ($(len > 4) delete)// BUG: cause confusion | |||
'{s}{y}' ($(len > 4) <- '{y}' ) | |||
'{s}{t}' ($(len > 4) <- '{t}') | |||
'{s}{n}' ($(len > 4) <- '{n}') | |||
'{s}{ao}' ($(len > 4) <- '{ao}') | |||
) | |||
) | |||
define Prefix_Step4_Verb as ( | |||
[substring] among ( | |||
'{y}{s}{t}' '{n}{s}{t}' '{t}{s}{t}' ($(len > 4) set is_verb unset is_noun <- '{a}{s}{t}' ) | |||
) | |||
) | |||
// suffixes | |||
backwardmode ( | |||
define Suffix_Noun_Step1a as ( | |||
[substring] among ( | |||
'{y}' '{k}' '{e}' ($(len >= 4) delete) | |||
'{n}{a}' '{k}{m}' '{e}{a}' '{e}{n}' '{e}{m}' ($(len >= 5) delete) | |||
'{k}{m}{a}' '{e}{m}{a}' ($(len >= 6) delete) | |||
) | |||
) | |||
define Suffix_Noun_Step1b as ( | |||
[substring] among ( | |||
'{n}' ($(len > 5) delete) | |||
) | |||
) | |||
define Suffix_Noun_Step2a as ( | |||
[substring] among ( | |||
'{a}' '{y}' '{w}' ($(len > 4) delete) | |||
) | |||
) | |||
define Suffix_Noun_Step2b as ( | |||
[substring] among ( | |||
'{a}{t}' ($(len >= 5) delete) | |||
) | |||
) | |||
define Suffix_Noun_Step2c1 as ( | |||
[substring] among ( | |||
'{t}' ($(len >= 4) delete) | |||
) | |||
) | |||
define Suffix_Noun_Step2c2 as ( // feminine t_ | |||
[substring] among ( | |||
'{t_}' ($(len >= 4) delete) | |||
) | |||
) | |||
define Suffix_Noun_Step3 as ( // ya' nisbiya | |||
[substring] among ( | |||
'{y}' ($(len >= 3) delete) | |||
) | |||
) | |||
define Suffix_Verb_Step1 as ( | |||
[substring] among ( | |||
'{e}' '{k}' ($(len >= 4) delete) | |||
'{n}{y}' '{n}{a}' '{e}{a}' '{e}{m}' '{e}{n}' '{k}{m}' '{k}{n}' ($(len >= 5) delete) | |||
'{e}{m}{a}' '{k}{m}{a}' '{k}{m}{w}'($(len >= 6) delete) | |||
) | |||
) | |||
define Suffix_Verb_Step2a as ( | |||
[substring] among ( | |||
'{t}' ($(len >= 4) delete) | |||
'{a}' '{n}' '{y}' ($(len >= 4) delete) | |||
'{n}{a}' '{t}{a}' '{t}{n}' ($(len >= 5) delete)// past | |||
'{a}{n}' '{w}{n}' '{y}{n}' ($(len > 5) delete) // present | |||
'{t}{m}{a}' ($(len >= 6) delete) | |||
) | |||
) | |||
define Suffix_Verb_Step2b as ( | |||
[substring] among ( | |||
'{w}{a}' '{t}{m}' ($(len >= 5) delete) | |||
) | |||
) | |||
define Suffix_Verb_Step2c as ( | |||
[substring] among ( | |||
'{w}' ($(len >= 4) delete) | |||
'{t}{m}{w}' ($(len >= 6) delete) | |||
) | |||
) | |||
define Suffix_All_alef_maqsura as ( | |||
[substring] among ( | |||
'{a_}' ( <- '{y}' ) // spell error | |||
// '{a_}' ( delete ) // if noun > 3 | |||
// '{a_}' ( <- '{a}') // if verb | |||
) | |||
) | |||
) | |||
define stem as ( | |||
// set initial values | |||
set is_noun | |||
set is_verb | |||
unset is_defined | |||
// guess type and properties | |||
do Checks1 | |||
// normalization pre-stemming | |||
do Normalize_pre | |||
backwards ( | |||
do ( | |||
//Suffixes for verbs | |||
( | |||
is_verb | |||
( | |||
( | |||
(atleast 1 Suffix_Verb_Step1) | |||
( Suffix_Verb_Step2a or Suffix_Verb_Step2c or next) | |||
) | |||
or Suffix_Verb_Step2b | |||
or Suffix_Verb_Step2a | |||
) | |||
) | |||
//Suffixes for nouns | |||
or ( | |||
is_noun | |||
( | |||
try ( | |||
Suffix_Noun_Step2c2 | |||
or (not is_defined Suffix_Noun_Step1a ( | |||
Suffix_Noun_Step2a | |||
or Suffix_Noun_Step2b | |||
or Suffix_Noun_Step2c1 | |||
or next)) | |||
or (Suffix_Noun_Step1b ( | |||
Suffix_Noun_Step2a | |||
or Suffix_Noun_Step2b | |||
or Suffix_Noun_Step2c1)) | |||
or (not is_defined Suffix_Noun_Step2a) | |||
or (Suffix_Noun_Step2b) | |||
) | |||
Suffix_Noun_Step3 | |||
) | |||
) | |||
// Suffixes for alef maqsura | |||
or Suffix_All_alef_maqsura | |||
) | |||
) | |||
//Prefixes | |||
do ( | |||
try Prefix_Step1 | |||
try Prefix_Step2 | |||
( Prefix_Step3a_Noun | |||
or (is_noun Prefix_Step3b_Noun) | |||
or (is_verb try Prefix_Step3_Verb Prefix_Step4_Verb) | |||
) | |||
) | |||
// normalization post-stemming | |||
do Normalize_post | |||
) |
@@ -0,0 +1,149 @@ | |||
routines ( | |||
aditzak | |||
izenak | |||
adjetiboak | |||
mark_regions | |||
RV R2 R1 | |||
) | |||
externals ( stem ) | |||
integers ( pV p1 p2 ) | |||
groupings ( v ) | |||
stringescapes {} | |||
/* special characters */ | |||
stringdef n~ '{U+00F1}' | |||
define v 'aeiou' | |||
define mark_regions as ( | |||
$pV = limit | |||
$p1 = limit | |||
$p2 = limit // defaults | |||
do ( | |||
( v (non-v gopast v) or (v gopast non-v) ) | |||
or | |||
( non-v (non-v gopast v) or (v next) ) | |||
setmark pV | |||
) | |||
do ( | |||
gopast v gopast non-v setmark p1 | |||
gopast v gopast non-v setmark p2 | |||
) | |||
) | |||
backwardmode ( | |||
define RV as $pV <= cursor | |||
define R2 as $p2 <= cursor | |||
define R1 as $p1 <= cursor | |||
define aditzak as ( | |||
[substring] among( | |||
'le' 'la' 'tzaile' 'aldatu' 'atu' 'tzailea' 'taile' 'tailea' 'pera' 'gale' 'galea' | |||
'gura' 'kura' 'kor' 'korra' 'or' 'orra' 'tun' 'tuna' 'gaitz' 'gaitza' | |||
'kaitz' 'kaitza' 'ezin' 'ezina' 'tezin' 'tezina' 'errez' 'erreza' | |||
'karri' 'karria' 'tzaga' 'tzaka' 'tzake' 'tzeke' 'ez' 'eza' 'tzez' | |||
'keta' 'eta' 'etan' 'pen' 'pena' 'tze' 'atze' 'kuntza' 'kunde' 'kundea' | |||
'kune' 'kunea' 'kuna' 'kera' 'era' 'kizun' 'kizuna' 'dura' 'tura' 'men' 'mena' | |||
'go' 'ago' 'tio' 'taldi' 'taldia' 'aldi' 'aldia' 'gune' 'gunea' 'bide' 'bidea' | |||
'pide' 'pidea' 'gai' 'gaia' 'ki' 'kin' 'rekin' 'kina' 'kari' 'karia' 'ari' 'tari' 'etari' | |||
'gailu' 'gailua' 'kide' 'kidea' 'ide' 'idea' 'du' 'ka' 'kan' 'an' 'ean' 'tu' 'lari' 'tatu' | |||
'rean' 'tarazi' 'arazi' 'tzat' 'bera' 'dako' | |||
( RV delete ) | |||
'garri' 'garria' 'tza' | |||
(R2 delete) | |||
'atseden' | |||
(<- 'atseden') | |||
'arabera' | |||
(<- 'arabera') | |||
'baditu' | |||
(<- 'baditu') | |||
) | |||
) | |||
define izenak as ( | |||
[substring] among( | |||
'ari' 'aria' 'bizia' 'kari' 'karia' 'lari' 'laria' 'tari' 'taria' 'zain' 'zaina' | |||
'tzain' 'tzaina' 'zale' 'zalea' 'tzale' 'tzalea' 'aizun' 'orde' 'ordea' | |||
'burua' 'ohi' 'ohia' 'kintza' 'gintzo' 'gintzu' 'tzu' 'tzua' | |||
'tzo' 'tzoa' 'kuntza' 'talde' 'taldea' 'eria' 'keria' 'teria' 'di' | |||
'za' 'ada' 'tara' 'etara' 'tra' 'ta' 'tegi' 'tegia' 'keta' 'z' 'zko' 'zkoa' | |||
'ti' 'tia' 'tsu' 'tsua' 'zu' 'zua' 'bera' 'pera' 'zto' 'ztoa' 'asi' 'asia' | |||
'gile' 'gilea' 'estu' 'estua' 'larri' 'larria' 'nahi' 'nahia' | |||
'koi' 'koia' 'oi' 'oia' 'goi' 'min' 'mina' 'dun' 'duna' 'duru' 'durua' | |||
'duri' 'duria' 'os' 'osa' 'oso' 'osoa' 'ar' 'ara' 'tar' 'dar' 'dara' | |||
'tiar' 'tiara' 'liar' 'liara' 'gabe' 'gabea' 'kabe' 'kabea' 'ga' 'ge' | |||
'kada' 'tasun' 'tasuna' 'asun' 'asuna' 'go' 'mendu' 'mendua' 'mentu' 'mentua' | |||
'mendi' 'mendia' 'zio' 'zioa' 'zino' 'zinoa' 'zione' 'zionea' 'ezia' | |||
'degi' 'degia' 'egi' 'egia' 'toki' 'tokia' 'leku' 'lekua' 'gintza' 'alde' | |||
'aldea' 'kalde' 'kaldea' 'gune' 'gunea' 'une' 'unea' 'una' 'pe' 'pea' | |||
'gibel' 'gibela' 'ondo' 'ondoa' 'arte' 'artea' 'aurre' 'aurrea' | |||
'etxe' 'etxea' 'ola' 'ontzi' 'ontzia' 'gela' 'denda' 'taldi' 'taldia' | |||
'aldi' 'aldia' 'te' 'tea' 'zaro' 'zaroa' 'taro' 'taroa' 'oro' 'oroa' | |||
'aro' 'aroa' 'ero' 'eroa' 'eroz' 'eroza' 'ka' 'kan' 'kana' 'tako' 'etako' 'takoa' | |||
'kote' 'kotea' 'tzar' 'tzarra' 'handi' 'handia' 'kondo' 'kondoa' 'skila' | |||
'no' 'noa' '{n~}o' '{n~}oa' 'ska' 'xka' 'zka' 'tila' 'to' 'toa' 'tto' 'ttoa' | |||
'txo' 'txoa' 'txu' 'txua' 'anda' 'anga' 'urren' 'urrena' 'gai' 'gaia' | |||
'gei' 'geia' 'eme' 'emea' 'kume' 'kumea' 'sa' 'ko' 'eko' 'koa' 'ena' | |||
'enea' 'ne' 'nea' 'kor' 'korra' 'ez' 'eza' 'eta' 'etan' | |||
'ki' 'kia' 'kin' 'kina' 'tu' 'tua' 'du' 'dua' 'ek' | |||
'tarik' 'tariko' 'tan' 'ordu' 'ordua' 'oste' 'ostea' 'tzara' | |||
'ra' 'antza' 'behar' 'ro' 'giro' 'ak' 'zp' 'ket' | |||
'kail' 'kaila' 'ail' 'kirri' 'kirria' 'ngo' 'ngoa' '{n~}i' 'sko' | |||
'sta' 'koitz' 'koitza' 'na' 'garren' 'garrena' 'kera' | |||
'gerren' 'gerrena' 'garna' 'kide' 'tz' 'tuko' | |||
( RV delete ) | |||
'ora' 'garri' 'garria' 'or' 'buru' 'ren' 'tza' | |||
( R2 delete ) | |||
'joka' | |||
(<- 'jok') | |||
'tzen' 'ten' 'en' 'tatu' | |||
(R1 delete) | |||
'trako' | |||
(<- 'tra') | |||
'minutuko' | |||
(<- 'minutu') | |||
'zehar' | |||
(<- 'zehar') | |||
'geldi' | |||
(<- 'geldi') | |||
'igaro' | |||
(<- 'igaro') | |||
'aurka' | |||
(<- 'aurka') | |||
) | |||
) | |||
define adjetiboak as ( | |||
[substring] among( | |||
'era' 'ero' 'go' 'tate' 'tade' 'date' 'dade' 'keria' | |||
'ki' 'to' 'ro' 'la' 'gi' 'larik' 'lanik' 'ik' 'ztik' 'rik' | |||
( RV delete ) | |||
'zlea' | |||
(<- 'z') | |||
) | |||
) | |||
) | |||
define stem as ( | |||
do mark_regions | |||
backwards ( | |||
repeat aditzak | |||
repeat izenak | |||
do adjetiboak | |||
) | |||
) | |||
/* | |||
Note 1: additions of 21 Jul 2010 | |||
*/ |
@@ -0,0 +1,202 @@ | |||
routines ( | |||
cleaning mark_regions | |||
R1 R2 | |||
attached_pronoun | |||
standard_suffix | |||
verb_suffix | |||
residual_suffix | |||
) | |||
externals ( stem ) | |||
integers ( p1 p2 ) | |||
groupings ( v ) | |||
stringescapes {} | |||
/* special characters */ | |||
stringdef a' '{U+00E1}' // a-acute | |||
stringdef a` '{U+00E0}' // a-grave | |||
stringdef c, '{U+00E7}' // c-cedilla | |||
stringdef e' '{U+00E9}' // e-acute | |||
stringdef e` '{U+00E8}' // e-grave | |||
stringdef i' '{U+00ED}' // i-acute | |||
stringdef i` '{U+00EC}' // i-grave | |||
stringdef i" '{U+00EF}' // i-diaeresis | |||
stringdef o' '{U+00F3}' // o-acute | |||
stringdef o` '{U+00F2}' // o-grave | |||
stringdef u' '{U+00FA}' // u-acute | |||
stringdef u" '{U+00FC}' // u-diaeresis | |||
stringdef . '{U+00B7}' // - per l aggeminades | |||
define v 'aeiou{a'}{a`}{e'}{e`}{i'}{i"}{o'}{o`}{u'}{u"}' | |||
define mark_regions as ( | |||
$p1 = limit | |||
$p2 = limit // defaults | |||
do ( | |||
gopast v gopast non-v setmark p1 | |||
gopast v gopast non-v setmark p2 | |||
) | |||
) | |||
define cleaning as repeat ( | |||
[substring] among( | |||
'{a'}' (<- 'a') | |||
'{a`}' (<- 'a') | |||
'{e'}' (<- 'e') | |||
'{e`}' (<- 'e') | |||
'{i'}' (<- 'i') | |||
'{i`}' (<- 'i') | |||
'{o'}' (<- 'o') | |||
'{o`}' (<- 'o') | |||
'{u'}' (<- 'u') | |||
'{u"}' (<- 'u') | |||
'{i"}' (<- 'i') | |||
'{.}' (<- '.') | |||
'' (next) | |||
) | |||
) | |||
backwardmode ( | |||
define R1 as $p1 <= cursor | |||
define R2 as $p2 <= cursor | |||
define attached_pronoun as ( | |||
[substring] among ( | |||
'{'}s' '{'}hi' '{'}ho' '{'}l' '{'}ls' | |||
'-ls' '-la' '-les' '-li' | |||
'vos' 'se' 'nos' '-nos' '-us' 'us' | |||
'{'}n' '{'}ns' '-n' '-ns' | |||
'{'}m' '-me' '-m' | |||
'-te' '{'}t' | |||
'li' 'lo' 'los' | |||
'me' 'sela' 'selo' 'selas' 'selos' 'le' | |||
'la' 'las' 'les' 'ens' 'ho' 'hi' | |||
(R1 delete) | |||
) | |||
) | |||
define standard_suffix as ( | |||
[substring] among( | |||
'ar' 'atge' 'formes' 'icte' 'ictes' | |||
'ell' 'ells' 'ella' '{e'}s' '{e`}s' 'esc' 'essa' 'et' 'ets' 'eta' | |||
'eres' 'eries' 'ers' 'ina' 'ines' 'able' 'ls' | |||
'i{o'}' 'itat' 'itats' 'itzar' 'iva' 'ives' 'ivisme' 'ius' | |||
'fer' 'ment' 'amen' 'ament' 'aments' 'ments' 'ot' 'sfera' 'al' 'als' 'era' 'ana' 'iste' | |||
'aire' 'eria' 'esa' 'eses' 'esos' 'or' '{i'}cia' '{i'}cies' 'icis' 'ici' '{i'}ci' '{i'}cis' | |||
'{a`}ria' '{a`}ries' 'alla' 'ci{o'}' 'cions' 'n{c,}a' 'nces' '{o'}' 'dor' 'all' | |||
'il' '{i'}stic' 'enc' 'enca' '{i'}s' 'issa' 'issos' '{i'}ssem' '{i'}ssiu' 'issem' 'isseu' '{i'}sseu' | |||
'{o'}s' 'osa' 'dora' 'dores' 'dors' 'adura' 'ble' 'bles' '{i'}vol' '{i'}vola' 'd{i'}s' 'egar' 'ejar' 'ificar' | |||
'itar' 'ables' 'adors' 'idores' 'idors' | |||
'adora' 'aci{o'}' 'doras' 'dur' 'dures' 'alleng{u"}es' | |||
'ant' 'ants' 'ancia' 'ancies' 'at{o`}ria' 'at{o`}ries' 'tori' 'toris' | |||
'ats' 'ions' 'ota' 'isam' 'ors' 'ora' 'ores' 'isament' | |||
'bilitat' 'bilitats' 'ivitat' 'ivitats' 'ari' 'aris' 'ionisme' 'ionista' 'ionistes' | |||
'ialista' 'ialistes' 'ialisme' 'ialismes' 'ud' 'uts' 'uds' 'encia' 'encies' '{e`}ncia' '{e`}ncies' | |||
'{i"}tat' '{i"}tats' 'atiu' 'atius' 'atives' 'ativa' 'ativitat' 'ativitats' 'ible' 'ibles' | |||
'assa' 'asses' 'assos' | |||
'ent' 'ents' | |||
'{i'}ssim' '{i'}ssima' '{i'}ssims' '{i'}ssimes' '{i`}ssem' '{i`}sseu' '{i`}ssin' | |||
'ims' 'ima' 'imes' | |||
'isme' 'ista' 'ismes' 'istes' | |||
'inia' 'inies' '{i'}inia' '{i'}nies' 'ita' 'ites' 'triu' 'trius' | |||
'oses' 'osos' 'ient' 'otes' 'ots' | |||
(R1 delete) | |||
'acions' 'ada' 'ades' | |||
(R2 delete) | |||
'log{i'}a' 'log{i'}es''logia' 'logies' 'logi' 'logis' 'l{o'}gica' 'l{o'}gics' 'l{o'}giques' | |||
(R2 <- 'log') | |||
'ic' 'ica' 'ics' 'iques' | |||
(R2 <- 'ic') | |||
'qu{i'}ssim' 'qu{i'}ssims' 'qu{i'}ssimes' 'qu{i'}ssima' | |||
(R1 <- 'c') | |||
) | |||
) | |||
define verb_suffix as ( | |||
[substring] among( | |||
'ador' 'adora' 'adors' 'adores' 're' 'ie' | |||
'ent' 'ents' 'udes' 'ar{a`}' 'eren' | |||
'ar{a'}' 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais' | |||
'aria' 'arian' 'arien' 'aries' 'ar{a`}s' | |||
'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ara' | |||
'ar{e'}' 'ar{e'}s' | |||
'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais' | |||
'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}' | |||
'er{e'}' 'er' 'erau' 'erass' | |||
'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais' | |||
'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}' | |||
'ir{e'}' '{i'}rem' '{i'}reu' '{i'}eu' | |||
'ia' 'ies' '{i'}em' '{i`}eu' 'ien' | |||
'at' 'ut' 'uda' 'ava' 'aves' 'avem' '{a'}vem' '{a`}vem' '{a`}veu' '{a'}veu' 'aven' 'au' 'ats' | |||
'asseu' 'esseu' 'eresseu' '{a`}sseu' '{a`}ssem' '{a`}ssim' '{a`}ssiu' | |||
'essen' 'esses' 'assen' 'asses' 'assim' 'assiu' | |||
'{e'}ssen' '{e'}sseu' '{e'}ssim' '{e'}ssiu' '{e'}ssem' | |||
'{i'}' 'ares' '{a`}rem' '{a`}reu' '{a`}ren' | |||
'ar{i'}em' 'ar{i'}eu' | |||
'areu' 'aren' 'ant' '{i"}m' '{i"}u' | |||
'{e'}s' '{i"}en' 'en' 'es' 'em' 'am' 'ams' '{i"}a' '{i"}es' | |||
'dre' 'eix' 'eixer' 'tzar' 'eixes' 'ides' '{i"}des' 'it' '{i"}t' '{i"}da' | |||
'aba' 'ada' 'ades' 'ida' '{i'}a' 'iera' 'ad' 'ed' 'its' | |||
'id' 'ids' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an' | |||
'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado' | |||
'ido' 'iendo' 'i{o'}' 'ar' 'ir' 'as' | |||
'ieu' 'ii' 'io' 'i{a`}' | |||
'ess' 'essin' 'essis' 'ass' 'assin' 'assis' 'essim' '{e`}ssim' '{e`}ssiu' | |||
'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases' | |||
'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais' | |||
'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados' | |||
'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' 'ques' | |||
'{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos' | |||
'ira' 'iran' 'irem' 'iren' 'ires' 'ireu' 'iria' 'irien' | |||
'iries' 'ir{a`}' 'ir{a`}s' 'ir{e`}' 'ir{i`}em' 'ir{i`}eu' | |||
'isquen' 'iguem' 'igueu' 'esqui' 'esquin' 'esquis' 'eixi' 'eixin' 'eixis' | |||
'eixen' 'eixo' 'isin' 'isis' 'esques' 'sis' 'sin' | |||
'int' 'ir{i'}em' 'ir{i'}eu' 'isc' 'atges' 'esca' 'esquen' | |||
'issen' 'isses' 'issin' 'issis' 'isca' 'issiu' 'issim' | |||
'{i"}sc' '{i"}sca' '{i"}ssin' '{i'}ssiu' '{i'}ssim' '{i"}ssis' '{i"}guem' '{i"}gueu' | |||
'{i"}ra' '{i"}ren' '{i"}res' | |||
'{i"}squen' '{i"}sques' '{i"}ssen' '{i"}sses' '{i"}xo' '{i"}xen' '{i"}xes' '{i"}x' | |||
'ixo' 'ixen' 'ixes' 'ix' 'ixa' 'inin' 'inis' 'ini' 'ineu' 'itza' 'itzi' 'itzeu' 'itzis' | |||
'itzo' 'itz' 'itz{a`}' 'arem' 'in' '{a`}s' 'i{i"}' 'i{i"}n' 'i{i"}s' | |||
(R1 delete) | |||
'ando' | |||
(R2 delete) | |||
) | |||
) | |||
define residual_suffix as ( | |||
[substring] among( | |||
'os' 'a' 'o' '{a'}' '{a`}' '{i'}' '{o'}' 'e' '{e'}' 'eu' 'iu' | |||
'is' 'i' 'ir' 's' '{i`}' 'itz' '{i"}' '{i"}n' '{i"}s' 'it' | |||
(R1 delete) | |||
'iqu' | |||
(R1 <- 'ic') | |||
) | |||
) | |||
) | |||
define stem as ( | |||
do mark_regions | |||
backwards ( | |||
do attached_pronoun | |||
do ( standard_suffix or | |||
verb_suffix | |||
) | |||
do residual_suffix | |||
) | |||
do cleaning | |||
) | |||
/* | |||
First works 2010/07/19 | |||
First Grammatical Reviews: https://ca.wikipedia.org/wiki/Gram%C3%A0tica_del_catal%C3%A0 | |||
Suffix list: https://ca.wikipedia.org/wiki/Llista_de_sufixos | |||
Irregular Verbs: https://ca.wikipedia.org/wiki/Flexi%C3%B3_verbal_del_catal%C3%A0 | |||
*/ |
@@ -12,15 +12,17 @@ strings ( ch ) | |||
integers ( p1 x ) | |||
groupings ( v s_ending ) | |||
groupings ( c v s_ending ) | |||
stringescapes {} | |||
/* special characters (in ISO Latin I) */ | |||
/* special characters */ | |||
stringdef ae hex 'E6' | |||
stringdef ao hex 'E5' | |||
stringdef o/ hex 'F8' | |||
stringdef ae '{U+00E6}' | |||
stringdef ao '{U+00E5}' | |||
stringdef o/ '{U+00F8}' | |||
define c 'bcdfghjklmnpqrstvwxz' | |||
define v 'aeiouy{ae}{ao}{o/}' | |||
@@ -73,7 +75,7 @@ backwardmode ( | |||
) | |||
) | |||
define undouble as ( | |||
setlimit tomark p1 for ([non-v] ->ch) | |||
setlimit tomark p1 for ([c] ->ch) | |||
ch | |||
delete | |||
) |
@@ -1,91 +0,0 @@ | |||
routines ( | |||
mark_regions | |||
main_suffix | |||
consonant_pair | |||
other_suffix | |||
undouble | |||
) | |||
externals ( stem ) | |||
strings ( ch ) | |||
integers ( p1 x ) | |||
groupings ( v s_ending ) | |||
stringescapes {} | |||
/* special characters (in MS-DOS Latin I) */ | |||
stringdef ae hex '91' | |||
stringdef ao hex '86' | |||
stringdef o/ hex '9B' | |||
define v 'aeiouy{ae}{ao}{o/}' | |||
define s_ending 'abcdfghjklmnoprtvyz{ao}' | |||
define mark_regions as ( | |||
$p1 = limit | |||
test ( hop 3 setmark x ) | |||
goto v gopast non-v setmark p1 | |||
try ( $p1 < x $p1 = x ) | |||
) | |||
backwardmode ( | |||
define main_suffix as ( | |||
setlimit tomark p1 for ([substring]) | |||
among( | |||
'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere' | |||
'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes' | |||
'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets' | |||
'erets' 'et' 'eret' | |||
(delete) | |||
's' | |||
(s_ending delete) | |||
) | |||
) | |||
define consonant_pair as ( | |||
test ( | |||
setlimit tomark p1 for ([substring]) | |||
among( | |||
'gd' // significant in the call from other_suffix | |||
'dt' 'gt' 'kt' | |||
) | |||
) | |||
next] delete | |||
) | |||
define other_suffix as ( | |||
do ( ['st'] 'ig' delete ) | |||
setlimit tomark p1 for ([substring]) | |||
among( | |||
'ig' 'lig' 'elig' 'els' | |||
(delete do consonant_pair) | |||
'l{o/}st' | |||
(<-'l{o/}s') | |||
) | |||
) | |||
define undouble as ( | |||
setlimit tomark p1 for ([non-v] ->ch) | |||
ch | |||
delete | |||
) | |||
) | |||
define stem as ( | |||
do mark_regions | |||
backwards ( | |||
do main_suffix | |||
do consonant_pair | |||
do other_suffix | |||
do undouble | |||
) | |||
) |
@@ -18,21 +18,21 @@ groupings ( v v_I v_j ) | |||
stringescapes {} | |||
/* special characters (in MS-DOS Latin I) */ | |||
/* special characters */ | |||
stringdef a" hex '84' | |||
stringdef e" hex '89' | |||
stringdef i" hex '8B' | |||
stringdef o" hex '94' | |||
stringdef u" hex '81' | |||
stringdef a" '{U+00E4}' | |||
stringdef e" '{U+00EB}' | |||
stringdef i" '{U+00EF}' | |||
stringdef o" '{U+00F6}' | |||
stringdef u" '{U+00FC}' | |||
stringdef a' hex 'A0' | |||
stringdef e' hex '82' | |||
stringdef i' hex 'A1' | |||
stringdef o' hex 'A2' | |||
stringdef u' hex 'A3' | |||
stringdef a' '{U+00E1}' | |||
stringdef e' '{U+00E9}' | |||
stringdef i' '{U+00ED}' | |||
stringdef o' '{U+00F3}' | |||
stringdef u' '{U+00FA}' | |||
stringdef e` hex '8A' | |||
stringdef e` '{U+00E8}' | |||
define v 'aeiouy{e`}' | |||
define v_I v + 'I' |
@@ -1,164 +0,0 @@ | |||
routines ( | |||
prelude postlude | |||
e_ending | |||
en_ending | |||
mark_regions | |||
R1 R2 | |||
undouble | |||
standard_suffix | |||
) | |||
externals ( stem ) | |||
booleans ( e_found ) | |||
integers ( p1 p2 ) | |||
groupings ( v v_I v_j ) | |||
stringescapes {} | |||
/* special characters (in ISO Latin I) */ | |||
stringdef a" hex 'E4' | |||
stringdef e" hex 'EB' | |||
stringdef i" hex 'EF' | |||
stringdef o" hex 'F6' | |||
stringdef u" hex 'FC' | |||
stringdef a' hex 'E1' | |||
stringdef e' hex 'E9' | |||
stringdef i' hex 'ED' | |||
stringdef o' hex 'F3' | |||
stringdef u' hex 'FA' | |||
stringdef e` hex 'E8' | |||
define v 'aeiouy{e`}' | |||
define v_I v + 'I' | |||
define v_j v + 'j' | |||
define prelude as ( | |||
test repeat ( | |||
[substring] among( | |||
'{a"}' '{a'}' | |||
(<- 'a') | |||
'{e"}' '{e'}' | |||
(<- 'e') | |||
'{i"}' '{i'}' | |||
(<- 'i') | |||
'{o"}' '{o'}' | |||
(<- 'o') | |||
'{u"}' '{u'}' | |||
(<- 'u') | |||
'' (next) | |||
) //or next | |||
) | |||
try(['y'] <- 'Y') | |||
repeat goto ( | |||
v [('i'] v <- 'I') or | |||
('y'] <- 'Y') | |||
) | |||
) | |||
define mark_regions as ( | |||
$p1 = limit | |||
$p2 = limit | |||
gopast v gopast non-v setmark p1 | |||
try($p1 < 3 $p1 = 3) // at least 3 | |||
gopast v gopast non-v setmark p2 | |||
) | |||
define postlude as repeat ( | |||
[substring] among( | |||
'Y' (<- 'y') | |||
'I' (<- 'i') | |||
'' (next) | |||
) //or next | |||
) | |||
backwardmode ( | |||
define R1 as $p1 <= cursor | |||
define R2 as $p2 <= cursor | |||
define undouble as ( | |||
test among('kk' 'dd' 'tt') [next] delete | |||
) | |||
define e_ending as ( | |||
unset e_found | |||
['e'] R1 test non-v delete | |||
set e_found | |||
undouble | |||
) | |||
define en_ending as ( | |||
R1 non-v and not 'gem' delete | |||
undouble | |||
) | |||
define standard_suffix as ( | |||
do ( | |||
[substring] among( | |||
'heden' | |||
( R1 <- 'heid' | |||
) | |||
'en' 'ene' | |||
( en_ending | |||
) | |||
's' 'se' | |||
( R1 non-v_j delete | |||
) | |||
) | |||
) | |||
do e_ending | |||
do ( ['heid'] R2 not 'c' delete | |||
['en'] en_ending | |||
) | |||
do ( | |||
[substring] among( | |||
'end' 'ing' | |||
( R2 delete | |||
(['ig'] R2 not 'e' delete) or undouble | |||
) | |||
'ig' | |||
( R2 not 'e' delete | |||
) | |||
'lijk' | |||
( R2 delete e_ending | |||
) | |||
'baar' | |||
( R2 delete | |||
) | |||
'bar' | |||
( R2 e_found delete | |||
) | |||
) | |||
) | |||
do ( | |||
non-v_I | |||
test ( | |||
among ('aa' 'ee' 'oo' 'uu') | |||
non-v | |||
) | |||
[next] delete | |||
) | |||
) | |||
) | |||
define stem as ( | |||
do prelude | |||
do mark_regions | |||
backwards | |||
do standard_suffix | |||
do postlude | |||
) |
@@ -24,16 +24,17 @@ externals ( stem ) | |||
integers ( p1 p2 ) | |||
strings ( x ) | |||
booleans ( ending_removed ) | |||
groupings ( AEI V1 V2 particle_end ) | |||
groupings ( AEI C V1 V2 particle_end ) | |||
stringescapes {} | |||
/* special characters (in ISO Latin I) */ | |||
/* special characters */ | |||
stringdef a" hex 'E4' | |||
stringdef o" hex 'F6' | |||
stringdef a" '{U+00E4}' | |||
stringdef o" '{U+00F6}' | |||
define AEI 'a{a"}ei' | |||
define C 'bcdfghjklmnpqrstvwxz' | |||
define V1 'aeiouy{a"}{o"}' | |||
define V2 'aeiou{a"}{o"}' | |||
define particle_end V1 + 'nt' | |||
@@ -116,7 +117,7 @@ backwardmode ( | |||
) | |||
'a' '{a"}' //-. | |||
(V1 non-V1) // | | |||
(V1 C) // | | |||
'tta' 'tt{a"}' // Partitive [32] | |||
('e') // | | |||
'ta' 't{a"}' //-' | |||
@@ -172,11 +173,11 @@ backwardmode ( | |||
define tidy as ( | |||
setlimit tomark p1 for ( | |||
do ( LONG and ([next] delete ) ) // undouble vowel | |||
do ( [AEI] non-V1 delete ) // remove trailing a, a", e, i | |||
do ( [AEI] C delete ) // remove trailing a, a", e, i | |||
do ( ['j'] 'o' or 'u' delete ) | |||
do ( ['o'] 'j' delete ) | |||
) | |||
goto non-V1 [next] -> x x delete // undouble consonant | |||
goto non-V1 [C] -> x x delete // undouble consonant | |||
) | |||
) | |||
@@ -17,21 +17,21 @@ groupings ( v keep_with_s ) | |||
stringescapes {} | |||
/* special characters (in ISO Latin I) */ | |||
stringdef a^ hex 'E2' // a-circumflex | |||
stringdef a` hex 'E0' // a-grave | |||
stringdef c, hex 'E7' // c-cedilla | |||
stringdef e" hex 'EB' // e-diaeresis (rare) | |||
stringdef e' hex 'E9' // e-acute | |||
stringdef e^ hex 'EA' // e-circumflex | |||
stringdef e` hex 'E8' // e-grave | |||
stringdef i" hex 'EF' // i-diaeresis | |||
stringdef i^ hex 'EE' // i-circumflex | |||
stringdef o^ hex 'F4' // o-circumflex | |||
stringdef u^ hex 'FB' // u-circumflex | |||
stringdef u` hex 'F9' // u-grave | |||
/* special characters */ | |||
stringdef a^ '{U+00E2}' // a-circumflex | |||
stringdef a` '{U+00E0}' // a-grave | |||
stringdef c, '{U+00E7}' // c-cedilla | |||
stringdef e" '{U+00EB}' // e-diaeresis (rare) | |||
stringdef e' '{U+00E9}' // e-acute | |||
stringdef e^ '{U+00EA}' // e-circumflex | |||
stringdef e` '{U+00E8}' // e-grave | |||
stringdef i" '{U+00EF}' // i-diaeresis | |||
stringdef i^ '{U+00EE}' // i-circumflex | |||
stringdef o^ '{U+00F4}' // o-circumflex | |||
stringdef u^ '{U+00FB}' // u-circumflex | |||
stringdef u` '{U+00F9}' // u-grave | |||
define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}' | |||
@@ -42,6 +42,10 @@ define prelude as repeat goto ( | |||
('y' ] <- 'Y') | |||
) | |||
or | |||
( [ '{e"}' ] <- 'He' ) | |||
or | |||
( [ '{i"}' ] <- 'Hi' ) | |||
or | |||
( ['y'] v <- 'Y' ) | |||
or | |||
( 'q' ['u'] <- 'U' ) | |||
@@ -78,6 +82,9 @@ define postlude as repeat ( | |||
'I' (<- 'i') | |||
'U' (<- 'u') | |||
'Y' (<- 'y') | |||
'He' (<- '{e"}') | |||
'Hi' (<- '{i"}') | |||
'H' (delete) | |||
'' (next) | |||
) | |||
) | |||
@@ -167,7 +174,7 @@ backwardmode ( | |||
'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait' | |||
'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses' | |||
'issez' 'issiez' 'issions' 'issons' 'it' | |||
(non-v delete) | |||
(not 'H' non-v delete) | |||
) | |||
) | |||
@@ -196,14 +203,13 @@ backwardmode ( | |||
define keep_with_s 'aiou{e`}s' | |||
define residual_suffix as ( | |||
try(['s'] test non-keep_with_s delete) | |||
try(['s'] test ('Hi' or non-keep_with_s) delete) | |||
setlimit tomark pV for ( | |||
[substring] among( | |||
'ion' (R2 's' or 't' delete) | |||
'ier' 'i{e`}re' | |||
'Ier' 'I{e`}re' (<-'i') | |||
'e' (delete) | |||
'{e"}' ('gu' delete) | |||
) | |||
) | |||
) |
@@ -1,239 +0,0 @@ | |||
routines ( | |||
prelude postlude mark_regions | |||
RV R1 R2 | |||
standard_suffix | |||
i_verb_suffix | |||
verb_suffix | |||
residual_suffix | |||
un_double | |||
un_accent | |||
) | |||
externals ( stem ) | |||
integers ( pV p1 p2 ) | |||
groupings ( v keep_with_s ) | |||
stringescapes {} | |||
/* special characters (in MS-DOS Latin I) */ | |||
stringdef a^ hex '83' // a-circumflex | |||
stringdef a` hex '85' // a-grave | |||
stringdef c, hex '87' // c-cedilla | |||
stringdef e" hex '89' // e-diaeresis (rare) | |||
stringdef e' hex '82' // e-acute | |||
stringdef e^ hex '88' // e-circumflex | |||
stringdef e` hex '8A' // e-grave | |||
stringdef i" hex '8B' // i-diaeresis | |||
stringdef i^ hex '8C' // i-circumflex | |||
stringdef o^ hex '93' // o-circumflex | |||
stringdef u^ hex '96' // u-circumflex | |||
stringdef u` hex '97' // u-grave | |||
define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}' | |||
define prelude as repeat goto ( | |||
( v [ ('u' ] v <- 'U') or | |||
('i' ] v <- 'I') or | |||
('y' ] <- 'Y') | |||
) | |||
or | |||
( ['y'] v <- 'Y' ) | |||
or | |||
( 'q' ['u'] <- 'U' ) | |||
) | |||
define mark_regions as ( | |||
$pV = limit | |||
$p1 = limit | |||
$p2 = limit // defaults | |||
do ( | |||
( v v next ) or ( next gopast v ) | |||
setmark pV | |||
) | |||
do ( | |||
gopast v gopast non-v setmark p1 | |||
gopast v gopast non-v setmark p2 | |||
) | |||
) | |||
define postlude as repeat ( | |||
[substring] among( | |||
'I' (<- 'i') | |||
'U' (<- 'u') | |||
'Y' (<- 'y') | |||
'' (next) | |||
) | |||
) | |||
backwardmode ( | |||
define RV as $pV <= cursor | |||
define R1 as $p1 <= cursor | |||
define R2 as $p2 <= cursor | |||
define standard_suffix as ( | |||
[substring] among( | |||
'ance' 'iqUe' 'isme' 'able' 'iste' 'eux' | |||
'ances' 'iqUes' 'ismes' 'ables' 'istes' | |||
( R2 delete ) | |||
'atrice' 'ateur' 'ation' | |||
'atrices' 'ateurs' 'ations' | |||
( R2 delete | |||
try ( ['ic'] (R2 delete) or <-'iqU' ) | |||
) | |||
'logie' | |||
'logies' | |||
( R2 <- 'log' ) | |||
'usion' 'ution' | |||
'usions' 'utions' | |||
( R2 <- 'u' ) | |||
'ence' | |||
'ences' | |||
( R2 <- 'ent' ) | |||
'ement' | |||
'ements' | |||
( | |||
RV delete | |||
try ( | |||
[substring] among( | |||
'iv' (R2 delete ['at'] R2 delete) | |||
'eus' ((R2 delete) or (R1<-'eux')) | |||
'abl' 'iqU' | |||
(R2 delete) | |||
'i{e`}r' 'I{e`}r' //) | |||
(RV <-'i') //)--new 2 Sept 02 | |||
) | |||
) | |||
) | |||
'it{e'}' | |||
'it{e'}s' | |||
( | |||
R2 delete | |||
try ( | |||
[substring] among( | |||
'abil' ((R2 delete) or <-'abl') | |||
'ic' ((R2 delete) or <-'iqU') | |||
'iv' (R2 delete) | |||
) | |||
) | |||
) | |||
'if' 'ive' | |||
'ifs' 'ives' | |||
( | |||
R2 delete | |||
try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' ) | |||
) | |||
'eaux' (<- 'eau') | |||
'aux' (R1 <- 'al') | |||
'euse' | |||
'euses'((R2 delete) or (R1<-'eux')) | |||
'issement' | |||
'issements'(R1 non-v delete) // verbal | |||
// fail(...) below forces entry to verb_suffix. -ment typically | |||
// follows the p.p., e.g 'confus{e'}ment'. | |||
'amment' (RV fail(<- 'ant')) | |||
'emment' (RV fail(<- 'ent')) | |||
'ment' | |||
'ments' (test(v RV) fail(delete)) | |||
// v is e,i,u,{e'},I or U | |||
) | |||
) | |||
define i_verb_suffix as setlimit tomark pV for ( | |||
[substring] among ( | |||
'{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai' | |||
'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez' | |||
'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait' | |||
'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses' | |||
'issez' 'issiez' 'issions' 'issons' 'it' | |||
(non-v delete) | |||
) | |||
) | |||
define verb_suffix as setlimit tomark pV for ( | |||
[substring] among ( | |||
'ions' | |||
(R2 delete) | |||
'{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai' | |||
'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions' | |||
'erons' 'eront' 'ez' 'iez' | |||
// 'ons' //-best omitted | |||
(delete) | |||
'{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant' | |||
'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez' | |||
'assions' | |||
(delete | |||
try(['e'] delete) | |||
) | |||
) | |||
) | |||
define keep_with_s 'aiou{e`}s' | |||
define residual_suffix as ( | |||
try(['s'] test non-keep_with_s delete) | |||
setlimit tomark pV for ( | |||
[substring] among( | |||
'ion' (R2 's' or 't' delete) | |||
'ier' 'i{e`}re' | |||
'Ier' 'I{e`}re' (<-'i') | |||
'e' (delete) | |||
'{e"}' ('gu' delete) | |||
) | |||
) | |||
) | |||
define un_double as ( | |||
test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete | |||
) | |||
define un_accent as ( | |||
atleast 1 non-v | |||
[ '{e'}' or '{e`}' ] <-'e' | |||
) | |||
) | |||
define stem as ( | |||
do prelude | |||
do mark_regions | |||
backwards ( | |||
do ( | |||
( | |||
( standard_suffix or | |||
i_verb_suffix or | |||
verb_suffix | |||
) | |||
and | |||
try( [ ('Y' ] <- 'i' ) or | |||
('{c,}'] <- 'c' ) | |||
) | |||
) or | |||
residual_suffix | |||
) | |||
// try(['ent'] RV delete) // is best omitted | |||
do un_double | |||
do un_accent | |||
) | |||
do postlude | |||
) | |||
@@ -18,12 +18,12 @@ groupings ( v s_ending st_ending ) | |||
stringescapes {} | |||
/* special characters (in ISO Latin I) */ | |||
/* special characters */ | |||
stringdef a" hex 'E4' | |||
stringdef o" hex 'F6' | |||
stringdef u" hex 'FC' | |||
stringdef ss hex 'DF' | |||
stringdef a" '{U+00E4}' | |||
stringdef o" '{U+00F6}' | |||
stringdef u" '{U+00FC}' | |||
stringdef ss '{U+00DF}' | |||
define v 'aeiouy{a"}{o"}{u"}' | |||
@@ -1,139 +0,0 @@ | |||
/* | |||
Extra rule for -nisse ending added 11 Dec 2009 | |||
*/ | |||
routines ( | |||
prelude postlude | |||
mark_regions | |||
R1 R2 | |||
standard_suffix | |||
) | |||
externals ( stem ) | |||
integers ( p1 p2 x ) | |||
groupings ( v s_ending st_ending ) | |||
stringescapes {} | |||
/* special characters (in MS-DOS Latin I) */ | |||
stringdef a" hex '84' | |||
stringdef o" hex '94' | |||
stringdef u" hex '81' | |||
stringdef ss hex 'E1' | |||
define v 'aeiouy{a"}{o"}{u"}' | |||
define s_ending 'bdfghklmnrt' | |||
define st_ending s_ending - 'r' | |||
define prelude as ( | |||
test repeat ( | |||
( | |||
['{ss}'] <- 'ss' | |||
) or next | |||
) | |||
repeat goto ( | |||
v [('u'] v <- 'U') or | |||
('y'] v <- 'Y') | |||
) | |||
) | |||
define mark_regions as ( | |||
$p1 = limit | |||
$p2 = limit | |||
test(hop 3 setmark x) | |||
gopast v gopast non-v setmark p1 | |||
try($p1 < x $p1 = x) // at least 3 | |||
gopast v gopast non-v setmark p2 | |||
) | |||
define postlude as repeat ( | |||
[substring] among( | |||
'Y' (<- 'y') | |||
'U' (<- 'u') | |||
'{a"}' (<- 'a') | |||
'{o"}' (<- 'o') | |||
'{u"}' (<- 'u') | |||
'' (next) | |||
) | |||
) | |||
backwardmode ( | |||
define R1 as $p1 <= cursor | |||
define R2 as $p2 <= cursor | |||
define standard_suffix as ( | |||
do ( | |||
[substring] R1 among( | |||
'em' 'ern' 'er' | |||
( delete | |||
) | |||
'e' 'en' 'es' | |||
( delete | |||
try (['s'] 'nis' delete) | |||
) | |||
's' | |||
( s_ending delete | |||
) | |||
) | |||
) | |||
do ( | |||
[substring] R1 among( | |||
'en' 'er' 'est' | |||
( delete | |||
) | |||
'st' | |||
( st_ending hop 3 delete | |||
) | |||
) | |||
) | |||
do ( | |||
[substring] R2 among( | |||
'end' 'ung' | |||
( delete | |||
try (['ig'] not 'e' R2 delete) | |||
) | |||
'ig' 'ik' 'isch' | |||
( not 'e' delete | |||
) | |||
'lich' 'heit' | |||
( delete | |||
try ( | |||
['er' or 'en'] R1 delete | |||
) | |||
) | |||
'keit' | |||
( delete | |||
try ( | |||
[substring] R2 among( | |||
'lich' 'ig' | |||
( delete | |||
) | |||
) | |||
) | |||
) | |||
) | |||
) | |||
) | |||
) | |||
define stem as ( | |||
do prelude | |||
do mark_regions | |||
backwards | |||
do standard_suffix | |||
do postlude | |||
) |
@@ -18,12 +18,12 @@ groupings ( v s_ending st_ending ) | |||
stringescapes {} | |||
/* special characters (in ISO Latin I) */ | |||
/* special characters */ | |||
stringdef a" hex 'E4' | |||
stringdef o" hex 'F6' | |||
stringdef u" hex 'FC' | |||
stringdef ss hex 'DF' | |||
stringdef a" '{U+00E4}' | |||
stringdef o" '{U+00F6}' | |||
stringdef u" '{U+00FC}' | |||
stringdef ss '{U+00DF}' | |||
define v 'aeiouy{a"}{o"}{u"}' | |||
@@ -0,0 +1,706 @@ | |||
// A stemmer for Modern Greek language, based on: | |||
// | |||
// Ntais, Georgios. Development of a Stemmer for the Greek | |||
// Language. Diss. Royal Institute of Technology, 2006. | |||
// https://sais.se/mthprize/2007/ntais2007.pdf | |||
// | |||
// Saroukos, Spyridon. Enhancing a Greek language stemmer. | |||
// University of Tampere, 2008. | |||
// https://tampub.uta.fi/bitstream/handle/10024/80480/gradu03463.pdf | |||
stringescapes {} | |||
stringdef a '{U+03B1}' // alpha | |||
stringdef v '{U+03B2}' // beta | |||
stringdef g '{U+03B3}' // gamma | |||
stringdef d '{U+03B4}' // delta | |||
stringdef e '{U+03B5}' // epsilon | |||
stringdef z '{U+03B6}' // zeta | |||
stringdef i '{U+03B7}' // eta | |||
stringdef th '{U+03B8}' // theta | |||
stringdef y '{U+03B9}' // iota | |||
stringdef k '{U+03BA}' // kappa | |||
stringdef l '{U+03BB}' // lamda | |||
stringdef m '{U+03BC}' // mu | |||
stringdef n '{U+03BD}' // nu | |||
stringdef x '{U+03BE}' // xi | |||
stringdef o '{U+03BF}' // omicron | |||
stringdef p '{U+03C0}' // pi | |||
stringdef r '{U+03C1}' // rho | |||
stringdef ss '{U+03C2}' // sigma final | |||
stringdef s '{U+03C3}' // sigma | |||
stringdef t '{U+03C4}' // tau | |||
stringdef u '{U+03C5}' // upsilon | |||
stringdef f '{U+03C6}' // phi | |||
stringdef ch '{U+03C7}' // chi | |||
stringdef ps '{U+03C8}' // psi | |||
stringdef oo '{U+03C9}' // omega | |||
stringdef A '{U+0391}' // Alpha | |||
stringdef V '{U+0392}' // Beta | |||
stringdef G '{U+0393}' // Gamma | |||
stringdef D '{U+0394}' // Delta | |||
stringdef E '{U+0395}' // Epsilon | |||
stringdef Z '{U+0396}' // Zeta | |||
stringdef I '{U+0397}' // Eta | |||
stringdef Th '{U+0398}' // Theta | |||
stringdef Y '{U+0399}' // Iota | |||
stringdef K '{U+039A}' // Kappa | |||
stringdef L '{U+039B}' // Lamda | |||
stringdef M '{U+039C}' // Mu | |||
stringdef N '{U+039D}' // Nu | |||
stringdef X '{U+039E}' // Xi | |||
stringdef O '{U+039F}' // Omicron | |||
stringdef P '{U+03A0}' // Pi | |||
stringdef R '{U+03A1}' // Rho | |||
stringdef S '{U+03A3}' // Sigma | |||
stringdef T '{U+03A4}' // Tau | |||
stringdef U '{U+03A5}' // Upsilon | |||
stringdef F '{U+03A6}' // Phi | |||
stringdef Ch '{U+03A7}' // Chi | |||
stringdef Ps '{U+03A8}' // Psi | |||
stringdef Oo '{U+03A9}' // Omega | |||
stringdef Y: '{U+03AA}' // Iota with dialytika | |||
stringdef U: '{U+03AB}' // Upsilon with dialytika | |||
stringdef a' '{U+03AC}' // alpha with tonos | |||
stringdef e' '{U+03AD}' // epsilon with tonos | |||
stringdef i' '{U+03AE}' // eta with tonos | |||
stringdef y' '{U+03AF}' // iota with tonos | |||
stringdef o' '{U+03CC}' // omicron with tonos | |||
stringdef u' '{U+03CD}' // upsilon with tonos | |||
stringdef oo' '{U+03CE}' // omega with tonos | |||
stringdef i:' '{U+0390}' // iota with dialytika and tonos | |||
stringdef u:' '{U+03B0}' // upsilon with dialytika and tonos | |||
stringdef i: '{U+03CA}' // iota with dialytika | |||
stringdef u: '{U+03CB}' // upsilon with dialytika | |||
stringdef A' '{U+0386}' // Alpha with tonos | |||
stringdef E' '{U+0388}' // Epsilon with tonos | |||
stringdef I' '{U+0389}' // Eta with tonos | |||
stringdef Y' '{U+038A}' // Iota with tonos | |||
stringdef O' '{U+038C}' // Omicron with tonos | |||
stringdef U' '{U+038E}' // Upsilon with tonos | |||
stringdef OO' '{U+038F}' // Omega with tonos | |||
externals ( stem ) | |||
booleans ( test1 ) | |||
groupings ( v v2 ) | |||
routines ( tolower has_min_length | |||
steps1 steps2 steps3 steps4 steps5 steps6 steps7 | |||
steps8 steps9 steps10 | |||
step1 step2a step2b step2c step2d step3 step4 | |||
step5a step5b step5c step5d step5e step5f | |||
step5g step5h step5i | |||
step5j step5k step5l step5m | |||
step6 step7 ) | |||
define v '{a}{e}{i}{y}{o}{u}{oo}' | |||
define v2 '{a}{e}{i}{y}{o}{oo}' | |||
backwardmode ( | |||
define has_min_length as ( | |||
$(len >= 3) | |||
) | |||
define tolower as ( | |||
repeat ( | |||
[substring] among ( | |||
'{A}' (<- '{a}') | |||
'{V}' (<- '{v}') | |||
'{G}' (<- '{g}') | |||
'{D}' (<- '{d}') | |||
'{E}' (<- '{e}') | |||
'{Z}' (<- '{z}') | |||
'{I}' (<- '{i}') | |||
'{Th}' (<- '{th}') | |||
'{Y}' (<- '{y}') | |||
'{K}' (<- '{k}') | |||
'{L}' (<- '{l}') | |||
'{M}' (<- '{m}') | |||
'{N}' (<- '{n}') | |||
'{X}' (<- '{x}') | |||
'{O}' (<- '{o}') | |||
'{P}' (<- '{p}') | |||
'{R}' (<- '{r}') | |||
'{S}' (<- '{s}') | |||
'{T}' (<- '{t}') | |||
'{U}' (<- '{u}') | |||
'{F}' (<- '{f}') | |||
'{Ch}' (<- '{ch}') | |||
'{Ps}' (<- '{ps}') | |||
'{Oo}' (<- '{oo}') | |||
'{Y:}' (<- '{y}') | |||
'{U:}' (<- '{u}') | |||
'{a'}' (<- '{a}') | |||
'{e'}' (<- '{e}') | |||
'{i'}' (<- '{i}') | |||
'{y'}' (<- '{y}') | |||
'{o'}' (<- '{o}') | |||
'{u'}' (<- '{u}') | |||
'{oo'}' (<- '{oo}') | |||
'{i:'}' (<- '{i}') | |||
'{u:'}' (<- '{u}') | |||
'{i:}' (<- '{i}') | |||
'{u:}' (<- '{u}') | |||
'{A'}' (<- '{a}') | |||
'{E'}' (<- '{e}') | |||
'{I'}' (<- '{i}') | |||
'{Y'}' (<- '{y}') | |||
'{O'}' (<- '{o}') | |||
'{U'}' (<- '{u}') | |||
'{OO'}' (<- '{oo}') | |||
'{ss}' (<- '{s}') | |||
'' (next) | |||
) | |||
) | |||
) | |||
define step1 as ( | |||
[substring] among ( | |||
'{f}{a}{g}{y}{a}' '{f}{a}{g}{y}{o}{u}' '{f}{a}{g}{y}{oo}{n}' (<- '{f}{a}') | |||
'{s}{k}{a}{g}{y}{a}' '{s}{k}{a}{g}{y}{o}{u}' '{s}{k}{a}{g}{y}{oo}{n}' (<- '{s}{k}{a}') | |||
'{o}{l}{o}{g}{y}{o}{u}' '{o}{l}{o}{g}{y}{a}' '{o}{l}{o}{g}{y}{oo}{n}' (<- '{o}{l}{o}') | |||
'{s}{o}{g}{y}{o}{u}' '{s}{o}{g}{y}{a}' '{s}{o}{g}{y}{oo}{n}' (<- '{s}{o}') | |||
'{t}{a}{t}{o}{g}{y}{a}' '{t}{a}{t}{o}{g}{y}{o}{u}' '{t}{a}{t}{o}{g}{y}{oo}{n}' (<- '{t}{a}{t}{o}') | |||
'{k}{r}{e}{a}{s}' '{k}{r}{e}{a}{t}{o}{s}' '{k}{r}{e}{a}{t}{a}' '{k}{r}{e}{a}{t}{oo}{n}' (<- '{k}{r}{e}') | |||
'{p}{e}{r}{a}{s}' '{p}{e}{r}{a}{t}{o}{s}' '{p}{e}{r}{a}{t}{i}' '{p}{e}{r}{a}{t}{a}' '{p}{e}{r}{a}{t}{oo}{n}' (<- '{p}{e}{r}') | |||
'{t}{e}{r}{a}{s}' '{t}{e}{r}{a}{t}{o}{s}' '{t}{e}{r}{a}{t}{a}' '{t}{e}{r}{a}{t}{oo}{n}' (<- '{t}{e}{r}') | |||
'{f}{oo}{s}' '{f}{oo}{t}{o}{s}' '{f}{oo}{t}{a}' '{f}{oo}{t}{oo}{n}' (<- '{f}{oo}') | |||
'{k}{a}{th}{e}{s}{t}{oo}{s}' '{k}{a}{th}{e}{s}{t}{oo}{t}{o}{s}' '{k}{a}{th}{e}{s}{t}{oo}{t}{a}' '{k}{a}{th}{e}{s}{t}{oo}{t}{oo}{n}' (<- '{k}{a}{th}{e}{s}{t}') | |||
'{g}{e}{g}{o}{n}{o}{s}' '{g}{e}{g}{o}{n}{o}{t}{o}{s}' '{g}{e}{g}{o}{n}{o}{t}{a}' '{g}{e}{g}{o}{n}{o}{t}{oo}{n}' (<- '{g}{e}{g}{o}{n}') | |||
) | |||
unset test1 | |||
) | |||
define steps1 as ( | |||
[substring] among ( | |||
'{y}{z}{a}' '{y}{z}{e}{s}' '{y}{z}{e}' '{y}{z}{a}{m}{e}' '{y}{z}{a}{t}{e}' '{y}{z}{a}{n}' '{y}{z}{a}{n}{e}' '{y}{z}{oo}' '{y}{z}{e}{y}{s}' '{y}{z}{e}{y}' | |||
'{y}{z}{o}{u}{m}{e}' '{y}{z}{e}{t}{e}' '{y}{z}{o}{u}{n}' '{y}{z}{o}{u}{n}{e}' ( | |||
delete | |||
unset test1 | |||
([] substring atlimit among ( | |||
'{a}{n}{a}{m}{p}{a}' '{e}{m}{p}{a}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{p}{a}' '{p}{e}{r}{y}{p}{a}' '{a}{th}{r}{o}' '{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' | |||
(<- '{y}') | |||
)) or | |||
([] substring atlimit among ( | |||
'{m}{a}{r}{k}' '{k}{o}{r}{n}' '{a}{m}{p}{a}{r}' '{a}{r}{r}' '{v}{a}{th}{u}{r}{y}' '{v}{a}{r}{k}' '{v}' '{v}{o}{l}{v}{o}{r}' '{g}{k}{r}' | |||
'{g}{l}{u}{k}{o}{r}' '{g}{l}{u}{k}{u}{r}' '{y}{m}{p}' '{l}' '{l}{o}{u}' '{m}{a}{r}' '{m}' '{p}{r}' '{m}{p}{r}' '{p}{o}{l}{u}{r}' '{p}' | |||
'{r}' '{p}{y}{p}{e}{r}{o}{r}' | |||
(<- '{y}{z}') | |||
)) | |||
) | |||
) | |||
) | |||
define steps2 as ( | |||
[substring] among ( | |||
'{oo}{th}{i}{k}{a}' '{oo}{th}{i}{k}{e}{s}' '{oo}{th}{i}{k}{e}' '{oo}{th}{i}{k}{a}{m}{e}' '{oo}{th}{i}{k}{a}{t}{e}' '{oo}{th}{i}{k}{a}{n}' '{oo}{th}{i}{k}{a}{n}{e}' ( | |||
delete | |||
unset test1 | |||
[] substring atlimit among ( | |||
'{a}{l}' '{v}{y}' '{e}{n}' '{u}{ps}' '{l}{y}' '{z}{oo}' '{s}' '{ch}' (<- '{oo}{n}') | |||
) | |||
) | |||
) | |||
) | |||
define steps3 as ( | |||
[substring] among ( | |||
'{y}{s}{a}' '{y}{s}{e}{s}' '{y}{s}{e}' '{y}{s}{a}{m}{e}' '{y}{s}{a}{t}{e}' '{y}{s}{a}{n}' '{y}{s}{a}{n}{e}' ( | |||
delete | |||
unset test1 | |||
('{y}{s}{a}' atlimit <- '{y}{s}') or | |||
([] substring atlimit among ( | |||
'{a}{n}{a}{m}{p}{a}' '{a}{th}{r}{o}' '{e}{m}{p}{a}' '{e}{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{e}{p}{e}' '{p}{e}{r}{y}{p}{a}' | |||
'{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' '{k}{l}{e}' '{ch}{a}{r}{t}{o}{p}{a}' '{e}{x}{a}{r}{ch}{a}' '{m}{e}{t}{e}{p}{e}' '{a}{p}{o}{k}{l}{e}' | |||
'{a}{p}{e}{k}{l}{e}' '{e}{k}{l}{e}' '{p}{e}' | |||
(<- '{y}') | |||
)) or | |||
([] substring atlimit among ( | |||
'{a}{n}' '{a}{f}' '{g}{e}' '{g}{y}{g}{a}{n}{t}{o}{a}{f}' '{g}{k}{e}' '{d}{i}{m}{o}{k}{r}{a}{t}' '{k}{o}{m}' '{g}{k}' '{m}' '{p}' | |||
'{p}{o}{u}{k}{a}{m}' '{o}{l}{o}' '{l}{a}{r}' | |||
(<- '{y}{s}') | |||
)) | |||
) | |||
) | |||
) | |||
define steps4 as ( | |||
[substring] among ( | |||
'{y}{s}{oo}' '{y}{s}{e}{y}{s}' '{y}{s}{e}{y}' '{y}{s}{o}{u}{m}{e}' '{y}{s}{e}{t}{e}' '{y}{s}{o}{u}{n}' '{y}{s}{o}{u}{n}{e}' ( | |||
delete | |||
unset test1 | |||
[] substring atlimit among ( | |||
'{a}{n}{a}{m}{p}{a}' '{e}{m}{p}{a}' '{e}{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{e}{p}{a}' '{x}{a}{n}{a}{p}{a}' '{e}{p}{e}' '{p}{e}{r}{y}{p}{a}' '{a}{th}{r}{o}' | |||
'{s}{u}{n}{a}{th}{r}{o}' '{d}{a}{n}{e}' '{k}{l}{e}' '{ch}{a}{r}{t}{o}{p}{a}' '{e}{x}{a}{r}{ch}{a}' '{m}{e}{t}{e}{p}{e}' '{a}{p}{o}{k}{l}{e}' '{a}{p}{e}{k}{l}{e}' | |||
'{e}{k}{l}{e}' '{p}{e}' | |||
(<- '{y}') | |||
) | |||
) | |||
) | |||
) | |||
define steps5 as ( | |||
[substring] among ( | |||
'{y}{s}{t}{o}{s}' '{y}{s}{t}{o}{u}' '{y}{s}{t}{o}' '{y}{s}{t}{e}' '{y}{s}{t}{o}{y}' '{y}{s}{t}{oo}{n}' '{y}{s}{t}{o}{u}{s}' '{y}{s}{t}{i}' '{y}{s}{t}{i}{s}' | |||
'{y}{s}{t}{a}' '{y}{s}{t}{e}{s}' ( | |||
delete | |||
unset test1 | |||
([] substring atlimit among ( | |||
'{d}{a}{n}{e}' '{s}{u}{n}{a}{th}{r}{o}' '{k}{l}{e}' '{s}{e}' '{e}{s}{oo}{k}{l}{e}' '{a}{s}{e}' '{p}{l}{e}' | |||
(<- '{y}') | |||
)) or | |||
([] substring atlimit among ( | |||
'{m}' '{p}' '{a}{p}' '{a}{r}' '{i}{d}' '{k}{t}' '{s}{k}' '{s}{ch}' '{u}{ps}' '{f}{a}' '{ch}{r}' '{ch}{t}' '{a}{k}{t}' | |||
'{a}{o}{r}' '{a}{s}{ch}' '{a}{t}{a}' '{a}{ch}{n}' '{a}{ch}{t}' '{g}{e}{m}' '{g}{u}{r}' '{e}{m}{p}' '{e}{u}{p}' '{e}{ch}{th}' '{i}{f}{a}' | |||
'{k}{a}{th}' '{k}{a}{k}' '{k}{u}{l}' '{l}{u}{g}' '{m}{a}{k}' '{m}{e}{g}' '{t}{a}{ch}' '{f}{y}{l}' '{ch}{oo}{r}' | |||
(<- '{y}{s}{t}') | |||
)) | |||
) | |||
) | |||
) | |||
define steps6 as ( | |||
[substring] among ( | |||
'{y}{s}{m}{o}' '{y}{s}{m}{o}{y}' '{y}{s}{m}{o}{s}' '{y}{s}{m}{o}{u}' '{y}{s}{m}{o}{u}{s}' '{y}{s}{m}{oo}{n}' ( | |||
delete | |||
unset test1 | |||
([] substring atlimit among ( | |||
'{s}{e}' '{m}{e}{t}{a}{s}{e}' '{m}{y}{k}{r}{o}{s}{e}' '{e}{g}{k}{l}{e}' '{a}{p}{o}{k}{l}{e}' | |||
(<- '{y}{s}{m}') | |||
)) or | |||
([] substring atlimit among ( | |||
'{d}{a}{n}{e}' '{a}{n}{t}{y}{d}{a}{n}{e}' | |||
(<- '{y}') | |||
)) or | |||
([substring] among ( | |||
'{a}{g}{n}{oo}{s}{t}{y}{k}' (<- '{a}{g}{n}{oo}{s}{t}') | |||
'{a}{t}{o}{m}{y}{k}' (<- '{a}{t}{o}{m}') | |||
'{g}{n}{oo}{s}{t}{y}{k}' (<- '{g}{n}{oo}{s}{t}') | |||
'{e}{th}{n}{y}{k}' (<- '{e}{th}{n}') | |||
'{e}{k}{l}{e}{k}{t}{y}{k}' (<- '{e}{k}{l}{e}{k}{t}') | |||
'{s}{k}{e}{p}{t}{y}{k}' (<- '{s}{k}{e}{p}{t}') | |||
'{t}{o}{p}{y}{k}' (<- '{t}{o}{p}') | |||
'{a}{l}{e}{x}{a}{n}{d}{r}{y}{n}' (<- '{a}{l}{e}{x}{a}{n}{d}{r}') | |||
'{v}{u}{z}{a}{n}{t}{y}{n}' (<- '{v}{u}{z}{a}{n}{t}') | |||
'{th}{e}{a}{t}{r}{y}{n}' (<- '{th}{e}{a}{t}{r}') | |||
)) | |||
) | |||
) | |||
) | |||
define steps7 as ( | |||
[substring] among ( | |||
'{a}{r}{a}{k}{y}' '{a}{r}{a}{k}{y}{a}' '{o}{u}{d}{a}{k}{y}' '{o}{u}{d}{a}{k}{y}{a}' ( | |||
delete | |||
unset test1 | |||
[] substring atlimit among ( | |||
'{s}' '{ch}' | |||
(<- '{a}{r}{a}{k}') | |||
) | |||
) | |||
) | |||
) | |||
define steps8 as ( | |||
[substring] among ( | |||
'{a}{k}{y}' '{a}{k}{y}{a}' '{y}{t}{s}{a}' '{y}{t}{s}{a}{s}' '{y}{t}{s}{e}{s}' '{y}{t}{s}{oo}{n}' '{a}{r}{a}{k}{y}' '{a}{r}{a}{k}{y}{a}' ( | |||
delete | |||
unset test1 | |||
([] substring atlimit among ( | |||
'{v}{a}{m}{v}' '{v}{r}' '{k}{a}{y}{m}' '{k}{o}{n}' '{k}{o}{r}' '{l}{a}{v}{r}' '{l}{o}{u}{l}' '{m}{e}{r}' '{m}{o}{u}{s}{t}' | |||
'{n}{a}{g}{k}{a}{s}' '{p}{l}' '{r}' '{r}{u}' '{s}' '{s}{k}' '{s}{o}{k}' '{s}{p}{a}{n}' '{t}{z}' '{f}{a}{r}{m}' '{ch}' '{k}{a}{p}{a}{k}' | |||
'{a}{l}{y}{s}{f}' '{a}{m}{v}{r}' '{a}{n}{th}{r}' '{k}' '{f}{u}{l}' '{k}{a}{t}{r}{a}{p}' '{k}{l}{y}{m}' '{m}{a}{l}' '{s}{l}{o}{v}' '{f}' | |||
'{s}{f}' '{t}{s}{e}{ch}{o}{s}{l}{o}{v}' | |||
(<- '{a}{k}') | |||
)) or | |||
([] substring atlimit among ( | |||
'{v}' '{v}{a}{l}' '{g}{y}{a}{n}' '{g}{l}' '{z}' '{i}{g}{o}{u}{m}{e}{n}' '{k}{a}{r}{d}' '{k}{o}{n}' '{m}{a}{k}{r}{u}{n}' '{n}{u}{f}' | |||
'{p}{a}{t}{e}{r}' '{p}' '{s}{k}' '{t}{o}{s}' '{t}{r}{y}{p}{o}{l}' | |||
(<- '{y}{t}{s}') | |||
)) or | |||
([] '{k}{o}{r}' <- '{y}{t}{s}') | |||
) | |||
) | |||
) | |||
define steps9 as ( | |||
[substring] among ( | |||
'{y}{d}{y}{o}' '{y}{d}{y}{a}' '{y}{d}{y}{oo}{n}' ( | |||
delete | |||
unset test1 | |||
([] substring atlimit among ( | |||
'{a}{y}{f}{n}' '{y}{r}' '{o}{l}{o}' '{ps}{a}{l}' (<- '{y}{d}') | |||
)) or | |||
([] substring among ( | |||
'{e}' '{p}{a}{y}{ch}{n}' (<- '{y}{d}') | |||
)) | |||
) | |||
) | |||
) | |||
define steps10 as ( | |||
[substring] among ( | |||
'{y}{s}{k}{o}{s}' '{y}{s}{k}{o}{u}' '{y}{s}{k}{o}' '{y}{s}{k}{e}' ( | |||
delete | |||
unset test1 | |||
[] substring atlimit among ( | |||
'{d}' '{y}{v}' '{m}{i}{n}' '{r}' '{f}{r}{a}{g}{k}' '{l}{u}{k}' '{o}{v}{e}{l}' | |||
(<- '{y}{s}{k}') | |||
) | |||
) | |||
) | |||
) | |||
define step2a as ( | |||
[substring] among ( | |||
'{a}{d}{e}{s}' '{a}{d}{oo}{n}' (delete) | |||
) | |||
not ([substring] among ( | |||
'{o}{k}' '{m}{a}{m}' '{m}{a}{n}' '{m}{p}{a}{m}{p}' '{p}{a}{t}{e}{r}' '{g}{y}{a}{g}{y}' '{n}{t}{a}{n}{t}' '{k}{u}{r}' '{th}{e}{y}' '{p}{e}{th}{e}{r}' | |||
)) | |||
insert '{a}{d}' | |||
) | |||
define step2b as ( | |||
[substring] among ( | |||
'{e}{d}{e}{s}' '{e}{d}{oo}{n}' (delete) | |||
) | |||
[] substring among ( | |||
'{o}{p}' '{y}{p}' '{e}{m}{p}' '{u}{p}' '{g}{i}{p}' '{d}{a}{p}' '{k}{r}{a}{s}{p}' '{m}{y}{l}' (<- '{e}{d}') | |||
) | |||
) | |||
define step2c as ( | |||
[substring] among ( | |||
'{o}{u}{d}{e}{s}' '{o}{u}{d}{oo}{n}' (delete) | |||
) | |||
[] substring among ( | |||
'{a}{r}{k}' '{k}{a}{l}{y}{a}{k}' '{p}{e}{t}{a}{l}' '{l}{y}{ch}' '{p}{l}{e}{x}' '{s}{k}' '{s}' '{f}{l}' '{f}{r}' '{v}{e}{l}' '{l}{o}{u}{l}' '{ch}{n}' | |||
'{s}{p}' '{t}{r}{a}{g}' '{f}{e}' (<- '{o}{u}{d}') | |||
) | |||
) | |||
define step2d as ( | |||
[substring] among ( | |||
'{e}{oo}{s}' '{e}{oo}{n}' (delete unset test1) | |||
) | |||
[] substring atlimit among ( | |||
'{th}' '{d}' '{e}{l}' '{g}{a}{l}' '{n}' '{p}' '{y}{d}' '{p}{a}{r}' (<- '{e}') | |||
) | |||
) | |||
define step3 as ( | |||
[substring] among ( | |||
'{y}{a}' '{y}{o}{u}' '{y}{oo}{n}' (delete unset test1) | |||
) | |||
([] v <- '{y}') | |||
) | |||
define step4 as ( | |||
[substring] among ( | |||
'{y}{k}{a}' '{y}{k}{o}' '{y}{k}{o}{u}' '{y}{k}{oo}{n}' (delete unset test1) | |||
) | |||
([] v <- '{y}{k}') or | |||
[] substring atlimit among ( | |||
'{a}{l}' '{a}{d}' '{e}{n}{d}' '{a}{m}{a}{n}' '{a}{m}{m}{o}{ch}{a}{l}' '{i}{th}' '{a}{n}{i}{th}' '{a}{n}{t}{y}{d}' '{f}{u}{s}' '{v}{r}{oo}{m}' '{g}{e}{r}' | |||
'{e}{x}{oo}{d}' '{k}{a}{l}{p}' '{k}{a}{l}{l}{y}{n}' '{k}{a}{t}{a}{d}' '{m}{o}{u}{l}' '{m}{p}{a}{n}' '{m}{p}{a}{g}{y}{a}{t}' '{m}{p}{o}{l}' '{m}{p}{o}{s}' | |||
'{n}{y}{t}' '{x}{y}{k}' '{s}{u}{n}{o}{m}{i}{l}' '{p}{e}{t}{s}' '{p}{y}{t}{s}' '{p}{y}{k}{a}{n}{t}' '{p}{l}{y}{a}{t}{s}' '{p}{o}{s}{t}{e}{l}{n}' '{p}{r}{oo}{t}{o}{d}' | |||
'{s}{e}{r}{t}' '{s}{u}{n}{a}{d}' '{t}{s}{a}{m}' '{u}{p}{o}{d}' '{f}{y}{l}{o}{n}' '{f}{u}{l}{o}{d}' '{ch}{a}{s}' | |||
(<- '{y}{k}') | |||
) | |||
) | |||
define step5a as ( | |||
do ('{a}{g}{a}{m}{e}' atlimit <- '{a}{g}{a}{m}') | |||
do ( | |||
[substring] among ( | |||
'{a}{g}{a}{m}{e}' '{i}{s}{a}{m}{e}' '{o}{u}{s}{a}{m}{e}' '{i}{k}{a}{m}{e}' '{i}{th}{i}{k}{a}{m}{e}' (delete unset test1) | |||
) | |||
) | |||
['{a}{m}{e}'] | |||
delete | |||
unset test1 | |||
[] substring atlimit among ( | |||
'{a}{n}{a}{p}' '{a}{p}{o}{th}' '{a}{p}{o}{k}' '{a}{p}{o}{s}{t}' '{v}{o}{u}{v}' '{x}{e}{th}' '{o}{u}{l}' '{p}{e}{th}' '{p}{y}{k}{r}' '{p}{o}{t}' '{s}{y}{ch}' '{ch}' | |||
(<- '{a}{m}') | |||
) | |||
) | |||
define step5b as ( | |||
do ( | |||
[substring] among ( | |||
'{a}{g}{a}{n}{e}' '{i}{s}{a}{n}{e}' '{o}{u}{s}{a}{n}{e}' '{y}{o}{n}{t}{a}{n}{e}' '{y}{o}{t}{a}{n}{e}' '{y}{o}{u}{n}{t}{a}{n}{e}' '{o}{n}{t}{a}{n}{e}' '{o}{t}{a}{n}{e}' | |||
'{o}{u}{n}{t}{a}{n}{e}' '{i}{k}{a}{n}{e}' '{i}{th}{i}{k}{a}{n}{e}' ( | |||
delete | |||
unset test1 | |||
[] substring atlimit among ( | |||
'{t}{r}' '{t}{s}' (<- '{a}{g}{a}{n}') | |||
) | |||
) | |||
) | |||
) | |||
['{a}{n}{e}'] | |||
delete | |||
unset test1 | |||
([] v2 <- '{a}{n}') or | |||
[] substring atlimit among ( | |||
'{v}{e}{t}{e}{r}' '{v}{o}{u}{l}{k}' '{v}{r}{a}{ch}{m}' '{g}' '{d}{r}{a}{d}{o}{u}{m}' | |||
'{th}' '{k}{a}{l}{p}{o}{u}{z}' '{k}{a}{s}{t}{e}{l}' '{k}{o}{r}{m}{o}{r}' '{l}{a}{o}{p}{l}' '{m}{oo}{a}{m}{e}{th}' | |||
'{m}' '{m}{o}{u}{s}{o}{u}{l}{m}' '{n}' '{o}{u}{l}' '{p}' '{p}{e}{l}{e}{k}' '{p}{l}' '{p}{o}{l}{y}{s}' | |||
'{p}{o}{r}{t}{o}{l}' '{s}{a}{r}{a}{k}{a}{t}{s}' '{s}{o}{u}{l}{t}' '{t}{s}{a}{r}{l}{a}{t}' '{o}{r}{f}' | |||
'{t}{s}{y}{g}{g}' '{t}{s}{o}{p}' '{f}{oo}{t}{o}{s}{t}{e}{f}' '{ch}' '{ps}{u}{ch}{o}{p}{l}' '{a}{g}' | |||
'{g}{a}{l}' '{g}{e}{r}' '{d}{e}{k}' '{d}{y}{p}{l}' '{a}{m}{e}{r}{y}{k}{a}{n}' '{o}{u}{r}' '{p}{y}{th}' | |||
'{p}{o}{u}{r}{y}{t}' '{s}' '{z}{oo}{n}{t}' '{y}{k}' '{k}{a}{s}{t}' '{k}{o}{p}' '{l}{y}{ch}' | |||
'{l}{o}{u}{th}{i}{r}' '{m}{a}{y}{n}{t}' '{m}{e}{l}' '{s}{y}{g}' '{s}{p}' '{s}{t}{e}{g}' '{t}{r}{a}{g}' | |||
'{t}{s}{a}{g}' '{f}' '{e}{r}' '{a}{d}{a}{p}' '{a}{th}{y}{g}{g}' '{a}{m}{i}{ch}' '{a}{n}{y}{k}' | |||
'{a}{n}{o}{r}{g}' '{a}{p}{i}{g}' '{a}{p}{y}{th}' '{a}{t}{s}{y}{g}{g}' '{v}{a}{s}' '{v}{a}{s}{k}' | |||
'{v}{a}{th}{u}{g}{a}{l}' '{v}{y}{o}{m}{i}{ch}' '{v}{r}{a}{ch}{u}{k}' '{d}{y}{a}{t}' '{d}{y}{a}{f}' '{e}{n}{o}{r}{g}' | |||
'{th}{u}{s}' '{k}{a}{p}{n}{o}{v}{y}{o}{m}{i}{ch}' '{k}{a}{t}{a}{g}{a}{l}' '{k}{l}{y}{v}' '{k}{o}{y}{l}{a}{r}{f}' | |||
'{l}{y}{v}' '{m}{e}{g}{l}{o}{v}{y}{o}{m}{i}{ch}' '{m}{y}{k}{r}{o}{v}{y}{o}{m}{i}{ch}' '{n}{t}{a}{v}' | |||
'{x}{i}{r}{o}{k}{l}{y}{v}' '{o}{l}{y}{g}{o}{d}{a}{m}' '{o}{l}{o}{g}{a}{l}' '{p}{e}{n}{t}{a}{r}{f}' '{p}{e}{r}{i}{f}' | |||
'{p}{e}{r}{y}{t}{r}' '{p}{l}{a}{t}' '{p}{o}{l}{u}{d}{a}{p}' '{p}{o}{l}{u}{m}{i}{ch}' '{s}{t}{e}{f}' '{t}{a}{v}' | |||
'{t}{e}{t}' '{u}{p}{e}{r}{i}{f}' '{u}{p}{o}{k}{o}{p}' '{ch}{a}{m}{i}{l}{o}{d}{a}{p}' '{ps}{i}{l}{o}{t}{a}{v}' | |||
(<- '{a}{n}') | |||
) | |||
) | |||
define step5c as ( | |||
do ( | |||
[substring] among ( | |||
'{i}{s}{e}{t}{e}' (delete unset test1) | |||
) | |||
) | |||
['{e}{t}{e}'] | |||
delete | |||
unset test1 | |||
([] v2 <- '{e}{t}') or | |||
([] substring among ( | |||
'{o}{d}' '{a}{y}{r}' '{f}{o}{r}' '{t}{a}{th}' '{d}{y}{a}{th}' '{s}{ch}' '{e}{n}{d}' '{e}{u}{r}' '{t}{y}{th}' '{u}{p}{e}{r}{th}' | |||
'{r}{a}{th}' '{e}{n}{th}' '{r}{o}{th}' '{s}{th}' '{p}{u}{r}' '{a}{y}{n}' '{s}{u}{n}{d}' '{s}{u}{n}' '{s}{u}{n}{th}' '{ch}{oo}{r}' | |||
'{p}{o}{n}' '{v}{r}' '{k}{a}{th}' '{e}{u}{th}' '{e}{k}{th}' '{n}{e}{t}' '{r}{o}{n}' '{a}{r}{k}' '{v}{a}{r}' '{v}{o}{l}' '{oo}{f}{e}{l}' | |||
(<- '{e}{t}') | |||
)) or | |||
[] substring atlimit among ( | |||
'{a}{v}{a}{r}' '{v}{e}{n}' '{e}{n}{a}{r}' '{a}{v}{r}' '{a}{d}' '{a}{th}' '{a}{n}' '{a}{p}{l}' '{v}{a}{r}{o}{n}' '{n}{t}{r}' '{s}{k}' '{k}{o}{p}' | |||
'{m}{p}{o}{r}' '{n}{y}{f}' '{p}{a}{g}' '{p}{a}{r}{a}{k}{a}{l}' '{s}{e}{r}{p}' '{s}{k}{e}{l}' '{s}{u}{r}{f}' '{t}{o}{k}' '{u}' '{d}' '{e}{m}' | |||
'{th}{a}{r}{r}' '{th}' | |||
(<- '{e}{t}') | |||
) | |||
) | |||
define step5d as ( | |||
[substring] among ( | |||
'{o}{n}{t}{a}{s}' '{oo}{n}{t}{a}{s}' ( | |||
delete | |||
unset test1 | |||
([] '{a}{r}{ch}' atlimit <- '{o}{n}{t}') or | |||
([] '{k}{r}{e}' <- '{oo}{n}{t}') | |||
) | |||
) | |||
) | |||
define step5e as ( | |||
[substring] among ( | |||
'{o}{m}{a}{s}{t}{e}' '{y}{o}{m}{a}{s}{t}{e}' ( | |||
delete | |||
unset test1 | |||
([] '{o}{n}' atlimit <- '{o}{m}{a}{s}{t}') | |||
) | |||
) | |||
) | |||
define step5f as ( | |||
do ( | |||
['{y}{e}{s}{t}{e}'] | |||
delete | |||
unset test1 | |||
[] substring atlimit among ( | |||
'{p}' '{a}{p}' '{s}{u}{m}{p}' '{a}{s}{u}{m}{p}' '{a}{k}{a}{t}{a}{p}' '{a}{m}{e}{t}{a}{m}{f}' (<- '{y}{e}{s}{t}') | |||
) | |||
) | |||
['{e}{s}{t}{e}'] | |||
delete | |||
unset test1 | |||
[] substring atlimit among ( | |||
'{a}{l}' '{a}{r}' '{e}{k}{t}{e}{l}' '{z}' '{m}' '{x}' '{p}{a}{r}{a}{k}{a}{l}' '{p}{r}{o}' '{n}{y}{s}' | |||
(<- '{y}{e}{s}{t}') | |||
) | |||
) | |||
define step5g as ( | |||
do ( | |||
[substring] among ( | |||
'{i}{th}{i}{k}{a}' '{i}{th}{i}{k}{e}{s}' '{i}{th}{i}{k}{e}' (delete unset test1) | |||
) | |||
) | |||
[substring] among ( | |||
'{i}{k}{a}' '{i}{k}{e}{s}' '{i}{k}{e}' ( | |||
delete | |||
unset test1 | |||
([] substring among ( | |||
'{s}{k}{oo}{l}' '{s}{k}{o}{u}{l}' '{n}{a}{r}{th}' '{s}{f}' '{o}{th}' '{p}{y}{th}' (<- '{i}{k}') | |||
)) or | |||
([] substring atlimit among ( | |||
'{d}{y}{a}{th}' '{th}' '{p}{a}{r}{a}{k}{a}{t}{a}{th}' '{p}{r}{o}{s}{th}' '{s}{u}{n}{th}' (<- '{i}{k}') | |||
)) | |||
) | |||
) | |||
) | |||
define step5h as ( | |||
[substring] among ( | |||
'{o}{u}{s}{a}' '{o}{u}{s}{e}{s}' '{o}{u}{s}{e}' ( | |||
delete | |||
unset test1 | |||
([] substring among ( | |||
'{p}{o}{d}{a}{r}' '{v}{l}{e}{p}' '{p}{a}{n}{t}{a}{ch}' '{f}{r}{u}{d}' '{m}{a}{n}{t}{y}{l}' '{m}{a}{l}{l}' '{k}{u}{m}{a}{t}' '{l}{a}{ch}' '{l}{i}{g}' | |||
'{f}{a}{g}' '{o}{m}' '{p}{r}{oo}{t}' (<- '{o}{u}{s}') | |||
)) or | |||
([] substring atlimit among ( | |||
'{f}{a}{r}{m}{a}{k}' '{ch}{a}{d}' '{a}{g}{k}' '{a}{n}{a}{r}{r}' '{v}{r}{o}{m}' '{e}{k}{l}{y}{p}' '{l}{a}{m}{p}{y}{d}' '{l}{e}{ch}' '{m}' '{p}{a}{t}' | |||
'{r}' '{l}' '{m}{e}{d}' '{m}{e}{s}{a}{z}' '{u}{p}{o}{t}{e}{y}{n}' '{a}{m}' '{a}{y}{th}' '{a}{n}{i}{k}' '{d}{e}{s}{p}{o}{z}' | |||
'{e}{n}{d}{y}{a}{f}{e}{r}' '{d}{e}' '{d}{e}{u}{t}{e}{r}{e}{u}' '{k}{a}{th}{a}{r}{e}{u}' '{p}{l}{e}' '{t}{s}{a}' | |||
(<- '{o}{u}{s}') | |||
)) | |||
) | |||
) | |||
) | |||
define step5i as ( | |||
[substring] among ( | |||
'{a}{g}{a}' '{a}{g}{e}{s}' '{a}{g}{e}' ( | |||
delete | |||
unset test1 | |||
([] '{k}{o}{l}{l}' <- '{a}{g}') or ( | |||
not ([substring] among ('{ps}{o}{f}' '{n}{a}{u}{l}{o}{ch}')) | |||
([] substring among ( | |||
'{o}{f}' '{p}{e}{l}' '{ch}{o}{r}{t}' '{l}{l}' '{s}{f}' '{r}{p}' '{f}{r}' '{p}{r}' '{l}{o}{ch}' '{s}{m}{i}{n}' | |||
(<- '{a}{g}') | |||
)) or | |||
([] substring atlimit among ( | |||
'{a}{v}{a}{s}{t}' '{p}{o}{l}{u}{f}' '{a}{d}{i}{f}' '{p}{a}{m}{f}' '{r}' '{a}{s}{p}' '{a}{f}' '{a}{m}{a}{l}' '{a}{m}{a}{l}{l}{y}' | |||
'{a}{n}{u}{s}{t}' '{a}{p}{e}{r}' '{a}{s}{p}{a}{r}' '{a}{ch}{a}{r}' '{d}{e}{r}{v}{e}{n}' '{d}{r}{o}{s}{o}{p}' '{x}{e}{f}' '{n}{e}{o}{p}' | |||
'{n}{o}{m}{o}{t}' '{o}{l}{o}{p}' '{o}{m}{o}{t}' '{p}{r}{o}{s}{t}' '{p}{r}{o}{s}{oo}{p}{o}{p}' '{s}{u}{m}{p}' '{s}{u}{n}{t}' '{t}' '{u}{p}{o}{t}' | |||
'{ch}{a}{r}' '{a}{e}{y}{p}' '{a}{y}{m}{o}{s}{t}' '{a}{n}{u}{p}' '{a}{p}{o}{t}' '{a}{r}{t}{y}{p}' '{d}{y}{a}{t}' '{e}{n}' '{e}{p}{y}{t}' | |||
'{k}{r}{o}{k}{a}{l}{o}{p}' '{s}{y}{d}{i}{r}{o}{p}' '{l}' '{n}{a}{u}' '{o}{u}{l}{a}{m}' '{o}{u}{r}' '{p}' '{t}{r}' '{m}' | |||
(<- '{a}{g}') | |||
)) | |||
) | |||
) | |||
) | |||
) | |||
define step5j as ( | |||
[substring] among ( | |||
'{i}{s}{e}' '{i}{s}{o}{u}' '{i}{s}{a}' (delete unset test1) | |||
) | |||
[] substring atlimit among ( | |||
'{n}' '{ch}{e}{r}{s}{o}{n}' '{d}{oo}{d}{e}{k}{a}{n}' '{e}{r}{i}{m}{o}{n}' '{m}{e}{g}{a}{l}{o}{n}' '{e}{p}{t}{a}{n}' (<- '{i}{s}') | |||
) | |||
) | |||
define step5k as ( | |||
[substring] among ( | |||
'{i}{s}{t}{e}' (delete unset test1) | |||
) | |||
[] substring atlimit among ( | |||
'{a}{s}{v}' '{s}{v}' '{a}{ch}{r}' '{ch}{r}' '{a}{p}{l}' '{a}{e}{y}{m}{n}' '{d}{u}{s}{ch}{r}' '{e}{u}{ch}{r}' '{k}{o}{y}{n}{o}{ch}{r}' '{p}{a}{l}{y}{m}{ps}' | |||
(<- '{i}{s}{t}') | |||
) | |||
) | |||
define step5l as ( | |||
[substring] among ( | |||
'{o}{u}{n}{e}' '{i}{s}{o}{u}{n}{e}' '{i}{th}{o}{u}{n}{e}' (delete unset test1) | |||
) | |||
[] substring atlimit among ( | |||
'{n}' '{r}' '{s}{p}{y}' '{s}{t}{r}{a}{v}{o}{m}{o}{u}{t}{s}' '{k}{a}{k}{o}{m}{o}{u}{t}{s}' '{e}{x}{oo}{n}' (<- '{o}{u}{n}') | |||
) | |||
) | |||
define step5m as ( | |||
[substring] among ( | |||
'{o}{u}{m}{e}' '{i}{s}{o}{u}{m}{e}' '{i}{th}{o}{u}{m}{e}' (delete unset test1) | |||
) | |||
[] substring atlimit among ( | |||
'{p}{a}{r}{a}{s}{o}{u}{s}' '{f}' '{ch}' '{oo}{r}{y}{o}{p}{l}' '{a}{z}' '{a}{l}{l}{o}{s}{o}{u}{s}' '{a}{s}{o}{u}{s}' | |||
(<- '{o}{u}{m}') | |||
) | |||
) | |||
define step6 as ( | |||
do ( | |||
[substring] among ( | |||
'{m}{a}{t}{a}' '{m}{a}{t}{oo}{n}' '{m}{a}{t}{o}{s}' (<- '{m}{a}') | |||
) | |||
) | |||
test1 | |||
[substring] among ( | |||
'{a}' '{a}{g}{a}{t}{e}' '{a}{g}{a}{n}' '{a}{e}{y}' '{a}{m}{a}{y}' '{a}{n}' '{a}{s}' '{a}{s}{a}{y}' '{a}{t}{a}{y}' '{a}{oo}' '{e}' '{e}{y}' | |||
'{e}{y}{s}' '{e}{y}{t}{e}' '{e}{s}{a}{y}' '{e}{s}' '{e}{t}{a}{y}' '{y}' '{y}{e}{m}{a}{y}' '{y}{e}{m}{a}{s}{t}{e}' '{y}{e}{t}{a}{y}' '{y}{e}{s}{a}{y}' | |||
'{y}{e}{s}{a}{s}{t}{e}' '{y}{o}{m}{a}{s}{t}{a}{n}' '{y}{o}{m}{o}{u}{n}' '{y}{o}{m}{o}{u}{n}{a}' '{y}{o}{n}{t}{a}{n}' '{y}{o}{n}{t}{o}{u}{s}{a}{n}' '{y}{o}{s}{a}{s}{t}{a}{n}' | |||
'{y}{o}{s}{a}{s}{t}{e}' '{y}{o}{s}{o}{u}{n}' '{y}{o}{s}{o}{u}{n}{a}' '{y}{o}{t}{a}{n}' '{y}{o}{u}{m}{a}' '{y}{o}{u}{m}{a}{s}{t}{e}' '{y}{o}{u}{n}{t}{a}{y}' | |||
'{y}{o}{u}{n}{t}{a}{n}' '{i}' '{i}{d}{e}{s}' '{i}{d}{oo}{n}' '{i}{th}{e}{y}' '{i}{th}{e}{y}{s}' '{i}{th}{e}{y}{t}{e}' '{i}{th}{i}{k}{a}{t}{e}' '{i}{th}{i}{k}{a}{n}' | |||
'{i}{th}{o}{u}{n}' '{i}{th}{oo}' '{i}{k}{a}{t}{e}' '{i}{k}{a}{n}' '{i}{s}' '{i}{s}{a}{n}' '{i}{s}{a}{t}{e}' '{i}{s}{e}{y}' '{i}{s}{e}{s}' '{i}{s}{o}{u}{n}' | |||
'{i}{s}{oo}' '{o}' '{o}{y}' '{o}{m}{a}{y}' '{o}{m}{a}{s}{t}{a}{n}' '{o}{m}{o}{u}{n}' '{o}{m}{o}{u}{n}{a}' '{o}{n}{t}{a}{y}' '{o}{n}{t}{a}{n}' | |||
'{o}{n}{t}{o}{u}{s}{a}{n}' '{o}{s}' '{o}{s}{a}{s}{t}{a}{n}' '{o}{s}{a}{s}{t}{e}' '{o}{s}{o}{u}{n}' '{o}{s}{o}{u}{n}{a}' '{o}{t}{a}{n}' '{o}{u}' '{o}{u}{m}{a}{y}' | |||
'{o}{u}{m}{a}{s}{t}{e}' '{o}{u}{n}' '{o}{u}{n}{t}{a}{y}' '{o}{u}{n}{t}{a}{n}' '{o}{u}{s}' '{o}{u}{s}{a}{n}' '{o}{u}{s}{a}{t}{e}' '{u}' '{u}{s}' '{oo}' | |||
'{oo}{n}' (delete) | |||
) | |||
) | |||
define step7 as ( | |||
[substring] among ( | |||
'{e}{s}{t}{e}{r}' '{e}{s}{t}{a}{t}' '{o}{t}{e}{r}' '{o}{t}{a}{t}' '{u}{t}{e}{r}' '{u}{t}{a}{t}' '{oo}{t}{e}{r}' '{oo}{t}{a}{t}' (delete) | |||
) | |||
) | |||
) | |||
define stem as ( | |||
backwards ( | |||
do tolower | |||
has_min_length | |||
set test1 | |||
do step1 | |||
do steps1 | |||
do steps2 | |||
do steps3 | |||
do steps4 | |||
do steps5 | |||
do steps6 | |||
do steps7 | |||
do steps8 | |||
do steps9 | |||
do steps10 | |||
do step2a | |||
do step2b | |||
do step2c | |||
do step2d | |||
do step3 | |||
do step4 | |||
do step5a | |||
do step5b | |||
do step5c | |||
do step5d | |||
do step5e | |||
do step5f | |||
do step5g | |||
do step5h | |||
do step5j | |||
do step5i | |||
do step5k | |||
do step5l | |||
do step5m | |||
do step6 | |||
do step7 | |||
) | |||
) |
@@ -0,0 +1,323 @@ | |||
// An implementation of "A Lightweight Stemmer for Hindi": | |||
// http://www.kbcs.in/downloads/papers/StmmerHindi.pdf | |||
externals ( stem ) | |||
stringescapes {} | |||
// The transliteration scheme used for our stringdefs matches that used in the | |||
// paper, as documented in the appendix. It appears to match the WX notation | |||
// (https://en.wikipedia.org/wiki/WX_notation) except that WX apparently | |||
// uses 'z' for Anunasika whereas the paper uses Mh. | |||
// | |||
// We discriminate dependent vowels by adding a leading "_" to their stringdef | |||
// names (mnemonic: the _ signifies removing the implicit a from the preceding | |||
// character). | |||
// Vowels and sonorants: | |||
stringdef a '{U+0905}' | |||
stringdef A '{U+0906}' | |||
stringdef i '{U+0907}' | |||
stringdef I '{U+0908}' | |||
stringdef u '{U+0909}' | |||
stringdef U '{U+090A}' | |||
stringdef q '{U+090B}' | |||
stringdef e '{U+090F}' | |||
stringdef E '{U+0910}' | |||
stringdef o '{U+0913}' | |||
stringdef O '{U+0914}' | |||
// Vowel signs: | |||
stringdef _A '{U+093E}' | |||
stringdef _i '{U+093F}' | |||
stringdef _I '{U+0940}' | |||
stringdef _u '{U+0941}' | |||
stringdef _U '{U+0942}' | |||
stringdef _q '{U+0943}' | |||
stringdef _e '{U+0947}' | |||
stringdef _E '{U+0948}' | |||
stringdef _o '{U+094B}' | |||
stringdef _O '{U+094C}' | |||
// Diacritics: | |||
stringdef M '{U+0902}' | |||
stringdef H '{U+0903}' | |||
stringdef Mh '{U+0901}' | |||
stringdef Z '{U+093C}' // Nukta | |||
stringdef virama '{U+094D}' | |||
// Velar consonants: | |||
stringdef k '{U+0915}' | |||
stringdef K '{U+0916}' | |||
stringdef g '{U+0917}' | |||
stringdef G '{U+0918}' | |||
stringdef f '{U+0919}' | |||
// Palatal consonants: | |||
stringdef c '{U+091A}' | |||
stringdef C '{U+091B}' | |||
stringdef j '{U+091C}' | |||
stringdef J '{U+091D}' | |||
stringdef F '{U+091E}' | |||
// Retroflex consonants: | |||
stringdef t '{U+091F}' | |||
stringdef T '{U+0920}' | |||
stringdef d '{U+0921}' | |||
stringdef D '{U+0922}' | |||
stringdef N '{U+0923}' | |||
// Dental consonants: | |||
stringdef w '{U+0924}' | |||
stringdef W '{U+0925}' | |||
stringdef x '{U+0926}' | |||
stringdef X '{U+0927}' | |||
stringdef n '{U+0928}' | |||
// Labial consonants: | |||
stringdef p '{U+092A}' | |||
stringdef P '{U+092B}' | |||
stringdef b '{U+092C}' | |||
stringdef B '{U+092D}' | |||
stringdef m '{U+092E}' | |||
// Semi-vowels: | |||
stringdef y '{U+092F}' | |||
stringdef r '{U+0930}' | |||
stringdef l '{U+0932}' | |||
stringdef v '{U+0935}' | |||
// Fricatives: | |||
stringdef S '{U+0936}' | |||
stringdef R '{U+0937}' | |||
stringdef s '{U+0938}' | |||
stringdef h '{U+0939}' | |||
stringdef lY '{U+0933}' | |||
// Precomposed characters - letters + nukta: | |||
stringdef nZ '{U+0929}' // ≡ {n}{Z} | |||
stringdef rZ '{U+0931}' // ≡ {r}{Z} | |||
stringdef lYZ '{U+0934}' // ≡ {lY}{Z} | |||
stringdef kZ '{U+0958}' // ≡ {k}{Z} | |||
stringdef KZ '{U+0959}' // ≡ {K}{Z} | |||
stringdef gZ '{U+095A}' // ≡ {g}{Z} | |||
stringdef jZ '{U+095B}' // ≡ {j}{Z} | |||
stringdef dZ '{U+095C}' // ≡ {d}{Z} | |||
stringdef DZ '{U+095D}' // ≡ {D}{Z} | |||
stringdef PZ '{U+095E}' // ≡ {P}{Z} | |||
stringdef yZ '{U+095F}' // ≡ {y}{Z} | |||
integers ( p ) | |||
groupings ( consonant ) | |||
routines ( CONSONANT ) | |||
define consonant '{k}{K}{g}{G}{f}' + | |||
'{c}{C}{j}{J}{F}' + | |||
'{t}{T}{d}{D}{N}' + | |||
'{w}{W}{x}{X}{n}' + | |||
'{p}{P}{b}{B}{m}' + | |||
'{y}{r}{l}{v}' + | |||
'{S}{R}{s}{h}' + | |||
'{lY}' + | |||
'{Z}' + // Nukta | |||
// Precomposed characters - letter and nukta: | |||
'{nZ}{rZ}{lYZ}{kZ}{KZ}{gZ}{jZ}{dZ}{DZ}{PZ}{yZ}' | |||
backwardmode ( define CONSONANT as ( consonant ) ) | |||
define stem as ( | |||
test ( next setmark p ) | |||
backwards ( | |||
// We assume in this implementation that the whole word doesn't count | |||
// as a valid suffix to remove, so we remove the longest suffix from | |||
// the list which leaves at least one character. This change affects | |||
// 47 words out of the 65,140 in the sample vocabulary from Hindi | |||
// wikipedia. | |||
setlimit tomark p for ([substring]) | |||
among ( | |||
// The list below is derived from figure 3 in the paper. | |||
// | |||
// We perform the stemming on the Devanagari characters rather than | |||
// transliterating to Latin, so we have adapted the list below to | |||
// reflect this by converting suffixes back to Devanagari as | |||
// follows: | |||
// | |||
// * within the suffixes, "a" after a consonant is dropped since | |||
// consonants have an implicit "a". | |||
// | |||
// * within the suffixes, a vowel other than "a" after a consonant | |||
// is a dependent vowel (vowel sign); a vowel (including "a") | |||
// after a non-consonant is an independent vowel. | |||
// | |||
// * to allow the vowel at the start of each suffix being dependent | |||
// or independent, we include each suffix twice. For the | |||
// dependent version, a leading "a" is dropped and we check that | |||
// the suffix is preceded by a consonant (which will have an | |||
// implicit "a"). | |||
// | |||
// * we add '{a}', which is needed for the example given right at | |||
// the end of section 5 to work (conflating BarawIya and | |||
// BarawIyawA), and which 3.1 a.v strongly suggests should be in | |||
// the list: | |||
// | |||
// Thus, the following suffix deletions (longest possible | |||
// match) are required to reduce inflected forms of masculine | |||
// nouns to a common stem: | |||
// a A i [...] | |||
// | |||
// Adding '{a}' only affect 2 words out of the 65,140 in the | |||
// sample vocabulary. | |||
// | |||
// * The transliterations of our stems would end with "a" when our | |||
// stems end in a consonant, so we also include {virama} in the | |||
// list of suffixes to remove (this affects 222 words from the | |||
// sample vocabulary). | |||
// | |||
// We've also assumed that Mh in the suffix list always means {Mh} | |||
// and never {M}{h}{virama}. Only one of the 65,140 words in the | |||
// sample vocabulary stems differently due to this (and that word | |||
// seems to be a typo). | |||
'{virama}' | |||
'{a}' | |||
'{A}' | |||
'{i}' | |||
'{I}' | |||
'{u}' | |||
'{U}' | |||
'{e}' | |||
'{o}' | |||
'{e}{M}' | |||
'{o}{M}' | |||
'{A}{M}' | |||
'{u}{A}{M}' | |||
'{u}{e}{M}' | |||
'{u}{o}{M}' | |||
'{A}{e}{M}' | |||
'{A}{o}{M}' | |||
'{i}{y}{_A}{M}' | |||
'{i}{y}{_o}{M}' | |||
'{A}{i}{y}{_A}{M}' | |||
'{A}{i}{y}{_o}{M}' | |||
'{A}{Mh}' | |||
'{i}{y}{_A}{Mh}' | |||
'{A}{i}{y}{_A}{Mh}' | |||
'{a}{w}{_A}{e}{M}' | |||
'{a}{w}{_A}{o}{M}' | |||
'{a}{n}{_A}{e}{M}' | |||
'{a}{n}{_A}{o}{M}' | |||
'{a}{w}{_A}' | |||
'{a}{w}{_I}' | |||
'{I}{M}' | |||
'{a}{w}{_I}{M}' | |||
'{a}{w}{_e}' | |||
'{A}{w}{_A}' | |||
'{A}{w}{_I}' | |||
'{A}{w}{_I}{M}' | |||
'{A}{w}{_e}' | |||
'{a}{n}{_A}' | |||
'{a}{n}{_I}' | |||
'{a}{n}{_e}' | |||
'{A}{n}{_A}' | |||
'{A}{n}{_e}' | |||
'{U}{M}{g}{_A}' | |||
'{U}{M}{g}{_I}' | |||
'{A}{U}{M}{g}{_A}' | |||
'{A}{U}{M}{g}{_I}' | |||
'{e}{M}{g}{_e}' | |||
'{e}{M}{g}{_I}' | |||
'{A}{e}{M}{g}{_e}' | |||
'{A}{e}{M}{g}{_I}' | |||
'{o}{g}{_e}' | |||
'{o}{g}{_I}' | |||
'{A}{o}{g}{_e}' | |||
'{A}{o}{g}{_I}' | |||
'{e}{g}{_A}' | |||
'{e}{g}{_I}' | |||
'{A}{e}{g}{_A}' | |||
'{A}{e}{g}{_I}' | |||
'{A}{y}{_A}' | |||
'{A}{e}' | |||
'{A}{I}' | |||
'{A}{I}{M}' | |||
'{i}{e}' | |||
'{A}{o}' | |||
'{A}{i}{e}' | |||
'{a}{k}{r}' | |||
'{A}{k}{r}' | |||
'{_A}' | |||
'{_i}' | |||
'{_I}' | |||
'{_u}' | |||
'{_U}' | |||
'{_e}' | |||
'{_o}' | |||
'{_e}{M}' | |||
'{_o}{M}' | |||
'{_A}{M}' | |||
'{_u}{A}{M}' | |||
'{_u}{e}{M}' | |||
'{_u}{o}{M}' | |||
'{_A}{e}{M}' | |||
'{_A}{o}{M}' | |||
'{_i}{y}{_A}{M}' | |||
'{_i}{y}{_o}{M}' | |||
'{_A}{i}{y}{_A}{M}' | |||
'{_A}{i}{y}{_o}{M}' | |||
'{_A}{Mh}' | |||
'{_i}{y}{_A}{Mh}' | |||
'{_A}{i}{y}{_A}{Mh}' | |||
'{_I}{M}' | |||
'{_A}{w}{_A}' | |||
'{_A}{w}{_I}' | |||
'{_A}{w}{_I}{M}' | |||
'{_A}{w}{_e}' | |||
'{_A}{n}{_A}' | |||
'{_A}{n}{_e}' | |||
'{_U}{M}{g}{_A}' | |||
'{_U}{M}{g}{_I}' | |||
'{_A}{U}{M}{g}{_A}' | |||
'{_A}{U}{M}{g}{_I}' | |||
'{_e}{M}{g}{_e}' | |||
'{_e}{M}{g}{_I}' | |||
'{_A}{e}{M}{g}{_e}' | |||
'{_A}{e}{M}{g}{_I}' | |||
'{_o}{g}{_e}' | |||
'{_o}{g}{_I}' | |||
'{_A}{o}{g}{_e}' | |||
'{_A}{o}{g}{_I}' | |||
'{_e}{g}{_A}' | |||
'{_e}{g}{_I}' | |||
'{_A}{e}{g}{_A}' | |||
'{_A}{e}{g}{_I}' | |||
'{_A}{y}{_A}' | |||
'{_A}{e}' | |||
'{_A}{I}' | |||
'{_A}{I}{M}' | |||
'{_i}{e}' | |||
'{_A}{o}' | |||
'{_A}{i}{e}' | |||
'{_A}{k}{r}' | |||
/* Suffixes with a leading implicit a: */ | |||
'{w}{_A}{e}{M}' CONSONANT | |||
'{w}{_A}{o}{M}' CONSONANT | |||
'{n}{_A}{e}{M}' CONSONANT | |||
'{n}{_A}{o}{M}' CONSONANT | |||
'{w}{_A}' CONSONANT | |||
'{w}{_I}' CONSONANT | |||
'{w}{_I}{M}' CONSONANT | |||
'{w}{_e}' CONSONANT | |||
'{n}{_A}' CONSONANT | |||
'{n}{_I}' CONSONANT | |||
'{n}{_e}' CONSONANT | |||
'{k}{r}' CONSONANT | |||
) | |||
delete | |||
) | |||
) |
@@ -27,17 +27,17 @@ groupings ( v ) | |||
stringescapes {} | |||
/* special characters (in Unicode) */ | |||
stringdef a' hex 'E1' //a-acute | |||
stringdef e' hex 'E9' //e-acute | |||
stringdef i' hex 'ED' //i-acute | |||
stringdef o' hex 'F3' //o-acute | |||
stringdef o" hex 'F6' //o-umlaut | |||
stringdef oq hex '151' //o-double acute | |||
stringdef u' hex 'FA' //u-acute | |||
stringdef u" hex 'FC' //u-umlaut | |||
stringdef uq hex '171' //u-double acute | |||
/* special characters */ | |||
stringdef a' '{U+00E1}' //a-acute | |||
stringdef e' '{U+00E9}' //e-acute | |||
stringdef i' '{U+00ED}' //i-acute | |||
stringdef o' '{U+00F3}' //o-acute | |||
stringdef o" '{U+00F6}' //o-umlaut | |||
stringdef oq '{U+0151}' //o-double acute | |||
stringdef u' '{U+00FA}' //u-acute | |||
stringdef u" '{U+00FC}' //u-umlaut | |||
stringdef uq '{U+0171}' //u-double acute | |||
define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}' | |||
@@ -1,241 +0,0 @@ | |||
/* | |||
Hungarian Stemmer | |||
Removes noun inflections | |||
*/ | |||
routines ( | |||
mark_regions | |||
R1 | |||
v_ending | |||
case | |||
case_special | |||
case_other | |||
plural | |||
owned | |||
sing_owner | |||
plur_owner | |||
instrum | |||
factive | |||
undouble | |||
double | |||
) | |||
externals ( stem ) | |||
integers ( p1 ) | |||
groupings ( v ) | |||
stringescapes {} | |||
/* special characters (in ISO Latin 2) */ | |||
stringdef a' hex 'E1' //a-acute | |||
stringdef e' hex 'E9' //e-acute | |||
stringdef i' hex 'ED' //i-acute | |||
stringdef o' hex 'F3' //o-acute | |||
stringdef o" hex 'F6' //o-umlaut | |||
stringdef oq hex 'F5' //o-double acute | |||
stringdef u' hex 'FA' //u-acute | |||
stringdef u" hex 'FC' //u-umlaut | |||
stringdef uq hex 'FB' //u-double acute | |||
define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}' | |||
define mark_regions as ( | |||
$p1 = limit | |||
(v goto non-v | |||
among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next | |||
setmark p1) | |||
or | |||
(non-v gopast v setmark p1) | |||
) | |||
backwardmode ( | |||
define R1 as $p1 <= cursor | |||
define v_ending as ( | |||
[substring] R1 among( | |||
'{a'}' (<- 'a') | |||
'{e'}' (<- 'e') | |||
) | |||
) | |||
define double as ( | |||
test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm' | |||
'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs') | |||
) | |||
define undouble as ( | |||
next [hop 1] delete | |||
) | |||
define instrum as( | |||
[substring] R1 among( | |||
'al' (double) | |||
'el' (double) | |||
) | |||
delete | |||
undouble | |||
) | |||
define case as ( | |||
[substring] R1 among( | |||
'ban' 'ben' | |||
'ba' 'be' | |||
'ra' 're' | |||
'nak' 'nek' | |||
'val' 'vel' | |||
't{o'}l' 't{oq}l' | |||
'r{o'}l' 'r{oq}l' | |||
'b{o'}l' 'b{oq}l' | |||
'hoz' 'hez' 'h{o"}z' | |||
'n{a'}l' 'n{e'}l' | |||
'ig' | |||
'at' 'et' 'ot' '{o"}t' | |||
'{e'}rt' | |||
'k{e'}pp' 'k{e'}ppen' | |||
'kor' | |||
'ul' '{u"}l' | |||
'v{a'}' 'v{e'}' | |||
'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt' | |||
'k{e'}nt' | |||
'en' 'on' 'an' '{o"}n' | |||
'n' | |||
't' | |||
) | |||
delete | |||
v_ending | |||
) | |||
define case_special as( | |||
[substring] R1 among( | |||
'{e'}n' (<- 'e') | |||
'{a'}n' (<- 'a') | |||
'{a'}nk{e'}nt' (<- 'a') | |||
) | |||
) | |||
define case_other as( | |||
[substring] R1 among( | |||
'astul' 'est{u"}l' (delete) | |||
'stul' 'st{u"}l' (delete) | |||
'{a'}stul' (<- 'a') | |||
'{e'}st{u"}l' (<- 'e') | |||
) | |||
) | |||
define factive as( | |||
[substring] R1 among( | |||
'{a'}' (double) | |||
'{e'}' (double) | |||
) | |||
delete | |||
undouble | |||
) | |||
define plural as ( | |||
[substring] R1 among( | |||
'{a'}k' (<- 'a') | |||
'{e'}k' (<- 'e') | |||
'{o"}k' (delete) | |||
'ak' (delete) | |||
'ok' (delete) | |||
'ek' (delete) | |||
'k' (delete) | |||
) | |||
) | |||
define owned as ( | |||
[substring] R1 among ( | |||
'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete) | |||
'{e'}k{e'}' (<- 'e') | |||
'{a'}k{e'}' (<- 'a') | |||
'k{e'}' (delete) | |||
'{e'}{e'}i' (<- 'e') | |||
'{a'}{e'}i' (<- 'a') | |||
'{e'}i' (delete) | |||
'{e'}{e'}' (<- 'e') | |||
'{e'}' (delete) | |||
) | |||
) | |||
define sing_owner as ( | |||
[substring] R1 among( | |||
'{u"}nk' 'unk' (delete) | |||
'{a'}nk' (<- 'a') | |||
'{e'}nk' (<- 'e') | |||
'nk' (delete) | |||
'{a'}juk' (<- 'a') | |||
'{e'}j{u"}k' (<- 'e') | |||
'juk' 'j{u"}k' (delete) | |||
'uk' '{u"}k' (delete) | |||
'em' 'om' 'am' (delete) | |||
'{a'}m' (<- 'a') | |||
'{e'}m' (<- 'e') | |||
'm' (delete) | |||
'od' 'ed' 'ad' '{o"}d' (delete) | |||
'{a'}d' (<- 'a') | |||
'{e'}d' (<- 'e') | |||
'd' (delete) | |||
'ja' 'je' (delete) | |||
'a' 'e' 'o' (delete) | |||
'{a'}' (<- 'a') | |||
'{e'}' (<- 'e') | |||
) | |||
) | |||
define plur_owner as ( | |||
[substring] R1 among( | |||
'jaim' 'jeim' (delete) | |||
'{a'}im' (<- 'a') | |||
'{e'}im' (<- 'e') | |||
'aim' 'eim' (delete) | |||
'im' (delete) | |||
'jaid' 'jeid' (delete) | |||
'{a'}id' (<- 'a') | |||
'{e'}id' (<- 'e') | |||
'aid' 'eid' (delete) | |||
'id' (delete) | |||
'jai' 'jei' (delete) | |||
'{a'}i' (<- 'a') | |||
'{e'}i' (<- 'e') | |||
'ai' 'ei' (delete) | |||
'i' (delete) | |||
'jaink' 'jeink' (delete) | |||
'eink' 'aink' (delete) | |||
'{a'}ink' (<- 'a') | |||
'{e'}ink' (<- 'e') | |||
'ink' | |||
'jaitok' 'jeitek' (delete) | |||
'aitok' 'eitek' (delete) | |||
'{a'}itok' (<- 'a') | |||
'{e'}itek' (<- 'e') | |||
'itek' (delete) | |||
'jeik' 'jaik' (delete) | |||
'aik' 'eik' (delete) | |||
'{a'}ik' (<- 'a') | |||
'{e'}ik' (<- 'e') | |||
'ik' (delete) | |||
) | |||
) | |||
) | |||
define stem as ( | |||
do mark_regions | |||
backwards ( | |||
do instrum | |||
do case | |||
do case_special | |||
do case_other | |||
do factive | |||
do owned | |||
do sing_owner | |||
do plur_owner | |||
do plural | |||
) | |||
) |
@@ -0,0 +1,192 @@ | |||
// An implementation of the "Porter Stemmer for Bahasa Indonesia" from: | |||
// http://www.illc.uva.nl/Research/Publications/Reports/MoL-2003-02.text.pdf | |||
integers ( | |||
// The paper defines measure as the number of vowels in the word. We | |||
// count this initially, then adjust the count each time we remove a | |||
// prefix or suffix. | |||
measure | |||
// Numeric code for the type of prefix removed: | |||
// | |||
// 0 other/none | |||
// 1 'di' or 'meng' or 'ter' | |||
// 2 'per' | |||
// 3 'ke' or 'peng' | |||
// 4 'ber' | |||
// | |||
// Some of these have variant forms, so e.g. "meng" includes "men", "me", | |||
// "meny", "mem". | |||
// | |||
// Note that the value of prefix is only used in remove_suffix (and | |||
// routines it calls) so we don't need to worry about | |||
// remove_second_order_prefix overwriting a value of prefix set by | |||
// remove_first_order_prefix since remove_suffix gets called between | |||
// the two. | |||
prefix | |||
) | |||
groupings ( vowel ) | |||
routines ( | |||
remove_particle | |||
remove_possessive_pronoun | |||
remove_first_order_prefix | |||
remove_second_order_prefix | |||
remove_suffix | |||
KER | |||
SUFFIX_KAN_OK | |||
SUFFIX_AN_OK | |||
SUFFIX_I_OK | |||
VOWEL | |||
) | |||
externals ( stem ) | |||
stringescapes {} | |||
backwardmode ( | |||
define remove_particle as ( | |||
[substring] among ( | |||
'kah' 'lah' 'pun' (delete $measure-=1) | |||
) | |||
) | |||
define remove_possessive_pronoun as ( | |||
[substring] among ( | |||
'ku' 'mu' 'nya' (delete $measure-=1) | |||
) | |||
) | |||
// prefix not in {ke, peng, per} | |||
define SUFFIX_KAN_OK as ( | |||
// On page 29, the example "kompas Q.31" says "Both Nazief and Porter | |||
// stemmer converted the word peledakan (blast, explotion) to ledak (to | |||
// blast, to explode)". However, the algorithm as described doesn't | |||
// behave in this way - grammatically the prefix pe- occurs as a | |||
// variation of both the first-order derivational prefix peng- and the | |||
// second-order derivational prefix per-, but table 2.5 doesn't include | |||
// "pe", only table 2.6 does, so "peledakan" is handled (incorrectly) | |||
// as having prefix "per" not "peng", and so we remove derivational | |||
// suffix "kan" rather than "an" to give stem leda. (Porter-style | |||
// stemmers remove the longest suffix they can amongst those available, | |||
// which this paper notes in the last paragraph on page 15). | |||
// | |||
// We resolve this by amending the condition on suffix "kan" to | |||
// "prefix ∉ {ke, peng, per}", which seems to make the stemmer's | |||
// behaviour match all the examples in the paper except for one: | |||
// "perbaikan" is shown in table 3.4 as stemming to "bai", but with | |||
// this change it now stems to "baik". The table notes that "baik" is | |||
// the actual root so this deviation is an improvement. In a sample | |||
// vocabulary derived from the most common words in id.wikipedia.org, | |||
// this change only affects 0.12% of words (76 out of 64,587, including | |||
// "peledakan" and "perbaikan"). | |||
$prefix != 3 and $prefix != 2 | |||
) | |||
// prefix not in {di, meng, ter} | |||
define SUFFIX_AN_OK as ( $prefix != 1 ) | |||
define SUFFIX_I_OK as ( | |||
// prefix not in {ke, peng, ber} | |||
$prefix <= 2 | |||
// The rest of the condition from the paper is: | |||
// V|K...c₁c₁, c₁ ≠ s, c₂ ≠ i | |||
// | |||
// The meaning of this is unclear in several ways, and none of the | |||
// examples given of the stemmer's behaviour in the paper help to | |||
// resolve these issues. | |||
// | |||
// Notice that c₂ isn't actually used - the most obvious explanation | |||
// seems to be that "c₁c₁" should read "c₁c₂", or maybe "c₂c₁". | |||
// | |||
// Elsewhere the paper defines V... as meaning "the stem starts with | |||
// a vowel" and K... as meaning "the stem starts with a consonant". | |||
// | |||
// In other places where it says X|Y... it seems the | binds more | |||
// tightly, so it's (V|K)...cᵢcⱼ not V|(K...cᵢcⱼ). That seems a bit | |||
// odd as the first letter must be either a vowel or a consonant, so | |||
// that really just means "ends cᵢcⱼ". However, nowhere in the paper | |||
// uses or defines a notation such as ...X, which may explain this | |||
// seemingly redundant way of specifying this. | |||
// | |||
// The conditions elsewhere on prefix removal (e.g. V...) are clearly | |||
// on the stem left after the prefix is removed. None of the other | |||
// rules for suffix removal have conditions on the stem, but for | |||
// consistency with the prefix rules we might expect that the cᵢcⱼ | |||
// test is on what's left *after* removing the "i" suffix. | |||
// | |||
// However, studying Indonesian wordlists and discussion with a native | |||
// speaker leads us to conclude that the purpose of this check is to | |||
// protect words of foreign origin (e.g. "televisi", "organisasi", | |||
// "komunikasi") from stemming, and the common feature of these is | |||
// that the word ends "-si", so we conclude that the condition here | |||
// should be read as "word does not end -si", and this is what we | |||
// have implemented. | |||
not 's' | |||
) | |||
define remove_suffix as ( | |||
[substring] among ( | |||
'kan' SUFFIX_KAN_OK 'an' SUFFIX_AN_OK 'i' SUFFIX_I_OK | |||
(delete $measure-=1) | |||
) | |||
) | |||
) | |||
define vowel 'aeiou' | |||
define VOWEL as ( vowel ) | |||
define KER as ( non-vowel 'er' ) | |||
define remove_first_order_prefix as ( | |||
[substring] among ( | |||
'di' 'meng' 'men' 'me' 'ter' (delete $prefix=1 $measure-=1) | |||
'ke' 'peng' 'pen' (delete $prefix=3 $measure-=1) | |||
'meny' VOWEL ($prefix=1 <-'s' $measure-=1) | |||
'peny' VOWEL ($prefix=3 <-'s' $measure-=1) | |||
'mem' ($prefix=1 $measure-=1 vowel and <-'p' or delete) | |||
'pem' ($prefix=3 $measure-=1 vowel and <-'p' or delete) | |||
) | |||
) | |||
define remove_second_order_prefix as ( | |||
// The paper has the condition on removal of prefix "bel" and "pel" as | |||
// just "ajar" not "ajar..." but it seems that the latter must be what | |||
// is intended so that e.g. "pelajaran" stems to "ajar" not "lajar". | |||
// This change only affects a very small number of words (11 out of | |||
// 64,587) and only for the better. | |||
[substring] among ( | |||
'per' 'pe' (delete $prefix=2 $measure-=1) | |||
'pelajar' (<-'ajar' $measure-=1) | |||
'ber' (delete $prefix=4 $measure-=1) | |||
'belajar' (<-'ajar' $prefix=4 $measure-=1) | |||
'be' KER (delete $prefix=4 $measure-=1) | |||
) | |||
) | |||
define stem as ( | |||
$measure = 0 | |||
do ( repeat ( gopast vowel $measure+=1 ) ) | |||
$measure > 2 | |||
$prefix = 0 | |||
backwards ( | |||
do remove_particle | |||
$measure > 2 | |||
do remove_possessive_pronoun | |||
) | |||
$measure > 2 | |||
test ( | |||
remove_first_order_prefix | |||
do ( | |||
test ($measure > 2 backwards remove_suffix) | |||
$measure > 2 remove_second_order_prefix | |||
) | |||
) or ( | |||
do remove_second_order_prefix | |||
do ($measure > 2 backwards remove_suffix) | |||
) | |||
) |
@@ -0,0 +1,151 @@ | |||
routines ( | |||
R1 R2 RV | |||
initial_morph | |||
mark_regions | |||
noun_sfx | |||
deriv | |||
verb_sfx | |||
) | |||
externals ( stem ) | |||
integers ( pV p1 p2 ) | |||
groupings ( v ) | |||
stringescapes {} | |||
/* Accented characters */ | |||
stringdef a' '{U+00E1}' // a-acute | |||
stringdef e' '{U+00E9}' // e-acute | |||
stringdef i' '{U+00ED}' // i-acute | |||
stringdef o' '{U+00F3}' // o-acute | |||
stringdef u' '{U+00FA}' // u-acute | |||
define v 'aeiou{a'}{e'}{i'}{o'}{u'}' | |||
define mark_regions as ( | |||
$pV = limit | |||
$p1 = limit | |||
$p2 = limit // defaults | |||
do ( | |||
gopast v setmark pV | |||
) | |||
do ( | |||
gopast v gopast non-v setmark p1 | |||
gopast v gopast non-v setmark p2 | |||
) | |||
) | |||
define initial_morph as ( | |||
[substring] among ( | |||
'h-' 'n-' 't-' //nAthair -> n-athair, but alone are problematic | |||
(delete) | |||
// verbs | |||
'd{'}' | |||
(delete) | |||
'd{'}fh' | |||
(<- 'f') | |||
// other contractions | |||
'm{'}' 'b{'}' | |||
(delete) | |||
'sh' | |||
(<- 's') | |||
'mb' | |||
(<- 'b') | |||
'gc' | |||
(<- 'c') | |||
'nd' | |||
(<- 'd') | |||
'bhf' | |||
(<- 'f') | |||
'ng' | |||
(<- 'g') | |||
'bp' | |||
(<- 'p') | |||
'ts' | |||
(<- 's') | |||
'dt' | |||
(<- 't') | |||
// Lenition | |||
'bh' | |||
(<- 'b') | |||
'ch' | |||
(<- 'c') | |||
'dh' | |||
(<- 'd') | |||
'fh' | |||
(<- 'f') | |||
'gh' | |||
(<- 'g') | |||
'mh' | |||
(<- 'm') | |||
'ph' | |||
(<- 'p') | |||
'th' | |||
(<- 't') | |||
) | |||
) | |||
backwardmode ( | |||
define RV as $pV <= cursor | |||
define R1 as $p1 <= cursor | |||
define R2 as $p2 <= cursor | |||
define noun_sfx as ( | |||
[substring] among ( | |||
'amh' 'eamh' 'abh' 'eabh' | |||
'aibh' 'ibh' 'aimh' 'imh' | |||
'a{i'}ocht' '{i'}ocht' 'a{i'}ochta' '{i'}ochta' | |||
(R1 delete) | |||
'ire' 'ir{i'}' 'aire' 'air{i'}' | |||
(R2 delete) | |||
) | |||
) | |||
define deriv as ( | |||
[substring] among ( | |||
'acht' 'eacht' 'ach' 'each' 'eacht{u'}il' 'eachta' 'acht{u'}il' 'achta' | |||
(R2 delete) //siopadóireacht -> siopadóir but not poblacht -> pobl | |||
'arcacht' 'arcachta{i'}' 'arcachta' | |||
(<- 'arc') // monarcacht -> monarc | |||
'gineach' 'gineas' 'ginis' | |||
(<- 'gin') | |||
'grafa{i'}och' 'grafa{i'}ocht' 'grafa{i'}ochta' 'grafa{i'}ochta{i'}' | |||
(<- 'graf') | |||
'paite' 'patach' 'pataigh' 'patacha' | |||
(<- 'paite') | |||
'{o'}ideach' '{o'}ideacha' '{o'}idigh' | |||
(<- '{o'}id') | |||
) | |||
) | |||
define verb_sfx as ( | |||
[substring] among ( | |||
'imid' 'aimid' '{i'}mid' 'a{i'}mid' | |||
'faidh' 'fidh' | |||
(RV delete) | |||
'ain' | |||
'eadh' 'adh' | |||
'{a'}il' | |||
'tear' 'tar' | |||
(R1 delete) | |||
) | |||
) | |||
) | |||
define stem as ( | |||
do initial_morph | |||
do mark_regions | |||
backwards ( | |||
do noun_sfx | |||
do deriv | |||
do verb_sfx | |||
) | |||
) |
@@ -16,18 +16,18 @@ groupings ( v AEIO CG ) | |||
stringescapes {} | |||
/* special characters (in MS-DOS Latin I) */ | |||
stringdef a' hex 'A0' | |||
stringdef a` hex '85' | |||
stringdef e' hex '82' | |||
stringdef e` hex '8A' | |||
stringdef i' hex 'A1' | |||
stringdef i` hex '8D' | |||
stringdef o' hex 'A2' | |||
stringdef o` hex '95' | |||
stringdef u' hex 'A3' | |||
stringdef u` hex '97' | |||
/* special characters */ | |||
stringdef a' '{U+00E1}' | |||
stringdef a` '{U+00E0}' | |||
stringdef e' '{U+00E9}' | |||
stringdef e` '{U+00E8}' | |||
stringdef i' '{U+00ED}' | |||
stringdef i` '{U+00EC}' | |||
stringdef o' '{U+00F3}' | |||
stringdef o` '{U+00F2}' | |||
stringdef u' '{U+00FA}' | |||
stringdef u` '{U+00F9}' | |||
define v 'aeiou{a`}{e`}{i`}{o`}{u`}' | |||
@@ -1,195 +0,0 @@ | |||
routines ( | |||
prelude postlude mark_regions | |||
RV R1 R2 | |||
attached_pronoun | |||
standard_suffix | |||
verb_suffix | |||
vowel_suffix | |||
) | |||
externals ( stem ) | |||
integers ( pV p1 p2 ) | |||
groupings ( v AEIO CG ) | |||
stringescapes {} | |||
/* special characters (in ISO Latin I) */ | |||
stringdef a' hex 'E1' | |||
stringdef a` hex 'E0' | |||
stringdef e' hex 'E9' | |||
stringdef e` hex 'E8' | |||
stringdef i' hex 'ED' | |||
stringdef i` hex 'EC' | |||
stringdef o' hex 'F3' | |||
stringdef o` hex 'F2' | |||
stringdef u' hex 'FA' | |||
stringdef u` hex 'F9' | |||
define v 'aeiou{a`}{e`}{i`}{o`}{u`}' | |||
define prelude as ( | |||
test repeat ( | |||
[substring] among( | |||
'{a'}' (<- '{a`}') | |||
'{e'}' (<- '{e`}') | |||
'{i'}' (<- '{i`}') | |||
'{o'}' (<- '{o`}') | |||
'{u'}' (<- '{u`}') | |||
'qu' (<- 'qU') | |||
'' (next) | |||
) | |||
) | |||
repeat goto ( | |||
v [ ('u' ] v <- 'U') or | |||
('i' ] v <- 'I') | |||
) | |||
) | |||
define mark_regions as ( | |||
$pV = limit | |||
$p1 = limit | |||
$p2 = limit // defaults | |||
do ( | |||
( v (non-v gopast v) or (v gopast non-v) ) | |||
or | |||
( non-v (non-v gopast v) or (v next) ) | |||
setmark pV | |||
) | |||
do ( | |||
gopast v gopast non-v setmark p1 | |||
gopast v gopast non-v setmark p2 | |||
) | |||
) | |||
define postlude as repeat ( | |||
[substring] among( | |||
'I' (<- 'i') | |||
'U' (<- 'u') | |||
'' (next) | |||
) | |||
) | |||
backwardmode ( | |||
define RV as $pV <= cursor | |||
define R1 as $p1 <= cursor | |||
define R2 as $p2 <= cursor | |||
define attached_pronoun as ( | |||
[substring] among( | |||
'ci' 'gli' 'la' 'le' 'li' 'lo' | |||
'mi' 'ne' 'si' 'ti' 'vi' | |||
// the compound forms are: | |||
'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene' | |||
'mela' 'mele' 'meli' 'melo' 'mene' | |||
'tela' 'tele' 'teli' 'telo' 'tene' | |||
'cela' 'cele' 'celi' 'celo' 'cene' | |||
'vela' 'vele' 'veli' 'velo' 'vene' | |||
) | |||
among( (RV) | |||
'ando' 'endo' (delete) | |||
'ar' 'er' 'ir' (<- 'e') | |||
) | |||
) | |||
define standard_suffix as ( | |||
[substring] among( | |||
'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo' | |||
'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti' | |||
'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente' | |||
'atrice' 'atrici' | |||
'ante' 'anti' // Note 1 | |||
( R2 delete ) | |||
'azione' 'azioni' 'atore' 'atori' | |||
( R2 delete | |||
try ( ['ic'] R2 delete ) | |||
) | |||
'logia' 'logie' | |||
( R2 <- 'log' ) | |||
'uzione' 'uzioni' 'usione' 'usioni' | |||
( R2 <- 'u' ) | |||
'enza' 'enze' | |||
( R2 <- 'ente' ) | |||
'amento' 'amenti' 'imento' 'imenti' | |||
( RV delete ) | |||
'amente' ( | |||
R1 delete | |||
try ( | |||
[substring] R2 delete among( | |||
'iv' ( ['at'] R2 delete ) | |||
'os' 'ic' 'abil' | |||
) | |||
) | |||
) | |||
'it{a`}' ( | |||
R2 delete | |||
try ( | |||
[substring] among( | |||
'abil' 'ic' 'iv' (R2 delete) | |||
) | |||
) | |||
) | |||
'ivo' 'ivi' 'iva' 'ive' ( | |||
R2 delete | |||
try ( ['at'] R2 delete ['ic'] R2 delete ) | |||
) | |||
) | |||
) | |||
define verb_suffix as setlimit tomark pV for ( | |||
[substring] among( | |||
'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi' | |||
'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate' | |||
'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai' | |||
'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo' | |||
'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete' | |||
'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo' | |||
'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei' | |||
'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono' | |||
'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita' | |||
'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo' | |||
'ono' 'uta' 'ute' 'uti' 'uto' | |||
'ar' 'ir' // but 'er' is problematical | |||
(delete) | |||
) | |||
) | |||
define AEIO 'aeio{a`}{e`}{i`}{o`}' | |||
define CG 'cg' | |||
define vowel_suffix as ( | |||
try ( | |||
[AEIO] RV delete | |||
['i'] RV delete | |||
) | |||
try ( | |||
['h'] CG RV delete | |||
) | |||
) | |||
) | |||
define stem as ( | |||
do prelude | |||
do mark_regions | |||
backwards ( | |||
do attached_pronoun | |||
do (standard_suffix or verb_suffix) | |||
do vowel_suffix | |||
) | |||
do postlude | |||
) | |||
/* | |||
Note 1: additions of 15 Jun 2005 | |||
*/ | |||
@@ -1,5 +1,5 @@ | |||
strings ( ch ) | |||
integers ( x p1 p2 ) | |||
integers ( p1 p2 ) | |||
booleans ( Y_found stemmed GE_removed ) | |||
routines ( | |||
@@ -20,8 +20,6 @@ groupings ( v v_WX AOU AIOU ) | |||
stringescapes {} | |||
stringdef ' hex '27' // yuk | |||
define v 'aeiouy' | |||
define v_WX v + 'wx' | |||
define AOU 'aou' | |||
@@ -29,8 +27,8 @@ define AIOU 'aiou' | |||
backwardmode ( | |||
define R1 as (setmark x $x >= p1) | |||
define R2 as (setmark x $x >= p2) | |||
define R1 as ($p1 <= cursor) | |||
define R2 as ($p2 <= cursor) | |||
define V as test (v or 'ij') | |||
define VX as test (next v or 'ij') | |||
@@ -46,7 +44,7 @@ backwardmode ( | |||
define Step_1 as | |||
( | |||
[among ( (]) | |||
[substring] among ( | |||
'{'}s' (delete) | |||
's' (R1 not ('t' R1) C delete) | |||
@@ -68,7 +66,7 @@ backwardmode ( | |||
define Step_2 as | |||
( | |||
[among ( (]) | |||
[substring] among ( | |||
'je' (('{'}t' ] delete) or | |||
('et' ] R1 C delete) or | |||
('rnt' ] <-'rn') or | |||
@@ -92,7 +90,7 @@ backwardmode ( | |||
define Step_3 as | |||
( | |||
[among ( (]) | |||
[substring] among ( | |||
'atie' (R1 <-'eer') | |||
'iteit' (R1 delete lengthen_V) | |||
'heid' | |||
@@ -112,7 +110,7 @@ backwardmode ( | |||
define Step_4 as | |||
( | |||
( [among ( (]) | |||
( [substring] among ( | |||
'ioneel' (R1 <-'ie') | |||
'atief' (R1 <-'eer') | |||
'baar' (R1 delete) | |||
@@ -132,7 +130,7 @@ backwardmode ( | |||
) | |||
) | |||
or | |||
( [among ( (]) | |||
( [substring] among ( | |||
'iger' | |||
'igst' | |||
'ig' (R1 C delete lengthen_V) | |||
@@ -142,7 +140,7 @@ backwardmode ( | |||
define Step_7 as | |||
( | |||
[among ( (]) | |||
[substring] among ( | |||
'kt' (<-'k') | |||
'ft' (<-'f') | |||
'pt' (<-'p') | |||
@@ -151,7 +149,7 @@ backwardmode ( | |||
define Step_6 as | |||
( | |||
[among ( (]) | |||
[substring] among ( | |||
'bb' (<-'b') | |||
'cc' (<-'c') | |||
'dd' (<-'d') | |||
@@ -179,7 +177,7 @@ backwardmode ( | |||
define Step_1c as | |||
( | |||
[among ( (] R1 C) | |||
[substring] among ( (R1 C) | |||
'd' (not ('n' R1) delete) | |||
't' (not ('h' R1) delete) | |||
) | |||
@@ -200,11 +198,8 @@ define Lose_infix as ( | |||
) | |||
define measure as ( | |||
do ( | |||
tolimit | |||
setmark p1 | |||
setmark p2 | |||
) | |||
$p1 = limit | |||
$p2 = limit | |||
do( | |||
repeat non-v atleast 1 ('ij' or v) non-v setmark p1 | |||
repeat non-v atleast 1 ('ij' or v) non-v setmark p2 |
@@ -0,0 +1,373 @@ | |||
externals ( stem ) | |||
// escape symbols for substituting lithuanian characters | |||
stringescapes { } | |||
/* Special characters in Unicode Latin Extended-A */ | |||
// ' nosine | |||
stringdef a' '{U+0105}' // ą a + ogonek | |||
stringdef e' '{U+0119}' // ę e + ogonek | |||
stringdef i' '{U+012F}' // į i + ogonek | |||
stringdef u' '{U+0173}' // ų u + ogonek | |||
// . taskas | |||
stringdef e. '{U+0117}' // ė e + dot | |||
// - ilgoji | |||
stringdef u- '{U+016B}' // ū u + macron | |||
// * varnele | |||
stringdef c* '{U+010D}' // č c + caron (haček) | |||
stringdef s* '{U+0161}' // š s + caron (haček) | |||
stringdef z* '{U+017E}' // ž z + caron (haček) | |||
// [C](VC)^m[V|C] | |||
// definitions of variables for | |||
// p1 - position of m = 0 | |||
integers ( p1 ) | |||
// groupings | |||
// v - lithuanian vowels | |||
groupings ( v ) | |||
// v - all lithuanian vowels | |||
define v 'aeiyou{a'}{e'}{i'}{u'}{e.}{u-}' | |||
// all lithuanian stemmer routines: 4 steps | |||
routines ( | |||
step2 R1 step1 fix_chdz fix_gd fix_conflicts | |||
) | |||
backwardmode ( | |||
define R1 as $p1 <= cursor | |||
define step1 as ( | |||
setlimit tomark p1 for ([substring]) R1 among ( | |||
// Daiktavardžiai (Nouns) | |||
// I linksniuotė (declension I) | |||
'as' 'ias' 'is' 'ys' // vyras, kelias, brolis, gaidys | |||
'o' 'io' // vyro, kelio | |||
'ui' 'iui' // vyrui, keliui | |||
'{a'}' 'i{a'}' '{i'}' // vyrą, kelią, brolį | |||
'u' 'iu' // vyru, keliu | |||
'e' 'yje' // vyre, kelyje | |||
'y' 'au' 'i' // kely, brolau, broli, | |||
'an' // nusižengiman | |||
'ai' 'iai' // vyrai, keliai | |||
'{u'}' 'i{u'}' // vyrų, kelių | |||
'ams' 'am' // vyrams, vyram | |||
'iams' 'iam' // broliams, broliam | |||
'us' 'ius' // vyrus, brolius | |||
'ais' 'iais' // vyrais, keliais | |||
'uose' 'iuose' 'uos' 'iuos' // vyruose, keliuose, vyruos, keliuos | |||
'uosna' 'iuosna' // vyruosna, keliuosna | |||
'ysna' // žutysna | |||
'asis' 'aisi' // sukimasis, sukimaisi | |||
'osi' '{u'}si' // sukimosi, sukimųsi | |||
'uisi' // sukimuisi | |||
'{a'}si' // sukimąsi | |||
'usi' // sukimusi | |||
'esi' // sukimesi | |||
'uo' // mėnuo | |||
// II linksniuote (declension II) | |||
'a' 'ia' // galva, vysnios | |||
'os' 'ios' // galvos, vysnios | |||
'oj' 'oje' 'ioje' // galvoje, vysnioje | |||
'osna' 'iosna' // galvosna, vyšniosna | |||
'om' 'oms' 'ioms' // galvoms, vysnioms | |||
'omis' 'iomis' // galvomis, vysniomis | |||
'ose' 'iose' // galvose, vysniose | |||
'on' 'ion' // galvon, vyšnion | |||
// III linksniuote (declension III) | |||
'{e.}' // gervė | |||
'{e.}s' // gervės | |||
'ei' // gervei | |||
'{e'}' // gervę | |||
'{e.}j' '{e.}je' // gervėj, gervėje | |||
'{e.}ms' // gervėms | |||
'es' // gerves | |||
'{e.}mis' // gervėmis | |||
'{e.}se' // gervėse | |||
'{e.}sna' // gervėsna | |||
'{e.}n' // žydaitėn | |||
// IV linksniuote (declension IV) | |||
'aus' 'iaus' // sūnaus, skaičiaus | |||
'umi' 'iumi' // sūnumi, skaičiumi | |||
'uje' 'iuje' // sūnuje, skaičiuje | |||
'iau' // skaičiau | |||
'{u-}s' // sūnūs | |||
'ums' // sūnums | |||
'umis' // sūnumis | |||
'un' 'iun' // sūnun, administratoriun | |||
// V linksniuote (declension V) | |||
'ies' 'ens' 'enio' 'ers' // avies, vandens, sesers | |||
'eniui' 'eriai' // vandeniui, eriai | |||
'en{i'}' 'er{i'}' // vandenį, seserį | |||
'imi' 'eniu' 'erimi' 'eria' // avimi, vandeniu, seserimi, seseria | |||
'enyje' 'eryje' // vandenyje, seseryje | |||
'ie' 'enie' 'erie' // avie, vandenie, seserie | |||
'enys' 'erys' // vandenys, seserys | |||
// 'en{u'}' konfliktas su 'žandenų' 'antenų' | |||
'er{u'}' // seserų | |||
'ims' 'enims' 'erims' // avims, vandemins, seserims | |||
'enis' // vandenis | |||
'imis' // žebenkštimis | |||
'enimis' // vandenimis | |||
'yse' 'enyse' 'eryse' // avyse, vandenyse, seseryse | |||
// Būdvardžiai (Adjectives) | |||
// (i)a linksniuotė | |||
'iem' 'iems' // geriem, geriems | |||
'ame' 'iame' // naujame, mediniame | |||
// Veiksmažodžiai (Verbs) | |||
// Tiesioginė nuosaka (indicative mood) | |||
// esamasis laikas (present tense) | |||
// (i)a asmenuotė (declension (i)a) | |||
'uosi' 'iuosi' // dirbuosi, traukiuosi | |||
'iesi' // dirbiesi | |||
'asi' 'iasi' // dirbasi, traukiasi | |||
'am{e.}s' 'iam{e.}s' // dirbamės, traukiamės | |||
'at' 'ate' 'iat' 'iate' // dirbat, dirbate, ariat, traukiate | |||
'at{e.}s' 'iat{e.}s' // dirbatės, traukiatės | |||
// i asmenuotė (declension i) | |||
'isi' // tikisi | |||
'im' // mylim | |||
// 'ime' konfliktassu daiktavardžiu vietininku, pvz. 'gėrime' | |||
'im{e.}s' // tikimės | |||
'it' 'ite' // mylit, mylite, tikitės | |||
// 'it{e.}s' konfliktas su priesaga ir dgs. vardininko galūne -ait-ės pvz. žydaitės | |||
// o asmenuotė (declension o) | |||
'ome' // mokome | |||
'ot' 'ote' // mokot, mokote | |||
// būtasis laikas | |||
// o asmenuotė (declension o) | |||
'{e.}jo' '{e.}josi' // tikėjo, tikėjosi | |||
'ot{e.}s' // tikėjotės/bijotės | |||
// ė asmenuotė (declension ė) | |||
'eisi' // mokeisi | |||
'{e.}si' // mokėsi | |||
'{e.}m' '{e.}me' // mokėm, mokėme | |||
'{e.}m{e.}s' // mokėmės | |||
'{e.}t' '{e.}te' // mokėt, mokėte | |||
'{e.}t{e.}s' // mokėtės | |||
// būtasis dažninis laikas (frequentative past tense) | |||
'ausi' // mokydavausi | |||
'om{e.}s' // mokydavomės/bijomės | |||
// būsimasis laikas (future tense) | |||
'siu' 'siuosi' // dirbsiu, mokysiuosi | |||
'si' 'siesi' // dirbsi, dirbsiesi | |||
's' 'ysis' // dirbs, mokysis | |||
'sim' 'sime' // dirbsim, dirbsime | |||
'sit' 'site' // gersit, gersite | |||
// tariamoji nuosaka (subjunctive mood) | |||
'{c*}iau' '{c*}iausi' // dirbčiau | |||
'tum' 'tumei' // dirbtum, dirbtumei | |||
'tumeis' 'tumeisi' // mokytumeis, mokytumeisi | |||
// 't{u'}' nes blogai batutų -> batų | |||
't{u'}si' // mokytųsi | |||
// 'tume' konfliktas su 'šventume' | |||
'tum{e.}m' // dirbtumėm | |||
'tum{e.}me' // dirbtumėme | |||
'tum{e.}m{e.}s' // mokytumėmės | |||
'tute' 'tum{e.}t' // dirbtute, dirbtumėt | |||
'tum{e.}te' // dirbtumėte | |||
'tum{e.}t{e.}s' // mokytumėtės | |||
// liepiamoji nuosaka (imperative mood) | |||
'k' 'ki' // dirbk, dirbki, mokykis | |||
// 'kis' konfliktas viln-išk-is | |||
// 'kime' konfliktas, nes pirkime | |||
'kim{e.}s' // mokykimės | |||
// bendratis (infinitive) | |||
'uoti' 'iuoti' // meluoti, dygsniuoti | |||
'auti' 'iauti' // draugauti, girtuokliauti | |||
'oti' 'ioti' // dovanoti, meškerioti | |||
'{e.}ti' // auklėti | |||
'yti' // akyti | |||
'inti' // auginti | |||
'in{e.}ti' // blusinėti | |||
'enti' // gyventi | |||
'tel{e.}ti' // bumbtelėti | |||
'ter{e.}ti' // bumbterėti | |||
'ti' // skalbti | |||
// 'tis' konfliktas, nes rytme-tis -> rytme | |||
// dalyviai (participles) | |||
'{a'}s' 'i{a'}s' '{i'}s' // dirbąs, žaidžiąs, gulįs | |||
't{u'}s' // suktųs -> suk | |||
'sim{e.}s' // suksimės | |||
'sit{e.}s' // suksitės | |||
'kite' // supkite | |||
) | |||
delete | |||
) | |||
define step2 as repeat ( | |||
setlimit tomark p1 for ([substring]) among ( | |||
// daiktavardziu priesagos (Noun suffixes) | |||
// budvardziu priesagos (Adjective suffixes) | |||
// 'in' // konfliktas su 'augintinis' ir 'akiniais' // lauk-in-is | |||
'ing' // tvark-ing-as | |||
'i{s*}k' // lenk-išk-as | |||
'{e.}t' // dem-ėt-as | |||
'ot' // garban-ot-as | |||
'uot' 'iuot' // lang-uot-as, akin-iuot-as | |||
// 'tin', nes augintinis // dirb-tin-is | |||
// 'ut', nes batutas, degutas etc. // maž-ut-is | |||
'yt' // maž-yt-is | |||
'iuk' // maž-iuk-as | |||
'iul' // maž-ul-is | |||
'{e.}l' // maž-ėl-is | |||
'yl' // maž-yl-is | |||
'u{c*}iuk' // maž-učiuk-as | |||
'uliuk' // maž-uliuk-as | |||
'ut{e.}ait' // maž-utėlait-is | |||
'ok' // did-ok-as | |||
'iok' // višč-iok-as | |||
'sv' '{s*}v' 'zgan' // sal-sv-as, pilk-šv-as, bal-zgan-as | |||
'op' 'iop' // dvej-op-as, viener-iop-as | |||
'ain' // apval-ain-as | |||
'yk{s*}t' 'yk{s*}{c*}' // ten-ykšt-is, vakar-ykšč-ias | |||
// laisniai | |||
'esn' // did-esn-is | |||
'aus' 'iaus' // nauj-aus-ias, ger-iaus-ias | |||
// ivardziuotiniai budvardziai (Pronominal adjectives) | |||
// vyriska gimine (Male gender) | |||
'ias' // žaliasis | |||
'oj' 'ioj' // gerojo, žaliojo | |||
'aj' 'iaj' // gerajam, žaliajam | |||
'{a'}j' 'i{a'}j' // garąjį, žaliąjį | |||
'uoj' 'iuoj' // geruoju, žaliuoju | |||
'iej' // gerieji | |||
'{u'}j' 'i{u'}j' // gerųjų, žaliųjų | |||
'ies' // geriesiems | |||
'uos' 'iuos' // geruosius, žaliuosius | |||
'ais' 'iais' // geraisiais, žaliaisiais | |||
// moteriska gimine (Female gender) | |||
'os' 'ios' // gerosios, žaliosios | |||
'{a'}s' 'i{a'}s' // gerąsios, žaliąsias | |||
// būtasis dažninis laikas (frequentative past tense) | |||
'dav' // ei-dav-o | |||
// dalyvių priesagos (particple suffix) | |||
'ant' 'iant' | |||
'int' // tur-int-is | |||
'{e.}j' // tur-ėj-o | |||
'{e'}' // | |||
'{e.}j{e'}' | |||
'{e'}s' // dirb-ęs-is | |||
'siant' // dirb-siant | |||
// pusdalyviai (participle) | |||
'dam' // bėg-dam-as | |||
'auj' // ūkinink-auj-a | |||
'jam' | |||
'iau' | |||
'am' // baiminim-ams-i | |||
) | |||
delete | |||
) | |||
define fix_conflicts as ( | |||
[substring] among ( | |||
// 'lietuvaite' -> 'lietuvaitė', konfliktas su 'myl-ite' | |||
'aite' (<-'ait{e.}') | |||
// 'lietuvaitės' -> 'lietuvaitė', konfliktas su 'myl-itės' | |||
'ait{e.}s' (<-'ait{e.}') | |||
// ''ūs-uotės' -> 'ūs-uotė', konfliktas 'mokotės' | |||
'uot{e.}s' (<-'uot{e.}') | |||
// ''ūs-uote' -> 'ūs-uotė', konfliktas 'mokote' | |||
'uote' (<-'uot{e.}') | |||
// 'žerėjime' -> 'žėrėjimas', konfliktas su 'žais-ime' | |||
'{e.}jime' (<-'{e.}jimas') | |||
// 'žvilgesiu' -> 'žvilgesys', konfliktas su 'dirb-siu' | |||
'esiu' (<-'esys') | |||
// 'duobkasiu' -> 'duobkasys', konfliktas su 'pakasiu' | |||
'asius' (<-'asys') | |||
// 'žioravime' -> 'žioravimas', konfliktas su 'myl-ime' | |||
'avime' (<-'avimas') | |||
'ojime' (<-'ojimas') | |||
// 'advokatės' -> 'advokatė', konfliktas su 'dirb-atės' | |||
'okat{e.}s' (<-'okat{e.}') | |||
// 'advokate' -> 'advokatė', konfliktas su 'dirb-ate' | |||
'okate' (<-'okat{e.}') | |||
) | |||
) | |||
define fix_chdz as ( | |||
[substring] among ( | |||
'{c*}' (<-'t') | |||
'd{z*}' (<-'d') | |||
) | |||
) | |||
define fix_gd as ( | |||
[substring] among ( | |||
'gd' (<-'g') | |||
// '{e.}k' (<-'{e.}g') | |||
) | |||
) | |||
) | |||
define stem as ( | |||
$p1 = limit | |||
do ( | |||
// priešdėlis 'a' ilgeniuose nei 6 raidės žodžiuose, pvz. 'a-liejus'. | |||
try (test 'a' $(len > 6) hop 1) | |||
gopast v gopast non-v setmark p1 | |||
) | |||
backwards ( | |||
do fix_conflicts | |||
do step1 | |||
do fix_chdz | |||
do step2 | |||
do fix_chdz | |||
do fix_gd | |||
) | |||
) |
@@ -0,0 +1,92 @@ | |||
/* | |||
* Authors: | |||
* - Ingroj Shrestha <ing.stha@gmail.com>, Nepali NLP Group | |||
* - Oleg Bartunov <obartunov@gmail.com>, Postgres Professional Ltd. | |||
* - Shreeya Singh Dhakal, Nepali NLP Group | |||
*/ | |||
routines ( | |||
remove_category_1 | |||
check_category_2 | |||
remove_category_2 | |||
remove_category_3 | |||
) | |||
stringescapes {} | |||
stringdef dsc '{U+0901}' // DEVANAGARI_SIGN_CANDRABINDU | |||
stringdef dsa '{U+0902}' // DEVANAGARI_SIGN_ANUSVARA | |||
stringdef dli '{U+0907}' // DEVANAGARI_LETTER_I | |||
stringdef dlii '{U+0908}' // DEVANAGARI_LETTER_II | |||
stringdef dle '{U+090F}' // DEVANAGARI_LETTER_E | |||
stringdef dlka '{U+0915}' // DEVANAGARI_LETTER_KA | |||
stringdef dlkha '{U+0916}' // DEVANAGARI_LETTER_KHA | |||
stringdef dlg '{U+0917}' // DEVANAGARI_LETTER_GA | |||
stringdef dlc '{U+091B}' // DEVANAGARI_LETTER_CHA | |||
stringdef dlta '{U+0924}' // DEVANAGARI_LETTER_TA | |||
stringdef dltha '{U+0925}' // DEVANAGARI_LETTER_THA | |||
stringdef dld '{U+0926}' // DEVANAGARI_LETTER_DA | |||
stringdef dln '{U+0928}' // DEVANAGARI_LETTER_NA | |||
stringdef dlpa '{U+092A}' // DEVANAGARI_LETTER_PA | |||
stringdef dlpha '{U+092B}' // DEVANAGARI_LETTER_PHA | |||
stringdef dlb '{U+092D}' // DEVANAGARI_LETTER_BHA | |||
stringdef dlm '{U+092E}' // DEVANAGARI_LETTER_MA | |||
stringdef dly '{U+092F}' // DEVANAGARI_LETTER_YA | |||
stringdef dlr '{U+0930}' // DEVANAGARI_LETTER_RA | |||
stringdef dll '{U+0932}' // DEVANAGARI_LETTER_LA | |||
stringdef dlv '{U+0935}' // DEVANAGARI_LETTER_VA | |||
stringdef dls '{U+0938}' // DEVANAGARI_LETTER_SA | |||
stringdef dlh '{U+0939}' // DEVANAGARI_LETTER_HA | |||
stringdef dvsaa '{U+093E}' // DEVANAGARI_VOWEL_SIGN_AA | |||
stringdef dvsi '{U+093F}' // DEVANAGARI_VOWEL_SIGN_I | |||
stringdef dvsii '{U+0940}' // DEVANAGARI_VOWEL_SIGN_II | |||
stringdef dvsu '{U+0941}' // DEVANAGARI_VOWEL_SIGN_U | |||
stringdef dvsuu '{U+0942}' // DEVANAGARI_VOWEL_SIGN_UU | |||
stringdef dvse '{U+0947}' // DEVANAGARI_VOWEL_SIGN_E | |||
stringdef dvsai '{U+0948}' // DEVANAGARI_VOWEL_SIGN_AI | |||
stringdef dvso '{U+094B}' // DEVANAGARI_VOWEL_SIGN_O | |||
stringdef dvsau '{U+094C}' // DEVANAGARI_VOWEL_SIGN_AU | |||
stringdef dsv '{U+094D}' // DEVANAGARI_SIGN_VIRAMA | |||
externals ( stem ) | |||
backwardmode ( | |||
define remove_category_1 as( | |||
[substring] among ( | |||
'{dlm}{dvsaa}{dlr}{dsv}{dlpha}{dlta}' '{dld}{dsv}{dlv}{dvsaa}{dlr}{dvsaa}' '{dls}{dsc}{dlg}{dvsai}' '{dls}{dsa}{dlg}' | |||
'{dls}{dsc}{dlg}' '{dll}{dvsaa}{dli}' '{dll}{dvsaa}{dlii}' '{dlpa}{dlc}{dvsi}' | |||
'{dll}{dvse}' '{dlr}{dlta}' '{dlm}{dvsai}' '{dlm}{dvsaa}' | |||
(delete) | |||
'{dlka}{dvso}' '{dlka}{dvsaa}' '{dlka}{dvsi}' '{dlka}{dvsii}' '{dlka}{dvsai}'(('{dle}' or '{dvse}' ()) or delete) | |||
) | |||
) | |||
define check_category_2 as( | |||
[substring] among( | |||
'{dsc}' '{dsa}' '{dvsai}' | |||
) | |||
) | |||
define remove_category_2 as ( | |||
[substring] among( | |||
'{dsc}' '{dsa}' ('{dly}{dvsau}' or '{dlc}{dvsau}' or '{dln}{dvsau}' or '{dltha}{dvse}' delete) | |||
'{dvsai}' ('{dlta}{dsv}{dlr}' delete) | |||
) | |||
) | |||
define remove_category_3 as( | |||
[substring] among( | |||
'{dltha}{dvsi}{dli}{dls}{dsv}' '{dlh}{dvsu}{dln}{dvse}{dlc}' '{dlh}{dvsu}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dls}{dsv}' '{dln}{dvse}{dlc}{dln}{dsv}' '{dli}{dle}{dlka}{dvsii}' '{dli}{dle}{dlka}{dvsaa}' '{dli}{dle}{dlka}{dvso}' '{dvsi}{dle}{dlka}{dvsii}' '{dvsi}{dle}{dlka}{dvsaa}' '{dvsi}{dle}{dlka}{dvso}' '{dli}{dlc}{dln}{dsv}' '{dvsi}{dlc}{dln}{dsv}' '{dli}{dlc}{dls}{dsv}' '{dvsi}{dlc}{dls}{dsv}' '{dle}{dlc}{dln}{dsv}' '{dvse}{dlc}{dln}{dsv}' '{dle}{dlc}{dls}{dsv}' '{dvse}{dlc}{dls}{dsv}' '{dlc}{dvsi}{dln}{dsv}' '{dlc}{dvse}{dls}{dsv}' '{dlc}{dsv}{dly}{dvsau}' '{dltha}{dvsi}{dln}{dsv}' '{dltha}{dvsi}{dly}{dvso}' '{dltha}{dvsi}{dly}{dvsau}' '{dltha}{dvsi}{dls}{dsv}' '{dltha}{dsv}{dly}{dvso}' '{dltha}{dsv}{dly}{dvsau}' '{dld}{dvsi}{dly}{dvso}' '{dld}{dvse}{dlkha}{dvsi}' '{dld}{dvse}{dlkha}{dvsii}' '{dll}{dvsaa}{dln}{dsv}' '{dlm}{dvsaa}{dltha}{dvsi}' '{dln}{dvse}{dlka}{dvsai}' '{dln}{dvse}{dlka}{dvsaa}' '{dln}{dvse}{dlka}{dvso}' '{dln}{dvse}{dlc}{dvsau}' '{dlh}{dvso}{dls}{dsv}' '{dli}{dln}{dsv}{dlc}' '{dvsi}{dln}{dsv}{dlc}' '{dln}{dvse}{dlc}{dvsu}' '{dli}{dlc}{dvsau}' '{dvsi}{dlc}{dvsau}' '{dli}{dls}{dsv}' '{dvsi}{dls}{dsv}' '{dvsi}{dly}{dvso}' '{dli}{dly}{dvso}' '{dle}{dlka}{dvsaa}' '{dvse}{dlka}{dvsaa}' '{dle}{dlka}{dvsii}' '{dvse}{dlka}{dvsii}' '{dle}{dlka}{dvsai}' '{dvse}{dlka}{dvsai}' '{dle}{dlka}{dvso}' '{dvse}{dlka}{dvso}' '{dle}{dlc}{dvsu}' '{dvse}{dlc}{dvsu}' '{dle}{dlc}{dvsau}' '{dvse}{dlc}{dvsau}' '{dlc}{dln}{dsv}' '{dlc}{dls}{dsv}' '{dltha}{dvsi}{dle}' '{dlpa}{dlr}{dsv}' '{dlb}{dly}{dvso}' '{dlh}{dlr}{dvsu}' '{dlh}{dlr}{dvsuu}' '{dvsi}{dld}{dvsaa}' '{dli}{dld}{dvsaa}' '{dvsi}{dld}{dvso}' '{dli}{dld}{dvso}' '{dvsi}{dld}{dvsai}' '{dli}{dld}{dvsai}' '{dln}{dvse}{dlc}' '{dli}{dlc}' '{dvsi}{dlc}' '{dle}{dlc}' '{dvse}{dlc}' '{dlc}{dvsu}' '{dlc}{dvse}' '{dlc}{dvsau}' '{dltha}{dvsii}' '{dltha}{dvse}' '{dld}{dvsaa}' '{dld}{dvsii}' '{dld}{dvsai}' '{dld}{dvso}' '{dln}{dvsu}' '{dln}{dvse}' '{dly}{dvso}' '{dly}{dvsau}' '{dlc}' | |||
(delete) | |||
) | |||
) | |||
) | |||
define stem as ( | |||
backwards ( | |||
do remove_category_1 | |||
do ( | |||
repeat (do (check_category_2 and remove_category_2) remove_category_3) | |||
) | |||
) | |||
) |
@@ -13,11 +13,11 @@ groupings ( v s_ending ) | |||
stringescapes {} | |||
/* special characters (in ISO Latin I) */ | |||
/* special characters */ | |||
stringdef ae hex 'E6' | |||
stringdef ao hex 'E5' | |||
stringdef o/ hex 'F8' | |||
stringdef ae '{U+00E6}' | |||
stringdef ao '{U+00E5}' | |||
stringdef o/ '{U+00F8}' | |||
define v 'aeiouy{ae}{ao}{o/}' | |||
@@ -1,80 +0,0 @@ | |||
routines ( | |||
mark_regions | |||
main_suffix | |||
consonant_pair | |||
other_suffix | |||
) | |||
externals ( stem ) | |||
integers ( p1 x ) | |||
groupings ( v s_ending ) | |||
stringescapes {} | |||
/* special characters (in MS-DOS Latin I) */ | |||
stringdef ae hex '91' | |||
stringdef ao hex '86' | |||
stringdef o/ hex '9B' | |||
define v 'aeiouy{ae}{ao}{o/}' | |||
define s_ending 'bcdfghjlmnoprtvyz' | |||
define mark_regions as ( | |||
$p1 = limit | |||
test ( hop 3 setmark x ) | |||
goto v gopast non-v setmark p1 | |||
try ( $p1 < x $p1 = x ) | |||
) | |||
backwardmode ( | |||
define main_suffix as ( | |||
setlimit tomark p1 for ([substring]) | |||
among( | |||
'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar' | |||
'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens' | |||
'hetens' 'ers' 'ets' 'et' 'het' 'ast' | |||
(delete) | |||
's' | |||
(s_ending or ('k' non-v) delete) | |||
'erte' 'ert' | |||
(<-'er') | |||
) | |||
) | |||
define consonant_pair as ( | |||
test ( | |||
setlimit tomark p1 for ([substring]) | |||
among( | |||
'dt' 'vt' | |||
) | |||
) | |||
next] delete | |||
) | |||
define other_suffix as ( | |||
setlimit tomark p1 for ([substring]) | |||
among( | |||
'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov' | |||
'hetslov' | |||
(delete) | |||
) | |||
) | |||
) | |||
define stem as ( | |||
do mark_regions | |||
backwards ( | |||
do main_suffix | |||
do consonant_pair | |||
do other_suffix | |||
) | |||
) |
@@ -15,20 +15,20 @@ groupings ( v ) | |||
stringescapes {} | |||
/* special characters (in ISO Latin I) */ | |||
/* special characters */ | |||
stringdef a' hex 'E1' // a-acute | |||
stringdef a^ hex 'E2' // a-circumflex e.g. 'bota^nico | |||
stringdef e' hex 'E9' // e-acute | |||
stringdef e^ hex 'EA' // e-circumflex | |||
stringdef i' hex 'ED' // i-acute | |||
stringdef o^ hex 'F4' // o-circumflex | |||
stringdef o' hex 'F3' // o-acute | |||
stringdef u' hex 'FA' // u-acute | |||
stringdef c, hex 'E7' // c-cedilla | |||
stringdef a' '{U+00E1}' // a-acute | |||
stringdef a^ '{U+00E2}' // a-circumflex e.g. 'bota^nico | |||
stringdef e' '{U+00E9}' // e-acute | |||
stringdef e^ '{U+00EA}' // e-circumflex | |||
stringdef i' '{U+00ED}' // i-acute | |||
stringdef o^ '{U+00F4}' // o-circumflex | |||
stringdef o' '{U+00F3}' // o-acute | |||
stringdef u' '{U+00FA}' // u-acute | |||
stringdef c, '{U+00E7}' // c-cedilla | |||
stringdef a~ hex 'E3' // a-tilde | |||
stringdef o~ hex 'F5' // o-tilde | |||
stringdef a~ '{U+00E3}' // a-tilde | |||
stringdef o~ '{U+00F5}' // o-tilde | |||
define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}' | |||
@@ -92,12 +92,12 @@ backwardmode ( | |||
( | |||
R2 delete | |||
) | |||
'log{i'}a' | |||
'log{i'}as' | |||
'logia' | |||
'logias' | |||
( | |||
R2 <- 'log' | |||
) | |||
'uci{o'}n' 'uciones' | |||
'u{c,}a~o' 'u{c,}o~es' | |||
( | |||
R2 <- 'u' | |||
) |
@@ -1,218 +0,0 @@ | |||
routines ( | |||
prelude postlude mark_regions | |||
RV R1 R2 | |||
standard_suffix | |||
verb_suffix | |||
residual_suffix | |||
residual_form | |||
) | |||
externals ( stem ) | |||
integers ( pV p1 p2 ) | |||
groupings ( v ) | |||
stringescapes {} | |||
/* special characters (in MS-DOS Latin I) */ | |||
stringdef a' hex 'A0' // a-acute | |||
stringdef a^ hex '83' // a-circumflex e.g. 'bota^nico | |||
stringdef e' hex '82' // e-acute | |||
stringdef e^ hex '88' // e-circumflex | |||
stringdef i' hex 'A1' // i-acute | |||
stringdef o^ hex '93' // o-circumflex | |||
stringdef o' hex 'A2' // o-acute | |||
stringdef u' hex 'A3' // u-acute | |||
stringdef c, hex '87' // c-cedilla | |||
stringdef a~ hex 'C6' // a-tilde | |||
stringdef o~ hex 'E4' // o-tilde | |||
define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}' | |||
define prelude as repeat ( | |||
[substring] among( | |||
'{a~}' (<- 'a~') | |||
'{o~}' (<- 'o~') | |||
'' (next) | |||
) //or next | |||
) | |||
define mark_regions as ( | |||
$pV = limit | |||
$p1 = limit | |||
$p2 = limit // defaults | |||
do ( | |||
( v (non-v gopast v) or (v gopast non-v) ) | |||
or | |||
( non-v (non-v gopast v) or (v next) ) | |||
setmark pV | |||
) | |||
do ( | |||
gopast v gopast non-v setmark p1 | |||
gopast v gopast non-v setmark p2 | |||
) | |||
) | |||
define postlude as repeat ( | |||
[substring] among( | |||
'a~' (<- '{a~}') | |||
'o~' (<- '{o~}') | |||
'' (next) | |||
) //or next | |||
) | |||
backwardmode ( | |||
define RV as $pV <= cursor | |||
define R1 as $p1 <= cursor | |||
define R2 as $p2 <= cursor | |||
define standard_suffix as ( | |||
[substring] among( | |||
'eza' 'ezas' | |||
'ico' 'ica' 'icos' 'icas' | |||
'ismo' 'ismos' | |||
'{a'}vel' | |||
'{i'}vel' | |||
'ista' 'istas' | |||
'oso' 'osa' 'osos' 'osas' | |||
'amento' 'amentos' | |||
'imento' 'imentos' | |||
'adora' 'ador' 'a{c,}a~o' | |||
'adoras' 'adores' 'a{c,}o~es' // no -ic test | |||
'ante' 'antes' '{a^}ncia' // Note 1 | |||
( | |||
R2 delete | |||
) | |||
'log{i'}a' | |||
'log{i'}as' | |||
( | |||
R2 <- 'log' | |||
) | |||
'uci{o'}n' 'uciones' | |||
( | |||
R2 <- 'u' | |||
) | |||
'{e^}ncia' '{e^}ncias' | |||
( | |||
R2 <- 'ente' | |||
) | |||
'amente' | |||
( | |||
R1 delete | |||
try ( | |||
[substring] R2 delete among( | |||
'iv' (['at'] R2 delete) | |||
'os' | |||
'ic' | |||
'ad' | |||
) | |||
) | |||
) | |||
'mente' | |||
( | |||
R2 delete | |||
try ( | |||
[substring] among( | |||
'ante' // Note 1 | |||
'avel' | |||
'{i'}vel' (R2 delete) | |||
) | |||
) | |||
) | |||
'idade' | |||
'idades' | |||
( | |||
R2 delete | |||
try ( | |||
[substring] among( | |||
'abil' | |||
'ic' | |||
'iv' (R2 delete) | |||
) | |||
) | |||
) | |||
'iva' 'ivo' | |||
'ivas' 'ivos' | |||
( | |||
R2 delete | |||
try ( | |||
['at'] R2 delete // but not a further ['ic'] R2 delete | |||
) | |||
) | |||
'ira' 'iras' | |||
( | |||
RV 'e' // -eira -eiras usually non-verbal | |||
<- 'ir' | |||
) | |||
) | |||
) | |||
define verb_suffix as setlimit tomark pV for ( | |||
[substring] among( | |||
'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}' | |||
'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste' | |||
'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam' | |||
'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem' | |||
'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o' | |||
'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias' | |||
'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras' | |||
'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres' | |||
'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is' | |||
'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis' | |||
'{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis' | |||
'{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos' | |||
'{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos' | |||
'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos' | |||
'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos' | |||
'{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou' | |||
'ira' 'iras' | |||
(delete) | |||
) | |||
) | |||
define residual_suffix as ( | |||
[substring] among( | |||
'os' | |||
'a' 'i' 'o' '{a'}' '{i'}' '{o'}' | |||
( RV delete ) | |||
) | |||
) | |||
define residual_form as ( | |||
[substring] among( | |||
'e' '{e'}' '{e^}' | |||
( RV delete [('u'] test 'g') or | |||
('i'] test 'c') RV delete ) | |||
'{c,}' (<-'c') | |||
) | |||
) | |||
) | |||
define stem as ( | |||
do prelude | |||
do mark_regions | |||
backwards ( | |||
do ( | |||
( ( standard_suffix or verb_suffix ) | |||
and do ( ['i'] test 'c' RV delete ) | |||
) | |||
or residual_suffix | |||
) | |||
do residual_form | |||
) | |||
do postlude | |||
) | |||
/* | |||
Note 1: additions of 15 Jun 2005 | |||
*/ |
@@ -20,11 +20,11 @@ stringescapes {} | |||
/* special characters */ | |||
stringdef a^ hex 'E2' // a circumflex | |||
stringdef i^ hex 'EE' // i circumflex | |||
stringdef a+ hex 'E3' // a breve | |||
stringdef s, hex 'BA' // s cedilla | |||
stringdef t, hex 'FE' // t cedilla | |||
stringdef a^ '{U+00E2}' // a circumflex | |||
stringdef i^ '{U+00EE}' // i circumflex | |||
stringdef a+ '{U+0103}' // a breve | |||
stringdef s, '{U+015F}' // s cedilla | |||
stringdef t, '{U+0163}' // t cedilla | |||
define v 'aeiou{a^}{i^}{a+}' | |||
@@ -1,236 +0,0 @@ | |||
routines ( | |||
prelude postlude mark_regions | |||
RV R1 R2 | |||
step_0 | |||
standard_suffix combo_suffix | |||
verb_suffix | |||
vowel_suffix | |||
) | |||
externals ( stem ) | |||
integers ( pV p1 p2 ) | |||
groupings ( v ) | |||
booleans ( standard_suffix_removed ) | |||
stringescapes {} | |||
/* special characters */ | |||
stringdef a^ hex '0E2' // a circumflex | |||
stringdef i^ hex '0EE' // i circumflex | |||
stringdef a+ hex '103' // a breve | |||
stringdef s, hex '15F' // s cedilla | |||
stringdef t, hex '163' // t cedilla | |||
define v 'aeiou{a^}{i^}{a+}' | |||
define prelude as ( | |||
repeat goto ( | |||
v [ ('u' ] v <- 'U') or | |||
('i' ] v <- 'I') | |||
) | |||
) | |||
define mark_regions as ( | |||
$pV = limit | |||
$p1 = limit | |||
$p2 = limit // defaults | |||
do ( | |||
( v (non-v gopast v) or (v gopast non-v) ) | |||
or | |||
( non-v (non-v gopast v) or (v next) ) | |||
setmark pV | |||
) | |||
do ( | |||
gopast v gopast non-v setmark p1 | |||
gopast v gopast non-v setmark p2 | |||
) | |||
) | |||
define postlude as repeat ( | |||
[substring] among( | |||
'I' (<- 'i') | |||
'U' (<- 'u') | |||
'' (next) | |||
) | |||
) | |||
backwardmode ( | |||
define RV as $pV <= cursor | |||
define R1 as $p1 <= cursor | |||
define R2 as $p2 <= cursor | |||
define step_0 as ( | |||
[substring] R1 among( | |||
'ul' 'ului' | |||
( delete ) | |||
'aua' | |||
( <-'a' ) | |||
'ea' 'ele' 'elor' | |||
( <-'e' ) | |||
'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor' | |||
( <-'i') | |||
'ile' | |||
( not 'ab' <- 'i' ) | |||
'atei' | |||
( <- 'at' ) | |||
'a{t,}ie' 'a{t,}ia' | |||
( <- 'a{t,}i' ) | |||
) | |||
) | |||
define combo_suffix as test ( | |||
[substring] R1 ( | |||
among( | |||
/* 'IST'. alternative: include the following | |||
'alism' 'alisme' | |||
'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' ( | |||
<- 'al' | |||
) | |||
*/ | |||
'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' ( | |||
<- 'abil' | |||
) | |||
'ibilitate' ( | |||
<- 'ibil' | |||
) | |||
'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' ( | |||
<- 'iv' | |||
) | |||
'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i' | |||
'icator' 'icatori' | |||
'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}' | |||
'ical' 'icala' 'icale' 'icali' 'ical{a+}' ( | |||
<- 'ic' | |||
) | |||
'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune' | |||
'atoare' 'ator' 'atori' | |||
'{a+}toare' '{a+}tor' '{a+}tori' ( | |||
<- 'at' | |||
) | |||
'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune' | |||
'itoare' 'itor' 'itori' ( | |||
<- 'it' | |||
) | |||
) | |||
set standard_suffix_removed | |||
) | |||
) | |||
define standard_suffix as ( | |||
unset standard_suffix_removed | |||
repeat combo_suffix | |||
[substring] R2 ( | |||
among( | |||
// past participle is treated here, rather than | |||
// as a verb ending: | |||
'at' 'ata' 'at{a+}' 'ati' 'ate' | |||
'ut' 'uta' 'ut{a+}' 'uti' 'ute' | |||
'it' 'ita' 'it{a+}' 'iti' 'ite' | |||
'ic' 'ica' 'ice' 'ici' 'ic{a+}' | |||
'abil' 'abila' 'abile' 'abili' 'abil{a+}' | |||
'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}' | |||
'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i' | |||
'ant' 'anta' 'ante' 'anti' 'ant{a+}' | |||
'ator' 'atori' | |||
'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i' | |||
'iv' 'iva' 'ive' 'ivi' 'iv{a+}' ( | |||
delete | |||
) | |||
'iune' 'iuni' ( | |||
'{t,}'] <- 't' | |||
) | |||
'ism' 'isme' | |||
'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' ( | |||
<- 'ist' | |||
/* 'IST'. alternative: remove with <- '' */ | |||
) | |||
) | |||
set standard_suffix_removed | |||
) | |||
) | |||
define verb_suffix as setlimit tomark pV for ( | |||
[substring] among( | |||
// 'long' infinitive: | |||
'are' 'ere' 'ire' '{a^}re' | |||
// gerund: | |||
'ind' '{a^}nd' | |||
'indu' '{a^}ndu' | |||
'eze' | |||
'easc{a+}' | |||
// present: | |||
'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti' | |||
'e{s,}te' | |||
'{a+}sc' '{a+}{s,}ti' | |||
'{a+}{s,}te' | |||
// imperfect: | |||
'am' 'ai' 'au' | |||
'eam' 'eai' 'ea' 'ea{t,}i' 'eau' | |||
'iam' 'iai' 'ia' 'ia{t,}i' 'iau' | |||
// past: // (not 'ii') | |||
'ui' | |||
'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}' | |||
'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}' | |||
'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}' | |||
'{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}' | |||
// pluferfect: | |||
'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}' | |||
'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}' | |||
'{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i' | |||
'{a^}ser{a+}' | |||
'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}' | |||
( non-v or 'u' delete ) | |||
// present: | |||
'{a+}m' 'a{t,}i' | |||
'em' 'e{t,}i' | |||
'im' 'i{t,}i' | |||
'{a^}m' '{a^}{t,}i' | |||
// past: | |||
'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}' | |||
'sei' 'se' | |||
// pluperfect: | |||
'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}' | |||
(delete) | |||
) | |||
) | |||
define vowel_suffix as ( | |||
[substring] RV among ( | |||
'a' 'e' 'i' 'ie' '{a+}' ( delete ) | |||
) | |||
) | |||
) | |||
define stem as ( | |||
do prelude | |||
do mark_regions | |||
backwards ( | |||
do step_0 | |||
do standard_suffix | |||
do ( standard_suffix_removed or verb_suffix ) | |||
do vowel_suffix | |||
) | |||
do postlude | |||
) | |||
@@ -1,41 +1,41 @@ | |||
stringescapes {} | |||
/* the 32 Cyrillic letters in the KOI8-R coding scheme, and represented | |||
in Latin characters following the conventions of the standard Library | |||
of Congress transliteration: */ | |||
stringdef a hex 'C1' | |||
stringdef b hex 'C2' | |||
stringdef v hex 'D7' | |||
stringdef g hex 'C7' | |||
stringdef d hex 'C4' | |||
stringdef e hex 'C5' | |||
stringdef zh hex 'D6' | |||
stringdef z hex 'DA' | |||
stringdef i hex 'C9' | |||
stringdef i` hex 'CA' | |||
stringdef k hex 'CB' | |||
stringdef l hex 'CC' | |||
stringdef m hex 'CD' | |||
stringdef n hex 'CE' | |||
stringdef o hex 'CF' | |||
stringdef p hex 'D0' | |||
stringdef r hex 'D2' | |||
stringdef s hex 'D3' | |||
stringdef t hex 'D4' | |||
stringdef u hex 'D5' | |||
stringdef f hex 'C6' | |||
stringdef kh hex 'C8' | |||
stringdef ts hex 'C3' | |||
stringdef ch hex 'DE' | |||
stringdef sh hex 'DB' | |||
stringdef shch hex 'DD' | |||
stringdef " hex 'DF' | |||
stringdef y hex 'D9' | |||
stringdef ' hex 'D8' | |||
stringdef e` hex 'DC' | |||
stringdef iu hex 'C0' | |||
stringdef ia hex 'D1' | |||
/* the 33 Cyrillic letters represented in ASCII characters following the | |||
* conventions of the standard Library of Congress transliteration: */ | |||
stringdef a '{U+0430}' | |||
stringdef b '{U+0431}' | |||
stringdef v '{U+0432}' | |||
stringdef g '{U+0433}' | |||
stringdef d '{U+0434}' | |||
stringdef e '{U+0435}' | |||
stringdef e" '{U+0451}' | |||
stringdef zh '{U+0436}' | |||
stringdef z '{U+0437}' | |||
stringdef i '{U+0438}' | |||
stringdef i` '{U+0439}' | |||
stringdef k '{U+043A}' | |||
stringdef l '{U+043B}' | |||
stringdef m '{U+043C}' | |||
stringdef n '{U+043D}' | |||
stringdef o '{U+043E}' | |||
stringdef p '{U+043F}' | |||
stringdef r '{U+0440}' | |||
stringdef s '{U+0441}' | |||
stringdef t '{U+0442}' | |||
stringdef u '{U+0443}' | |||
stringdef f '{U+0444}' | |||
stringdef kh '{U+0445}' | |||
stringdef ts '{U+0446}' | |||
stringdef ch '{U+0447}' | |||
stringdef sh '{U+0448}' | |||
stringdef shch '{U+0449}' | |||
stringdef " '{U+044A}' | |||
stringdef y '{U+044B}' | |||
stringdef ' '{U+044C}' | |||
stringdef e` '{U+044D}' | |||
stringdef iu '{U+044E}' | |||
stringdef ia '{U+044F}' | |||
routines ( mark_regions R2 | |||
perfective_gerund | |||
@@ -200,6 +200,10 @@ backwardmode ( | |||
define stem as ( | |||
// Normalise {e"} to {e}. The documentation has long suggested the user | |||
// should do this before calling the stemmer - we now do it for them. | |||
do repeat ( goto (['{e"}']) <- '{e}' ) | |||
do mark_regions | |||
backwards setlimit tomark pV for ( | |||
do ( |
@@ -1,215 +0,0 @@ | |||
stringescapes {} | |||
/* the 32 Cyrillic letters in Unicode */ | |||
stringdef a hex '430' | |||
stringdef b hex '431' | |||
stringdef v hex '432' | |||
stringdef g hex '433' | |||
stringdef d hex '434' | |||
stringdef e hex '435' | |||
stringdef zh hex '436' | |||
stringdef z hex '437' | |||
stringdef i hex '438' | |||
stringdef i` hex '439' | |||
stringdef k hex '43A' | |||
stringdef l hex '43B' | |||
stringdef m hex '43C' | |||
stringdef n hex '43D' | |||
stringdef o hex '43E' | |||
stringdef p hex '43F' | |||
stringdef r hex '440' | |||
stringdef s hex '441' | |||
stringdef t hex '442' | |||
stringdef u hex '443' | |||
stringdef f hex '444' | |||
stringdef kh hex '445' | |||
stringdef ts hex '446' | |||
stringdef ch hex '447' | |||
stringdef sh hex '448' | |||
stringdef shch hex '449' | |||
stringdef " hex '44A' | |||
stringdef y hex '44B' | |||
stringdef ' hex '44C' | |||
stringdef e` hex '44D' | |||
stringdef iu hex '44E' | |||
stringdef ia hex '44F' | |||
routines ( mark_regions R2 | |||
perfective_gerund | |||
adjective | |||
adjectival | |||
reflexive | |||
verb | |||
noun | |||
derivational | |||
tidy_up | |||
) | |||
externals ( stem ) | |||
integers ( pV p2 ) | |||
groupings ( v ) | |||
define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}' | |||
define mark_regions as ( | |||
$pV = limit | |||
$p2 = limit | |||
do ( | |||
gopast v setmark pV gopast non-v | |||
gopast v gopast non-v setmark p2 | |||
) | |||
) | |||
backwardmode ( | |||
define R2 as $p2 <= cursor | |||
define perfective_gerund as ( | |||
[substring] among ( | |||
'{v}' | |||
'{v}{sh}{i}' | |||
'{v}{sh}{i}{s}{'}' | |||
('{a}' or '{ia}' delete) | |||
'{i}{v}' | |||
'{i}{v}{sh}{i}' | |||
'{i}{v}{sh}{i}{s}{'}' | |||
'{y}{v}' | |||
'{y}{v}{sh}{i}' | |||
'{y}{v}{sh}{i}{s}{'}' | |||
(delete) | |||
) | |||
) | |||
define adjective as ( | |||
[substring] among ( | |||
'{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}' | |||
'{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}' | |||
'{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}' | |||
'{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}' | |||
'{ia}{ia}' | |||
// and - | |||
'{o}{iu}' // - which is somewhat archaic | |||
'{e}{iu}' // - soft form of {o}{iu} | |||
(delete) | |||
) | |||
) | |||
define adjectival as ( | |||
adjective | |||
/* of the participle forms, em, vsh, ivsh, yvsh are readily removable. | |||
nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of | |||
errors. Removing im, uem, enn creates too many errors. | |||
*/ | |||
try ( | |||
[substring] among ( | |||
'{e}{m}' // present passive participle | |||
'{n}{n}' // adjective from past passive participle | |||
'{v}{sh}' // past active participle | |||
'{iu}{shch}' '{shch}' // present active participle | |||
('{a}' or '{ia}' delete) | |||
//but not '{i}{m}' '{u}{e}{m}' // present passive participle | |||
//or '{e}{n}{n}' // adjective from past passive participle | |||
'{i}{v}{sh}' '{y}{v}{sh}'// past active participle | |||
'{u}{iu}{shch}' // present active participle | |||
(delete) | |||
) | |||
) | |||
) | |||
define reflexive as ( | |||
[substring] among ( | |||
'{s}{ia}' | |||
'{s}{'}' | |||
(delete) | |||
) | |||
) | |||
define verb as ( | |||
[substring] among ( | |||
'{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}' | |||
'{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}' | |||
'{n}{y}' '{t}{'}' '{e}{sh}{'}' | |||
'{n}{n}{o}' | |||
('{a}' or '{ia}' delete) | |||
'{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}' | |||
'{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}' | |||
'{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}' | |||
'{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}' | |||
'{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}' | |||
'{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}' | |||
(delete) | |||
/* note the short passive participle tests: | |||
'{n}{a}' '{n}' '{n}{o}' '{n}{y}' | |||
'{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}' | |||
*/ | |||
) | |||
) | |||
define noun as ( | |||
[substring] among ( | |||
'{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}' | |||
'{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}' | |||
'{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}' | |||
'{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}' | |||
'{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}' | |||
'{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}' | |||
(delete) | |||
/* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}' | |||
'{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}' | |||
omitted - they only occur on 12 words. | |||
*/ | |||
) | |||
) | |||
define derivational as ( | |||
[substring] R2 among ( | |||
'{o}{s}{t}' | |||
'{o}{s}{t}{'}' | |||
(delete) | |||
) | |||
) | |||
define tidy_up as ( | |||
[substring] among ( | |||
'{e}{i`}{sh}' | |||
'{e}{i`}{sh}{e}' // superlative forms | |||
(delete | |||
['{n}'] '{n}' delete | |||
) | |||
'{n}' | |||
('{n}' delete) // e.g. -nno endings | |||
'{'}' | |||
(delete) // with some slight false conflations | |||
) | |||
) | |||
) | |||
define stem as ( | |||
do mark_regions | |||
backwards setlimit tomark pV for ( | |||
do ( | |||
perfective_gerund or | |||
( try reflexive | |||
adjectival or verb or noun | |||
) | |||
) | |||
try([ '{i}' ] delete) | |||
// because noun ending -i{iu} is being treated as verb ending -{iu} | |||
do derivational | |||
do tidy_up | |||
) | |||
) |
@@ -16,15 +16,15 @@ groupings ( v ) | |||
stringescapes {} | |||
/* special characters (in ISO Latin I) */ | |||
stringdef a' hex 'E1' // a-acute | |||
stringdef e' hex 'E9' // e-acute | |||
stringdef i' hex 'ED' // i-acute | |||
stringdef o' hex 'F3' // o-acute | |||
stringdef u' hex 'FA' // u-acute | |||
stringdef u" hex 'FC' // u-diaeresis | |||
stringdef n~ hex 'F1' // n-tilde | |||
/* special characters */ | |||
stringdef a' '{U+00E1}' // a-acute | |||
stringdef e' '{U+00E9}' // e-acute | |||
stringdef i' '{U+00ED}' // i-acute | |||
stringdef o' '{U+00F3}' // o-acute | |||
stringdef u' '{U+00FA}' // u-acute | |||
stringdef u" '{U+00FC}' // u-diaeresis | |||
stringdef n~ '{U+00F1}' // n-tilde | |||
define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}' | |||
@@ -1,230 +0,0 @@ | |||
routines ( | |||
postlude mark_regions | |||
RV R1 R2 | |||
attached_pronoun | |||
standard_suffix | |||
y_verb_suffix | |||
verb_suffix | |||
residual_suffix | |||
) | |||
externals ( stem ) | |||
integers ( pV p1 p2 ) | |||
groupings ( v ) | |||
stringescapes {} | |||
/* special characters (in MS-DOS Latin I) */ | |||
stringdef a' hex 'A0' // a-acute | |||
stringdef e' hex '82' // e-acute | |||
stringdef i' hex 'A1' // i-acute | |||
stringdef o' hex 'A2' // o-acute | |||
stringdef u' hex 'A3' // u-acute | |||
stringdef u" hex '81' // u-diaeresis | |||
stringdef n~ hex 'A4' // n-tilde | |||
define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}' | |||
define mark_regions as ( | |||
$pV = limit | |||
$p1 = limit | |||
$p2 = limit // defaults | |||
do ( | |||
( v (non-v gopast v) or (v gopast non-v) ) | |||
or | |||
( non-v (non-v gopast v) or (v next) ) | |||
setmark pV | |||
) | |||
do ( | |||
gopast v gopast non-v setmark p1 | |||
gopast v gopast non-v setmark p2 | |||
) | |||
) | |||
define postlude as repeat ( | |||
[substring] among( | |||
'{a'}' (<- 'a') | |||
'{e'}' (<- 'e') | |||
'{i'}' (<- 'i') | |||
'{o'}' (<- 'o') | |||
'{u'}' (<- 'u') | |||
// and possibly {u"}->u here, or in prelude | |||
'' (next) | |||
) //or next | |||
) | |||
backwardmode ( | |||
define RV as $pV <= cursor | |||
define R1 as $p1 <= cursor | |||
define R2 as $p2 <= cursor | |||
define attached_pronoun as ( | |||
[substring] among( | |||
'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo' | |||
'las' 'les' 'los' 'nos' | |||
) | |||
substring RV among( | |||
'i{e'}ndo' (] <- 'iendo') | |||
'{a'}ndo' (] <- 'ando') | |||
'{a'}r' (] <- 'ar') | |||
'{e'}r' (] <- 'er') | |||
'{i'}r' (] <- 'ir') | |||
'ando' | |||
'iendo' | |||
'ar' 'er' 'ir' | |||
(delete) | |||
'yendo' ('u' delete) | |||
) | |||
) | |||
define standard_suffix as ( | |||
[substring] among( | |||
'anza' 'anzas' | |||
'ico' 'ica' 'icos' 'icas' | |||
'ismo' 'ismos' | |||
'able' 'ables' | |||
'ible' 'ibles' | |||
'ista' 'istas' | |||
'oso' 'osa' 'osos' 'osas' | |||
'amiento' 'amientos' | |||
'imiento' 'imientos' | |||
( | |||
R2 delete | |||
) | |||
'adora' 'ador' 'aci{o'}n' | |||
'adoras' 'adores' 'aciones' | |||
'ante' 'antes' 'ancia' 'ancias'// Note 1 | |||
( | |||
R2 delete | |||
try ( ['ic'] R2 delete ) | |||
) | |||
'log{i'}a' | |||
'log{i'}as' | |||
( | |||
R2 <- 'log' | |||
) | |||
'uci{o'}n' 'uciones' | |||
( | |||
R2 <- 'u' | |||
) | |||
'encia' 'encias' | |||
( | |||
R2 <- 'ente' | |||
) | |||
'amente' | |||
( | |||
R1 delete | |||
try ( | |||
[substring] R2 delete among( | |||
'iv' (['at'] R2 delete) | |||
'os' | |||
'ic' | |||
'ad' | |||
) | |||
) | |||
) | |||
'mente' | |||
( | |||
R2 delete | |||
try ( | |||
[substring] among( | |||
'ante' // Note 1 | |||
'able' | |||
'ible' (R2 delete) | |||
) | |||
) | |||
) | |||
'idad' | |||
'idades' | |||
( | |||
R2 delete | |||
try ( | |||
[substring] among( | |||
'abil' | |||
'ic' | |||
'iv' (R2 delete) | |||
) | |||
) | |||
) | |||
'iva' 'ivo' | |||
'ivas' 'ivos' | |||
( | |||
R2 delete | |||
try ( | |||
['at'] R2 delete // but not a further ['ic'] R2 delete | |||
) | |||
) | |||
) | |||
) | |||
define y_verb_suffix as ( | |||
setlimit tomark pV for ([substring]) among( | |||
'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}' | |||
'yas' 'yes' 'yais' 'yamos' | |||
('u' delete) | |||
) | |||
) | |||
define verb_suffix as ( | |||
setlimit tomark pV for ([substring]) among( | |||
'en' 'es' '{e'}is' 'emos' | |||
(try ('u' test 'g') ] delete) | |||
'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais' | |||
'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}' | |||
'ar{e'}' | |||
'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais' | |||
'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}' | |||
'er{e'}' | |||
'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais' | |||
'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}' | |||
'ir{e'}' | |||
'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed' | |||
'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an' | |||
'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado' | |||
'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as' | |||
'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases' | |||
'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais' | |||
'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados' | |||
'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' | |||
'{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos' | |||
(delete) | |||
) | |||
) | |||
define residual_suffix as ( | |||
[substring] among( | |||
'os' | |||
'a' 'o' '{a'}' '{i'}' '{o'}' | |||
( RV delete ) | |||
'e' '{e'}' | |||
( RV delete try( ['u'] test 'g' RV delete ) ) | |||
) | |||
) | |||
) | |||
define stem as ( | |||
do mark_regions | |||
backwards ( | |||
do attached_pronoun | |||
do ( standard_suffix or | |||
y_verb_suffix or | |||
verb_suffix | |||
) | |||
do residual_suffix | |||
) | |||
do postlude | |||
) | |||
/* | |||
Note 1: additions of 15 Jun 2005 | |||
*/ |
@@ -13,11 +13,11 @@ groupings ( v s_ending ) | |||
stringescapes {} | |||
/* special characters (in ISO Latin I) */ | |||
/* special characters */ | |||
stringdef a" hex 'E4' | |||
stringdef ao hex 'E5' | |||
stringdef o" hex 'F6' | |||
stringdef a" '{U+00E4}' | |||
stringdef ao '{U+00E5}' | |||
stringdef o" '{U+00F6}' | |||
define v 'aeiouy{a"}{ao}{o"}' | |||
@@ -1,72 +0,0 @@ | |||
routines ( | |||
mark_regions | |||
main_suffix | |||
consonant_pair | |||
other_suffix | |||
) | |||
externals ( stem ) | |||
integers ( p1 x ) | |||
groupings ( v s_ending ) | |||
stringescapes {} | |||
/* special characters (in MS-DOS Latin I) */ | |||
stringdef a" hex '84' | |||
stringdef ao hex '86' | |||
stringdef o" hex '94' | |||
define v 'aeiouy{a"}{ao}{o"}' | |||
define s_ending 'bcdfghjklmnoprtvy' | |||
define mark_regions as ( | |||
$p1 = limit | |||
test ( hop 3 setmark x ) | |||
goto v gopast non-v setmark p1 | |||
try ( $p1 < x $p1 = x ) | |||
) | |||
backwardmode ( | |||
define main_suffix as ( | |||
setlimit tomark p1 for ([substring]) | |||
among( | |||
'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne' | |||
'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter' | |||
'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens' | |||
'hetens' 'erns' 'at' 'andet' 'het' 'ast' | |||
(delete) | |||
's' | |||
(s_ending delete) | |||
) | |||
) | |||
define consonant_pair as setlimit tomark p1 for ( | |||
among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt') | |||
and ([next] delete) | |||
) | |||
define other_suffix as setlimit tomark p1 for ( | |||
[substring] among( | |||
'lig' 'ig' 'els' (delete) | |||
'l{o"}st' (<-'l{o"}s') | |||
'fullt' (<-'full') | |||
) | |||
) | |||
) | |||
define stem as ( | |||
do mark_regions | |||
backwards ( | |||
do main_suffix | |||
do consonant_pair | |||
do other_suffix | |||
) | |||
) |
@@ -0,0 +1,405 @@ | |||
/* | |||
* Affix stripping stemming algorithm for Tamil | |||
* By Damodharan Rajalingam | |||
*/ | |||
stringescapes {} | |||
/* Aytham */ | |||
stringdef aytham '{U+0B83}' | |||
/* Uyir - independent vowels */ | |||
stringdef a '{U+0B85}' | |||
stringdef aa '{U+0B86}' | |||
stringdef i '{U+0B87}' | |||
stringdef ii '{U+0B88}' | |||
stringdef u '{U+0B89}' | |||
stringdef uu '{U+0B8A}' | |||
stringdef e '{U+0B8E}' | |||
stringdef ee '{U+0B8F}' | |||
stringdef ai '{U+0B90}' | |||
stringdef o '{U+0B92}' | |||
stringdef oo '{U+0B93}' | |||
stringdef au '{U+0B94}' | |||
/* Consonants */ | |||
stringdef ka '{U+0B95}' | |||
stringdef nga '{U+0B99}' | |||
stringdef ca '{U+0B9A}' | |||
stringdef ja '{U+0B9C}' | |||
stringdef nya '{U+0B9E}' | |||
stringdef tta '{U+0B9F}' | |||
stringdef nna '{U+0BA3}' | |||
stringdef ta '{U+0BA4}' | |||
stringdef tha '{U+0BA4}' | |||
stringdef na '{U+0BA8}' | |||
stringdef nnna '{U+0BA9}' | |||
stringdef pa '{U+0BAA}' | |||
stringdef ma '{U+0BAE}' | |||
stringdef ya '{U+0BAF}' | |||
stringdef ra '{U+0BB0}' | |||
stringdef rra '{U+0BB1}' | |||
stringdef la '{U+0BB2}' | |||
stringdef lla '{U+0BB3}' | |||
stringdef llla '{U+0BB4}' | |||
stringdef zha '{U+0BB4}' | |||
stringdef va '{U+0BB5}' | |||
/* Vatamozi - borrowed */ | |||
stringdef sha '{U+0BB6}' | |||
stringdef ssa '{U+0BB7}' | |||
stringdef sa '{U+0BB8}' | |||
stringdef ha '{U+0BB9}' | |||
/* Dependent vowel signs (kombu etc.) */ | |||
stringdef vs_aa '{U+0BBE}' | |||
stringdef vs_i '{U+0BBF}' | |||
stringdef vs_ii '{U+0BC0}' | |||
stringdef vs_u '{U+0BC1}' | |||
stringdef vs_uu '{U+0BC2}' | |||
stringdef vs_e '{U+0BC6}' | |||
stringdef vs_ee '{U+0BC7}' | |||
stringdef vs_ai '{U+0BC8}' | |||
stringdef vs_o '{U+0BCA}' | |||
stringdef vs_oo '{U+0BCB}' | |||
stringdef vs_au '{U+0BCC}' | |||
/* Pulli */ | |||
stringdef pulli '{U+0BCD}' | |||
/* AU length markk */ | |||
stringdef au_lmark '{U+0BD7}' | |||
routines ( | |||
remove_plural_suffix | |||
remove_question_suffixes | |||
remove_question_prefixes | |||
remove_pronoun_prefixes | |||
remove_command_suffixes | |||
remove_um | |||
remove_vetrumai_urupukal | |||
fix_va_start | |||
fix_ending | |||
fix_endings | |||
remove_tense_suffix | |||
remove_tense_suffixes | |||
remove_common_word_endings | |||
has_min_length | |||
) | |||
externals ( stem ) | |||
booleans ( | |||
found_a_match | |||
found_vetrumai_urupu | |||
) | |||
define has_min_length as ( | |||
$(len > 4) | |||
) | |||
define fix_va_start as ( | |||
(try '{va}{vs_oo}' and [ '{va}{vs_oo}' ] <- '{oo}' ) or | |||
(try '{va}{vs_o}' and [ '{va}{vs_o}' ] <- '{o}' ) or | |||
(try '{va}{vs_u}' and [ '{va}{vs_u}' ] <- '{u}' ) or | |||
(try '{va}{vs_uu}' and [ '{va}{vs_uu}' ] <- '{uu}' ) | |||
) | |||
define fix_endings as ( | |||
do repeat fix_ending | |||
) | |||
define remove_question_prefixes as ( | |||
[ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete | |||
do fix_va_start | |||
) | |||
// Gives signal t if an ending was fixed, signal f otherwise. | |||
define fix_ending as ( | |||
$(len > 3) | |||
backwards ( | |||
( [among('{na}{pulli}' '{na}{pulli}{ta}' '{na}{pulli}{ta}{pulli}') ] delete ) | |||
or | |||
( ['{ya}{pulli}' test among('{vs_ai}' '{vs_i}' '{vs_ii}') ] delete ) | |||
or | |||
( [ '{tta}{pulli}{pa}{pulli}' or '{tta}{pulli}{ka}{pulli}' ] <- '{lla}{pulli}' ) | |||
or | |||
( [ '{nnna}{pulli}{rra}{pulli}' ] <- '{la}{pulli}' ) | |||
or | |||
// ( [ '{rra}{pulli}{ka}{pulli}' or '{nnna}{pulli}{nnna}{pulli}' ] <- '{la}{pulli}' ) | |||
( [ '{rra}{pulli}{ka}{pulli}' ] <- '{la}{pulli}' ) | |||
or | |||
( [ '{tta}{pulli}{tta}{pulli}' ] <- '{tta}{vs_u}' ) | |||
or | |||
( found_vetrumai_urupu [ '{ta}{pulli}{ta}{pulli}' (test not '{vs_ai}') ] <- '{ma}{pulli}' ] ) | |||
or | |||
( [ '{vs_u}{ka}{pulli}' or '{vs_u}{ka}{pulli}{ka}{pulli}' ] <- '{pulli}' ) | |||
or | |||
( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete ) | |||
or | |||
( [ '{vs_u}{ka}{pulli}' ] <- '{pulli}' ) | |||
or | |||
( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete ) | |||
or | |||
( [ '{pulli}' (among('{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}') or among('{nga}' '{nya}' '{nna}' '{na}' '{ma}' '{nnna}')) '{pulli}' ] <- '{pulli}' ) | |||
or | |||
( [ among('{va}' '{ya}' '{va}{pulli}') ] delete ) | |||
or | |||
( [ '{nnna}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')) ] delete ) | |||
or | |||
( [ '{nga}{pulli}' (test not '{vs_ai}')] <- '{ma}{pulli}' ) | |||
or | |||
( [ '{nga}{pulli}' ] delete ) | |||
or | |||
( [ '{pulli}' (test (among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') or '{pulli}')) ] delete ) | |||
) | |||
) | |||
define remove_pronoun_prefixes as ( | |||
unset found_a_match | |||
[ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete | |||
(set found_a_match) | |||
do fix_va_start | |||
) | |||
define remove_plural_suffix as ( | |||
unset found_a_match | |||
backwards ( | |||
( [ '{vs_u}{nga}{pulli}{ka}{lla}{pulli}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')) ] <- '{pulli}' ) or | |||
( [ '{rra}{pulli}{ka}{lla}{pulli}' ] <- '{la}{pulli}' ) or | |||
( [ '{tta}{pulli}{ka}{lla}{pulli}' ] <- '{lla}{pulli}' ) or | |||
( [ '{ka}{lla}{pulli}' ] delete ) | |||
(set found_a_match) | |||
) | |||
) | |||
define remove_question_suffixes as ( | |||
has_min_length | |||
unset found_a_match | |||
backwards ( | |||
do ( | |||
[ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}' | |||
(set found_a_match) | |||
) | |||
) | |||
do fix_endings | |||
) | |||
define remove_command_suffixes as ( | |||
has_min_length | |||
unset found_a_match | |||
backwards ( | |||
[ among('{pa}{vs_i}' '{va}{vs_i}') ] delete | |||
(set found_a_match) | |||
) | |||
) | |||
define remove_um as ( | |||
unset found_a_match | |||
has_min_length | |||
backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}' | |||
(set found_a_match) | |||
) | |||
do fix_ending | |||
) | |||
define remove_common_word_endings as ( | |||
// These are not suffixes actually but are | |||
// some words that are attached to other words | |||
// but can be removed for stemming | |||
unset found_a_match | |||
has_min_length | |||
backwards ( | |||
test ( [ '{vs_u}{tta}{nnna}{pulli}' or | |||
'{vs_i}{la}{pulli}{la}{vs_ai}' or | |||
'{vs_i}{tta}{ma}{pulli}' or | |||
'{vs_i}{nnna}{pulli}{rra}{vs_i}' or | |||
'{vs_aa}{ka}{vs_i}' or | |||
'{vs_aa}{ka}{vs_i}{ya}' or | |||
'{vs_e}{nnna}{pulli}{rra}{vs_u}' or | |||
'{vs_u}{lla}{pulli}{lla}' or | |||
'{vs_u}{tta}{vs_ai}{ya}' or | |||
'{vs_u}{tta}{vs_ai}' or | |||
'{vs_e}{nnna}{vs_u}{ma}{pulli}' or | |||
('{la}{pulli}{la}' test (not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or | |||
'{vs_e}{nnna}' or | |||
'{vs_aa}{ka}{vs_i}' ] <- '{pulli}' | |||
(set found_a_match) | |||
) | |||
or | |||
test ( [ among('{pa}{tta}{vs_u}' | |||
'{pa}{tta}{pulli}{tta}' | |||
'{pa}{tta}{pulli}{tta}{vs_u}' | |||
'{pa}{tta}{pulli}{tta}{ta}{vs_u}' | |||
'{pa}{tta}{pulli}{tta}{nna}' | |||
'{ka}{vs_u}{ra}{vs_i}{ya}' | |||
'{pa}{rra}{pulli}{rra}{vs_i}' | |||
'{va}{vs_i}{tta}{vs_u}' | |||
'{va}{vs_i}{tta}{pulli}{tta}{vs_u}' | |||
'{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}' | |||
'{pa}{tta}{vs_i}' | |||
'{ta}{vs_aa}{nnna}' | |||
'{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}') | |||
] delete | |||
(set found_a_match) | |||
) | |||
) | |||
do fix_endings | |||
) | |||
define remove_vetrumai_urupukal as ( | |||
unset found_a_match | |||
unset found_vetrumai_urupu | |||
has_min_length | |||
backwards ( | |||
( | |||
test ( ['{nnna}{vs_ai}'] delete ) | |||
or | |||
test ([ ( '{vs_i}{nnna}{vs_ai}' or | |||
'{vs_ai}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}'))) or | |||
( '{vs_ai}' (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}'))) | |||
] <- '{pulli}' | |||
) | |||
or | |||
test ( [ | |||
'{vs_o}{tta}{vs_u}' or | |||
'{vs_oo}{tta}{vs_u}' or | |||
'{vs_i}{la}{pulli}' or | |||
'{vs_i}{rra}{pulli}' or | |||
('{vs_i}{nnna}{pulli}' (test not '{ma}')) or | |||
'{vs_i}{nnna}{pulli}{rra}{vs_u}' or | |||
'{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}' or | |||
'{va}{vs_i}{tta}' or | |||
($(len >= 7) '{vs_i}{tta}{ma}{pulli}') or | |||
'{vs_aa}{la}{pulli}' or | |||
'{vs_u}{tta}{vs_ai}' or | |||
'{vs_aa}{ma}{la}{pulli}' or | |||
('{la}{pulli}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or | |||
'{vs_u}{lla}{pulli}' | |||
] <- '{pulli}' | |||
) | |||
or | |||
test ( [ | |||
'{ka}{nna}{pulli}' or | |||
'{ma}{vs_u}{nnna}{pulli}' or | |||
'{ma}{vs_ee}{la}{pulli}' or | |||
'{ma}{vs_ee}{rra}{pulli}' or | |||
'{ka}{vs_ii}{llla}{pulli}' or | |||
'{pa}{vs_i}{nnna}{pulli}' or | |||
('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) | |||
] delete | |||
) | |||
or | |||
test ([ '{vs_ii}' ] <- '{vs_i}') | |||
) | |||
(set found_a_match) | |||
(set found_vetrumai_urupu) | |||
do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' ) | |||
) | |||
do fix_endings | |||
) | |||
define remove_tense_suffixes as ( | |||
set found_a_match | |||
repeat ( found_a_match (do remove_tense_suffix) ) | |||
) | |||
define remove_tense_suffix as ( | |||
unset found_a_match | |||
has_min_length | |||
backwards ( | |||
do ( | |||
test ( [among( | |||
'{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}' | |||
'{pa}{tta}{vs_u}' | |||
)] delete | |||
(set found_a_match) | |||
) | |||
or | |||
test ( [ | |||
'{ma}{vs_aa}{ra}{pulli}' or | |||
'{ma}{vs_i}{nnna}{pulli}' or | |||
'{nnna}{nnna}{pulli}' or | |||
'{nnna}{vs_aa}{nnna}{pulli}' or | |||
'{nnna}{vs_aa}{lla}{pulli}' or | |||
'{nnna}{vs_aa}{ra}{pulli}' or | |||
('{va}{nnna}{pulli}' test (not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}')) ) or | |||
'{nnna}{lla}{pulli}' or | |||
'{va}{lla}{pulli}' or | |||
'{nnna}{ra}{pulli}' or | |||
'{va}{ra}{pulli}' or | |||
'{nnna}' or '{pa}' or '{ka}' or '{ta}' or '{ya}' or | |||
'{pa}{nnna}{pulli}' or | |||
'{pa}{lla}{pulli}' or | |||
'{pa}{ra}{pulli}' or | |||
('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or | |||
'{vs_i}{rra}{pulli}{rra}{vs_u}' or | |||
'{pa}{ma}{pulli}' or | |||
'{nnna}{ma}{pulli}' or | |||
'{ta}{vs_u}{ma}{pulli}' or | |||
'{rra}{vs_u}{ma}{pulli}' or | |||
'{ka}{vs_u}{ma}{pulli}' or | |||
'{nnna}{vs_e}{nnna}{pulli}' or | |||
'{nnna}{vs_ai}' or | |||
'{va}{vs_ai}' | |||
] delete | |||
(set found_a_match) | |||
) | |||
or | |||
test ( [ | |||
('{vs_aa}{nnna}{pulli}' test (not '{ca}')) or | |||
'{vs_aa}{lla}{pulli}' or | |||
'{vs_aa}{ra}{pulli}' or | |||
'{vs_ee}{nnna}{pulli}' or | |||
'{vs_aa}' or | |||
'{vs_aa}{ma}{pulli}' or | |||
'{vs_e}{ma}{pulli}' or | |||
'{vs_ee}{ma}{pulli}' or | |||
'{vs_oo}{ma}{pulli}' or | |||
'{ka}{vs_u}{ma}{pulli}' or | |||
'{ta}{vs_u}{ma}{pulli}' or | |||
'{tta}{vs_u}{ma}{pulli}' or | |||
'{rra}{vs_u}{ma}{pulli}' or | |||
'{vs_aa}{ya}{pulli}' or | |||
'{nnna}{vs_e}{nnna}{pulli}' or | |||
'{nnna}{vs_i}{ra}{pulli}' or | |||
'{vs_ii}{ra}{pulli}' or | |||
'{vs_ii}{ya}{ra}{pulli}' | |||
] <- '{pulli}' | |||
(set found_a_match) | |||
) | |||
or | |||
test ( ([ '{ka}{vs_u}' or '{ta}{vs_u}' ) (test '{pulli}') ] delete | |||
(set found_a_match) | |||
) | |||
) | |||
do ([among( | |||
'{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}' | |||
'{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}' | |||
'{ka}{vs_i}{nnna}{pulli}{rra}' | |||
'{ka}{vs_i}{nnna}{pulli}{rra}{pulli}' | |||
'{ka}{vs_i}{rra}' | |||
'{ka}{vs_i}{rra}{pulli}' | |||
)] delete | |||
(set found_a_match) | |||
) | |||
) | |||
do fix_endings | |||
) | |||
define stem as ( | |||
unset found_vetrumai_urupu | |||
do fix_ending | |||
has_min_length | |||
do remove_question_prefixes | |||
do remove_pronoun_prefixes | |||
do remove_question_suffixes | |||
do remove_um | |||
do remove_common_word_endings | |||
do remove_vetrumai_urupukal | |||
do remove_plural_suffix | |||
do remove_command_suffixes | |||
do remove_tense_suffixes | |||
) |
@@ -2,7 +2,7 @@ | |||
* author: Evren (Kapusuz) Çilden | |||
* email: evren.kapusuz at gmail.com | |||
* version: 1.0 (15.01.2007) | |||
* stems nominal verb suffixes | |||
* stems nominal inflections | |||
@@ -10,13 +10,13 @@ | |||
* (y,n,s,U) context check | |||
* vowel harmony check | |||
* last consonant check and conversion (b, c, d, ğ to p, ç, t, k) | |||
* The stemming algorithm is based on the paper "An Affix Stripping | |||
* Morphological Analyzer for Turkish" by Gülşen Eryiğit and | |||
* Eşref Adalı (Proceedings of the IAESTED International Conference | |||
* ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004, | |||
* Innsbruck, Austria | |||
* Turkish is an agglutinative language and has a very rich morphological | |||
* structure. In Turkish, you can form many different words from a single stem | |||
* by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means | |||
@@ -59,14 +59,14 @@ routines ( | |||
mark_yken // nominal verb suffix | |||
mark_ymUs_ // nominal verb suffix | |||
mark_ysA // nominal verb suffix | |||
mark_suffix_with_optional_y_consonant | |||
mark_suffix_with_optional_U_vowel | |||
mark_suffix_with_optional_n_consonant | |||
mark_suffix_with_optional_s_consonant | |||
more_than_one_syllable_word | |||
post_process_last_consonants | |||
postlude | |||
@@ -75,34 +75,32 @@ routines ( | |||
stem_suffix_chain_before_ki | |||
) | |||
/* Special characters in Unicode Latin-1 and Latin Extended-A */ | |||
stringdef c. hex 'E7' // LATIN SMALL LETTER C WITH CEDILLA | |||
stringdef g~ hex '011F' // LATIN SMALL LETTER G WITH BREVE | |||
stringdef i' hex '0131' // LATIN SMALL LETTER I WITHOUT DOT | |||
stringdef o" hex 'F6' // LATIN SMALL LETTER O WITH DIAERESIS | |||
stringdef s. hex '015F' // LATIN SMALL LETTER S WITH CEDILLA | |||
stringdef u" hex 'FC' // LATIN SMALL LETTER U WITH DIAERESIS | |||
stringescapes { } | |||
stringescapes { } | |||
integers ( strlen ) // length of a string | |||
/* Special characters in Unicode Latin-1 and Latin Extended-A */ | |||
stringdef c, '{U+00E7}' // LATIN SMALL LETTER C WITH CEDILLA | |||
stringdef g~ '{U+011F}' // LATIN SMALL LETTER G WITH BREVE | |||
stringdef i' '{U+0131}' // LATIN SMALL LETTER I WITHOUT DOT | |||
stringdef o" '{U+00F6}' // LATIN SMALL LETTER O WITH DIAERESIS | |||
stringdef s, '{U+015F}' // LATIN SMALL LETTER S WITH CEDILLA | |||
stringdef u" '{U+00FC}' // LATIN SMALL LETTER U WITH DIAERESIS | |||
booleans ( continue_stemming_noun_suffixes ) | |||
groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6) | |||
groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6) | |||
define vowel 'ae{i'}io{o"}u{u"}' | |||
define vowel 'ae{i'}io{o"}u{u"}' | |||
define U '{i'}iu{u"}' | |||
// the vowel grouping definitions below are used for checking vowel harmony | |||
define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a' | |||
define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e' | |||
define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i'' | |||
define vowel4 'ei' // vowels that can end with suffixes containing 'i' | |||
define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u' | |||
define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"' | |||
define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a' | |||
define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e' | |||
define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i'' | |||
define vowel4 'ei' // vowels that can end with suffixes containing 'i' | |||
define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u' | |||
define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"' | |||
externals ( stem ) | |||
externals ( stem ) | |||
backwardmode ( | |||
// checks vowel harmony for possible suffixes, | |||
@@ -124,165 +122,165 @@ backwardmode ( | |||
) | |||
) | |||
) | |||
// if the last consonant before suffix is vowel and n then advance and delete | |||
// if the last consonant before suffix is non vowel and n do nothing | |||
// if the last consonant before suffix is not n then only delete the suffix | |||
// assumption: slice beginning is set correctly | |||
define mark_suffix_with_optional_n_consonant as ( | |||
((test 'n') next (test vowel)) | |||
('n' (test vowel)) | |||
or | |||
((not(test 'n')) test(next (test vowel))) | |||
((not(test 'n')) test(next vowel)) | |||
) | |||
// if the last consonant before suffix is vowel and s then advance and delete | |||
// if the last consonant before suffix is non vowel and s do nothing | |||
// if the last consonant before suffix is not s then only delete the suffix | |||
// assumption: slice beginning is set correctly | |||
define mark_suffix_with_optional_s_consonant as ( | |||
((test 's') next (test vowel)) | |||
('s' (test vowel)) | |||
or | |||
((not(test 's')) test(next (test vowel))) | |||
((not(test 's')) test(next vowel)) | |||
) | |||
// if the last consonant before suffix is vowel and y then advance and delete | |||
// if the last consonant before suffix is non vowel and y do nothing | |||
// if the last consonant before suffix is not y then only delete the suffix | |||
// assumption: slice beginning is set correctly | |||
define mark_suffix_with_optional_y_consonant as ( | |||
((test 'y') next (test vowel)) | |||
('y' (test vowel)) | |||
or | |||
((not(test 'y')) test(next (test vowel))) | |||
((not(test 'y')) test(next vowel)) | |||
) | |||
define mark_suffix_with_optional_U_vowel as ( | |||
((test U) next (test non-vowel)) | |||
(U (test non-vowel)) | |||
or | |||
((not(test U)) test(next (test non-vowel))) | |||
((not(test U)) test(next non-vowel)) | |||
) | |||
define mark_possessives as ( | |||
among ('m{i'}z' 'miz' 'muz' 'm{u"}z' | |||
'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n') | |||
(mark_suffix_with_optional_U_vowel) | |||
) | |||
define mark_sU as ( | |||
check_vowel_harmony | |||
U | |||
(mark_suffix_with_optional_s_consonant) | |||
) | |||
define mark_lArI as ( | |||
among ('leri' 'lar{i'}') | |||
) | |||
define mark_yU as ( | |||
check_vowel_harmony | |||
U | |||
(mark_suffix_with_optional_y_consonant) | |||
(mark_suffix_with_optional_y_consonant) | |||
) | |||
define mark_nU as ( | |||
check_vowel_harmony | |||
among ('n{i'}' 'ni' 'nu' 'n{u"}') | |||
among ('n{i'}' 'ni' 'nu' 'n{u"}') | |||
) | |||
define mark_nUn as ( | |||
check_vowel_harmony | |||
among ('{i'}n' 'in' 'un' '{u"}n') | |||
among ('{i'}n' 'in' 'un' '{u"}n') | |||
(mark_suffix_with_optional_n_consonant) | |||
) | |||
define mark_yA as ( | |||
check_vowel_harmony | |||
among('a' 'e') | |||
(mark_suffix_with_optional_y_consonant) | |||
) | |||
define mark_nA as ( | |||
check_vowel_harmony | |||
among('na' 'ne') | |||
) | |||
define mark_DA as ( | |||
check_vowel_harmony | |||
among('da' 'de' 'ta' 'te') | |||
) | |||
define mark_ndA as ( | |||
check_vowel_harmony | |||
among('nda' 'nde') | |||
) | |||
define mark_DAn as ( | |||
check_vowel_harmony | |||
among('dan' 'den' 'tan' 'ten') | |||
) | |||
define mark_ndAn as ( | |||
check_vowel_harmony | |||
among('ndan' 'nden') | |||
) | |||
define mark_ylA as ( | |||
check_vowel_harmony | |||
among('la' 'le') | |||
(mark_suffix_with_optional_y_consonant) | |||
) | |||
define mark_ki as ( | |||
'ki' | |||
) | |||
define mark_ncA as ( | |||
check_vowel_harmony | |||
among('ca' 'ce') | |||
among('ca' 'ce') | |||
(mark_suffix_with_optional_n_consonant) | |||
) | |||
define mark_yUm as ( | |||
check_vowel_harmony | |||
among ('{i'}m' 'im' 'um' '{u"}m') | |||
(mark_suffix_with_optional_y_consonant) | |||
) | |||
define mark_sUn as ( | |||
check_vowel_harmony | |||
among ('s{i'}n' 'sin' 'sun' 's{u"}n' ) | |||
) | |||
define mark_yUz as ( | |||
check_vowel_harmony | |||
among ('{i'}z' 'iz' 'uz' '{u"}z') | |||
(mark_suffix_with_optional_y_consonant) | |||
) | |||
define mark_sUnUz as ( | |||
among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z') | |||
) | |||
define mark_lAr as ( | |||
check_vowel_harmony | |||
among ('ler' 'lar') | |||
) | |||
define mark_nUz as ( | |||
check_vowel_harmony | |||
among ('n{i'}z' 'niz' 'nuz' 'n{u"}z') | |||
) | |||
define mark_DUr as ( | |||
check_vowel_harmony | |||
among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r') | |||
) | |||
define mark_cAsInA as ( | |||
among ('cas{i'}na' 'cesine') | |||
) | |||
define mark_yDU as ( | |||
check_vowel_harmony | |||
among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m' | |||
@@ -292,24 +290,24 @@ backwardmode ( | |||
(mark_suffix_with_optional_y_consonant) | |||
) | |||
// does not fully obey vowel harmony | |||
// does not fully obey vowel harmony | |||
define mark_ysA as ( | |||
among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se') | |||
(mark_suffix_with_optional_y_consonant) | |||
) | |||
define mark_ymUs_ as ( | |||
check_vowel_harmony | |||
among ('m{i'}{s.}' 'mi{s.}' 'mu{s.}' 'm{u"}{s.}') | |||
among ('m{i'}{s,}' 'mi{s,}' 'mu{s,}' 'm{u"}{s,}') | |||
(mark_suffix_with_optional_y_consonant) | |||
) | |||
define mark_yken as ( | |||
'ken' (mark_suffix_with_optional_y_consonant) | |||
) | |||
define stem_nominal_verb_suffixes as ( | |||
[ | |||
[ | |||
set continue_stemming_noun_suffixes | |||
(mark_ymUs_ or mark_yDU or mark_ysA or mark_yken) | |||
or | |||
@@ -327,7 +325,7 @@ backwardmode ( | |||
(mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)) | |||
]delete | |||
) | |||
// stems noun suffix chains ending with -ki | |||
define stem_suffix_chain_before_ki as ( | |||
[ | |||
@@ -337,7 +335,7 @@ backwardmode ( | |||
(mark_lAr] delete try(stem_suffix_chain_before_ki)) | |||
or | |||
(mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) | |||
)) | |||
or | |||
(mark_nUn] delete try([ | |||
@@ -348,7 +346,7 @@ backwardmode ( | |||
(stem_suffix_chain_before_ki) | |||
)) | |||
or | |||
(mark_ndA ( | |||
(mark_ndA ( | |||
(mark_lArI] delete) | |||
or | |||
((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki))) | |||
@@ -357,7 +355,7 @@ backwardmode ( | |||
)) | |||
) | |||
) | |||
define stem_noun_suffixes as ( | |||
([mark_lAr] delete try(stem_suffix_chain_before_ki)) | |||
or | |||
@@ -373,24 +371,24 @@ backwardmode ( | |||
or | |||
([(mark_ndA or mark_nA) | |||
( | |||
(mark_lArI] delete) | |||
or | |||
(mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) | |||
or | |||
(stem_suffix_chain_before_ki) | |||
) | |||
(mark_lArI] delete) | |||
or | |||
(mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) | |||
or | |||
(stem_suffix_chain_before_ki) | |||
) | |||
) | |||
or | |||
([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI))) | |||
or | |||
( [mark_DAn] delete try ([ | |||
( | |||
(mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) | |||
or | |||
(mark_lAr] delete try(stem_suffix_chain_before_ki)) | |||
or | |||
(stem_suffix_chain_before_ki) | |||
)) | |||
(mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) | |||
or | |||
(mark_lAr] delete try(stem_suffix_chain_before_ki)) | |||
or | |||
(stem_suffix_chain_before_ki) | |||
)) | |||
) | |||
or | |||
([mark_nUn or mark_ylA] delete | |||
@@ -404,18 +402,18 @@ backwardmode ( | |||
) | |||
or | |||
([mark_lArI] delete) | |||
or | |||
or | |||
(stem_suffix_chain_before_ki) | |||
or | |||
([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki)) | |||
or | |||
([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) | |||
) | |||
define post_process_last_consonants as ( | |||
define post_process_last_consonants as ( | |||
[substring] among ( | |||
'b' (<- 'p') | |||
'c' (<- '{c.}') | |||
'c' (<- '{c,}') | |||
'd' (<- 't') | |||
'{g~}' (<- 'k') | |||
) | |||
@@ -424,7 +422,7 @@ backwardmode ( | |||
// after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed | |||
// like in 'kedim' -> 'ked' | |||
// Turkish words don't usually end with 'd' or 'g' | |||
// some very well known words are ignored (like 'ad' 'soyad' | |||
// some very well known words are ignored (like 'ad' 'soyad' | |||
// appends U to stems ending with d or g, decides which vowel to add | |||
// based on the last vowel in the stem | |||
define append_U_to_stems_ending_with_d_or_g as ( | |||
@@ -437,7 +435,10 @@ backwardmode ( | |||
or | |||
(test((goto vowel) '{o"}' or '{u"}') <+ '{u"}') | |||
) | |||
define is_reserved_word as ( | |||
'ad' try 'soy' atlimit | |||
) | |||
) | |||
// Tests if there are more than one syllables | |||
@@ -446,18 +447,12 @@ define more_than_one_syllable_word as ( | |||
test (atleast 2 (gopast vowel)) | |||
) | |||
define is_reserved_word as ( | |||
test(gopast 'ad' ($strlen = 2) ($strlen == limit)) | |||
or | |||
test(gopast 'soyad' ($strlen = 5) ($strlen == limit)) | |||
) | |||
define postlude as ( | |||
not(is_reserved_word) | |||
backwards ( | |||
not(is_reserved_word) | |||
do append_U_to_stems_ending_with_d_or_g | |||
do post_process_last_consonants | |||
) | |||
) | |||
@@ -469,9 +464,7 @@ define stem as ( | |||
continue_stemming_noun_suffixes | |||
do stem_noun_suffixes | |||
) | |||
postlude | |||
) | |||
) | |||
@@ -0,0 +1,98 @@ | |||
// ISO-8859-2 character mappings. | |||
stringdef U+00A0 hex 'A0' | |||
stringdef U+0104 hex 'A1' | |||
stringdef U+02D8 hex 'A2' | |||
stringdef U+0141 hex 'A3' | |||
stringdef U+00A4 hex 'A4' | |||
stringdef U+013D hex 'A5' | |||
stringdef U+015A hex 'A6' | |||
stringdef U+00A7 hex 'A7' | |||
stringdef U+00A8 hex 'A8' | |||
stringdef U+0160 hex 'A9' | |||
stringdef U+015E hex 'AA' | |||
stringdef U+0164 hex 'AB' | |||
stringdef U+0179 hex 'AC' | |||
stringdef U+00AD hex 'AD' | |||
stringdef U+017D hex 'AE' | |||
stringdef U+017B hex 'AF' | |||
stringdef U+00B0 hex 'B0' | |||
stringdef U+0105 hex 'B1' | |||
stringdef U+02DB hex 'B2' | |||
stringdef U+0142 hex 'B3' | |||
stringdef U+00B4 hex 'B4' | |||
stringdef U+013E hex 'B5' | |||
stringdef U+015B hex 'B6' | |||
stringdef U+02C7 hex 'B7' | |||
stringdef U+00B8 hex 'B8' | |||
stringdef U+0161 hex 'B9' | |||
stringdef U+015F hex 'BA' | |||
stringdef U+0165 hex 'BB' | |||
stringdef U+017A hex 'BC' | |||
stringdef U+02DD hex 'BD' | |||
stringdef U+017E hex 'BE' | |||
stringdef U+017C hex 'BF' | |||
stringdef U+0154 hex 'C0' | |||
stringdef U+00C1 hex 'C1' | |||
stringdef U+00C2 hex 'C2' | |||
stringdef U+0102 hex 'C3' | |||
stringdef U+00C4 hex 'C4' | |||
stringdef U+0139 hex 'C5' | |||
stringdef U+0106 hex 'C6' | |||
stringdef U+00C7 hex 'C7' | |||
stringdef U+010C hex 'C8' | |||
stringdef U+00C9 hex 'C9' | |||
stringdef U+0118 hex 'CA' | |||
stringdef U+00CB hex 'CB' | |||
stringdef U+011A hex 'CC' | |||
stringdef U+00CD hex 'CD' | |||
stringdef U+00CE hex 'CE' | |||
stringdef U+010E hex 'CF' | |||
stringdef U+0110 hex 'D0' | |||
stringdef U+0143 hex 'D1' | |||
stringdef U+0147 hex 'D2' | |||
stringdef U+00D3 hex 'D3' | |||
stringdef U+00D4 hex 'D4' | |||
stringdef U+0150 hex 'D5' | |||
stringdef U+00D6 hex 'D6' | |||
stringdef U+00D7 hex 'D7' | |||
stringdef U+0158 hex 'D8' | |||
stringdef U+016E hex 'D9' | |||
stringdef U+00DA hex 'DA' | |||
stringdef U+0170 hex 'DB' | |||
stringdef U+00DC hex 'DC' | |||
stringdef U+00DD hex 'DD' | |||
stringdef U+0162 hex 'DE' | |||
stringdef U+00DF hex 'DF' | |||
stringdef U+0155 hex 'E0' | |||
stringdef U+00E1 hex 'E1' | |||
stringdef U+00E2 hex 'E2' | |||
stringdef U+0103 hex 'E3' | |||
stringdef U+00E4 hex 'E4' | |||
stringdef U+013A hex 'E5' | |||
stringdef U+0107 hex 'E6' | |||
stringdef U+00E7 hex 'E7' | |||
stringdef U+010D hex 'E8' | |||
stringdef U+00E9 hex 'E9' | |||
stringdef U+0119 hex 'EA' | |||
stringdef U+00EB hex 'EB' | |||
stringdef U+011B hex 'EC' | |||
stringdef U+00ED hex 'ED' | |||
stringdef U+00EE hex 'EE' | |||
stringdef U+010F hex 'EF' | |||
stringdef U+0111 hex 'F0' | |||
stringdef U+0144 hex 'F1' | |||
stringdef U+0148 hex 'F2' | |||
stringdef U+00F3 hex 'F3' | |||
stringdef U+00F4 hex 'F4' | |||
stringdef U+0151 hex 'F5' | |||
stringdef U+00F6 hex 'F6' | |||
stringdef U+00F7 hex 'F7' | |||
stringdef U+0159 hex 'F8' | |||
stringdef U+016F hex 'F9' | |||
stringdef U+00FA hex 'FA' | |||
stringdef U+0171 hex 'FB' | |||
stringdef U+00FC hex 'FC' | |||
stringdef U+00FD hex 'FD' | |||
stringdef U+0163 hex 'FE' | |||
stringdef U+02D9 hex 'FF' |
@@ -0,0 +1,74 @@ | |||
// KOI8-R character mappings. | |||
stringdef U+00A0 hex '9A' | |||
stringdef U+00A9 hex 'BF' | |||
stringdef U+00B0 hex '9C' | |||
stringdef U+00B2 hex '9D' | |||
stringdef U+00B7 hex '9E' | |||
stringdef U+00F7 hex '9F' | |||
stringdef U+0401 hex 'B3' | |||
stringdef U+0410 hex 'E1' | |||
stringdef U+0411 hex 'E2' | |||
stringdef U+0412 hex 'F7' | |||
stringdef U+0413 hex 'E7' | |||
stringdef U+0414 hex 'E4' | |||
stringdef U+0415 hex 'E5' | |||
stringdef U+0416 hex 'F6' | |||
stringdef U+0417 hex 'FA' | |||
stringdef U+0418 hex 'E9' | |||
stringdef U+0419 hex 'EA' | |||
stringdef U+041A hex 'EB' | |||
stringdef U+041B hex 'EC' | |||
stringdef U+041C hex 'ED' | |||
stringdef U+041D hex 'EE' | |||
stringdef U+041E hex 'EF' | |||
stringdef U+041F hex 'F0' | |||
stringdef U+0420 hex 'F2' | |||
stringdef U+0421 hex 'F3' | |||
stringdef U+0422 hex 'F4' | |||
stringdef U+0423 hex 'F5' | |||
stringdef U+0424 hex 'E6' | |||
stringdef U+0425 hex 'E8' | |||
stringdef U+0426 hex 'E3' | |||
stringdef U+0427 hex 'FE' | |||
stringdef U+0428 hex 'FB' | |||
stringdef U+0429 hex 'FD' | |||
stringdef U+042A hex 'FF' | |||
stringdef U+042B hex 'F9' | |||
stringdef U+042C hex 'F8' | |||
stringdef U+042D hex 'FC' | |||
stringdef U+042E hex 'E0' | |||
stringdef U+042F hex 'F1' | |||
stringdef U+0430 hex 'C1' | |||
stringdef U+0431 hex 'C2' | |||
stringdef U+0432 hex 'D7' | |||
stringdef U+0433 hex 'C7' | |||
stringdef U+0434 hex 'C4' | |||
stringdef U+0435 hex 'C5' | |||
stringdef U+0436 hex 'D6' | |||
stringdef U+0437 hex 'DA' | |||
stringdef U+0438 hex 'C9' | |||
stringdef U+0439 hex 'CA' | |||
stringdef U+043A hex 'CB' | |||
stringdef U+043B hex 'CC' | |||
stringdef U+043C hex 'CD' | |||
stringdef U+043D hex 'CE' | |||
stringdef U+043E hex 'CF' | |||
stringdef U+043F hex 'D0' | |||
stringdef U+0440 hex 'D2' | |||
stringdef U+0441 hex 'D3' | |||
stringdef U+0442 hex 'D4' | |||
stringdef U+0443 hex 'D5' | |||
stringdef U+0444 hex 'C6' | |||
stringdef U+0445 hex 'C8' | |||
stringdef U+0446 hex 'C3' | |||
stringdef U+0447 hex 'DE' | |||
stringdef U+0448 hex 'DB' | |||
stringdef U+0449 hex 'DD' | |||
stringdef U+044A hex 'DF' | |||
stringdef U+044B hex 'D9' | |||
stringdef U+044C hex 'D8' | |||
stringdef U+044D hex 'DC' | |||
stringdef U+044E hex 'C0' | |||
stringdef U+044F hex 'D1' | |||
stringdef U+0451 hex 'A3' |
@@ -0,0 +1,130 @@ | |||
// Code page 850 (MSDOS Latin 1) character mappings. | |||
stringdef U+00A0 hex 'FF' | |||
stringdef U+00A1 hex 'AD' | |||
stringdef U+00A2 hex 'BD' | |||
stringdef U+00A3 hex '9C' | |||
stringdef U+00A4 hex 'CF' | |||
stringdef U+00A5 hex 'BE' | |||
stringdef U+00A6 hex 'DD' | |||
stringdef U+00A7 hex 'F5' | |||
stringdef U+00A8 hex 'F9' | |||
stringdef U+00A9 hex 'B8' | |||
stringdef U+00AA hex 'A6' | |||
stringdef U+00AB hex 'AE' | |||
stringdef U+00AC hex 'AA' | |||
stringdef U+00AD hex 'F0' | |||
stringdef U+00AE hex 'A9' | |||
stringdef U+00AF hex 'EE' | |||
stringdef U+00B0 hex 'F8' | |||
stringdef U+00B1 hex 'F1' | |||
stringdef U+00B2 hex 'FD' | |||
stringdef U+00B3 hex 'FC' | |||
stringdef U+00B4 hex 'EF' | |||
stringdef U+00B5 hex 'E6' | |||
stringdef U+00B6 hex 'F4' | |||
stringdef U+00B7 hex 'FA' | |||
stringdef U+00B8 hex 'F7' | |||
stringdef U+00B9 hex 'FB' | |||
stringdef U+00BA hex 'A7' | |||
stringdef U+00BB hex 'AF' | |||
stringdef U+00BC hex 'AC' | |||
stringdef U+00BD hex 'AB' | |||
stringdef U+00BE hex 'F3' | |||
stringdef U+00BF hex 'A8' | |||
stringdef U+00C0 hex 'B7' | |||
stringdef U+00C1 hex 'B5' | |||
stringdef U+00C2 hex 'B6' | |||
stringdef U+00C3 hex 'C7' | |||
stringdef U+00C4 hex '8E' | |||
stringdef U+00C5 hex '8F' | |||
stringdef U+00C6 hex '92' | |||
stringdef U+00C7 hex '80' | |||
stringdef U+00C8 hex 'D4' | |||
stringdef U+00C9 hex '90' | |||
stringdef U+00CA hex 'D2' | |||
stringdef U+00CB hex 'D3' | |||
stringdef U+00CC hex 'DE' | |||
stringdef U+00CD hex 'D6' | |||
stringdef U+00CE hex 'D7' | |||
stringdef U+00CF hex 'D8' | |||
stringdef U+00D0 hex 'D1' | |||
stringdef U+00D1 hex 'A5' | |||
stringdef U+00D2 hex 'E3' | |||
stringdef U+00D3 hex 'E0' | |||
stringdef U+00D4 hex 'E2' | |||
stringdef U+00D5 hex 'E5' | |||
stringdef U+00D6 hex '99' | |||
stringdef U+00D7 hex '9E' | |||
stringdef U+00D8 hex '9D' | |||
stringdef U+00D9 hex 'EB' | |||
stringdef U+00DA hex 'E9' | |||
stringdef U+00DB hex 'EA' | |||
stringdef U+00DC hex '9A' | |||
stringdef U+00DD hex 'ED' | |||
stringdef U+00DE hex 'E8' | |||
stringdef U+00DF hex 'E1' | |||
stringdef U+00E0 hex '85' | |||
stringdef U+00E1 hex 'A0' | |||
stringdef U+00E2 hex '83' | |||
stringdef U+00E3 hex 'C6' | |||
stringdef U+00E4 hex '84' | |||
stringdef U+00E5 hex '86' | |||
stringdef U+00E6 hex '91' | |||
stringdef U+00E7 hex '87' | |||
stringdef U+00E8 hex '8A' | |||
stringdef U+00E9 hex '82' | |||
stringdef U+00EA hex '88' | |||
stringdef U+00EB hex '89' | |||
stringdef U+00EC hex '8D' | |||
stringdef U+00ED hex 'A1' | |||
stringdef U+00EE hex '8C' | |||
stringdef U+00EF hex '8B' | |||
stringdef U+00F0 hex 'D0' | |||
stringdef U+00F1 hex 'A4' | |||
stringdef U+00F2 hex '95' | |||
stringdef U+00F3 hex 'A2' | |||
stringdef U+00F4 hex '93' | |||
stringdef U+00F5 hex 'E4' | |||
stringdef U+00F6 hex '94' | |||
stringdef U+00F7 hex 'F6' | |||
stringdef U+00F8 hex '9B' | |||
stringdef U+00F9 hex '97' | |||
stringdef U+00FA hex 'A3' | |||
stringdef U+00FB hex '96' | |||
stringdef U+00FC hex '81' | |||
stringdef U+00FD hex 'EC' | |||
stringdef U+00FE hex 'E7' | |||
stringdef U+00FF hex '98' | |||
stringdef U+0131 hex 'D5' | |||
stringdef U+0192 hex '9F' | |||
stringdef U+2017 hex 'F2' | |||
stringdef U+2500 hex 'C4' | |||
stringdef U+2502 hex 'B3' | |||
stringdef U+250C hex 'DA' | |||
stringdef U+2510 hex 'BF' | |||
stringdef U+2514 hex 'C0' | |||
stringdef U+2518 hex 'D9' | |||
stringdef U+251C hex 'C3' | |||
stringdef U+2524 hex 'B4' | |||
stringdef U+252C hex 'C2' | |||
stringdef U+2534 hex 'C1' | |||
stringdef U+253C hex 'C5' | |||
stringdef U+2550 hex 'CD' | |||
stringdef U+2551 hex 'BA' | |||
stringdef U+2554 hex 'C9' | |||
stringdef U+2557 hex 'BB' | |||
stringdef U+255A hex 'C8' | |||
stringdef U+255D hex 'BC' | |||
stringdef U+2560 hex 'CC' | |||
stringdef U+2563 hex 'B9' | |||
stringdef U+2566 hex 'CB' | |||
stringdef U+2569 hex 'CA' | |||
stringdef U+256C hex 'CE' | |||
stringdef U+2580 hex 'DF' | |||
stringdef U+2584 hex 'DC' | |||
stringdef U+2588 hex 'DB' | |||
stringdef U+2591 hex 'B0' | |||
stringdef U+2592 hex 'B1' | |||
stringdef U+2593 hex 'B2' | |||
stringdef U+25A0 hex 'FE' |
@@ -1,48 +1,86 @@ | |||
#include <ctype.h> /* for toupper etc */ | |||
#include <stdio.h> /* for fprintf etc */ | |||
#include <stdlib.h> /* for free etc */ | |||
#include <string.h> /* for strlen */ | |||
#include <string.h> /* for strcmp */ | |||
#include "header.h" | |||
#define DEFAULT_PACKAGE "org.tartarus.snowball.ext" | |||
#define DEFAULT_BASE_CLASS "org.tartarus.snowball.SnowballProgram" | |||
#define DEFAULT_AMONG_CLASS "org.tartarus.snowball.Among" | |||
#define DEFAULT_STRING_CLASS "java.lang.StringBuilder" | |||
#define DEFAULT_JAVA_PACKAGE "org.tartarus.snowball.ext" | |||
#define DEFAULT_JAVA_BASE_CLASS "org.tartarus.snowball.SnowballProgram" | |||
#define DEFAULT_JAVA_AMONG_CLASS "org.tartarus.snowball.Among" | |||
#define DEFAULT_JAVA_STRING_CLASS "java.lang.StringBuilder" | |||
#define DEFAULT_GO_PACKAGE "snowball" | |||
#define DEFAULT_GO_SNOWBALL_RUNTIME "github.com/snowballstem/snowball/go" | |||
#define DEFAULT_CS_NAMESPACE "Snowball" | |||
#define DEFAULT_CS_BASE_CLASS "Stemmer" | |||
#define DEFAULT_CS_AMONG_CLASS "Among" | |||
#define DEFAULT_CS_STRING_CLASS "StringBuilder" | |||
#define DEFAULT_JS_BASE_CLASS "BaseStemmer" | |||
#define DEFAULT_PYTHON_BASE_CLASS "BaseStemmer" | |||
static int eq(const char * s1, const char * s2) { | |||
int s1_len = strlen(s1); | |||
int s2_len = strlen(s2); | |||
return s1_len == s2_len && memcmp(s1, s2, s1_len) == 0; | |||
return strcmp(s1, s2) == 0; | |||
} | |||
static void print_arglist(void) { | |||
fprintf(stderr, "Usage: snowball <file> [options]\n\n" | |||
"options are: [-o[utput] file]\n" | |||
" [-s[yntax]]\n" | |||
static void print_arglist(int exit_code) { | |||
FILE * f = exit_code ? stderr : stdout; | |||
fprintf(f, "Usage: snowball SOURCE_FILE... [OPTIONS]\n\n" | |||
"Supported options:\n" | |||
" -o[utput] file\n" | |||
" -s[yntax]\n" | |||
" -comments\n" | |||
#ifndef DISABLE_JAVA | |||
" [-j[ava]]\n" | |||
" -j[ava]\n" | |||
#endif | |||
" [-c++]\n" | |||
" [-w[idechars]]\n" | |||
" [-u[tf8]]\n" | |||
" [-n[ame] class name]\n" | |||
" [-ep[refix] string]\n" | |||
" [-vp[refix] string]\n" | |||
" [-i[nclude] directory]\n" | |||
" [-r[untime] path to runtime headers]\n" | |||
#ifndef DISABLE_JAVA | |||
" [-p[arentclassname] fully qualified parent class name]\n" | |||
" [-P[ackage] package name for stemmers]\n" | |||
" [-S[tringclass] StringBuffer-compatible class]\n" | |||
" [-a[mongclass] fully qualified name of the Among class]\n" | |||
#ifndef DISABLE_CSHARP | |||
" -cs[harp]\n" | |||
#endif | |||
" -c++\n" | |||
#ifndef DISABLE_PASCAL | |||
" -pascal\n" | |||
#endif | |||
#ifndef DISABLE_PYTHON | |||
" -py[thon]\n" | |||
#endif | |||
#ifndef DISABLE_JS | |||
" -js\n" | |||
#endif | |||
#ifndef DISABLE_RUST | |||
" -rust\n" | |||
#endif | |||
#ifndef DISABLE_GO | |||
" -go\n" | |||
#endif | |||
" -w[idechars]\n" | |||
" -u[tf8]\n" | |||
" -n[ame] class name\n" | |||
" -ep[refix] string\n" | |||
" -vp[refix] string\n" | |||
" -i[nclude] directory\n" | |||
" -r[untime] path to runtime headers\n" | |||
" -p[arentclassname] fully qualified parent class name\n" | |||
#if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP) | |||
" -P[ackage] package name for stemmers\n" | |||
" -S[tringclass] StringBuffer-compatible class\n" | |||
" -a[mongclass] fully qualified name of the Among class\n" | |||
#endif | |||
#ifndef DISABLE_GO | |||
" -gop[ackage] Go package name for stemmers\n" | |||
" -gor[untime] Go snowball runtime package\n" | |||
#endif | |||
" --help display this help and exit\n" | |||
" --version output version information and exit\n" | |||
); | |||
exit(1); | |||
exit(exit_code); | |||
} | |||
static void check_lim(int i, int argc) { | |||
if (i >= argc) { | |||
fprintf(stderr, "argument list is one short\n"); | |||
print_arglist(); | |||
print_arglist(1); | |||
} | |||
} | |||
@@ -57,35 +95,47 @@ static FILE * get_output(symbol * b) { | |||
return output; | |||
} | |||
static void read_options(struct options * o, int argc, char * argv[]) { | |||
static int read_options(struct options * o, int argc, char * argv[]) { | |||
char * s; | |||
int i = 2; | |||
int i = 1; | |||
int new_argc = 1; | |||
/* Note down the last option used to specify an explicit encoding so | |||
* we can warn we ignored it for languages with a fixed encoding. | |||
*/ | |||
const char * encoding_opt = NULL; | |||
/* set defaults: */ | |||
o->output_file = 0; | |||
o->syntax_tree = false; | |||
o->externals_prefix = ""; | |||
o->comments = false; | |||
o->externals_prefix = NULL; | |||
o->variables_prefix = 0; | |||
o->runtime_path = 0; | |||
o->parent_class_name = DEFAULT_BASE_CLASS; | |||
o->string_class = DEFAULT_STRING_CLASS; | |||
o->among_class = DEFAULT_AMONG_CLASS; | |||
o->package = DEFAULT_PACKAGE; | |||
o->name = ""; | |||
o->parent_class_name = NULL; | |||
o->string_class = NULL; | |||
o->among_class = NULL; | |||
o->package = NULL; | |||
o->go_snowball_runtime = DEFAULT_GO_SNOWBALL_RUNTIME; | |||
o->name = NULL; | |||
o->make_lang = LANG_C; | |||
o->widechars = false; | |||
o->includes = 0; | |||
o->includes_end = 0; | |||
o->utf8 = false; | |||
o->encoding = ENC_SINGLEBYTE; | |||
/* read options: */ | |||
repeat { | |||
if (i >= argc) break; | |||
while (i < argc) { | |||
s = argv[i++]; | |||
{ if (eq(s, "-o") || eq(s, "-output")) { | |||
check_lim(i, argc); | |||
if (s[0] != '-') { | |||
/* Non-option argument - shuffle down. */ | |||
argv[new_argc++] = s; | |||
continue; | |||
} | |||
{ | |||
if (eq(s, "-o") || eq(s, "-output")) { | |||
check_lim(i, argc); | |||
o->output_file = argv[i++]; | |||
continue; | |||
} | |||
@@ -94,10 +144,33 @@ static void read_options(struct options * o, int argc, char * argv[]) { | |||
o->name = argv[i++]; | |||
continue; | |||
} | |||
#ifndef DISABLE_JS | |||
if (eq(s, "-js")) { | |||
o->make_lang = LANG_JAVASCRIPT; | |||
continue; | |||
} | |||
#endif | |||
#ifndef DISABLE_RUST | |||
if (eq(s, "-rust")) { | |||
o->make_lang = LANG_RUST; | |||
continue; | |||
} | |||
#endif | |||
#ifndef DISABLE_GO | |||
if (eq(s, "-go")) { | |||
o->make_lang = LANG_GO; | |||
continue; | |||
} | |||
#endif | |||
#ifndef DISABLE_JAVA | |||
if (eq(s, "-j") || eq(s, "-java")) { | |||
o->make_lang = LANG_JAVA; | |||
o->widechars = true; | |||
continue; | |||
} | |||
#endif | |||
#ifndef DISABLE_CSHARP | |||
if (eq(s, "-cs") || eq(s, "-csharp")) { | |||
o->make_lang = LANG_CSHARP; | |||
continue; | |||
} | |||
#endif | |||
@@ -105,15 +178,31 @@ static void read_options(struct options * o, int argc, char * argv[]) { | |||
o->make_lang = LANG_CPLUSPLUS; | |||
continue; | |||
} | |||
#ifndef DISABLE_PASCAL | |||
if (eq(s, "-pascal")) { | |||
o->make_lang = LANG_PASCAL; | |||
continue; | |||
} | |||
#endif | |||
#ifndef DISABLE_PYTHON | |||
if (eq(s, "-py") || eq(s, "-python")) { | |||
o->make_lang = LANG_PYTHON; | |||
continue; | |||
} | |||
#endif | |||
if (eq(s, "-w") || eq(s, "-widechars")) { | |||
o->widechars = true; | |||
o->utf8 = false; | |||
encoding_opt = s; | |||
o->encoding = ENC_WIDECHARS; | |||
continue; | |||
} | |||
if (eq(s, "-s") || eq(s, "-syntax")) { | |||
o->syntax_tree = true; | |||
continue; | |||
} | |||
if (eq(s, "-comments")) { | |||
o->comments = true; | |||
continue; | |||
} | |||
if (eq(s, "-ep") || eq(s, "-eprefix")) { | |||
check_lim(i, argc); | |||
o->externals_prefix = argv[i++]; | |||
@@ -145,16 +234,16 @@ static void read_options(struct options * o, int argc, char * argv[]) { | |||
continue; | |||
} | |||
if (eq(s, "-u") || eq(s, "-utf8")) { | |||
o->utf8 = true; | |||
o->widechars = false; | |||
encoding_opt = s; | |||
o->encoding = ENC_UTF8; | |||
continue; | |||
} | |||
#ifndef DISABLE_JAVA | |||
if (eq(s, "-p") || eq(s, "-parentclassname")) { | |||
check_lim(i, argc); | |||
o->parent_class_name = argv[i++]; | |||
continue; | |||
} | |||
#if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP) | |||
if (eq(s, "-P") || eq(s, "-Package")) { | |||
check_lim(i, argc); | |||
o->package = argv[i++]; | |||
@@ -171,44 +260,216 @@ static void read_options(struct options * o, int argc, char * argv[]) { | |||
continue; | |||
} | |||
#endif | |||
#ifndef DISABLE_GO | |||
if (eq(s, "-gop") || eq(s, "-gopackage")) { | |||
check_lim(i, argc); | |||
o->package = argv[i++]; | |||
continue; | |||
} | |||
if (eq(s, "-gor") || eq(s, "-goruntime")) { | |||
check_lim(i, argc); | |||
o->go_snowball_runtime = argv[i++]; | |||
continue; | |||
} | |||
#endif | |||
if (eq(s, "--help")) { | |||
print_arglist(0); | |||
} | |||
if (eq(s, "--version")) { | |||
printf("Snowball compiler version " SNOWBALL_VERSION "\n"); | |||
exit(0); | |||
} | |||
fprintf(stderr, "'%s' misplaced\n", s); | |||
print_arglist(); | |||
print_arglist(1); | |||
} | |||
} | |||
if (new_argc == 1) { | |||
fprintf(stderr, "no source files specified\n"); | |||
print_arglist(1); | |||
} | |||
argv[new_argc] = NULL; | |||
/* Set language-dependent defaults. */ | |||
switch (o->make_lang) { | |||
case LANG_C: | |||
case LANG_CPLUSPLUS: | |||
encoding_opt = NULL; | |||
break; | |||
case LANG_CSHARP: | |||
o->encoding = ENC_WIDECHARS; | |||
if (!o->parent_class_name) | |||
o->parent_class_name = DEFAULT_CS_BASE_CLASS; | |||
if (!o->string_class) | |||
o->string_class = DEFAULT_CS_STRING_CLASS; | |||
if (!o->among_class) | |||
o->among_class = DEFAULT_CS_AMONG_CLASS; | |||
if (!o->package) | |||
o->package = DEFAULT_CS_NAMESPACE; | |||
break; | |||
case LANG_GO: | |||
o->encoding = ENC_UTF8; | |||
if (!o->package) | |||
o->package = DEFAULT_GO_PACKAGE; | |||
break; | |||
case LANG_JAVA: | |||
o->encoding = ENC_WIDECHARS; | |||
if (!o->parent_class_name) | |||
o->parent_class_name = DEFAULT_JAVA_BASE_CLASS; | |||
if (!o->string_class) | |||
o->string_class = DEFAULT_JAVA_STRING_CLASS; | |||
if (!o->among_class) | |||
o->among_class = DEFAULT_JAVA_AMONG_CLASS; | |||
if (!o->package) | |||
o->package = DEFAULT_JAVA_PACKAGE; | |||
break; | |||
case LANG_JAVASCRIPT: | |||
o->encoding = ENC_WIDECHARS; | |||
if (!o->parent_class_name) | |||
o->parent_class_name = DEFAULT_JS_BASE_CLASS; | |||
break; | |||
case LANG_PYTHON: | |||
o->encoding = ENC_WIDECHARS; | |||
if (!o->parent_class_name) | |||
o->parent_class_name = DEFAULT_PYTHON_BASE_CLASS; | |||
break; | |||
case LANG_RUST: | |||
o->encoding = ENC_UTF8; | |||
break; | |||
default: | |||
break; | |||
} | |||
if (encoding_opt) { | |||
fprintf(stderr, "warning: %s only meaningful for C and C++\n", | |||
encoding_opt); | |||
} | |||
if (o->make_lang != LANG_C && o->make_lang != LANG_CPLUSPLUS) { | |||
if (o->runtime_path) { | |||
fprintf(stderr, "warning: -r/-runtime only meaningful for C and C++\n"); | |||
} | |||
if (o->externals_prefix) { | |||
fprintf(stderr, "warning: -ep/-eprefix only meaningful for C and C++\n"); | |||
} | |||
} | |||
if (!o->externals_prefix) o->externals_prefix = ""; | |||
if (!o->name && o->output_file) { | |||
/* Default class name to basename of output_file - this is the standard | |||
* convention for at least Java and C#. | |||
*/ | |||
const char * slash = strrchr(o->output_file, '/'); | |||
size_t len; | |||
const char * leaf = (slash == NULL) ? o->output_file : slash + 1; | |||
slash = strrchr(leaf, '\\'); | |||
if (slash != NULL) leaf = slash + 1; | |||
{ | |||
const char * dot = strchr(leaf, '.'); | |||
len = (dot == NULL) ? strlen(leaf) : (size_t)(dot - leaf); | |||
} | |||
{ | |||
char * new_name = malloc(len + 1); | |||
switch (o->make_lang) { | |||
case LANG_CSHARP: | |||
case LANG_PASCAL: | |||
/* Upper case initial letter. */ | |||
memcpy(new_name, leaf, len); | |||
new_name[0] = toupper(new_name[0]); | |||
break; | |||
case LANG_JAVASCRIPT: | |||
case LANG_PYTHON: { | |||
/* Upper case initial letter and change each | |||
* underscore+letter or hyphen+letter to an upper case | |||
* letter. | |||
*/ | |||
size_t i, j = 0; | |||
int uc_next = true; | |||
for (i = 0; i != len; ++i) { | |||
unsigned char ch = leaf[i]; | |||
if (ch == '_' || ch == '-') { | |||
uc_next = true; | |||
} else { | |||
if (uc_next) { | |||
new_name[j] = toupper(ch); | |||
uc_next = false; | |||
} else { | |||
new_name[j] = ch; | |||
} | |||
++j; | |||
} | |||
} | |||
len = j; | |||
break; | |||
} | |||
default: | |||
/* Just copy. */ | |||
memcpy(new_name, leaf, len); | |||
break; | |||
} | |||
new_name[len] = '\0'; | |||
o->name = new_name; | |||
} | |||
} | |||
return new_argc; | |||
} | |||
extern int main(int argc, char * argv[]) { | |||
int i; | |||
NEW(options, o); | |||
if (argc == 1) print_arglist(); | |||
read_options(o, argc, argv); | |||
argc = read_options(o, argc, argv); | |||
{ | |||
symbol * filename = add_s_to_b(0, argv[1]); | |||
char * file; | |||
symbol * u = get_input(filename, &file); | |||
char * file = argv[1]; | |||
symbol * u = get_input(file); | |||
if (u == 0) { | |||
fprintf(stderr, "Can't open input %s\n", argv[1]); | |||
fprintf(stderr, "Can't open input %s\n", file); | |||
exit(1); | |||
} | |||
{ | |||
struct tokeniser * t = create_tokeniser(u, file); | |||
struct analyser * a = create_analyser(t); | |||
t->widechars = o->widechars; | |||
struct input ** next_input_ptr = &(t->next); | |||
a->encoding = t->encoding = o->encoding; | |||
t->includes = o->includes; | |||
a->utf8 = t->utf8 = o->utf8; | |||
/* If multiple source files are specified, set up the others to be | |||
* read after the first in order, using the same mechanism as | |||
* 'get' uses. */ | |||
for (i = 2; i != argc; ++i) { | |||
NEW(input, q); | |||
file = argv[i]; | |||
u = get_input(file); | |||
if (u == 0) { | |||
fprintf(stderr, "Can't open input %s\n", file); | |||
exit(1); | |||
} | |||
q->p = u; | |||
q->c = 0; | |||
q->file = file; | |||
q->file_needs_freeing = false; | |||
q->line_number = 1; | |||
*next_input_ptr = q; | |||
next_input_ptr = &(q->next); | |||
} | |||
*next_input_ptr = NULL; | |||
read_program(a); | |||
if (t->error_count > 0) exit(1); | |||
if (o->syntax_tree) print_program(a); | |||
close_tokeniser(t); | |||
unless (o->syntax_tree) { | |||
if (!o->syntax_tree) { | |||
struct generator * g; | |||
char * s = o->output_file; | |||
unless (s) { | |||
const char * s = o->output_file; | |||
if (!s) { | |||
fprintf(stderr, "Please include the -o option\n"); | |||
print_arglist(); | |||
exit(1); | |||
print_arglist(1); | |||
} | |||
g = create_generator(a, o); | |||
if (o->make_lang == LANG_C || o->make_lang == LANG_CPLUSPLUS) { | |||
symbol * b = add_s_to_b(0, s); | |||
b = add_s_to_b(b, ".h"); | |||
@@ -217,41 +478,96 @@ extern int main(int argc, char * argv[]) { | |||
if (o->make_lang == LANG_CPLUSPLUS) { | |||
b = add_s_to_b(b, "c"); | |||
} | |||
o->output_c = get_output(b); | |||
o->output_src = get_output(b); | |||
lose_b(b); | |||
g = create_generator_c(a, o); | |||
generate_program_c(g); | |||
close_generator_c(g); | |||
fclose(o->output_c); | |||
fclose(o->output_src); | |||
fclose(o->output_h); | |||
} | |||
#ifndef DISABLE_JAVA | |||
if (o->make_lang == LANG_JAVA) { | |||
symbol * b = add_s_to_b(0, s); | |||
b = add_s_to_b(b, ".java"); | |||
o->output_java = get_output(b); | |||
o->output_src = get_output(b); | |||
lose_b(b); | |||
g = create_generator_java(a, o); | |||
generate_program_java(g); | |||
close_generator_java(g); | |||
fclose(o->output_java); | |||
fclose(o->output_src); | |||
} | |||
#endif | |||
#ifndef DISABLE_PASCAL | |||
if (o->make_lang == LANG_PASCAL) { | |||
symbol *b = add_s_to_b(0, s); | |||
b = add_s_to_b(b, ".pas"); | |||
o->output_src = get_output(b); | |||
lose_b(b); | |||
generate_program_pascal(g); | |||
fclose(o->output_src); | |||
} | |||
#endif | |||
#ifndef DISABLE_PYTHON | |||
if (o->make_lang == LANG_PYTHON) { | |||
symbol * b = add_s_to_b(0, s); | |||
b = add_s_to_b(b, ".py"); | |||
o->output_src = get_output(b); | |||
lose_b(b); | |||
generate_program_python(g); | |||
fclose(o->output_src); | |||
} | |||
#endif | |||
#ifndef DISABLE_JS | |||
if (o->make_lang == LANG_JAVASCRIPT) { | |||
symbol * b = add_s_to_b(0, s); | |||
b = add_s_to_b(b, ".js"); | |||
o->output_src = get_output(b); | |||
lose_b(b); | |||
generate_program_js(g); | |||
fclose(o->output_src); | |||
} | |||
#endif | |||
#ifndef DISABLE_CSHARP | |||
if (o->make_lang == LANG_CSHARP) { | |||
symbol * b = add_s_to_b(0, s); | |||
b = add_s_to_b(b, ".cs"); | |||
o->output_src = get_output(b); | |||
lose_b(b); | |||
generate_program_csharp(g); | |||
fclose(o->output_src); | |||
} | |||
#endif | |||
#ifndef DISABLE_RUST | |||
if (o->make_lang == LANG_RUST) { | |||
symbol * b = add_s_to_b(0, s); | |||
b = add_s_to_b(b, ".rs"); | |||
o->output_src = get_output(b); | |||
lose_b(b); | |||
generate_program_rust(g); | |||
fclose(o->output_src); | |||
} | |||
#endif | |||
#ifndef DISABLE_GO | |||
if (o->make_lang == LANG_GO) { | |||
symbol * b = add_s_to_b(0, s); | |||
b = add_s_to_b(b, ".go"); | |||
o->output_src = get_output(b); | |||
lose_b(b); | |||
generate_program_go(g); | |||
fclose(o->output_src); | |||
} | |||
#endif | |||
close_generator(g); | |||
} | |||
close_analyser(a); | |||
} | |||
lose_b(u); | |||
lose_b(filename); | |||
} | |||
{ struct include * p = o->includes; | |||
until (p == 0) | |||
{ struct include * q = p->next; | |||
while (p) { | |||
struct include * q = p->next; | |||
lose_b(p->b); FREE(p); p = q; | |||
} | |||
} | |||
FREE(o); | |||
unless (space_count == 0) fprintf(stderr, "%d blocks unfreed\n", space_count); | |||
if (space_count) fprintf(stderr, "%d blocks unfreed\n", space_count); | |||
return 0; | |||
} | |||
@@ -1,49 +1,56 @@ | |||
#include <stdio.h> | |||
#define SNOWBALL_VERSION "2.0.0" | |||
typedef unsigned char byte; | |||
typedef unsigned short symbol; | |||
#define true 1 | |||
#define false 0 | |||
#define repeat while(true) | |||
#define unless(C) if(!(C)) | |||
#define until(C) while(!(C)) | |||
#define MALLOC check_malloc | |||
#define FREE check_free | |||
#define NEW(type, p) struct type * p = (struct type *) MALLOC(sizeof(struct type)) | |||
#define NEWVEC(type, p, n) struct type * p = (struct type *) MALLOC(sizeof(struct type) * n) | |||
#define NEWVEC(type, p, n) struct type * p = (struct type *) MALLOC(sizeof(struct type) * (n)) | |||
#define STARTSIZE 10 | |||
#define SIZE(p) ((int *)(p))[-1] | |||
#define CAPACITY(p) ((int *)(p))[-2] | |||
extern symbol * create_b(int n); | |||
extern void report_b(FILE * out, symbol * p); | |||
extern void report_b(FILE * out, const symbol * p); | |||
extern void lose_b(symbol * p); | |||
extern symbol * increase_capacity(symbol * p, int n); | |||
extern symbol * move_to_b(symbol * p, int n, symbol * q); | |||
extern symbol * add_to_b(symbol * p, int n, symbol * q); | |||
extern symbol * copy_b(symbol * p); | |||
extern char * b_to_s(symbol * p); | |||
extern symbol * move_to_b(symbol * p, int n, const symbol * q); | |||
extern symbol * add_to_b(symbol * p, int n, const symbol * q); | |||
extern symbol * copy_b(const symbol * p); | |||
extern char * b_to_s(const symbol * p); | |||
extern symbol * add_s_to_b(symbol * p, const char * s); | |||
#define MOVE_TO_B(B, LIT) \ | |||
move_to_b(B, sizeof(LIT) / sizeof(LIT[0]), LIT) | |||
struct str; /* defined in space.c */ | |||
extern struct str * str_new(void); | |||
extern void str_delete(struct str * str); | |||
extern void str_append(struct str * str, struct str * add); | |||
extern void str_append(struct str * str, const struct str * add); | |||
extern void str_append_ch(struct str * str, char add); | |||
extern void str_append_b(struct str * str, symbol * q); | |||
extern void str_append_b(struct str * str, const symbol * q); | |||
extern void str_append_b_tail(struct str * str, const symbol * q, int skip); | |||
extern void str_append_string(struct str * str, const char * s); | |||
extern void str_append_int(struct str * str, int i); | |||
extern void str_clear(struct str * str); | |||
extern void str_assign(struct str * str, char * s); | |||
extern struct str * str_copy(struct str * old); | |||
extern symbol * str_data(struct str * str); | |||
extern int str_len(struct str * str); | |||
extern void str_assign(struct str * str, const char * s); | |||
extern struct str * str_copy(const struct str * old); | |||
extern symbol * str_data(const struct str * str); | |||
extern int str_len(const struct str * str); | |||
extern int str_back(const struct str *str); | |||
extern int get_utf8(const symbol * p, int * slot); | |||
extern int put_utf8(int ch, symbol * p); | |||
extern void output_str(FILE * outfile, struct str * str); | |||
typedef enum { ENC_SINGLEBYTE, ENC_UTF8, ENC_WIDECHARS } enc; | |||
struct m_pair { | |||
@@ -60,6 +67,7 @@ struct input { | |||
symbol * p; | |||
int c; | |||
char * file; | |||
int file_needs_freeing; | |||
int line_number; | |||
}; | |||
@@ -71,6 +79,28 @@ struct include { | |||
}; | |||
enum token_codes { | |||
#include "syswords2.h" | |||
c_mathassign, | |||
c_name, | |||
c_number, | |||
c_literalstring, | |||
c_neg, | |||
c_call, | |||
c_grouping, | |||
c_booltest, | |||
NUM_TOKEN_CODES | |||
}; | |||
enum uplus_modes { | |||
UPLUS_NONE, | |||
UPLUS_DEFINED, | |||
UPLUS_UNICODE | |||
}; | |||
/* struct input must be a prefix of struct tokeniser. */ | |||
struct tokeniser { | |||
@@ -78,6 +108,7 @@ struct tokeniser { | |||
symbol * p; | |||
int c; | |||
char * file; | |||
int file_needs_freeing; | |||
int line_number; | |||
symbol * b; | |||
symbol * b2; | |||
@@ -90,34 +121,28 @@ struct tokeniser { | |||
int token; | |||
int previous_token; | |||
byte token_held; | |||
byte widechars; | |||
byte utf8; | |||
enc encoding; | |||
int omission; | |||
struct include * includes; | |||
/* Mode in which U+ has been used: | |||
* UPLUS_NONE - not used yet | |||
* UPLUS_DEFINED - stringdef U+xxxx .... | |||
* UPLUS_UNICODE - {U+xxxx} used with implicit meaning | |||
*/ | |||
int uplusmode; | |||
char token_disabled[NUM_TOKEN_CODES]; | |||
}; | |||
extern symbol * get_input(symbol * p, char ** p_file); | |||
extern symbol * get_input(const char * filename); | |||
extern struct tokeniser * create_tokeniser(symbol * b, char * file); | |||
extern int read_token(struct tokeniser * t); | |||
extern const char * name_of_token(int code); | |||
extern void disable_token(struct tokeniser * t, int code); | |||
extern void close_tokeniser(struct tokeniser * t); | |||
enum token_codes { | |||
#include "syswords2.h" | |||
c_mathassign, | |||
c_name, | |||
c_number, | |||
c_literalstring, | |||
c_neg, | |||
c_call, | |||
c_grouping, | |||
c_booltest | |||
}; | |||
extern int space_count; | |||
extern void * check_malloc(int n); | |||
extern void check_free(void * p); | |||
@@ -134,7 +159,13 @@ struct name { | |||
int count; /* 0, 1, 2 for each type */ | |||
struct grouping * grouping; /* for grouping names */ | |||
byte referenced; | |||
byte used; | |||
byte used_in_among; /* Function used in among? */ | |||
byte value_used; /* (For variables) is its value ever used? */ | |||
byte initialised; /* (For variables) is it ever initialised? */ | |||
byte used_in_definition; /* (grouping) used in grouping definition? */ | |||
struct node * used; /* First use, or NULL if not used */ | |||
struct name * local_to; /* Local to one routine/external */ | |||
int declaration_line_number;/* Line number of declaration */ | |||
}; | |||
@@ -149,9 +180,10 @@ struct amongvec { | |||
symbol * b; /* the string giving the case */ | |||
int size; /* - and its size */ | |||
struct node * p; /* the corresponding command */ | |||
struct node * action; /* the corresponding action */ | |||
int i; /* the amongvec index of the longest substring of b */ | |||
int result; /* the numeric result for the case */ | |||
int line_number; /* for diagnostics and stable sorting */ | |||
struct name * function; | |||
}; | |||
@@ -163,19 +195,22 @@ struct among { | |||
int number; /* amongs are numbered 0, 1, 2 ... */ | |||
int literalstring_count; /* in this among */ | |||
int command_count; /* in this among */ | |||
int nocommand_count; /* number of "no command" entries in this among */ | |||
int function_count; /* in this among */ | |||
int amongvar_needed; /* do we need to set among_var? */ | |||
struct node * starter; /* i.e. among( (starter) 'string' ... ) */ | |||
struct node * substring; /* i.e. substring ... among ( ... ) */ | |||
struct node ** commands; /* array with command_count entries */ | |||
}; | |||
struct grouping { | |||
struct grouping * next; | |||
int number; /* groupings are numbered 0, 1, 2 ... */ | |||
symbol * b; /* the characters of this group */ | |||
int largest_ch; /* character with max code */ | |||
int smallest_ch; /* character with min code */ | |||
byte no_gaps; /* not used in generator.c after 11/5/05 */ | |||
struct name * name; /* so g->name->grouping == g */ | |||
int line_number; | |||
}; | |||
struct node { | |||
@@ -234,7 +269,8 @@ struct analyser { | |||
struct grouping * groupings; | |||
struct grouping * groupings_end; | |||
struct node * substring; /* pending 'substring' in current routine definition */ | |||
byte utf8; | |||
enc encoding; | |||
byte int_limits_used; /* are maxint or minint used? */ | |||
}; | |||
enum analyser_modes { | |||
@@ -259,16 +295,23 @@ struct generator { | |||
struct str * outbuf; /* temporary str to store output */ | |||
struct str * declarations; /* str storing variable declarations */ | |||
int next_label; | |||
#ifndef DISABLE_PYTHON | |||
int max_label; | |||
#endif | |||
int margin; | |||
const char * failure_string; /* String to output in case of a failure. */ | |||
#ifndef DISABLE_JAVA | |||
struct str * failure_str; /* This is used by the java generator instead of failure_string */ | |||
/* if > 0, keep_count to restore in case of a failure; | |||
* if < 0, the negated keep_count for the limit to restore in case of | |||
* failure. */ | |||
int failure_keep_count; | |||
#if !defined(DISABLE_JAVA) && !defined(DISABLE_JS) && !defined(DISABLE_PYTHON) && !defined(DISABLE_CSHARP) | |||
struct str * failure_str; /* This is used by some generators instead of failure_keep_count */ | |||
#endif | |||
int label_used; /* Keep track of whether the failure label is used. */ | |||
int failure_label; | |||
int debug_count; | |||
int copy_from_count; /* count of calls to copy_from() */ | |||
const char * S[10]; /* strings */ | |||
symbol * B[10]; /* blocks */ | |||
@@ -277,48 +320,92 @@ struct generator { | |||
symbol * L[5]; /* literals, used in formatted write */ | |||
int line_count; /* counts number of lines output */ | |||
int line_labelled; /* in ANSI C, will need extra ';' if it is a block end */ | |||
int line_labelled; /* in ISO C, will need extra ';' if it is a block end */ | |||
int literalstring_count; | |||
int keep_count; /* used to number keep/restore pairs to avoid compiler warnings | |||
about shadowed variables */ | |||
}; | |||
/* Special values for failure_label in struct generator. */ | |||
enum special_labels { | |||
x_return = -1 | |||
}; | |||
struct options { | |||
/* for the command line: */ | |||
char * output_file; | |||
char * name; | |||
FILE * output_c; | |||
const char * output_file; | |||
const char * name; | |||
FILE * output_src; | |||
FILE * output_h; | |||
#ifndef DISABLE_JAVA | |||
FILE * output_java; | |||
#endif | |||
byte syntax_tree; | |||
byte widechars; | |||
enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS } make_lang; | |||
char * externals_prefix; | |||
char * variables_prefix; | |||
char * runtime_path; | |||
char * parent_class_name; | |||
char * package; | |||
char * string_class; | |||
char * among_class; | |||
byte comments; | |||
enc encoding; | |||
enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS, LANG_CSHARP, LANG_PASCAL, LANG_PYTHON, LANG_JAVASCRIPT, LANG_RUST, LANG_GO } make_lang; | |||
const char * externals_prefix; | |||
const char * variables_prefix; | |||
const char * runtime_path; | |||
const char * parent_class_name; | |||
const char * package; | |||
const char * go_snowball_runtime; | |||
const char * string_class; | |||
const char * among_class; | |||
struct include * includes; | |||
struct include * includes_end; | |||
byte utf8; | |||
}; | |||
/* Generator for C code. */ | |||
extern struct generator * create_generator_c(struct analyser * a, struct options * o); | |||
extern void close_generator_c(struct generator * g); | |||
/* Generator functions common to several backends. */ | |||
extern struct generator * create_generator(struct analyser * a, struct options * o); | |||
extern void close_generator(struct generator * g); | |||
extern void write_char(struct generator * g, int ch); | |||
extern void write_newline(struct generator * g); | |||
extern void write_string(struct generator * g, const char * s); | |||
extern void write_int(struct generator * g, int i); | |||
extern void write_b(struct generator * g, symbol * b); | |||
extern void write_str(struct generator * g, struct str * str); | |||
extern void write_comment_content(struct generator * g, struct node * p); | |||
extern void write_generated_comment_content(struct generator * g); | |||
extern void write_start_comment(struct generator * g, | |||
const char * comment_start, | |||
const char * comment_end); | |||
extern int K_needed(struct generator * g, struct node * p); | |||
extern int repeat_restore(struct generator * g, struct node * p); | |||
/* Generator for C code. */ | |||
extern void generate_program_c(struct generator * g); | |||
#ifndef DISABLE_JAVA | |||
/* Generator for Java code. */ | |||
extern struct generator * create_generator_java(struct analyser * a, struct options * o); | |||
extern void close_generator_java(struct generator * g); | |||
extern void generate_program_java(struct generator * g); | |||
#endif | |||
#ifndef DISABLE_CSHARP | |||
/* Generator for C# code. */ | |||
extern void generate_program_csharp(struct generator * g); | |||
#endif | |||
#ifndef DISABLE_PASCAL | |||
extern void generate_program_pascal(struct generator * g); | |||
#endif | |||
#ifndef DISABLE_PYTHON | |||
/* Generator for Python code. */ | |||
extern void generate_program_python(struct generator * g); | |||
#endif | |||
#ifndef DISABLE_JS | |||
extern void generate_program_js(struct generator * g); | |||
#endif | |||
#ifndef DISABLE_RUST | |||
extern void generate_program_rust(struct generator * g); | |||
#endif | |||
#ifndef DISABLE_GO | |||
extern void generate_program_go(struct generator * g); | |||
#endif |
@@ -57,9 +57,19 @@ extern symbol * create_b(int n) { | |||
return p; | |||
} | |||
extern void report_b(FILE * out, symbol * p) { | |||
extern void report_b(FILE * out, const symbol * p) { | |||
int i; | |||
for (i = 0; i < SIZE(p); i++) fprintf(out, "%c", p[i]); | |||
for (i = 0; i < SIZE(p); i++) { | |||
if (p[i] > 255) { | |||
printf("In report_b, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]); | |||
exit(1); | |||
} | |||
putc(p[i], out); | |||
} | |||
} | |||
extern void output_str(FILE * outfile, struct str * str) { | |||
report_b(outfile, str_data(str)); | |||
} | |||
extern void lose_b(symbol * p) { | |||
@@ -74,19 +84,19 @@ extern symbol * increase_capacity(symbol * p, int n) { | |||
lose_b(p); return q; | |||
} | |||
extern symbol * move_to_b(symbol * p, int n, symbol * q) { | |||
extern symbol * move_to_b(symbol * p, int n, const symbol * q) { | |||
int x = n - CAPACITY(p); | |||
if (x > 0) p = increase_capacity(p, x); | |||
memmove(p, q, n * sizeof(symbol)); SIZE(p) = n; return p; | |||
} | |||
extern symbol * add_to_b(symbol * p, int n, symbol * q) { | |||
extern symbol * add_to_b(symbol * p, int n, const symbol * q) { | |||
int x = SIZE(p) + n - CAPACITY(p); | |||
if (x > 0) p = increase_capacity(p, x); | |||
memmove(p + SIZE(p), q, n * sizeof(symbol)); SIZE(p) += n; return p; | |||
} | |||
extern symbol * copy_b(symbol * p) { | |||
extern symbol * copy_b(const symbol * p) { | |||
int n = SIZE(p); | |||
symbol * q = create_b(n); | |||
move_to_b(q, n, p); | |||
@@ -97,7 +107,7 @@ int space_count = 0; | |||
extern void * check_malloc(int n) { | |||
space_count++; | |||
return calloc(1, n); | |||
return malloc(n); | |||
} | |||
extern void check_free(void * p) { | |||
@@ -107,18 +117,18 @@ extern void check_free(void * p) { | |||
/* To convert a block to a zero terminated string: */ | |||
extern char * b_to_s(symbol * p) { | |||
extern char * b_to_s(const symbol * p) { | |||
int n = SIZE(p); | |||
char * s = (char *)calloc(1, n + 1); | |||
char * s = (char *)malloc(n + 1); | |||
{ | |||
int i; | |||
for (i = 0; i < n; i++) { | |||
if (p[i] > 255) { | |||
printf("In b_to_s, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]); | |||
exit(1); | |||
} | |||
s[i] = (char)p[i]; | |||
} | |||
if (p[i] > 255) { | |||
printf("In b_to_s, can't convert p[%d] to char because it's 0x%02x\n", i, (int)p[i]); | |||
exit(1); | |||
} | |||
s[i] = (char)p[i]; | |||
} | |||
} | |||
s[n] = 0; | |||
return s; | |||
@@ -153,9 +163,9 @@ struct str { | |||
}; | |||
/* Create a new string. */ | |||
extern struct str * str_new() { | |||
extern struct str * str_new(void) { | |||
struct str * output = (struct str *) calloc(1, sizeof(struct str)); | |||
struct str * output = (struct str *) malloc(sizeof(struct str)); | |||
output->data = create_b(0); | |||
return output; | |||
} | |||
@@ -168,7 +178,7 @@ extern void str_delete(struct str * str) { | |||
} | |||
/* Append a str to this str. */ | |||
extern void str_append(struct str * str, struct str * add) { | |||
extern void str_append(struct str * str, const struct str * add) { | |||
symbol * q = add->data; | |||
str->data = add_to_b(str->data, SIZE(q), q); | |||
@@ -183,12 +193,19 @@ extern void str_append_ch(struct str * str, char add) { | |||
} | |||
/* Append a low level block to a str. */ | |||
extern void str_append_b(struct str * str, symbol * q) { | |||
extern void str_append_b(struct str * str, const symbol * q) { | |||
str->data = add_to_b(str->data, SIZE(q), q); | |||
} | |||
/* Append a (char *, null teminated) string to a str. */ | |||
/* Append the tail of a low level block to a str. */ | |||
extern void str_append_b_tail(struct str * str, const symbol * q, int skip) { | |||
if (skip < 0 || skip >= SIZE(q)) return; | |||
str->data = add_to_b(str->data, SIZE(q) - skip, q + skip); | |||
} | |||
/* Append a (char *, null terminated) string to a str. */ | |||
extern void str_append_string(struct str * str, const char * s) { | |||
str->data = add_s_to_b(str->data, s); | |||
@@ -209,14 +226,14 @@ extern void str_clear(struct str * str) { | |||
} | |||
/* Set a string */ | |||
extern void str_assign(struct str * str, char * s) { | |||
extern void str_assign(struct str * str, const char * s) { | |||
str_clear(str); | |||
str_append_string(str, s); | |||
} | |||
/* Copy a string. */ | |||
extern struct str * str_copy(struct str * old) { | |||
extern struct str * str_copy(const struct str * old) { | |||
struct str * newstr = str_new(); | |||
str_append(newstr, old); | |||
@@ -224,17 +241,25 @@ extern struct str * str_copy(struct str * old) { | |||
} | |||
/* Get the data stored in this str. */ | |||
extern symbol * str_data(struct str * str) { | |||
extern symbol * str_data(const struct str * str) { | |||
return str->data; | |||
} | |||
/* Get the length of the str. */ | |||
extern int str_len(struct str * str) { | |||
extern int str_len(const struct str * str) { | |||
return SIZE(str->data); | |||
} | |||
/* Get the last character of the str. | |||
* | |||
* Or -1 if the string is empty. | |||
*/ | |||
extern int str_back(const struct str *str) { | |||
return SIZE(str->data) ? str->data[SIZE(str->data) - 1] : -1; | |||
} | |||
extern int get_utf8(const symbol * p, int * slot) { | |||
int b0, b1; | |||
b0 = *p++; | |||
@@ -260,4 +285,3 @@ extern int put_utf8(int ch, symbol * p) { | |||
p[1] = ((ch >> 6) & 0x3F) | 0x80; | |||
p[2] = (ch & 0x3F) | 0x80; return 3; | |||
} | |||
@@ -1,5 +1,5 @@ | |||
static const struct system_word vocab[80+1] = { | |||
{ 0, (const byte *)"", 80+1}, | |||
static const struct system_word vocab[82+1] = { | |||
{ 0, (const byte *)"", 82+1}, | |||
{ 1, (const byte *)"$", c_dollar }, | |||
{ 1, (const byte *)"(", c_bra }, | |||
@@ -36,6 +36,7 @@ static const struct system_word vocab[80+1] = { | |||
{ 3, (const byte *)"get", c_get }, | |||
{ 3, (const byte *)"hex", c_hex }, | |||
{ 3, (const byte *)"hop", c_hop }, | |||
{ 3, (const byte *)"len", c_len }, | |||
{ 3, (const byte *)"non", c_non }, | |||
{ 3, (const byte *)"not", c_not }, | |||
{ 3, (const byte *)"set", c_set }, | |||
@@ -49,6 +50,7 @@ static const struct system_word vocab[80+1] = { | |||
{ 4, (const byte *)"true", c_true }, | |||
{ 5, (const byte *)"among", c_among }, | |||
{ 5, (const byte *)"false", c_false }, | |||
{ 5, (const byte *)"lenof", c_lenof }, | |||
{ 5, (const byte *)"limit", c_limit }, | |||
{ 5, (const byte *)"unset", c_unset }, | |||
{ 6, (const byte *)"atmark", c_atmark }, |
@@ -4,8 +4,8 @@ | |||
c_decimal, c_define, c_delete, c_divide, c_divideassign, c_do, | |||
c_dollar, c_eq, c_externals, c_fail, c_false, c_for, c_ge, c_get, | |||
c_gopast, c_goto, c_gr, c_groupings, c_hex, c_hop, c_insert, | |||
c_integers, c_ket, c_le, c_leftslice, c_limit, c_loop, c_ls, | |||
c_maxint, c_minint, c_minus, c_minusassign, c_multiply, | |||
c_integers, c_ket, c_le, c_leftslice, c_len, c_lenof, c_limit, c_loop, | |||
c_ls, c_maxint, c_minint, c_minus, c_minusassign, c_multiply, | |||
c_multiplyassign, c_ne, c_next, c_non, c_not, c_or, c_plus, | |||
c_plusassign, c_repeat, c_reverse, c_rightslice, c_routines, | |||
c_set, c_setlimit, c_setmark, c_size, c_sizeof, c_slicefrom, |
@@ -16,57 +16,57 @@ struct system_word { | |||
#include "syswords.h" | |||
static int smaller(int a, int b) { return a < b ? a : b; } | |||
#define INITIAL_INPUT_BUFFER_SIZE 8192 | |||
static int hex_to_num(int ch); | |||
extern symbol * get_input(symbol * p, char ** p_file) { | |||
static int smaller(int a, int b) { return a < b ? a : b; } | |||
char * s = b_to_s(p); | |||
extern symbol * get_input(const char * filename) { | |||
FILE * input = fopen(filename, "r"); | |||
if (input == 0) { return 0; } | |||
{ | |||
FILE * input = fopen(s, "r"); | |||
if (input == 0) { free(s); return 0; } | |||
*p_file = s; | |||
{ | |||
symbol * u = create_b(STARTSIZE); | |||
int size = 0; | |||
repeat | |||
{ int ch = getc(input); | |||
if (ch == EOF) break; | |||
if (size >= CAPACITY(u)) u = increase_capacity(u, size/2); | |||
u[size++] = ch; | |||
} | |||
fclose(input); | |||
SIZE(u) = size; return u; | |||
symbol * u = create_b(INITIAL_INPUT_BUFFER_SIZE); | |||
int size = 0; | |||
while (true) { | |||
int ch = getc(input); | |||
if (ch == EOF) break; | |||
if (size >= CAPACITY(u)) u = increase_capacity(u, size); | |||
u[size++] = ch; | |||
} | |||
fclose(input); | |||
SIZE(u) = size; | |||
return u; | |||
} | |||
} | |||
static void error(struct tokeniser * t, char * s1, int n, symbol * p, char * s2) { | |||
static void error(struct tokeniser * t, const char * s1, int n, symbol * p, const char * s2) { | |||
if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); } | |||
fprintf(stderr, "%s:%d: ", t->file, t->line_number); | |||
unless (s1 == 0) fprintf(stderr, "%s", s1); | |||
unless (p == 0) { | |||
if (s1) fprintf(stderr, "%s", s1); | |||
if (p) { | |||
int i; | |||
for (i = 0; i < n; i++) fprintf(stderr, "%c", p[i]); | |||
} | |||
unless (s2 == 0) fprintf(stderr, "%s", s2); | |||
if (s2) fprintf(stderr, "%s", s2); | |||
fprintf(stderr, "\n"); | |||
t->error_count++; | |||
} | |||
static void error1(struct tokeniser * t, char * s) { | |||
static void error1(struct tokeniser * t, const char * s) { | |||
error(t, s, 0,0, 0); | |||
} | |||
static void error2(struct tokeniser * t, char * s) { | |||
static void error2(struct tokeniser * t, const char * s) { | |||
error(t, "unexpected end of text after ", 0,0, s); | |||
} | |||
static int compare_words(int m, symbol * p, int n, const byte * q) { | |||
unless (m == n) return m - n; | |||
if (m != n) return m - n; | |||
{ | |||
int i; for (i = 0; i < n; i++) { | |||
int diff = p[i] - q[i]; | |||
unless (diff == 0) return diff; | |||
if (diff) return diff; | |||
} | |||
} | |||
return 0; | |||
@@ -74,14 +74,13 @@ static int compare_words(int m, symbol * p, int n, const byte * q) { | |||
static int find_word(int n, symbol * p) { | |||
int i = 0; int j = vocab->code; | |||
repeat { | |||
do { | |||
int k = i + (j - i)/2; | |||
const struct system_word * w = vocab + k; | |||
int diff = compare_words(n, p, w->s_size, w->s); | |||
if (diff == 0) return w->code; | |||
if (diff < 0) j = k; else i = k; | |||
if (j - i == 1) break; | |||
} | |||
} while (j - i != 1); | |||
return -1; | |||
} | |||
@@ -91,7 +90,7 @@ static int get_number(int n, symbol * p) { | |||
return x; | |||
} | |||
static int eq_s(struct tokeniser * t, char * s) { | |||
static int eq_s(struct tokeniser * t, const char * s) { | |||
int l = strlen(s); | |||
if (SIZE(t->p) - t->c < l) return false; | |||
{ | |||
@@ -103,64 +102,141 @@ static int eq_s(struct tokeniser * t, char * s) { | |||
static int white_space(struct tokeniser * t, int ch) { | |||
switch (ch) { | |||
case '\n': t->line_number++; | |||
case '\n': | |||
t->line_number++; | |||
/* fall through */ | |||
case '\r': | |||
case '\t': | |||
case ' ': return true; | |||
case ' ': | |||
return true; | |||
} | |||
return false; | |||
} | |||
static symbol * find_in_m(struct tokeniser * t, int n, symbol * p) { | |||
struct m_pair * q = t->m_pairs; | |||
repeat { | |||
if (q == 0) return 0; | |||
{ | |||
symbol * name = q->name; | |||
if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value; | |||
} | |||
q = q->next; | |||
struct m_pair * q; | |||
for (q = t->m_pairs; q; q = q->next) { | |||
symbol * name = q->name; | |||
if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value; | |||
} | |||
return 0; | |||
} | |||
static int read_literal_string(struct tokeniser * t, int c) { | |||
symbol * p = t->p; | |||
int ch; | |||
SIZE(t->b) = 0; | |||
repeat { | |||
while (true) { | |||
if (c >= SIZE(p)) { error2(t, "'"); return c; } | |||
ch = p[c]; | |||
if (ch == '\n') { error1(t, "string not terminated"); return c; } | |||
c++; | |||
if (ch == t->m_start) { | |||
/* Inside insert characters. */ | |||
int c0 = c; | |||
int newlines = false; /* no newlines as yet */ | |||
int black_found = false; /* no printing chars as yet */ | |||
repeat { | |||
while (true) { | |||
if (c >= SIZE(p)) { error2(t, "'"); return c; } | |||
ch = p[c]; c++; | |||
if (ch == t->m_end) break; | |||
unless (white_space(t, ch)) black_found = true; | |||
if (!white_space(t, ch)) black_found = true; | |||
if (ch == '\n') newlines = true; | |||
if (newlines && black_found) { | |||
error1(t, "string not terminated"); | |||
return c; | |||
} | |||
} | |||
unless (newlines) { | |||
if (!newlines) { | |||
int n = c - c0 - 1; /* macro size */ | |||
int firstch = p[c0]; | |||
symbol * q = find_in_m(t, n, p + c0); | |||
if (q == 0) { | |||
if (n == 1 && (firstch == '\'' || firstch == t->m_start)) | |||
t->b = add_to_b(t->b, 1, p + c0); | |||
else | |||
else if (n >= 3 && firstch == 'U' && p[c0 + 1] == '+') { | |||
int codepoint = 0; | |||
int x; | |||
if (t->uplusmode == UPLUS_DEFINED) { | |||
/* See if found with xxxx upper-cased. */ | |||
symbol * uc = create_b(n); | |||
int i; | |||
for (i = 0; i != n; ++i) { | |||
uc[i] = toupper(p[c0 + i]); | |||
} | |||
q = find_in_m(t, n, uc); | |||
lose_b(uc); | |||
if (q != 0) { | |||
t->b = add_to_b(t->b, SIZE(q), q); | |||
continue; | |||
} | |||
error1(t, "Some U+xxxx stringdefs seen but not this one"); | |||
} else { | |||
t->uplusmode = UPLUS_UNICODE; | |||
} | |||
for (x = c0 + 2; x != c - 1; ++x) { | |||
int hex = hex_to_num(p[x]); | |||
if (hex < 0) { | |||
error1(t, "Bad hex digit following U+"); | |||
break; | |||
} | |||
codepoint = (codepoint << 4) | hex; | |||
} | |||
if (t->encoding == ENC_UTF8) { | |||
if (codepoint < 0 || codepoint > 0x01ffff) { | |||
error1(t, "character values exceed 0x01ffff"); | |||
} | |||
/* Ensure there's enough space for a max length | |||
* UTF-8 sequence. */ | |||
if (CAPACITY(t->b) < SIZE(t->b) + 3) { | |||
t->b = increase_capacity(t->b, 3); | |||
} | |||
SIZE(t->b) += put_utf8(codepoint, t->b + SIZE(t->b)); | |||
} else { | |||
symbol sym; | |||
if (t->encoding == ENC_SINGLEBYTE) { | |||
/* Only ISO-8859-1 is handled this way - for | |||
* other single-byte character sets you need | |||
* stringdef all the U+xxxx codes you use | |||
* like - e.g.: | |||
* | |||
* stringdef U+0171 hex 'FB' | |||
*/ | |||
if (codepoint < 0 || codepoint > 0xff) { | |||
error1(t, "character values exceed 256"); | |||
} | |||
} else { | |||
if (codepoint < 0 || codepoint > 0xffff) { | |||
error1(t, "character values exceed 64K"); | |||
} | |||
} | |||
sym = codepoint; | |||
t->b = add_to_b(t->b, 1, &sym); | |||
} | |||
} else | |||
error(t, "string macro '", n, p + c0, "' undeclared"); | |||
} else | |||
t->b = add_to_b(t->b, SIZE(q), q); | |||
} | |||
} else { | |||
if (ch == '\'') return c; | |||
if (ch < 0 || ch >= 0x80) { | |||
if (t->encoding != ENC_WIDECHARS) { | |||
/* We don't really want people using non-ASCII literal | |||
* strings, but historically it's worked for single-byte | |||
* and UTF-8 if the source encoding matches what the | |||
* generated stemmer works in and it seems unfair to just | |||
* suddenly make this a hard error.` | |||
*/ | |||
fprintf(stderr, | |||
"%s:%d: warning: Non-ASCII literal strings aren't " | |||
"portable - use stringdef instead\n", | |||
t->file, t->line_number); | |||
} else { | |||
error1(t, "Non-ASCII literal strings aren't " | |||
"portable - use stringdef instead"); | |||
} | |||
} | |||
t->b = add_to_b(t->b, 1, p + c - 1); | |||
} | |||
} | |||
@@ -171,7 +247,7 @@ static int next_token(struct tokeniser * t) { | |||
int c = t->c; | |||
int ch; | |||
int code = -1; | |||
repeat { | |||
while (true) { | |||
if (c >= SIZE(p)) { t->c = c; return -1; } | |||
ch = p[c]; | |||
if (white_space(t, ch)) { c++; continue; } | |||
@@ -179,7 +255,7 @@ static int next_token(struct tokeniser * t) { | |||
int c0 = c; | |||
while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++; | |||
code = find_word(c - c0, p + c0); | |||
if (code < 0) { | |||
if (code < 0 || t->token_disabled[code]) { | |||
t->b = move_to_b(t->b, c - c0, p + c0); | |||
code = c_name; | |||
} | |||
@@ -218,10 +294,9 @@ static int next_char(struct tokeniser * t) { | |||
} | |||
static int next_real_char(struct tokeniser * t) { | |||
repeat { | |||
while (true) { | |||
int ch = next_char(t); | |||
if (white_space(t, ch)) continue; | |||
return ch; | |||
if (!white_space(t, ch)) return ch; | |||
} | |||
} | |||
@@ -230,7 +305,7 @@ static void read_chars(struct tokeniser * t) { | |||
if (ch < 0) { error2(t, "stringdef"); return; } | |||
{ | |||
int c0 = t->c-1; | |||
repeat { | |||
while (true) { | |||
ch = next_char(t); | |||
if (white_space(t, ch) || ch < 0) break; | |||
} | |||
@@ -246,19 +321,20 @@ static int decimal_to_num(int ch) { | |||
static int hex_to_num(int ch) { | |||
if ('0' <= ch && ch <= '9') return ch - '0'; | |||
if ('a' <= ch && ch <= 'f') return ch - 'a' + 10; | |||
if ('A' <= ch && ch <= 'F') return ch - 'A' + 10; | |||
return -1; | |||
} | |||
static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) { | |||
int c = 0; int d = 0; | |||
repeat { | |||
while (true) { | |||
while (c < SIZE(p) && p[c] == ' ') c++; | |||
if (c == SIZE(p)) break; | |||
{ | |||
int number = 0; | |||
repeat { | |||
while (c != SIZE(p)) { | |||
int ch = p[c]; | |||
if (c == SIZE(p) || ch == ' ') break; | |||
if (ch == ' ') break; | |||
if (base == 10) { | |||
ch = decimal_to_num(ch); | |||
if (ch < 0) { | |||
@@ -266,7 +342,7 @@ static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) { | |||
return; | |||
} | |||
} else { | |||
ch = hex_to_num(tolower(ch)); | |||
ch = hex_to_num(ch); | |||
if (ch < 0) { | |||
error1(t, "hex string contains non-hex characters"); | |||
return; | |||
@@ -275,18 +351,18 @@ static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) { | |||
number = base * number + ch; | |||
c++; | |||
} | |||
if (t->widechars || t->utf8) { | |||
unless (0 <= number && number <= 0xffff) { | |||
error1(t, "character values exceed 64K"); | |||
if (t->encoding == ENC_SINGLEBYTE) { | |||
if (number < 0 || number > 0xff) { | |||
error1(t, "character values exceed 256"); | |||
return; | |||
} | |||
} else { | |||
unless (0 <= number && number <= 0xff) { | |||
error1(t, "character values exceed 256"); | |||
if (number < 0 || number > 0xffff) { | |||
error1(t, "character values exceed 64K"); | |||
return; | |||
} | |||
} | |||
if (t->utf8) | |||
if (t->encoding == ENC_UTF8) | |||
d += put_utf8(number, p + d); | |||
else | |||
p[d++] = number; | |||
@@ -300,104 +376,118 @@ extern int read_token(struct tokeniser * t) { | |||
int held = t->token_held; | |||
t->token_held = false; | |||
if (held) return t->token; | |||
repeat { | |||
while (true) { | |||
int code = next_token(t); | |||
switch (code) { | |||
case c_comment1: /* slash-slash comment */ | |||
while (t->c < SIZE(p) && p[t->c] != '\n') t->c++; | |||
continue; | |||
while (t->c < SIZE(p) && p[t->c] != '\n') t->c++; | |||
continue; | |||
case c_comment2: /* slash-star comment */ | |||
repeat { | |||
if (t->c >= SIZE(p)) { | |||
error1(t, "/* comment not terminated"); | |||
t->token = -1; | |||
return -1; | |||
} | |||
if (p[t->c] == '\n') t->line_number++; | |||
if (eq_s(t, "*/")) break; | |||
t->c++; | |||
} | |||
continue; | |||
case c_stringescapes: | |||
{ | |||
int ch1 = next_real_char(t); | |||
int ch2 = next_real_char(t); | |||
if (ch2 < 0) | |||
{ error2(t, "stringescapes"); continue; } | |||
if (ch1 == '\'') | |||
{ error1(t, "first stringescape cannot be '"); continue; } | |||
t->m_start = ch1; | |||
t->m_end = ch2; | |||
} | |||
continue; | |||
case c_stringdef: | |||
{ | |||
int base = 0; | |||
read_chars(t); | |||
code = read_token(t); | |||
if (code == c_hex) { base = 16; code = read_token(t); } else | |||
if (code == c_decimal) { base = 10; code = read_token(t); } | |||
unless (code == c_literalstring) | |||
{ error1(t, "string omitted after stringdef"); continue; } | |||
if (base > 0) convert_numeric_string(t, t->b, base); | |||
{ NEW(m_pair, q); | |||
q->next = t->m_pairs; | |||
q->name = copy_b(t->b2); | |||
q->value = copy_b(t->b); | |||
t->m_pairs = q; | |||
} | |||
} | |||
continue; | |||
while (true) { | |||
if (t->c >= SIZE(p)) { | |||
error1(t, "/* comment not terminated"); | |||
t->token = -1; | |||
return -1; | |||
} | |||
if (p[t->c] == '\n') t->line_number++; | |||
if (eq_s(t, "*/")) break; | |||
t->c++; | |||
} | |||
continue; | |||
case c_stringescapes: { | |||
int ch1 = next_real_char(t); | |||
int ch2 = next_real_char(t); | |||
if (ch2 < 0) { | |||
error2(t, "stringescapes"); | |||
continue; | |||
} | |||
if (ch1 == '\'') { | |||
error1(t, "first stringescape cannot be '"); | |||
continue; | |||
} | |||
t->m_start = ch1; | |||
t->m_end = ch2; | |||
continue; | |||
} | |||
case c_stringdef: { | |||
int base = 0; | |||
read_chars(t); | |||
code = read_token(t); | |||
if (code == c_hex) { base = 16; code = read_token(t); } else | |||
if (code == c_decimal) { base = 10; code = read_token(t); } | |||
if (code != c_literalstring) { | |||
error1(t, "string omitted after stringdef"); | |||
continue; | |||
} | |||
if (base > 0) convert_numeric_string(t, t->b, base); | |||
{ NEW(m_pair, q); | |||
q->next = t->m_pairs; | |||
q->name = copy_b(t->b2); | |||
q->value = copy_b(t->b); | |||
t->m_pairs = q; | |||
if (t->uplusmode != UPLUS_DEFINED && | |||
(SIZE(t->b2) >= 3 && t->b2[0] == 'U' && t->b2[1] == '+')) { | |||
if (t->uplusmode == UPLUS_UNICODE) { | |||
error1(t, "U+xxxx already used with implicit meaning"); | |||
} else { | |||
t->uplusmode = UPLUS_DEFINED; | |||
} | |||
} | |||
} | |||
continue; | |||
} | |||
case c_get: | |||
code = read_token(t); | |||
unless (code == c_literalstring) { | |||
error1(t, "string omitted after get"); continue; | |||
} | |||
t->get_depth++; | |||
if (t->get_depth > 10) { | |||
fprintf(stderr, "get directives go 10 deep. Looping?\n"); | |||
exit(1); | |||
} | |||
{ | |||
char * file; | |||
NEW(input, q); | |||
symbol * u = get_input(t->b, &file); | |||
if (u == 0) { | |||
struct include * r = t->includes; | |||
until (r == 0) { | |||
symbol * b = copy_b(r->b); | |||
b = add_to_b(b, SIZE(t->b), t->b); | |||
u = get_input(b, &file); | |||
lose_b(b); | |||
unless (u == 0) break; | |||
r = r->next; | |||
} | |||
} | |||
if (u == 0) { | |||
error(t, "Can't get '", SIZE(t->b), t->b, "'"); | |||
exit(1); | |||
} | |||
memmove(q, t, sizeof(struct input)); | |||
t->next = q; | |||
t->p = u; | |||
t->c = 0; | |||
t->file = file; | |||
t->line_number = 1; | |||
} | |||
p = t->p; | |||
continue; | |||
code = read_token(t); | |||
if (code != c_literalstring) { | |||
error1(t, "string omitted after get"); continue; | |||
} | |||
t->get_depth++; | |||
if (t->get_depth > 10) { | |||
fprintf(stderr, "get directives go 10 deep. Looping?\n"); | |||
exit(1); | |||
} | |||
{ | |||
NEW(input, q); | |||
char * file = b_to_s(t->b); | |||
symbol * u = get_input(file); | |||
if (u == 0) { | |||
struct include * r; | |||
for (r = t->includes; r; r = r->next) { | |||
symbol * b = copy_b(r->b); | |||
b = add_to_b(b, SIZE(t->b), t->b); | |||
free(file); | |||
file = b_to_s(b); | |||
u = get_input(file); | |||
lose_b(b); | |||
if (u != 0) break; | |||
} | |||
} | |||
if (u == 0) { | |||
error(t, "Can't get '", SIZE(t->b), t->b, "'"); | |||
exit(1); | |||
} | |||
memmove(q, t, sizeof(struct input)); | |||
t->next = q; | |||
t->p = u; | |||
t->c = 0; | |||
t->file = file; | |||
t->file_needs_freeing = true; | |||
t->line_number = 1; | |||
} | |||
p = t->p; | |||
continue; | |||
case -1: | |||
unless (t->next == 0) { | |||
lose_b(p); | |||
{ | |||
struct input * q = t->next; | |||
memmove(t, q, sizeof(struct input)); p = t->p; | |||
FREE(q); | |||
} | |||
t->get_depth--; | |||
continue; | |||
} | |||
/* drop through */ | |||
if (t->next) { | |||
lose_b(p); | |||
{ | |||
struct input * q = t->next; | |||
memmove(t, q, sizeof(struct input)); p = t->p; | |||
FREE(q); | |||
} | |||
t->get_depth--; | |||
continue; | |||
} | |||
/* fall through */ | |||
default: | |||
t->previous_token = t->token; | |||
t->token = code; | |||
@@ -425,12 +515,17 @@ extern const char * name_of_token(int code) { | |||
} | |||
} | |||
extern void disable_token(struct tokeniser * t, int code) { | |||
t->token_disabled[code] = 1; | |||
} | |||
extern struct tokeniser * create_tokeniser(symbol * p, char * file) { | |||
NEW(tokeniser, t); | |||
t->next = 0; | |||
t->p = p; | |||
t->c = 0; | |||
t->file = file; | |||
t->file_needs_freeing = false; | |||
t->line_number = 1; | |||
t->b = create_b(0); | |||
t->b2 = create_b(0); | |||
@@ -441,6 +536,8 @@ extern struct tokeniser * create_tokeniser(symbol * p, char * file) { | |||
t->token_held = false; | |||
t->token = -2; | |||
t->previous_token = -2; | |||
t->uplusmode = UPLUS_NONE; | |||
memset(t->token_disabled, 0, sizeof(t->token_disabled)); | |||
return t; | |||
} | |||
@@ -449,7 +546,7 @@ extern void close_tokeniser(struct tokeniser * t) { | |||
lose_b(t->b2); | |||
{ | |||
struct m_pair * q = t->m_pairs; | |||
until (q == 0) { | |||
while (q) { | |||
struct m_pair * q_next = q->next; | |||
lose_b(q->name); | |||
lose_b(q->value); | |||
@@ -459,12 +556,12 @@ extern void close_tokeniser(struct tokeniser * t) { | |||
} | |||
{ | |||
struct input * q = t->next; | |||
until (q == 0) { | |||
while (q) { | |||
struct input * q_next = q->next; | |||
FREE(q); | |||
q = q_next; | |||
} | |||
} | |||
free(t->file); | |||
if (t->file_needs_freeing) free(t->file); | |||
FREE(t); | |||
} |
@@ -1,209 +0,0 @@ | |||
/* This is a simple program which uses libstemmer to provide a command | |||
* line interface for stemming using any of the algorithms provided. | |||
*/ | |||
#include <stdio.h> | |||
#include <stdlib.h> /* for malloc, free */ | |||
#include <string.h> /* for memmove */ | |||
#include <ctype.h> /* for isupper, tolower */ | |||
#include "libstemmer.h" | |||
const char * progname; | |||
static int pretty = 1; | |||
static void | |||
stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out) | |||
{ | |||
#define INC 10 | |||
int lim = INC; | |||
sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol)); | |||
while(1) { | |||
int ch = getc(f_in); | |||
if (ch == EOF) { | |||
free(b); return; | |||
} | |||
{ | |||
int i = 0; | |||
int inlen = 0; | |||
while(1) { | |||
if (ch == '\n' || ch == EOF) break; | |||
if (i == lim) { | |||
sb_symbol * newb; | |||
newb = (sb_symbol *) | |||
realloc(b, (lim + INC) * sizeof(sb_symbol)); | |||
if (newb == 0) goto error; | |||
b = newb; | |||
lim = lim + INC; | |||
} | |||
/* Update count of utf-8 characters. */ | |||
if (ch < 0x80 || ch > 0xBF) inlen += 1; | |||
/* force lower case: */ | |||
if (isupper(ch)) ch = tolower(ch); | |||
b[i] = ch; | |||
i++; | |||
ch = getc(f_in); | |||
} | |||
{ | |||
const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i); | |||
if (stemmed == NULL) | |||
{ | |||
fprintf(stderr, "Out of memory"); | |||
exit(1); | |||
} | |||
else | |||
{ | |||
if (pretty == 1) { | |||
fwrite(b, i, 1, f_out); | |||
fputs(" -> ", f_out); | |||
} else if (pretty == 2) { | |||
fwrite(b, i, 1, f_out); | |||
if (sb_stemmer_length(stemmer) > 0) { | |||
int j; | |||
if (inlen < 30) { | |||
for (j = 30 - inlen; j > 0; j--) | |||
fputs(" ", f_out); | |||
} else { | |||
fputs("\n", f_out); | |||
for (j = 30; j > 0; j--) | |||
fputs(" ", f_out); | |||
} | |||
} | |||
} | |||
fputs((const char *)stemmed, f_out); | |||
putc('\n', f_out); | |||
} | |||
} | |||
} | |||
} | |||
error: | |||
if (b != 0) free(b); | |||
return; | |||
} | |||
/** Display the command line syntax, and then exit. | |||
* @param n The value to exit with. | |||
*/ | |||
static void | |||
usage(int n) | |||
{ | |||
printf("usage: %s [-l <language>] [-i <input file>] [-o <output file>] [-c <character encoding>] [-p[2]] [-h]\n" | |||
"\n" | |||
"The input file consists of a list of words to be stemmed, one per\n" | |||
"line. Words should be in lower case, but (for English) A-Z letters\n" | |||
"are mapped to their a-z equivalents anyway. If omitted, stdin is\n" | |||
"used.\n" | |||
"\n" | |||
"If -c is given, the argument is the character encoding of the input\n" | |||
"and output files. If it is omitted, the UTF-8 encoding is used.\n" | |||
"\n" | |||
"If -p is given the output file consists of each word of the input\n" | |||
"file followed by \"->\" followed by its stemmed equivalent.\n" | |||
"If -p2 is given the output file is a two column layout containing\n" | |||
"the input words in the first column and the stemmed equivalents in\n" | |||
"the second column.\n" | |||
"Otherwise, the output file consists of the stemmed words, one per\n" | |||
"line.\n" | |||
"\n" | |||
"-h displays this help\n", | |||
progname); | |||
exit(n); | |||
} | |||
int | |||
main(int argc, char * argv[]) | |||
{ | |||
char * in = 0; | |||
char * out = 0; | |||
FILE * f_in; | |||
FILE * f_out; | |||
struct sb_stemmer * stemmer; | |||
char * language = "english"; | |||
char * charenc = NULL; | |||
char * s; | |||
int i = 1; | |||
pretty = 0; | |||
progname = argv[0]; | |||
while(i < argc) { | |||
s = argv[i++]; | |||
if (s[0] == '-') { | |||
if (strcmp(s, "-o") == 0) { | |||
if (i >= argc) { | |||
fprintf(stderr, "%s requires an argument\n", s); | |||
exit(1); | |||
} | |||
out = argv[i++]; | |||
} else if (strcmp(s, "-i") == 0) { | |||
if (i >= argc) { | |||
fprintf(stderr, "%s requires an argument\n", s); | |||
exit(1); | |||
} | |||
in = argv[i++]; | |||
} else if (strcmp(s, "-l") == 0) { | |||
if (i >= argc) { | |||
fprintf(stderr, "%s requires an argument\n", s); | |||
exit(1); | |||
} | |||
language = argv[i++]; | |||
} else if (strcmp(s, "-c") == 0) { | |||
if (i >= argc) { | |||
fprintf(stderr, "%s requires an argument\n", s); | |||
exit(1); | |||
} | |||
charenc = argv[i++]; | |||
} else if (strcmp(s, "-p2") == 0) { | |||
pretty = 2; | |||
} else if (strcmp(s, "-p") == 0) { | |||
pretty = 1; | |||
} else if (strcmp(s, "-h") == 0) { | |||
usage(0); | |||
} else { | |||
fprintf(stderr, "option %s unknown\n", s); | |||
usage(1); | |||
} | |||
} else { | |||
fprintf(stderr, "unexpected parameter %s\n", s); | |||
usage(1); | |||
} | |||
} | |||
/* prepare the files */ | |||
f_in = (in == 0) ? stdin : fopen(in, "r"); | |||
if (f_in == 0) { | |||
fprintf(stderr, "file %s not found\n", in); | |||
exit(1); | |||
} | |||
f_out = (out == 0) ? stdout : fopen(out, "w"); | |||
if (f_out == 0) { | |||
fprintf(stderr, "file %s cannot be opened\n", out); | |||
exit(1); | |||
} | |||
/* do the stemming process: */ | |||
stemmer = sb_stemmer_new(language, charenc); | |||
if (stemmer == 0) { | |||
if (charenc == NULL) { | |||
fprintf(stderr, "language `%s' not available for stemming\n", language); | |||
exit(1); | |||
} else { | |||
fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc); | |||
exit(1); | |||
} | |||
} | |||
stem_file(stemmer, f_in, f_out); | |||
sb_stemmer_delete(stemmer); | |||
if (in != 0) (void) fclose(f_in); | |||
if (out != 0) (void) fclose(f_out); | |||
return 0; | |||
} | |||
@@ -32,9 +32,9 @@ const char ** sb_stemmer_list(void); | |||
* | |||
* @param charenc The character encoding. NULL may be passed as | |||
* this value, in which case UTF-8 encoding will be assumed. Otherwise, | |||
* the argument may be one of "UTF_8", "ISO_8859_1" (ie, Latin 1), | |||
* "CP850" (ie, MS-DOS Latin 1) or "KOI8_R" (Russian). Note that | |||
* case is significant in this parameter. | |||
* the argument may be one of "UTF_8", "ISO_8859_1" (i.e. Latin 1), | |||
* "ISO_8859_2" (i.e. Latin 2) or "KOI8_R" (Russian). Note that case is | |||
* significant in this parameter. | |||
* | |||
* @return NULL if the specified algorithm is not recognised, or the | |||
* algorithm is not available for the requested encoding. Otherwise, | |||
@@ -66,7 +66,7 @@ void sb_stemmer_delete(struct sb_stemmer * stemmer); | |||
* If an out-of-memory error occurs, this will return NULL. | |||
*/ | |||
const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer, | |||
const sb_symbol * word, int size); | |||
const sb_symbol * word, int size); | |||
/** Get the length of the result of the last stemmed word. | |||
* This should not be called before sb_stemmer_stem() has been called. | |||
@@ -76,4 +76,3 @@ int sb_stemmer_length(struct sb_stemmer * stemmer); | |||
#ifdef __cplusplus | |||
} | |||
#endif | |||
@@ -1,31 +0,0 @@ | |||
package org.tartarus.snowball; | |||
import java.lang.reflect.Method; | |||
public class Among { | |||
public Among (String s, int substring_i, int result, | |||
String methodname, SnowballProgram methodobject) { | |||
this.s_size = s.length(); | |||
this.s = s.toCharArray(); | |||
this.substring_i = substring_i; | |||
this.result = result; | |||
this.methodobject = methodobject; | |||
if (methodname.length() == 0) { | |||
this.method = null; | |||
} else { | |||
try { | |||
this.method = methodobject.getClass(). | |||
getDeclaredMethod(methodname, new Class[0]); | |||
} catch (NoSuchMethodException e) { | |||
throw new RuntimeException(e); | |||
} | |||
} | |||
} | |||
public final int s_size; /* search string */ | |||
public final char[] s; /* search string */ | |||
public final int substring_i; /* index to longest matching substring */ | |||
public final int result; /* result of the lookup */ | |||
public final Method method; /* method to use if substring matches */ | |||
public final SnowballProgram methodobject; /* object to invoke method on */ | |||
}; |
@@ -1,432 +0,0 @@ | |||
package org.tartarus.snowball; | |||
import java.lang.reflect.InvocationTargetException; | |||
public class SnowballProgram { | |||
protected SnowballProgram() | |||
{ | |||
current = new StringBuffer(); | |||
setCurrent(""); | |||
} | |||
/** | |||
* Set the current string. | |||
*/ | |||
public void setCurrent(String value) | |||
{ | |||
current.replace(0, current.length(), value); | |||
cursor = 0; | |||
limit = current.length(); | |||
limit_backward = 0; | |||
bra = cursor; | |||
ket = limit; | |||
} | |||
/** | |||
* Get the current string. | |||
*/ | |||
public String getCurrent() | |||
{ | |||
String result = current.toString(); | |||
// Make a new StringBuffer. If we reuse the old one, and a user of | |||
// the library keeps a reference to the buffer returned (for example, | |||
// by converting it to a String in a way which doesn't force a copy), | |||
// the buffer size will not decrease, and we will risk wasting a large | |||
// amount of memory. | |||
// Thanks to Wolfram Esser for spotting this problem. | |||
current = new StringBuffer(); | |||
return result; | |||
} | |||
// current string | |||
protected StringBuffer current; | |||
protected int cursor; | |||
protected int limit; | |||
protected int limit_backward; | |||
protected int bra; | |||
protected int ket; | |||
protected void copy_from(SnowballProgram other) | |||
{ | |||
current = other.current; | |||
cursor = other.cursor; | |||
limit = other.limit; | |||
limit_backward = other.limit_backward; | |||
bra = other.bra; | |||
ket = other.ket; | |||
} | |||
protected boolean in_grouping(char [] s, int min, int max) | |||
{ | |||
if (cursor >= limit) return false; | |||
char ch = current.charAt(cursor); | |||
if (ch > max || ch < min) return false; | |||
ch -= min; | |||
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; | |||
cursor++; | |||
return true; | |||
} | |||
protected boolean in_grouping_b(char [] s, int min, int max) | |||
{ | |||
if (cursor <= limit_backward) return false; | |||
char ch = current.charAt(cursor - 1); | |||
if (ch > max || ch < min) return false; | |||
ch -= min; | |||
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; | |||
cursor--; | |||
return true; | |||
} | |||
protected boolean out_grouping(char [] s, int min, int max) | |||
{ | |||
if (cursor >= limit) return false; | |||
char ch = current.charAt(cursor); | |||
if (ch > max || ch < min) { | |||
cursor++; | |||
return true; | |||
} | |||
ch -= min; | |||
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { | |||
cursor ++; | |||
return true; | |||
} | |||
return false; | |||
} | |||
protected boolean out_grouping_b(char [] s, int min, int max) | |||
{ | |||
if (cursor <= limit_backward) return false; | |||
char ch = current.charAt(cursor - 1); | |||
if (ch > max || ch < min) { | |||
cursor--; | |||
return true; | |||
} | |||
ch -= min; | |||
if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { | |||
cursor--; | |||
return true; | |||
} | |||
return false; | |||
} | |||
protected boolean in_range(int min, int max) | |||
{ | |||
if (cursor >= limit) return false; | |||
char ch = current.charAt(cursor); | |||
if (ch > max || ch < min) return false; | |||
cursor++; | |||
return true; | |||
} | |||
protected boolean in_range_b(int min, int max) | |||
{ | |||
if (cursor <= limit_backward) return false; | |||
char ch = current.charAt(cursor - 1); | |||
if (ch > max || ch < min) return false; | |||
cursor--; | |||
return true; | |||
} | |||
protected boolean out_range(int min, int max) | |||
{ | |||
if (cursor >= limit) return false; | |||
char ch = current.charAt(cursor); | |||
if (!(ch > max || ch < min)) return false; | |||
cursor++; | |||
return true; | |||
} | |||
protected boolean out_range_b(int min, int max) | |||
{ | |||
if (cursor <= limit_backward) return false; | |||
char ch = current.charAt(cursor - 1); | |||
if(!(ch > max || ch < min)) return false; | |||
cursor--; | |||
return true; | |||
} | |||
protected boolean eq_s(int s_size, String s) | |||
{ | |||
if (limit - cursor < s_size) return false; | |||
int i; | |||
for (i = 0; i != s_size; i++) { | |||
if (current.charAt(cursor + i) != s.charAt(i)) return false; | |||
} | |||
cursor += s_size; | |||
return true; | |||
} | |||
protected boolean eq_s_b(int s_size, String s) | |||
{ | |||
if (cursor - limit_backward < s_size) return false; | |||
int i; | |||
for (i = 0; i != s_size; i++) { | |||
if (current.charAt(cursor - s_size + i) != s.charAt(i)) return false; | |||
} | |||
cursor -= s_size; | |||
return true; | |||
} | |||
protected boolean eq_v(CharSequence s) | |||
{ | |||
return eq_s(s.length(), s.toString()); | |||
} | |||
protected boolean eq_v_b(CharSequence s) | |||
{ return eq_s_b(s.length(), s.toString()); | |||
} | |||
protected int find_among(Among v[], int v_size) | |||
{ | |||
int i = 0; | |||
int j = v_size; | |||
int c = cursor; | |||
int l = limit; | |||
int common_i = 0; | |||
int common_j = 0; | |||
boolean first_key_inspected = false; | |||
while(true) { | |||
int k = i + ((j - i) >> 1); | |||
int diff = 0; | |||
int common = common_i < common_j ? common_i : common_j; // smaller | |||
Among w = v[k]; | |||
int i2; | |||
for (i2 = common; i2 < w.s_size; i2++) { | |||
if (c + common == l) { | |||
diff = -1; | |||
break; | |||
} | |||
diff = current.charAt(c + common) - w.s[i2]; | |||
if (diff != 0) break; | |||
common++; | |||
} | |||
if (diff < 0) { | |||
j = k; | |||
common_j = common; | |||
} else { | |||
i = k; | |||
common_i = common; | |||
} | |||
if (j - i <= 1) { | |||
if (i > 0) break; // v->s has been inspected | |||
if (j == i) break; // only one item in v | |||
// - but now we need to go round once more to get | |||
// v->s inspected. This looks messy, but is actually | |||
// the optimal approach. | |||
if (first_key_inspected) break; | |||
first_key_inspected = true; | |||
} | |||
} | |||
while(true) { | |||
Among w = v[i]; | |||
if (common_i >= w.s_size) { | |||
cursor = c + w.s_size; | |||
if (w.method == null) return w.result; | |||
boolean res; | |||
try { | |||
Object resobj = w.method.invoke(w.methodobject, | |||
new Object[0]); | |||
res = resobj.toString().equals("true"); | |||
} catch (InvocationTargetException e) { | |||
res = false; | |||
// FIXME - debug message | |||
} catch (IllegalAccessException e) { | |||
res = false; | |||
// FIXME - debug message | |||
} | |||
cursor = c + w.s_size; | |||
if (res) return w.result; | |||
} | |||
i = w.substring_i; | |||
if (i < 0) return 0; | |||
} | |||
} | |||
// find_among_b is for backwards processing. Same comments apply | |||
protected int find_among_b(Among v[], int v_size) | |||
{ | |||
int i = 0; | |||
int j = v_size; | |||
int c = cursor; | |||
int lb = limit_backward; | |||
int common_i = 0; | |||
int common_j = 0; | |||
boolean first_key_inspected = false; | |||
while(true) { | |||
int k = i + ((j - i) >> 1); | |||
int diff = 0; | |||
int common = common_i < common_j ? common_i : common_j; | |||
Among w = v[k]; | |||
int i2; | |||
for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) { | |||
if (c - common == lb) { | |||
diff = -1; | |||
break; | |||
} | |||
diff = current.charAt(c - 1 - common) - w.s[i2]; | |||
if (diff != 0) break; | |||
common++; | |||
} | |||
if (diff < 0) { | |||
j = k; | |||
common_j = common; | |||
} else { | |||
i = k; | |||
common_i = common; | |||
} | |||
if (j - i <= 1) { | |||
if (i > 0) break; | |||
if (j == i) break; | |||
if (first_key_inspected) break; | |||
first_key_inspected = true; | |||
} | |||
} | |||
while(true) { | |||
Among w = v[i]; | |||
if (common_i >= w.s_size) { | |||
cursor = c - w.s_size; | |||
if (w.method == null) return w.result; | |||
boolean res; | |||
try { | |||
Object resobj = w.method.invoke(w.methodobject, | |||
new Object[0]); | |||
res = resobj.toString().equals("true"); | |||
} catch (InvocationTargetException e) { | |||
res = false; | |||
// FIXME - debug message | |||
} catch (IllegalAccessException e) { | |||
res = false; | |||
// FIXME - debug message | |||
} | |||
cursor = c - w.s_size; | |||
if (res) return w.result; | |||
} | |||
i = w.substring_i; | |||
if (i < 0) return 0; | |||
} | |||
} | |||
/* to replace chars between c_bra and c_ket in current by the | |||
* chars in s. | |||
*/ | |||
protected int replace_s(int c_bra, int c_ket, String s) | |||
{ | |||
int adjustment = s.length() - (c_ket - c_bra); | |||
current.replace(c_bra, c_ket, s); | |||
limit += adjustment; | |||
if (cursor >= c_ket) cursor += adjustment; | |||
else if (cursor > c_bra) cursor = c_bra; | |||
return adjustment; | |||
} | |||
protected void slice_check() | |||
{ | |||
if (bra < 0 || | |||
bra > ket || | |||
ket > limit || | |||
limit > current.length()) // this line could be removed | |||
{ | |||
System.err.println("faulty slice operation"); | |||
// FIXME: report error somehow. | |||
/* | |||
fprintf(stderr, "faulty slice operation:\n"); | |||
debug(z, -1, 0); | |||
exit(1); | |||
*/ | |||
} | |||
} | |||
protected void slice_from(String s) | |||
{ | |||
slice_check(); | |||
replace_s(bra, ket, s); | |||
} | |||
protected void slice_from(CharSequence s) | |||
{ | |||
slice_from(s.toString()); | |||
} | |||
protected void slice_del() | |||
{ | |||
slice_from(""); | |||
} | |||
protected void insert(int c_bra, int c_ket, String s) | |||
{ | |||
int adjustment = replace_s(c_bra, c_ket, s); | |||
if (c_bra <= bra) bra += adjustment; | |||
if (c_bra <= ket) ket += adjustment; | |||
} | |||
protected void insert(int c_bra, int c_ket, CharSequence s) | |||
{ | |||
insert(c_bra, c_ket, s.toString()); | |||
} | |||
/* Copy the slice into the supplied StringBuffer */ | |||
protected StringBuffer slice_to(StringBuffer s) | |||
{ | |||
slice_check(); | |||
int len = ket - bra; | |||
s.replace(0, s.length(), current.substring(bra, ket)); | |||
return s; | |||
} | |||
/* Copy the slice into the supplied StringBuilder */ | |||
protected StringBuilder slice_to(StringBuilder s) | |||
{ | |||
slice_check(); | |||
int len = ket - bra; | |||
s.replace(0, s.length(), current.substring(bra, ket)); | |||
return s; | |||
} | |||
protected StringBuffer assign_to(StringBuffer s) | |||
{ | |||
s.replace(0, s.length(), current.substring(0, limit)); | |||
return s; | |||
} | |||
protected StringBuilder assign_to(StringBuilder s) | |||
{ | |||
s.replace(0, s.length(), current.substring(0, limit)); | |||
return s; | |||
} | |||
/* | |||
extern void debug(struct SN_env * z, int number, int line_count) | |||
{ int i; | |||
int limit = SIZE(z->p); | |||
//if (number >= 0) printf("%3d (line %4d): '", number, line_count); | |||
if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit); | |||
for (i = 0; i <= limit; i++) | |||
{ if (z->lb == i) printf("{"); | |||
if (z->bra == i) printf("["); | |||
if (z->c == i) printf("|"); | |||
if (z->ket == i) printf("]"); | |||
if (z->l == i) printf("}"); | |||
if (i < limit) | |||
{ int ch = z->p[i]; | |||
if (ch == 0) ch = '#'; | |||
printf("%c", ch); | |||
} | |||
} | |||
printf("'\n"); | |||
} | |||
*/ | |||
}; |
@@ -1,7 +0,0 @@ | |||
package org.tartarus.snowball; | |||
import java.lang.reflect.InvocationTargetException; | |||
public abstract class SnowballStemmer extends SnowballProgram { | |||
public abstract boolean stem(); | |||
}; |
@@ -1,77 +0,0 @@ | |||
package org.tartarus.snowball; | |||
import java.lang.reflect.Method; | |||
import java.io.Reader; | |||
import java.io.Writer; | |||
import java.io.BufferedReader; | |||
import java.io.BufferedWriter; | |||
import java.io.FileInputStream; | |||
import java.io.InputStreamReader; | |||
import java.io.OutputStreamWriter; | |||
import java.io.OutputStream; | |||
import java.io.FileOutputStream; | |||
public class TestApp { | |||
private static void usage() | |||
{ | |||
System.err.println("Usage: TestApp <algorithm> <input file> [-o <output file>]"); | |||
} | |||
public static void main(String [] args) throws Throwable { | |||
if (args.length < 2) { | |||
usage(); | |||
return; | |||
} | |||
Class stemClass = Class.forName("org.tartarus.snowball.ext." + | |||
args[0] + "Stemmer"); | |||
SnowballStemmer stemmer = (SnowballStemmer) stemClass.newInstance(); | |||
Reader reader; | |||
reader = new InputStreamReader(new FileInputStream(args[1])); | |||
reader = new BufferedReader(reader); | |||
StringBuffer input = new StringBuffer(); | |||
OutputStream outstream; | |||
if (args.length > 2) { | |||
if (args.length >= 4 && args[2].equals("-o")) { | |||
outstream = new FileOutputStream(args[3]); | |||
} else { | |||
usage(); | |||
return; | |||
} | |||
} else { | |||
outstream = System.out; | |||
} | |||
Writer output = new OutputStreamWriter(outstream); | |||
output = new BufferedWriter(output); | |||
int repeat = 1; | |||
if (args.length > 4) { | |||
repeat = Integer.parseInt(args[4]); | |||
} | |||
Object [] emptyArgs = new Object[0]; | |||
int character; | |||
while ((character = reader.read()) != -1) { | |||
char ch = (char) character; | |||
if (Character.isWhitespace((char) ch)) { | |||
if (input.length() > 0) { | |||
stemmer.setCurrent(input.toString()); | |||
for (int i = repeat; i != 0; i--) { | |||
stemmer.stem(); | |||
} | |||
output.write(stemmer.getCurrent()); | |||
output.write('\n'); | |||
input.delete(0, input.length()); | |||
} | |||
} else { | |||
input.append(Character.toLowerCase(ch)); | |||
} | |||
} | |||
output.flush(); | |||
} | |||
} |
@@ -22,10 +22,10 @@ sb_stemmer_list(void) | |||
static stemmer_encoding_t | |||
sb_getenc(const char * charenc) | |||
{ | |||
struct stemmer_encoding * encoding; | |||
const struct stemmer_encoding * encoding; | |||
if (charenc == NULL) return ENC_UTF_8; | |||
for (encoding = encodings; encoding->name != 0; encoding++) { | |||
if (strcmp(encoding->name, charenc) == 0) break; | |||
if (strcmp(encoding->name, charenc) == 0) break; | |||
} | |||
if (encoding->name == NULL) return ENC_UNKNOWN; | |||
return encoding->enc; | |||
@@ -35,14 +35,14 @@ extern struct sb_stemmer * | |||
sb_stemmer_new(const char * algorithm, const char * charenc) | |||
{ | |||
stemmer_encoding_t enc; | |||
struct stemmer_modules * module; | |||
const struct stemmer_modules * module; | |||
struct sb_stemmer * stemmer; | |||
enc = sb_getenc(charenc); | |||
if (enc == ENC_UNKNOWN) return NULL; | |||
for (module = modules; module->name != 0; module++) { | |||
if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break; | |||
if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break; | |||
} | |||
if (module->name == NULL) return NULL; | |||
@@ -67,9 +67,10 @@ void | |||
sb_stemmer_delete(struct sb_stemmer * stemmer) | |||
{ | |||
if (stemmer == 0) return; | |||
if (stemmer->close == 0) return; | |||
stemmer->close(stemmer->env); | |||
stemmer->close = 0; | |||
if (stemmer->close) { | |||
stemmer->close(stemmer->env); | |||
stemmer->close = 0; | |||
} | |||
free(stemmer); | |||
} | |||
@@ -1,10 +1,12 @@ | |||
#!/usr/bin/perl -w | |||
#!/usr/bin/env perl | |||
use strict; | |||
use 5.006; | |||
use warnings; | |||
my $progname = $0; | |||
if (scalar @ARGV < 4 || scalar @ARGV > 5) { | |||
print "Usage: $progname <outfile> <C source directory> <modules description file> <source list file> [<extn>]\n"; | |||
print "Usage: $progname <outfile> <C source directory> <modules description file> <source list file> [<enc>]\n"; | |||
exit 1; | |||
} | |||
@@ -12,9 +14,11 @@ my $outname = shift(@ARGV); | |||
my $c_src_dir = shift(@ARGV); | |||
my $descfile = shift(@ARGV); | |||
my $srclistfile = shift(@ARGV); | |||
my $enc_only; | |||
my $extn = ''; | |||
if (@ARGV) { | |||
$extn = '_'.shift(@ARGV); | |||
$enc_only = shift(@ARGV); | |||
$extn = '_'.$enc_only; | |||
} | |||
my %aliases = (); | |||
@@ -27,6 +31,14 @@ sub addalgenc($$) { | |||
my $alg = shift(); | |||
my $enc = shift(); | |||
if (defined $enc_only) { | |||
my $norm_enc = lc $enc; | |||
$norm_enc =~ s/_//g; | |||
if ($norm_enc ne $enc_only) { | |||
return; | |||
} | |||
} | |||
if (defined $algorithm_encs{$alg}) { | |||
my $hashref = $algorithm_encs{$alg}; | |||
$$hashref{$enc}=1; | |||
@@ -42,7 +54,7 @@ sub readinput() | |||
{ | |||
open DESCFILE, $descfile; | |||
my $line; | |||
while($line = <DESCFILE>) | |||
while ($line = <DESCFILE>) | |||
{ | |||
next if $line =~ m/^\s*#/; | |||
next if $line =~ m/^\s*$/; | |||
@@ -123,7 +135,7 @@ struct stemmer_encoding { | |||
const char * name; | |||
stemmer_encoding_t enc; | |||
}; | |||
static struct stemmer_encoding encodings[] = { | |||
static const struct stemmer_encoding encodings[] = { | |||
EOS | |||
for $enc (sort keys %encs) { | |||
print OUT " {\"${enc}\", ENC_${enc}},\n"; | |||
@@ -139,7 +151,7 @@ struct stemmer_modules { | |||
void (*close)(struct SN_env *); | |||
int (*stem)(struct SN_env *); | |||
}; | |||
static struct stemmer_modules modules[] = { | |||
static const struct stemmer_modules modules[] = { | |||
EOS | |||
for $lang (sort keys %aliases) { | |||
@@ -162,7 +174,6 @@ static const char * algorithm_names[] = { | |||
EOS | |||
for $lang (@algorithms) { | |||
my $l = $aliases{$lang}; | |||
print OUT " \"$lang\", \n"; | |||
} | |||
@@ -9,27 +9,35 @@ | |||
# List all the main algorithms for each language, in UTF-8, and also with | |||
# the most commonly used encoding. | |||
danish UTF_8,ISO_8859_1 danish,da,dan | |||
dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld | |||
english UTF_8,ISO_8859_1 english,en,eng | |||
finnish UTF_8,ISO_8859_1 finnish,fi,fin | |||
french UTF_8,ISO_8859_1 french,fr,fre,fra | |||
german UTF_8,ISO_8859_1 german,de,ger,deu | |||
hungarian UTF_8,ISO_8859_2 hungarian,hu,hun | |||
italian UTF_8,ISO_8859_1 italian,it,ita | |||
norwegian UTF_8,ISO_8859_1 norwegian,no,nor | |||
portuguese UTF_8,ISO_8859_1 portuguese,pt,por | |||
romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron | |||
russian UTF_8,KOI8_R russian,ru,rus | |||
spanish UTF_8,ISO_8859_1 spanish,es,esl,spa | |||
swedish UTF_8,ISO_8859_1 swedish,sv,swe | |||
arabic UTF_8 arabic,ar,ara | |||
danish UTF_8 danish,da,dan | |||
dutch UTF_8 dutch,nl,dut,nld | |||
english UTF_8 english,en,eng | |||
finnish UTF_8 finnish,fi,fin | |||
french UTF_8 french,fr,fre,fra | |||
german UTF_8 german,de,ger,deu | |||
greek UTF_8 greek,el,gre,ell | |||
hindi UTF_8 hindi,hi,hin | |||
hungarian UTF_8 hungarian,hu,hun | |||
indonesian UTF_8 indonesian,id,ind | |||
italian UTF_8 italian,it,ita | |||
lithuanian UTF_8 lithuanian,lt,lit | |||
nepali UTF_8 nepali,ne,nep | |||
norwegian UTF_8 norwegian,no,nor | |||
portuguese UTF_8 portuguese,pt,por | |||
romanian UTF_8 romanian,ro,rum,ron | |||
russian UTF_8 russian,ru,rus | |||
serbian UTF_8 serbian,sr,srp | |||
spanish UTF_8 spanish,es,esl,spa | |||
swedish UTF_8 swedish,sv,swe | |||
tamil UTF_8 tamil,ta,tam | |||
turkish UTF_8 turkish,tr,tur | |||
# Also include the traditional porter algorithm for english. | |||
# The porter algorithm is included in the libstemmer distribution to assist | |||
# with backwards compatibility, but for new systems the english algorithm | |||
# should be used in preference. | |||
porter UTF_8,ISO_8859_1 porter | |||
porter UTF_8 porter english | |||
# Some other stemmers in the snowball project are not included in the standard | |||
# distribution. To compile a libstemmer with them in, add them to this list, | |||
@@ -39,12 +47,12 @@ porter UTF_8,ISO_8859_1 porter | |||
# algorithms are: | |||
# | |||
# german2 - This is a slight modification of the german stemmer. | |||
#german2 UTF_8,ISO_8859_1 german2 | |||
#german2 UTF_8,ISO_8859_1 german2 german | |||
# | |||
# kraaij_pohlmann - This is a different dutch stemmer. | |||
#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann | |||
#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann dutch | |||
# | |||
# lovins - This is an english stemmer, but fairly outdated, and | |||
# only really applicable to a restricted type of input text | |||
# (keywords in academic publications). | |||
#lovins UTF_8,ISO_8859_1 lovins | |||
#lovins UTF_8,ISO_8859_1 lovins english |
@@ -2,7 +2,7 @@ | |||
#include <stdlib.h> /* for calloc, free */ | |||
#include "header.h" | |||
extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size) | |||
extern struct SN_env * SN_create_env(int S_size, int I_size) | |||
{ | |||
struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env)); | |||
if (z == NULL) return NULL; | |||
@@ -27,12 +27,6 @@ extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size) | |||
if (z->I == NULL) goto error; | |||
} | |||
if (B_size) | |||
{ | |||
z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char)); | |||
if (z->B == NULL) goto error; | |||
} | |||
return z; | |||
error: | |||
SN_close_env(z, S_size); | |||
@@ -52,7 +46,6 @@ extern void SN_close_env(struct SN_env * z, int S_size) | |||
free(z->S); | |||
} | |||
free(z->I); | |||
free(z->B); | |||
if (z->p) lose_s(z->p); | |||
free(z); | |||
} | |||
@@ -63,4 +56,3 @@ extern int SN_set_current(struct SN_env * z, int size, const symbol * s) | |||
z->c = 0; | |||
return err; | |||
} | |||
@@ -16,11 +16,17 @@ struct SN_env { | |||
int c; int l; int lb; int bra; int ket; | |||
symbol * * S; | |||
int * I; | |||
unsigned char * B; | |||
}; | |||
extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size); | |||
#ifdef __cplusplus | |||
extern "C" { | |||
#endif | |||
extern struct SN_env * SN_create_env(int S_size, int I_size); | |||
extern void SN_close_env(struct SN_env * z, int S_size); | |||
extern int SN_set_current(struct SN_env * z, int size, const symbol * s); | |||
#ifdef __cplusplus | |||
} | |||
#endif |
@@ -54,5 +54,6 @@ extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p); | |||
extern symbol * slice_to(struct SN_env * z, symbol * p); | |||
extern symbol * assign_to(struct SN_env * z, symbol * p); | |||
extern void debug(struct SN_env * z, int number, int line_count); | |||
extern int len_utf8(const symbol * p); | |||
extern void debug(struct SN_env * z, int number, int line_count); |
@@ -5,8 +5,6 @@ | |||
#include "header.h" | |||
#define unless(C) if(!(C)) | |||
#define CREATE_SIZE 1 | |||
extern symbol * create_s(void) { | |||
@@ -15,7 +13,7 @@ extern symbol * create_s(void) { | |||
if (mem == NULL) return NULL; | |||
p = (symbol *) (HEAD + (char *) mem); | |||
CAPACITY(p) = CREATE_SIZE; | |||
SET_SIZE(p, CREATE_SIZE); | |||
SET_SIZE(p, 0); | |||
return p; | |||
} | |||
@@ -27,7 +25,7 @@ extern void lose_s(symbol * p) { | |||
/* | |||
new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c | |||
if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new | |||
position, or 0 on failure. | |||
position, or -1 on failure. | |||
-- used to implement hop and next in the utf8 case. | |||
*/ | |||
@@ -66,77 +64,95 @@ extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) { | |||
/* Code for character groupings: utf8 cases */ | |||
static int get_utf8(const symbol * p, int c, int l, int * slot) { | |||
int b0, b1; | |||
int b0, b1, b2; | |||
if (c >= l) return 0; | |||
b0 = p[c++]; | |||
if (b0 < 0xC0 || c == l) { /* 1100 0000 */ | |||
* slot = b0; return 1; | |||
*slot = b0; | |||
return 1; | |||
} | |||
b1 = p[c++]; | |||
b1 = p[c++] & 0x3F; | |||
if (b0 < 0xE0 || c == l) { /* 1110 0000 */ | |||
* slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2; | |||
*slot = (b0 & 0x1F) << 6 | b1; | |||
return 2; | |||
} | |||
b2 = p[c++] & 0x3F; | |||
if (b0 < 0xF0 || c == l) { /* 1111 0000 */ | |||
*slot = (b0 & 0xF) << 12 | b1 << 6 | b2; | |||
return 3; | |||
} | |||
* slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[c] & 0x3F); return 3; | |||
*slot = (b0 & 0xE) << 18 | b1 << 12 | b2 << 6 | (p[c] & 0x3F); | |||
return 4; | |||
} | |||
static int get_b_utf8(const symbol * p, int c, int lb, int * slot) { | |||
int b0, b1; | |||
int a, b; | |||
if (c <= lb) return 0; | |||
b0 = p[--c]; | |||
if (b0 < 0x80 || c == lb) { /* 1000 0000 */ | |||
* slot = b0; return 1; | |||
b = p[--c]; | |||
if (b < 0x80 || c == lb) { /* 1000 0000 */ | |||
*slot = b; | |||
return 1; | |||
} | |||
b1 = p[--c]; | |||
if (b1 >= 0xC0 || c == lb) { /* 1100 0000 */ | |||
* slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2; | |||
a = b & 0x3F; | |||
b = p[--c]; | |||
if (b >= 0xC0 || c == lb) { /* 1100 0000 */ | |||
*slot = (b & 0x1F) << 6 | a; | |||
return 2; | |||
} | |||
* slot = (p[c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3; | |||
a |= (b & 0x3F) << 6; | |||
b = p[--c]; | |||
if (b >= 0xE0 || c == lb) { /* 1110 0000 */ | |||
*slot = (b & 0xF) << 12 | a; | |||
return 3; | |||
} | |||
*slot = (p[--c] & 0xE) << 18 | (b & 0x3F) << 12 | a; | |||
return 4; | |||
} | |||
extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { | |||
do { | |||
int ch; | |||
int w = get_utf8(z->p, z->c, z->l, & ch); | |||
unless (w) return -1; | |||
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) | |||
return w; | |||
z->c += w; | |||
int ch; | |||
int w = get_utf8(z->p, z->c, z->l, & ch); | |||
if (!w) return -1; | |||
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) | |||
return w; | |||
z->c += w; | |||
} while (repeat); | |||
return 0; | |||
} | |||
extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { | |||
do { | |||
int ch; | |||
int w = get_b_utf8(z->p, z->c, z->lb, & ch); | |||
unless (w) return -1; | |||
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) | |||
return w; | |||
z->c -= w; | |||
int ch; | |||
int w = get_b_utf8(z->p, z->c, z->lb, & ch); | |||
if (!w) return -1; | |||
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) | |||
return w; | |||
z->c -= w; | |||
} while (repeat); | |||
return 0; | |||
} | |||
extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { | |||
do { | |||
int ch; | |||
int w = get_utf8(z->p, z->c, z->l, & ch); | |||
unless (w) return -1; | |||
unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) | |||
return w; | |||
z->c += w; | |||
int ch; | |||
int w = get_utf8(z->p, z->c, z->l, & ch); | |||
if (!w) return -1; | |||
if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) | |||
return w; | |||
z->c += w; | |||
} while (repeat); | |||
return 0; | |||
} | |||
extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { | |||
do { | |||
int ch; | |||
int w = get_b_utf8(z->p, z->c, z->lb, & ch); | |||
unless (w) return -1; | |||
unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) | |||
return w; | |||
z->c -= w; | |||
int ch; | |||
int w = get_b_utf8(z->p, z->c, z->lb, & ch); | |||
if (!w) return -1; | |||
if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) | |||
return w; | |||
z->c -= w; | |||
} while (repeat); | |||
return 0; | |||
} | |||
@@ -145,48 +161,48 @@ extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, | |||
extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { | |||
do { | |||
int ch; | |||
if (z->c >= z->l) return -1; | |||
ch = z->p[z->c]; | |||
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) | |||
return 1; | |||
z->c++; | |||
int ch; | |||
if (z->c >= z->l) return -1; | |||
ch = z->p[z->c]; | |||
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) | |||
return 1; | |||
z->c++; | |||
} while (repeat); | |||
return 0; | |||
} | |||
extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { | |||
do { | |||
int ch; | |||
if (z->c <= z->lb) return -1; | |||
ch = z->p[z->c - 1]; | |||
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) | |||
return 1; | |||
z->c--; | |||
int ch; | |||
if (z->c <= z->lb) return -1; | |||
ch = z->p[z->c - 1]; | |||
if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) | |||
return 1; | |||
z->c--; | |||
} while (repeat); | |||
return 0; | |||
} | |||
extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { | |||
do { | |||
int ch; | |||
if (z->c >= z->l) return -1; | |||
ch = z->p[z->c]; | |||
unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) | |||
return 1; | |||
z->c++; | |||
int ch; | |||
if (z->c >= z->l) return -1; | |||
ch = z->p[z->c]; | |||
if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) | |||
return 1; | |||
z->c++; | |||
} while (repeat); | |||
return 0; | |||
} | |||
extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { | |||
do { | |||
int ch; | |||
if (z->c <= z->lb) return -1; | |||
ch = z->p[z->c - 1]; | |||
unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) | |||
return 1; | |||
z->c--; | |||
int ch; | |||
if (z->c <= z->lb) return -1; | |||
ch = z->p[z->c - 1]; | |||
if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) | |||
return 1; | |||
z->c--; | |||
} while (repeat); | |||
return 0; | |||
} | |||
@@ -215,7 +231,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) { | |||
int j = v_size; | |||
int c = z->c; int l = z->l; | |||
symbol * q = z->p + c; | |||
const symbol * q = z->p + c; | |||
const struct among * w; | |||
@@ -224,7 +240,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) { | |||
int first_key_inspected = 0; | |||
while(1) { | |||
while (1) { | |||
int k = i + ((j - i) >> 1); | |||
int diff = 0; | |||
int common = common_i < common_j ? common_i : common_j; /* smaller */ | |||
@@ -237,8 +253,13 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) { | |||
common++; | |||
} | |||
} | |||
if (diff < 0) { j = k; common_j = common; } | |||
else { i = k; common_i = common; } | |||
if (diff < 0) { | |||
j = k; | |||
common_j = common; | |||
} else { | |||
i = k; | |||
common_i = common; | |||
} | |||
if (j - i <= 1) { | |||
if (i > 0) break; /* v->s has been inspected */ | |||
if (j == i) break; /* only one item in v */ | |||
@@ -251,7 +272,7 @@ extern int find_among(struct SN_env * z, const struct among * v, int v_size) { | |||
first_key_inspected = 1; | |||
} | |||
} | |||
while(1) { | |||
while (1) { | |||
w = v + i; | |||
if (common_i >= w->s_size) { | |||
z->c = c + w->s_size; | |||
@@ -275,7 +296,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) { | |||
int j = v_size; | |||
int c = z->c; int lb = z->lb; | |||
symbol * q = z->p + c - 1; | |||
const symbol * q = z->p + c - 1; | |||
const struct among * w; | |||
@@ -284,7 +305,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) { | |||
int first_key_inspected = 0; | |||
while(1) { | |||
while (1) { | |||
int k = i + ((j - i) >> 1); | |||
int diff = 0; | |||
int common = common_i < common_j ? common_i : common_j; | |||
@@ -306,7 +327,7 @@ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) { | |||
first_key_inspected = 1; | |||
} | |||
} | |||
while(1) { | |||
while (1) { | |||
w = v + i; | |||
if (common_i >= w->s_size) { | |||
z->c = c - w->s_size; | |||
@@ -367,11 +388,10 @@ extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const | |||
z->l += adjustment; | |||
if (z->c >= c_ket) | |||
z->c += adjustment; | |||
else | |||
if (z->c > c_bra) | |||
z->c = c_bra; | |||
else if (z->c > c_bra) | |||
z->c = c_bra; | |||
} | |||
unless (s_size == 0) memmove(z->p + c_bra, s, s_size * sizeof(symbol)); | |||
if (s_size) memmove(z->p + c_bra, s, s_size * sizeof(symbol)); | |||
if (adjptr != NULL) | |||
*adjptr = adjustment; | |||
return 0; | |||
@@ -417,12 +437,7 @@ extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbo | |||
} | |||
extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) { | |||
int adjustment; | |||
if (replace_s(z, bra, ket, SIZE(p), p, &adjustment)) | |||
return -1; | |||
if (bra <= z->bra) z->bra += adjustment; | |||
if (bra <= z->ket) z->ket += adjustment; | |||
return 0; | |||
return insert_s(z, bra, ket, SIZE(p), p); | |||
} | |||
extern symbol * slice_to(struct SN_env * z, symbol * p) { | |||
@@ -455,6 +470,16 @@ extern symbol * assign_to(struct SN_env * z, symbol * p) { | |||
return p; | |||
} | |||
extern int len_utf8(const symbol * p) { | |||
int size = SIZE(p); | |||
int len = 0; | |||
while (size--) { | |||
symbol b = *p++; | |||
if (b >= 0xC0 || b < 0x80) ++len; | |||
} | |||
return len; | |||
} | |||
#if 0 | |||
extern void debug(struct SN_env * z, int number, int line_count) { | |||
int i; |