diff options
-rw-r--r-- | contrib/snowball/libstemmer/modules.txt | 50 | ||||
-rw-r--r-- | contrib/snowball/libstemmer/modules_utf8.txt | 49 |
2 files changed, 99 insertions, 0 deletions
diff --git a/contrib/snowball/libstemmer/modules.txt b/contrib/snowball/libstemmer/modules.txt new file mode 100644 index 000000000..d237c1d8d --- /dev/null +++ b/contrib/snowball/libstemmer/modules.txt @@ -0,0 +1,50 @@ +# This file contains a list of stemmers to include in the distribution. +# The format is a set of space separated lines - on each line: +# First item is name of stemmer. +# Second item is comma separated list of character sets. +# Third item is comma separated list of names to refer to the stemmer by. +# +# Lines starting with a #, or blank lines, are ignored. + +# List all the main algorithms for each language, in UTF-8, and also with +# the most commonly used encoding. + +danish UTF_8,ISO_8859_1 danish,da,dan +dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld +english UTF_8,ISO_8859_1 english,en,eng +finnish UTF_8,ISO_8859_1 finnish,fi,fin +french UTF_8,ISO_8859_1 french,fr,fre,fra +german UTF_8,ISO_8859_1 german,de,ger,deu +hungarian UTF_8,ISO_8859_2 hungarian,hu,hun +italian UTF_8,ISO_8859_1 italian,it,ita +norwegian UTF_8,ISO_8859_1 norwegian,no,nor +portuguese UTF_8,ISO_8859_1 portuguese,pt,por +romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron +russian UTF_8,KOI8_R russian,ru,rus +spanish UTF_8,ISO_8859_1 spanish,es,esl,spa +swedish UTF_8,ISO_8859_1 swedish,sv,swe +turkish UTF_8 turkish,tr,tur + +# Also include the traditional porter algorithm for english. +# The porter algorithm is included in the libstemmer distribution to assist +# with backwards compatibility, but for new systems the english algorithm +# should be used in preference. +porter UTF_8,ISO_8859_1 porter + +# Some other stemmers in the snowball project are not included in the standard +# distribution. To compile a libstemmer with them in, add them to this list, +# and regenerate the distribution. (You will need a full source checkout for +# this.) They are included in the snowball website as curiosities, but are not +# intended for general use, and use of them is is not fully supported. These +# algorithms are: +# +# german2 - This is a slight modification of the german stemmer. +#german2 UTF_8,ISO_8859_1 german2 +# +# kraaij_pohlmann - This is a different dutch stemmer. +#kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann +# +# lovins - This is an english stemmer, but fairly outdated, and +# only really applicable to a restricted type of input text +# (keywords in academic publications). +#lovins UTF_8,ISO_8859_1 lovins diff --git a/contrib/snowball/libstemmer/modules_utf8.txt b/contrib/snowball/libstemmer/modules_utf8.txt new file mode 100644 index 000000000..60a0e1d0c --- /dev/null +++ b/contrib/snowball/libstemmer/modules_utf8.txt @@ -0,0 +1,49 @@ +# This file contains a list of stemmers to include in the distribution. +# The format is a set of space separated lines - on each line: +# First item is name of stemmer. +# Second item is comma separated list of character sets. +# Third item is comma separated list of names to refer to the stemmer by. +# +# Lines starting with a #, or blank lines, are ignored. + +# List all the main algorithms for each language, in UTF-8. + +danish UTF_8 danish,da,dan +dutch UTF_8 dutch,nl,dut,nld +english UTF_8 english,en,eng +finnish UTF_8 finnish,fi,fin +french UTF_8 french,fr,fre,fra +german UTF_8 german,de,ger,deu +hungarian UTF_8 hungarian,hu,hun +italian UTF_8 italian,it,ita +norwegian UTF_8 norwegian,no,nor +portuguese UTF_8 portuguese,pt,por +romanian UTF_8 romanian,ro,rum,ron +russian UTF_8 russian,ru,rus +spanish UTF_8 spanish,es,esl,spa +swedish UTF_8 swedish,sv,swe +turkish UTF_8 turkish,tr,tur + +# Also include the traditional porter algorithm for english. +# The porter algorithm is included in the libstemmer distribution to assist +# with backwards compatibility, but for new systems the english algorithm +# should be used in preference. +porter UTF_8 porter + +# Some other stemmers in the snowball project are not included in the standard +# distribution. To compile a libstemmer with them in, add them to this list, +# and regenerate the distribution. (You will need a full source checkout for +# this.) They are included in the snowball website as curiosities, but are not +# intended for general use, and use of them is is not fully supported. These +# algorithms are: +# +# german2 - This is a slight modification of the german stemmer. +#german2 UTF_8 german2 +# +# kraaij_pohlmann - This is a different dutch stemmer. +#kraaij_pohlmann UTF_8 kraaij_pohlmann +# +# lovins - This is an english stemmer, but fairly outdated, and +# only really applicable to a restricted type of input text +# (keywords in academic publications). +#lovins UTF_8 lovins |