summaryrefslogtreecommitdiffstats
path: root/contrib/snowball/algorithms
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-12-31 17:38:02 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-12-31 17:38:02 +0000
commit2375dba898b481837879940dfdcf3ea85248fe01 (patch)
treecced5fb680e9a362d1de25630bff537865d38365 /contrib/snowball/algorithms
parent1543c98d38ffb84a1e405081436d0a25bee713a6 (diff)
downloadrspamd-2375dba898b481837879940dfdcf3ea85248fe01.tar.gz
rspamd-2375dba898b481837879940dfdcf3ea85248fe01.zip
Remove bloody submodules.
Diffstat (limited to 'contrib/snowball/algorithms')
-rw-r--r--contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl91
-rw-r--r--contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl91
-rw-r--r--contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl164
-rw-r--r--contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl164
-rw-r--r--contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl229
-rw-r--r--contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl196
-rw-r--r--contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl248
-rw-r--r--contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl239
-rw-r--r--contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl139
-rw-r--r--contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl139
-rw-r--r--contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl145
-rw-r--r--contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl241
-rw-r--r--contrib/snowball/algorithms/hungarian/stem_Unicode.sbl241
-rw-r--r--contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl195
-rw-r--r--contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl195
-rw-r--r--contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl245
-rw-r--r--contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl208
-rw-r--r--contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl80
-rw-r--r--contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl80
-rw-r--r--contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl139
-rw-r--r--contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl218
-rw-r--r--contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl218
-rw-r--r--contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl236
-rw-r--r--contrib/snowball/algorithms/romanian/stem_Unicode.sbl236
-rw-r--r--contrib/snowball/algorithms/russian/stem_KOI8_R.sbl217
-rw-r--r--contrib/snowball/algorithms/russian/stem_Unicode.sbl215
-rw-r--r--contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl230
-rw-r--r--contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl230
-rw-r--r--contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl72
-rw-r--r--contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl72
-rw-r--r--contrib/snowball/algorithms/turkish/stem_Unicode.sbl477
31 files changed, 5890 insertions, 0 deletions
diff --git a/contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..0a8190a08
--- /dev/null
+++ b/contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl
@@ -0,0 +1,91 @@
+routines (
+ mark_regions
+ main_suffix
+ consonant_pair
+ other_suffix
+ undouble
+)
+
+externals ( stem )
+
+strings ( ch )
+
+integers ( p1 x )
+
+groupings ( v s_ending )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef ae hex 'E6'
+stringdef ao hex 'E5'
+stringdef o/ hex 'F8'
+
+define v 'aeiouy{ae}{ao}{o/}'
+
+define s_ending 'abcdfghjklmnoprtvyz{ao}'
+
+define mark_regions as (
+
+ $p1 = limit
+
+ test ( hop 3 setmark x )
+ goto v gopast non-v setmark p1
+ try ( $p1 < x $p1 = x )
+)
+
+backwardmode (
+
+ define main_suffix as (
+ setlimit tomark p1 for ([substring])
+ among(
+
+ 'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere'
+ 'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes'
+ 'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets'
+ 'erets' 'et' 'eret'
+ (delete)
+ 's'
+ (s_ending delete)
+ )
+ )
+
+ define consonant_pair as (
+ test (
+ setlimit tomark p1 for ([substring])
+ among(
+ 'gd' // significant in the call from other_suffix
+ 'dt' 'gt' 'kt'
+ )
+ )
+ next] delete
+ )
+
+ define other_suffix as (
+ do ( ['st'] 'ig' delete )
+ setlimit tomark p1 for ([substring])
+ among(
+ 'ig' 'lig' 'elig' 'els'
+ (delete do consonant_pair)
+ 'l{o/}st'
+ (<-'l{o/}s')
+ )
+ )
+ define undouble as (
+ setlimit tomark p1 for ([non-v] ->ch)
+ ch
+ delete
+ )
+)
+
+define stem as (
+
+ do mark_regions
+ backwards (
+ do main_suffix
+ do consonant_pair
+ do other_suffix
+ do undouble
+ )
+)
diff --git a/contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl
new file mode 100644
index 000000000..1131a1cb7
--- /dev/null
+++ b/contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl
@@ -0,0 +1,91 @@
+routines (
+ mark_regions
+ main_suffix
+ consonant_pair
+ other_suffix
+ undouble
+)
+
+externals ( stem )
+
+strings ( ch )
+
+integers ( p1 x )
+
+groupings ( v s_ending )
+
+stringescapes {}
+
+/* special characters (in MS-DOS Latin I) */
+
+stringdef ae hex '91'
+stringdef ao hex '86'
+stringdef o/ hex '9B'
+
+define v 'aeiouy{ae}{ao}{o/}'
+
+define s_ending 'abcdfghjklmnoprtvyz{ao}'
+
+define mark_regions as (
+
+ $p1 = limit
+
+ test ( hop 3 setmark x )
+ goto v gopast non-v setmark p1
+ try ( $p1 < x $p1 = x )
+)
+
+backwardmode (
+
+ define main_suffix as (
+ setlimit tomark p1 for ([substring])
+ among(
+
+ 'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere'
+ 'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes'
+ 'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets'
+ 'erets' 'et' 'eret'
+ (delete)
+ 's'
+ (s_ending delete)
+ )
+ )
+
+ define consonant_pair as (
+ test (
+ setlimit tomark p1 for ([substring])
+ among(
+ 'gd' // significant in the call from other_suffix
+ 'dt' 'gt' 'kt'
+ )
+ )
+ next] delete
+ )
+
+ define other_suffix as (
+ do ( ['st'] 'ig' delete )
+ setlimit tomark p1 for ([substring])
+ among(
+ 'ig' 'lig' 'elig' 'els'
+ (delete do consonant_pair)
+ 'l{o/}st'
+ (<-'l{o/}s')
+ )
+ )
+ define undouble as (
+ setlimit tomark p1 for ([non-v] ->ch)
+ ch
+ delete
+ )
+)
+
+define stem as (
+
+ do mark_regions
+ backwards (
+ do main_suffix
+ do consonant_pair
+ do other_suffix
+ do undouble
+ )
+)
diff --git a/contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..f7609f766
--- /dev/null
+++ b/contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl
@@ -0,0 +1,164 @@
+routines (
+ prelude postlude
+ e_ending
+ en_ending
+ mark_regions
+ R1 R2
+ undouble
+ standard_suffix
+)
+
+externals ( stem )
+
+booleans ( e_found )
+
+integers ( p1 p2 )
+
+groupings ( v v_I v_j )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef a" hex 'E4'
+stringdef e" hex 'EB'
+stringdef i" hex 'EF'
+stringdef o" hex 'F6'
+stringdef u" hex 'FC'
+
+stringdef a' hex 'E1'
+stringdef e' hex 'E9'
+stringdef i' hex 'ED'
+stringdef o' hex 'F3'
+stringdef u' hex 'FA'
+
+stringdef e` hex 'E8'
+
+define v 'aeiouy{e`}'
+define v_I v + 'I'
+define v_j v + 'j'
+
+define prelude as (
+ test repeat (
+ [substring] among(
+ '{a"}' '{a'}'
+ (<- 'a')
+ '{e"}' '{e'}'
+ (<- 'e')
+ '{i"}' '{i'}'
+ (<- 'i')
+ '{o"}' '{o'}'
+ (<- 'o')
+ '{u"}' '{u'}'
+ (<- 'u')
+ '' (next)
+ ) //or next
+ )
+ try(['y'] <- 'Y')
+ repeat goto (
+ v [('i'] v <- 'I') or
+ ('y'] <- 'Y')
+ )
+)
+
+define mark_regions as (
+
+ $p1 = limit
+ $p2 = limit
+
+ gopast v gopast non-v setmark p1
+ try($p1 < 3 $p1 = 3) // at least 3
+ gopast v gopast non-v setmark p2
+
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'Y' (<- 'y')
+ 'I' (<- 'i')
+ '' (next)
+ ) //or next
+
+)
+
+backwardmode (
+
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define undouble as (
+ test among('kk' 'dd' 'tt') [next] delete
+ )
+
+ define e_ending as (
+ unset e_found
+ ['e'] R1 test non-v delete
+ set e_found
+ undouble
+ )
+
+ define en_ending as (
+ R1 non-v and not 'gem' delete
+ undouble
+ )
+
+ define standard_suffix as (
+ do (
+ [substring] among(
+ 'heden'
+ ( R1 <- 'heid'
+ )
+ 'en' 'ene'
+ ( en_ending
+ )
+ 's' 'se'
+ ( R1 non-v_j delete
+ )
+ )
+ )
+ do e_ending
+
+ do ( ['heid'] R2 not 'c' delete
+ ['en'] en_ending
+ )
+
+ do (
+ [substring] among(
+ 'end' 'ing'
+ ( R2 delete
+ (['ig'] R2 not 'e' delete) or undouble
+ )
+ 'ig'
+ ( R2 not 'e' delete
+ )
+ 'lijk'
+ ( R2 delete e_ending
+ )
+ 'baar'
+ ( R2 delete
+ )
+ 'bar'
+ ( R2 e_found delete
+ )
+ )
+ )
+ do (
+ non-v_I
+ test (
+ among ('aa' 'ee' 'oo' 'uu')
+ non-v
+ )
+ [next] delete
+ )
+ )
+)
+
+define stem as (
+
+ do prelude
+ do mark_regions
+ backwards
+ do standard_suffix
+ do postlude
+)
diff --git a/contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl
new file mode 100644
index 000000000..15b8718d1
--- /dev/null
+++ b/contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl
@@ -0,0 +1,164 @@
+routines (
+ prelude postlude
+ e_ending
+ en_ending
+ mark_regions
+ R1 R2
+ undouble
+ standard_suffix
+)
+
+externals ( stem )
+
+booleans ( e_found )
+
+integers ( p1 p2 )
+
+groupings ( v v_I v_j )
+
+stringescapes {}
+
+/* special characters (in MS-DOS Latin I) */
+
+stringdef a" hex '84'
+stringdef e" hex '89'
+stringdef i" hex '8B'
+stringdef o" hex '94'
+stringdef u" hex '81'
+
+stringdef a' hex 'A0'
+stringdef e' hex '82'
+stringdef i' hex 'A1'
+stringdef o' hex 'A2'
+stringdef u' hex 'A3'
+
+stringdef e` hex '8A'
+
+define v 'aeiouy{e`}'
+define v_I v + 'I'
+define v_j v + 'j'
+
+define prelude as (
+ test repeat (
+ [substring] among(
+ '{a"}' '{a'}'
+ (<- 'a')
+ '{e"}' '{e'}'
+ (<- 'e')
+ '{i"}' '{i'}'
+ (<- 'i')
+ '{o"}' '{o'}'
+ (<- 'o')
+ '{u"}' '{u'}'
+ (<- 'u')
+ '' (next)
+ ) //or next
+ )
+ try(['y'] <- 'Y')
+ repeat goto (
+ v [('i'] v <- 'I') or
+ ('y'] <- 'Y')
+ )
+)
+
+define mark_regions as (
+
+ $p1 = limit
+ $p2 = limit
+
+ gopast v gopast non-v setmark p1
+ try($p1 < 3 $p1 = 3) // at least 3
+ gopast v gopast non-v setmark p2
+
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'Y' (<- 'y')
+ 'I' (<- 'i')
+ '' (next)
+ ) //or next
+
+)
+
+backwardmode (
+
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define undouble as (
+ test among('kk' 'dd' 'tt') [next] delete
+ )
+
+ define e_ending as (
+ unset e_found
+ ['e'] R1 test non-v delete
+ set e_found
+ undouble
+ )
+
+ define en_ending as (
+ R1 non-v and not 'gem' delete
+ undouble
+ )
+
+ define standard_suffix as (
+ do (
+ [substring] among(
+ 'heden'
+ ( R1 <- 'heid'
+ )
+ 'en' 'ene'
+ ( en_ending
+ )
+ 's' 'se'
+ ( R1 non-v_j delete
+ )
+ )
+ )
+ do e_ending
+
+ do ( ['heid'] R2 not 'c' delete
+ ['en'] en_ending
+ )
+
+ do (
+ [substring] among(
+ 'end' 'ing'
+ ( R2 delete
+ (['ig'] R2 not 'e' delete) or undouble
+ )
+ 'ig'
+ ( R2 not 'e' delete
+ )
+ 'lijk'
+ ( R2 delete e_ending
+ )
+ 'baar'
+ ( R2 delete
+ )
+ 'bar'
+ ( R2 e_found delete
+ )
+ )
+ )
+ do (
+ non-v_I
+ test (
+ among ('aa' 'ee' 'oo' 'uu')
+ non-v
+ )
+ [next] delete
+ )
+ )
+)
+
+define stem as (
+
+ do prelude
+ do mark_regions
+ backwards
+ do standard_suffix
+ do postlude
+)
diff --git a/contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..fe18d7a91
--- /dev/null
+++ b/contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl
@@ -0,0 +1,229 @@
+integers ( p1 p2 )
+booleans ( Y_found )
+
+routines (
+ prelude postlude
+ mark_regions
+ shortv
+ R1 R2
+ Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5
+ exception1
+ exception2
+)
+
+externals ( stem )
+
+groupings ( v v_WXY valid_LI )
+
+stringescapes {}
+
+define v 'aeiouy'
+define v_WXY v + 'wxY'
+
+define valid_LI 'cdeghkmnrt'
+
+define prelude as (
+ unset Y_found
+ do ( ['{'}'] delete)
+ do ( ['y'] <-'Y' set Y_found)
+ do repeat(goto (v ['y']) <-'Y' set Y_found)
+)
+
+define mark_regions as (
+ $p1 = limit
+ $p2 = limit
+ do(
+ among (
+ 'gener'
+ 'commun' // added May 2005
+ 'arsen' // added Nov 2006 (arsenic/arsenal)
+ // ... extensions possible here ...
+ ) or (gopast v gopast non-v)
+ setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+backwardmode (
+
+ define shortv as (
+ ( non-v_WXY v non-v )
+ or
+ ( non-v v atlimit )
+ )
+
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define Step_1a as (
+ try (
+ [substring] among (
+ '{'}' '{'}s' '{'}s{'}'
+ (delete)
+ )
+ )
+ [substring] among (
+ 'sses' (<-'ss')
+ 'ied' 'ies'
+ ((hop 2 <-'i') or <-'ie')
+ 's' (next gopast v delete)
+ 'us' 'ss'
+ )
+ )
+
+ define Step_1b as (
+ [substring] among (
+ 'eed' 'eedly'
+ (R1 <-'ee')
+ 'ed' 'edly' 'ing' 'ingly'
+ (
+ test gopast v delete
+ test substring among(
+ 'at' 'bl' 'iz'
+ (<+ 'e')
+ 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt'
+ // ignoring double c, h, j, k, q, v, w, and x
+ ([next] delete)
+ '' (atmark p1 test shortv <+ 'e')
+ )
+ )
+ )
+ )
+
+ define Step_1c as (
+ ['y' or 'Y']
+ non-v not atlimit
+ <-'i'
+ )
+
+ define Step_2 as (
+ [substring] R1 among (
+ 'tional' (<-'tion')
+ 'enci' (<-'ence')
+ 'anci' (<-'ance')
+ 'abli' (<-'able')
+ 'entli' (<-'ent')
+ 'izer' 'ization'
+ (<-'ize')
+ 'ational' 'ation' 'ator'
+ (<-'ate')
+ 'alism' 'aliti' 'alli'
+ (<-'al')
+ 'fulness' (<-'ful')
+ 'ousli' 'ousness'
+ (<-'ous')
+ 'iveness' 'iviti'
+ (<-'ive')
+ 'biliti' 'bli'
+ (<-'ble')
+ 'ogi' ('l' <-'og')
+ 'fulli' (<-'ful')
+ 'lessli' (<-'less')
+ 'li' (valid_LI delete)
+ )
+ )
+
+ define Step_3 as (
+ [substring] R1 among (
+ 'tional' (<- 'tion')
+ 'ational' (<- 'ate')
+ 'alize' (<-'al')
+ 'icate' 'iciti' 'ical'
+ (<-'ic')
+ 'ful' 'ness'
+ (delete)
+ 'ative'
+ (R2 delete) // 'R2' added Dec 2001
+ )
+ )
+
+ define Step_4 as (
+ [substring] R2 among (
+ 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement'
+ 'ment' 'ent' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize'
+ (delete)
+ 'ion' ('s' or 't' delete)
+ )
+ )
+
+ define Step_5 as (
+ [substring] among (
+ 'e' (R2 or (R1 not shortv) delete)
+ 'l' (R2 'l' delete)
+ )
+ )
+
+ define exception2 as (
+
+ [substring] atlimit among(
+ 'inning' 'outing' 'canning' 'herring' 'earring'
+ 'proceed' 'exceed' 'succeed'
+
+ // ... extensions possible here ...
+
+ )
+ )
+)
+
+define exception1 as (
+
+ [substring] atlimit among(
+
+ /* special changes: */
+
+ 'skis' (<-'ski')
+ 'skies' (<-'sky')
+ 'dying' (<-'die')
+ 'lying' (<-'lie')
+ 'tying' (<-'tie')
+
+ /* special -LY cases */
+
+ 'idly' (<-'idl')
+ 'gently' (<-'gentl')
+ 'ugly' (<-'ugli')
+ 'early' (<-'earli')
+ 'only' (<-'onli')
+ 'singly' (<-'singl')
+
+ // ... extensions possible here ...
+
+ /* invariant forms: */
+
+ 'sky'
+ 'news'
+ 'howe'
+
+ 'atlas' 'cosmos' 'bias' 'andes' // not plural forms
+
+ // ... extensions possible here ...
+ )
+)
+
+define postlude as (Y_found repeat(goto (['Y']) <-'y'))
+
+define stem as (
+
+ exception1 or
+ not hop 3 or (
+ do prelude
+ do mark_regions
+ backwards (
+
+ do Step_1a
+
+ exception2 or (
+
+ do Step_1b
+ do Step_1c
+
+ do Step_2
+ do Step_3
+ do Step_4
+
+ do Step_5
+ )
+ )
+ do postlude
+ )
+)
diff --git a/contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..9ac74f292
--- /dev/null
+++ b/contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl
@@ -0,0 +1,196 @@
+
+/* Finnish stemmer.
+
+ Numbers in square brackets refer to the sections in
+ Fred Karlsson, Finnish: An Essential Grammar. Routledge, 1999
+ ISBN 0-415-20705-3
+
+*/
+
+routines (
+ mark_regions
+ R2
+ particle_etc possessive
+ LONG VI
+ case_ending
+ i_plural
+ t_plural
+ other_endings
+ tidy
+)
+
+externals ( stem )
+
+integers ( p1 p2 )
+strings ( x )
+booleans ( ending_removed )
+groupings ( AEI V1 V2 particle_end )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef a" hex 'E4'
+stringdef o" hex 'F6'
+
+define AEI 'a{a"}ei'
+define V1 'aeiouy{a"}{o"}'
+define V2 'aeiou{a"}{o"}'
+define particle_end V1 + 'nt'
+
+define mark_regions as (
+
+ $p1 = limit
+ $p2 = limit
+
+ goto V1 gopast non-V1 setmark p1
+ goto V1 gopast non-V1 setmark p2
+)
+
+backwardmode (
+
+ define R2 as $p2 <= cursor
+
+ define particle_etc as (
+ setlimit tomark p1 for ([substring])
+ among(
+ 'kin'
+ 'kaan' 'k{a"}{a"}n'
+ 'ko' 'k{o"}'
+ 'han' 'h{a"}n'
+ 'pa' 'p{a"}' // Particles [91]
+ (particle_end)
+ 'sti' // Adverb [87]
+ (R2)
+ )
+ delete
+ )
+ define possessive as ( // [36]
+ setlimit tomark p1 for ([substring])
+ among(
+ 'si'
+ (not 'k' delete) // take 'ksi' as the Comitative case
+ 'ni'
+ (delete ['kse'] <- 'ksi') // kseni = ksi + ni
+ 'nsa' 'ns{a"}'
+ 'mme'
+ 'nne'
+ (delete)
+ /* Now for Vn possessives after case endings: [36] */
+ 'an'
+ (among('ta' 'ssa' 'sta' 'lla' 'lta' 'na') delete)
+ '{a"}n'
+ (among('t{a"}' 'ss{a"}' 'st{a"}'
+ 'll{a"}' 'lt{a"}' 'n{a"}') delete)
+ 'en'
+ (among('lle' 'ine') delete)
+ )
+ )
+
+ define LONG as
+ among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}')
+
+ define VI as ('i' V2)
+
+ define case_ending as (
+ setlimit tomark p1 for ([substring])
+ among(
+ 'han' ('a') //-.
+ 'hen' ('e') // |
+ 'hin' ('i') // |
+ 'hon' ('o') // |
+ 'h{a"}n' ('{a"}') // Illative [43]
+ 'h{o"}n' ('{o"}') // |
+ 'siin' VI // |
+ 'seen' LONG //-'
+
+ 'den' VI
+ 'tten' VI // Genitive plurals [34]
+ ()
+ 'n' // Genitive or Illative
+ ( try ( LONG // Illative
+ or 'ie' // Genitive
+ and next ]
+ )
+ /* otherwise Genitive */
+ )
+
+ 'a' '{a"}' //-.
+ (V1 non-V1) // |
+ 'tta' 'tt{a"}' // Partitive [32]
+ ('e') // |
+ 'ta' 't{a"}' //-'
+
+ 'ssa' 'ss{a"}' // Inessive [41]
+ 'sta' 'st{a"}' // Elative [42]
+
+ 'lla' 'll{a"}' // Adessive [44]
+ 'lta' 'lt{a"}' // Ablative [51]
+ 'lle' // Allative [46]
+ 'na' 'n{a"}' // Essive [49]
+ 'ksi' // Translative[50]
+ 'ine' // Comitative [51]
+
+ /* Abessive and Instructive are too rare for
+ inclusion [51] */
+
+ )
+ delete
+ set ending_removed
+ )
+ define other_endings as (
+ setlimit tomark p2 for ([substring])
+ among(
+ 'mpi' 'mpa' 'mp{a"}'
+ 'mmi' 'mma' 'mm{a"}' // Comparative forms [85]
+ (not 'po') //-improves things
+ 'impi' 'impa' 'imp{a"}'
+ 'immi' 'imma' 'imm{a"}' // Superlative forms [86]
+ 'eja' 'ej{a"}' // indicates agent [93.1B]
+ )
+ delete
+ )
+ define i_plural as ( // [26]
+ setlimit tomark p1 for ([substring])
+ among(
+ 'i' 'j'
+ )
+ delete
+ )
+ define t_plural as ( // [26]
+ setlimit tomark p1 for (
+ ['t'] test V1
+ delete
+ )
+ setlimit tomark p2 for ([substring])
+ among(
+ 'mma' (not 'po') //-mmat endings
+ 'imma' //-immat endings
+ )
+ delete
+ )
+ define tidy as (
+ setlimit tomark p1 for (
+ do ( LONG and ([next] delete ) ) // undouble vowel
+ do ( [AEI] non-V1 delete ) // remove trailing a, a", e, i
+ do ( ['j'] 'o' or 'u' delete )
+ do ( ['o'] 'j' delete )
+ )
+ goto non-V1 [next] -> x x delete // undouble consonant
+ )
+)
+
+define stem as (
+
+ do mark_regions
+ unset ending_removed
+ backwards (
+ do particle_etc
+ do possessive
+ do case_ending
+ do other_endings
+ (ending_removed do i_plural) or do t_plural
+ do tidy
+ )
+)
+
diff --git a/contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..e972f227f
--- /dev/null
+++ b/contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl
@@ -0,0 +1,248 @@
+routines (
+ prelude postlude mark_regions
+ RV R1 R2
+ standard_suffix
+ i_verb_suffix
+ verb_suffix
+ residual_suffix
+ un_double
+ un_accent
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v keep_with_s )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef a^ hex 'E2' // a-circumflex
+stringdef a` hex 'E0' // a-grave
+stringdef c, hex 'E7' // c-cedilla
+
+stringdef e" hex 'EB' // e-diaeresis (rare)
+stringdef e' hex 'E9' // e-acute
+stringdef e^ hex 'EA' // e-circumflex
+stringdef e` hex 'E8' // e-grave
+stringdef i" hex 'EF' // i-diaeresis
+stringdef i^ hex 'EE' // i-circumflex
+stringdef o^ hex 'F4' // o-circumflex
+stringdef u^ hex 'FB' // u-circumflex
+stringdef u` hex 'F9' // u-grave
+
+define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}'
+
+define prelude as repeat goto (
+
+ ( v [ ('u' ] v <- 'U') or
+ ('i' ] v <- 'I') or
+ ('y' ] <- 'Y')
+ )
+ or
+ ( ['y'] v <- 'Y' )
+ or
+ ( 'q' ['u'] <- 'U' )
+)
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v v next )
+ or
+ among ( // this exception list begun Nov 2006
+ 'par' // paris, parie, pari
+ 'col' // colis
+ 'tap' // tapis
+ // extensions possible here
+ )
+ or
+ ( next gopast v )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'I' (<- 'i')
+ 'U' (<- 'u')
+ 'Y' (<- 'y')
+ '' (next)
+ )
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define standard_suffix as (
+ [substring] among(
+
+ 'ance' 'iqUe' 'isme' 'able' 'iste' 'eux'
+ 'ances' 'iqUes' 'ismes' 'ables' 'istes'
+ ( R2 delete )
+ 'atrice' 'ateur' 'ation'
+ 'atrices' 'ateurs' 'ations'
+ ( R2 delete
+ try ( ['ic'] (R2 delete) or <-'iqU' )
+ )
+ 'logie'
+ 'logies'
+ ( R2 <- 'log' )
+ 'usion' 'ution'
+ 'usions' 'utions'
+ ( R2 <- 'u' )
+ 'ence'
+ 'ences'
+ ( R2 <- 'ent' )
+ 'ement'
+ 'ements'
+ (
+ RV delete
+ try (
+ [substring] among(
+ 'iv' (R2 delete ['at'] R2 delete)
+ 'eus' ((R2 delete) or (R1<-'eux'))
+ 'abl' 'iqU'
+ (R2 delete)
+ 'i{e`}r' 'I{e`}r' //)
+ (RV <-'i') //)--new 2 Sept 02
+ )
+ )
+ )
+ 'it{e'}'
+ 'it{e'}s'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'abil' ((R2 delete) or <-'abl')
+ 'ic' ((R2 delete) or <-'iqU')
+ 'iv' (R2 delete)
+ )
+ )
+ )
+ 'if' 'ive'
+ 'ifs' 'ives'
+ (
+ R2 delete
+ try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' )
+ )
+ 'eaux' (<- 'eau')
+ 'aux' (R1 <- 'al')
+ 'euse'
+ 'euses'((R2 delete) or (R1<-'eux'))
+
+ 'issement'
+ 'issements'(R1 non-v delete) // verbal
+
+ // fail(...) below forces entry to verb_suffix. -ment typically
+ // follows the p.p., e.g 'confus{e'}ment'.
+
+ 'amment' (RV fail(<- 'ant'))
+ 'emment' (RV fail(<- 'ent'))
+ 'ment'
+ 'ments' (test(v RV) fail(delete))
+ // v is e,i,u,{e'},I or U
+ )
+ )
+
+ define i_verb_suffix as setlimit tomark pV for (
+ [substring] among (
+ '{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai'
+ 'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez'
+ 'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait'
+ 'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses'
+ 'issez' 'issiez' 'issions' 'issons' 'it'
+ (non-v delete)
+ )
+ )
+
+ define verb_suffix as setlimit tomark pV for (
+ [substring] among (
+ 'ions'
+ (R2 delete)
+
+ '{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai'
+ 'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions'
+ 'erons' 'eront' 'ez' 'iez'
+
+ // 'ons' //-best omitted
+
+ (delete)
+
+ '{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant'
+ 'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez'
+ 'assions'
+ (delete
+ try(['e'] delete)
+ )
+ )
+ )
+
+ define keep_with_s 'aiou{e`}s'
+
+ define residual_suffix as (
+ try(['s'] test non-keep_with_s delete)
+ setlimit tomark pV for (
+ [substring] among(
+ 'ion' (R2 's' or 't' delete)
+ 'ier' 'i{e`}re'
+ 'Ier' 'I{e`}re' (<-'i')
+ 'e' (delete)
+ '{e"}' ('gu' delete)
+ )
+ )
+ )
+
+ define un_double as (
+ test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete
+ )
+
+ define un_accent as (
+ atleast 1 non-v
+ [ '{e'}' or '{e`}' ] <-'e'
+ )
+)
+
+define stem as (
+
+ do prelude
+ do mark_regions
+ backwards (
+
+ do (
+ (
+ ( standard_suffix or
+ i_verb_suffix or
+ verb_suffix
+ )
+ and
+ try( [ ('Y' ] <- 'i' ) or
+ ('{c,}'] <- 'c' )
+ )
+ ) or
+ residual_suffix
+ )
+
+ // try(['ent'] RV delete) // is best omitted
+
+ do un_double
+ do un_accent
+ )
+ do postlude
+)
+
diff --git a/contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl
new file mode 100644
index 000000000..996eba1ab
--- /dev/null
+++ b/contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl
@@ -0,0 +1,239 @@
+routines (
+ prelude postlude mark_regions
+ RV R1 R2
+ standard_suffix
+ i_verb_suffix
+ verb_suffix
+ residual_suffix
+ un_double
+ un_accent
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v keep_with_s )
+
+stringescapes {}
+
+/* special characters (in MS-DOS Latin I) */
+
+stringdef a^ hex '83' // a-circumflex
+stringdef a` hex '85' // a-grave
+stringdef c, hex '87' // c-cedilla
+
+stringdef e" hex '89' // e-diaeresis (rare)
+stringdef e' hex '82' // e-acute
+stringdef e^ hex '88' // e-circumflex
+stringdef e` hex '8A' // e-grave
+stringdef i" hex '8B' // i-diaeresis
+stringdef i^ hex '8C' // i-circumflex
+stringdef o^ hex '93' // o-circumflex
+stringdef u^ hex '96' // u-circumflex
+stringdef u` hex '97' // u-grave
+
+define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}'
+
+define prelude as repeat goto (
+
+ ( v [ ('u' ] v <- 'U') or
+ ('i' ] v <- 'I') or
+ ('y' ] <- 'Y')
+ )
+ or
+ ( ['y'] v <- 'Y' )
+ or
+ ( 'q' ['u'] <- 'U' )
+)
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v v next ) or ( next gopast v )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'I' (<- 'i')
+ 'U' (<- 'u')
+ 'Y' (<- 'y')
+ '' (next)
+ )
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define standard_suffix as (
+ [substring] among(
+
+ 'ance' 'iqUe' 'isme' 'able' 'iste' 'eux'
+ 'ances' 'iqUes' 'ismes' 'ables' 'istes'
+ ( R2 delete )
+ 'atrice' 'ateur' 'ation'
+ 'atrices' 'ateurs' 'ations'
+ ( R2 delete
+ try ( ['ic'] (R2 delete) or <-'iqU' )
+ )
+ 'logie'
+ 'logies'
+ ( R2 <- 'log' )
+ 'usion' 'ution'
+ 'usions' 'utions'
+ ( R2 <- 'u' )
+ 'ence'
+ 'ences'
+ ( R2 <- 'ent' )
+ 'ement'
+ 'ements'
+ (
+ RV delete
+ try (
+ [substring] among(
+ 'iv' (R2 delete ['at'] R2 delete)
+ 'eus' ((R2 delete) or (R1<-'eux'))
+ 'abl' 'iqU'
+ (R2 delete)
+ 'i{e`}r' 'I{e`}r' //)
+ (RV <-'i') //)--new 2 Sept 02
+ )
+ )
+ )
+ 'it{e'}'
+ 'it{e'}s'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'abil' ((R2 delete) or <-'abl')
+ 'ic' ((R2 delete) or <-'iqU')
+ 'iv' (R2 delete)
+ )
+ )
+ )
+ 'if' 'ive'
+ 'ifs' 'ives'
+ (
+ R2 delete
+ try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' )
+ )
+ 'eaux' (<- 'eau')
+ 'aux' (R1 <- 'al')
+ 'euse'
+ 'euses'((R2 delete) or (R1<-'eux'))
+
+ 'issement'
+ 'issements'(R1 non-v delete) // verbal
+
+ // fail(...) below forces entry to verb_suffix. -ment typically
+ // follows the p.p., e.g 'confus{e'}ment'.
+
+ 'amment' (RV fail(<- 'ant'))
+ 'emment' (RV fail(<- 'ent'))
+ 'ment'
+ 'ments' (test(v RV) fail(delete))
+ // v is e,i,u,{e'},I or U
+ )
+ )
+
+ define i_verb_suffix as setlimit tomark pV for (
+ [substring] among (
+ '{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai'
+ 'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez'
+ 'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait'
+ 'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses'
+ 'issez' 'issiez' 'issions' 'issons' 'it'
+ (non-v delete)
+ )
+ )
+
+ define verb_suffix as setlimit tomark pV for (
+ [substring] among (
+ 'ions'
+ (R2 delete)
+
+ '{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai'
+ 'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions'
+ 'erons' 'eront' 'ez' 'iez'
+
+ // 'ons' //-best omitted
+
+ (delete)
+
+ '{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant'
+ 'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez'
+ 'assions'
+ (delete
+ try(['e'] delete)
+ )
+ )
+ )
+
+ define keep_with_s 'aiou{e`}s'
+
+ define residual_suffix as (
+ try(['s'] test non-keep_with_s delete)
+ setlimit tomark pV for (
+ [substring] among(
+ 'ion' (R2 's' or 't' delete)
+ 'ier' 'i{e`}re'
+ 'Ier' 'I{e`}re' (<-'i')
+ 'e' (delete)
+ '{e"}' ('gu' delete)
+ )
+ )
+ )
+
+ define un_double as (
+ test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete
+ )
+
+ define un_accent as (
+ atleast 1 non-v
+ [ '{e'}' or '{e`}' ] <-'e'
+ )
+)
+
+define stem as (
+
+ do prelude
+ do mark_regions
+ backwards (
+
+ do (
+ (
+ ( standard_suffix or
+ i_verb_suffix or
+ verb_suffix
+ )
+ and
+ try( [ ('Y' ] <- 'i' ) or
+ ('{c,}'] <- 'c' )
+ )
+ ) or
+ residual_suffix
+ )
+
+ // try(['ent'] RV delete) // is best omitted
+
+ do un_double
+ do un_accent
+ )
+ do postlude
+)
+
diff --git a/contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..7069daf0d
--- /dev/null
+++ b/contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl
@@ -0,0 +1,139 @@
+
+/*
+ Extra rule for -nisse ending added 11 Dec 2009
+*/
+
+routines (
+ prelude postlude
+ mark_regions
+ R1 R2
+ standard_suffix
+)
+
+externals ( stem )
+
+integers ( p1 p2 x )
+
+groupings ( v s_ending st_ending )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef a" hex 'E4'
+stringdef o" hex 'F6'
+stringdef u" hex 'FC'
+stringdef ss hex 'DF'
+
+define v 'aeiouy{a"}{o"}{u"}'
+
+define s_ending 'bdfghklmnrt'
+define st_ending s_ending - 'r'
+
+define prelude as (
+
+ test repeat (
+ (
+ ['{ss}'] <- 'ss'
+ ) or next
+ )
+
+ repeat goto (
+ v [('u'] v <- 'U') or
+ ('y'] v <- 'Y')
+ )
+)
+
+define mark_regions as (
+
+ $p1 = limit
+ $p2 = limit
+
+ test(hop 3 setmark x)
+
+ gopast v gopast non-v setmark p1
+ try($p1 < x $p1 = x) // at least 3
+ gopast v gopast non-v setmark p2
+
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'Y' (<- 'y')
+ 'U' (<- 'u')
+ '{a"}' (<- 'a')
+ '{o"}' (<- 'o')
+ '{u"}' (<- 'u')
+ '' (next)
+ )
+
+)
+
+backwardmode (
+
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define standard_suffix as (
+ do (
+ [substring] R1 among(
+ 'em' 'ern' 'er'
+ ( delete
+ )
+ 'e' 'en' 'es'
+ ( delete
+ try (['s'] 'nis' delete)
+ )
+ 's'
+ ( s_ending delete
+ )
+ )
+ )
+ do (
+ [substring] R1 among(
+ 'en' 'er' 'est'
+ ( delete
+ )
+ 'st'
+ ( st_ending hop 3 delete
+ )
+ )
+ )
+ do (
+ [substring] R2 among(
+ 'end' 'ung'
+ ( delete
+ try (['ig'] not 'e' R2 delete)
+ )
+ 'ig' 'ik' 'isch'
+ ( not 'e' delete
+ )
+ 'lich' 'heit'
+ ( delete
+ try (
+ ['er' or 'en'] R1 delete
+ )
+ )
+ 'keit'
+ ( delete
+ try (
+ [substring] R2 among(
+ 'lich' 'ig'
+ ( delete
+ )
+ )
+ )
+ )
+ )
+ )
+ )
+)
+
+define stem as (
+ do prelude
+ do mark_regions
+ backwards
+ do standard_suffix
+ do postlude
+)
diff --git a/contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl
new file mode 100644
index 000000000..3effb3257
--- /dev/null
+++ b/contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl
@@ -0,0 +1,139 @@
+
+/*
+ Extra rule for -nisse ending added 11 Dec 2009
+*/
+
+routines (
+ prelude postlude
+ mark_regions
+ R1 R2
+ standard_suffix
+)
+
+externals ( stem )
+
+integers ( p1 p2 x )
+
+groupings ( v s_ending st_ending )
+
+stringescapes {}
+
+/* special characters (in MS-DOS Latin I) */
+
+stringdef a" hex '84'
+stringdef o" hex '94'
+stringdef u" hex '81'
+stringdef ss hex 'E1'
+
+define v 'aeiouy{a"}{o"}{u"}'
+
+define s_ending 'bdfghklmnrt'
+define st_ending s_ending - 'r'
+
+define prelude as (
+
+ test repeat (
+ (
+ ['{ss}'] <- 'ss'
+ ) or next
+ )
+
+ repeat goto (
+ v [('u'] v <- 'U') or
+ ('y'] v <- 'Y')
+ )
+)
+
+define mark_regions as (
+
+ $p1 = limit
+ $p2 = limit
+
+ test(hop 3 setmark x)
+
+ gopast v gopast non-v setmark p1
+ try($p1 < x $p1 = x) // at least 3
+ gopast v gopast non-v setmark p2
+
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'Y' (<- 'y')
+ 'U' (<- 'u')
+ '{a"}' (<- 'a')
+ '{o"}' (<- 'o')
+ '{u"}' (<- 'u')
+ '' (next)
+ )
+
+)
+
+backwardmode (
+
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define standard_suffix as (
+ do (
+ [substring] R1 among(
+ 'em' 'ern' 'er'
+ ( delete
+ )
+ 'e' 'en' 'es'
+ ( delete
+ try (['s'] 'nis' delete)
+ )
+ 's'
+ ( s_ending delete
+ )
+ )
+ )
+ do (
+ [substring] R1 among(
+ 'en' 'er' 'est'
+ ( delete
+ )
+ 'st'
+ ( st_ending hop 3 delete
+ )
+ )
+ )
+ do (
+ [substring] R2 among(
+ 'end' 'ung'
+ ( delete
+ try (['ig'] not 'e' R2 delete)
+ )
+ 'ig' 'ik' 'isch'
+ ( not 'e' delete
+ )
+ 'lich' 'heit'
+ ( delete
+ try (
+ ['er' or 'en'] R1 delete
+ )
+ )
+ 'keit'
+ ( delete
+ try (
+ [substring] R2 among(
+ 'lich' 'ig'
+ ( delete
+ )
+ )
+ )
+ )
+ )
+ )
+ )
+)
+
+define stem as (
+ do prelude
+ do mark_regions
+ backwards
+ do standard_suffix
+ do postlude
+)
diff --git a/contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..ce6026a86
--- /dev/null
+++ b/contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl
@@ -0,0 +1,145 @@
+
+/*
+ Extra rule for -nisse ending added 11 Dec 2009
+*/
+
+routines (
+ prelude postlude
+ mark_regions
+ R1 R2
+ standard_suffix
+)
+
+externals ( stem )
+
+integers ( p1 p2 x )
+
+groupings ( v s_ending st_ending )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef a" hex 'E4'
+stringdef o" hex 'F6'
+stringdef u" hex 'FC'
+stringdef ss hex 'DF'
+
+define v 'aeiouy{a"}{o"}{u"}'
+
+define s_ending 'bdfghklmnrt'
+define st_ending s_ending - 'r'
+
+define prelude as (
+
+ test repeat goto (
+ v [('u'] v <- 'U') or
+ ('y'] v <- 'Y')
+ )
+
+ repeat (
+ [substring] among(
+ '{ss}' (<- 'ss')
+ 'ae' (<- '{a"}')
+ 'oe' (<- '{o"}')
+ 'ue' (<- '{u"}')
+ 'qu' (hop 2)
+ '' (next)
+ )
+ )
+
+)
+
+define mark_regions as (
+
+ $p1 = limit
+ $p2 = limit
+
+ test(hop 3 setmark x)
+
+ gopast v gopast non-v setmark p1
+ try($p1 < x $p1 = x) // at least 3
+ gopast v gopast non-v setmark p2
+
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'Y' (<- 'y')
+ 'U' (<- 'u')
+ '{a"}' (<- 'a')
+ '{o"}' (<- 'o')
+ '{u"}' (<- 'u')
+ '' (next)
+ )
+
+)
+
+backwardmode (
+
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define standard_suffix as (
+ do (
+ [substring] R1 among(
+ 'em' 'ern' 'er'
+ ( delete
+ )
+ 'e' 'en' 'es'
+ ( delete
+ try (['s'] 'nis' delete)
+ )
+ 's'
+ ( s_ending delete
+ )
+ )
+ )
+ do (
+ [substring] R1 among(
+ 'en' 'er' 'est'
+ ( delete
+ )
+ 'st'
+ ( st_ending hop 3 delete
+ )
+ )
+ )
+ do (
+ [substring] R2 among(
+ 'end' 'ung'
+ ( delete
+ try (['ig'] not 'e' R2 delete)
+ )
+ 'ig' 'ik' 'isch'
+ ( not 'e' delete
+ )
+ 'lich' 'heit'
+ ( delete
+ try (
+ ['er' or 'en'] R1 delete
+ )
+ )
+ 'keit'
+ ( delete
+ try (
+ [substring] R2 among(
+ 'lich' 'ig'
+ ( delete
+ )
+ )
+ )
+ )
+ )
+ )
+ )
+)
+
+define stem as (
+ do prelude
+ do mark_regions
+ backwards
+ do standard_suffix
+ do postlude
+)
diff --git a/contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl b/contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl
new file mode 100644
index 000000000..d1a644931
--- /dev/null
+++ b/contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl
@@ -0,0 +1,241 @@
+/*
+Hungarian Stemmer
+Removes noun inflections
+*/
+
+routines (
+ mark_regions
+ R1
+ v_ending
+ case
+ case_special
+ case_other
+ plural
+ owned
+ sing_owner
+ plur_owner
+ instrum
+ factive
+ undouble
+ double
+)
+
+externals ( stem )
+
+integers ( p1 )
+groupings ( v )
+
+stringescapes {}
+
+/* special characters (in ISO Latin 2) */
+
+stringdef a' hex 'E1' //a-acute
+stringdef e' hex 'E9' //e-acute
+stringdef i' hex 'ED' //i-acute
+stringdef o' hex 'F3' //o-acute
+stringdef o" hex 'F6' //o-umlaut
+stringdef oq hex 'F5' //o-double acute
+stringdef u' hex 'FA' //u-acute
+stringdef u" hex 'FC' //u-umlaut
+stringdef uq hex 'FB' //u-double acute
+
+define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}'
+
+define mark_regions as (
+
+ $p1 = limit
+
+ (v goto non-v
+ among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next
+ setmark p1)
+ or
+
+ (non-v gopast v setmark p1)
+)
+
+backwardmode (
+
+ define R1 as $p1 <= cursor
+
+ define v_ending as (
+ [substring] R1 among(
+ '{a'}' (<- 'a')
+ '{e'}' (<- 'e')
+ )
+ )
+
+ define double as (
+ test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm'
+ 'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs')
+ )
+
+ define undouble as (
+ next [hop 1] delete
+ )
+
+ define instrum as(
+ [substring] R1 among(
+ 'al' (double)
+ 'el' (double)
+ )
+ delete
+ undouble
+ )
+
+
+ define case as (
+ [substring] R1 among(
+ 'ban' 'ben'
+ 'ba' 'be'
+ 'ra' 're'
+ 'nak' 'nek'
+ 'val' 'vel'
+ 't{o'}l' 't{oq}l'
+ 'r{o'}l' 'r{oq}l'
+ 'b{o'}l' 'b{oq}l'
+ 'hoz' 'hez' 'h{o"}z'
+ 'n{a'}l' 'n{e'}l'
+ 'ig'
+ 'at' 'et' 'ot' '{o"}t'
+ '{e'}rt'
+ 'k{e'}pp' 'k{e'}ppen'
+ 'kor'
+ 'ul' '{u"}l'
+ 'v{a'}' 'v{e'}'
+ 'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt'
+ 'k{e'}nt'
+ 'en' 'on' 'an' '{o"}n'
+ 'n'
+ 't'
+ )
+ delete
+ v_ending
+ )
+
+ define case_special as(
+ [substring] R1 among(
+ '{e'}n' (<- 'e')
+ '{a'}n' (<- 'a')
+ '{a'}nk{e'}nt' (<- 'a')
+ )
+ )
+
+ define case_other as(
+ [substring] R1 among(
+ 'astul' 'est{u"}l' (delete)
+ 'stul' 'st{u"}l' (delete)
+ '{a'}stul' (<- 'a')
+ '{e'}st{u"}l' (<- 'e')
+ )
+ )
+
+ define factive as(
+ [substring] R1 among(
+ '{a'}' (double)
+ '{e'}' (double)
+ )
+ delete
+ undouble
+ )
+
+ define plural as (
+ [substring] R1 among(
+ '{a'}k' (<- 'a')
+ '{e'}k' (<- 'e')
+ '{o"}k' (delete)
+ 'ak' (delete)
+ 'ok' (delete)
+ 'ek' (delete)
+ 'k' (delete)
+ )
+ )
+
+ define owned as (
+ [substring] R1 among (
+ 'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete)
+ '{e'}k{e'}' (<- 'e')
+ '{a'}k{e'}' (<- 'a')
+ 'k{e'}' (delete)
+ '{e'}{e'}i' (<- 'e')
+ '{a'}{e'}i' (<- 'a')
+ '{e'}i' (delete)
+ '{e'}{e'}' (<- 'e')
+ '{e'}' (delete)
+ )
+ )
+
+ define sing_owner as (
+ [substring] R1 among(
+ '{u"}nk' 'unk' (delete)
+ '{a'}nk' (<- 'a')
+ '{e'}nk' (<- 'e')
+ 'nk' (delete)
+ '{a'}juk' (<- 'a')
+ '{e'}j{u"}k' (<- 'e')
+ 'juk' 'j{u"}k' (delete)
+ 'uk' '{u"}k' (delete)
+ 'em' 'om' 'am' (delete)
+ '{a'}m' (<- 'a')
+ '{e'}m' (<- 'e')
+ 'm' (delete)
+ 'od' 'ed' 'ad' '{o"}d' (delete)
+ '{a'}d' (<- 'a')
+ '{e'}d' (<- 'e')
+ 'd' (delete)
+ 'ja' 'je' (delete)
+ 'a' 'e' 'o' (delete)
+ '{a'}' (<- 'a')
+ '{e'}' (<- 'e')
+ )
+ )
+
+ define plur_owner as (
+ [substring] R1 among(
+ 'jaim' 'jeim' (delete)
+ '{a'}im' (<- 'a')
+ '{e'}im' (<- 'e')
+ 'aim' 'eim' (delete)
+ 'im' (delete)
+ 'jaid' 'jeid' (delete)
+ '{a'}id' (<- 'a')
+ '{e'}id' (<- 'e')
+ 'aid' 'eid' (delete)
+ 'id' (delete)
+ 'jai' 'jei' (delete)
+ '{a'}i' (<- 'a')
+ '{e'}i' (<- 'e')
+ 'ai' 'ei' (delete)
+ 'i' (delete)
+ 'jaink' 'jeink' (delete)
+ 'eink' 'aink' (delete)
+ '{a'}ink' (<- 'a')
+ '{e'}ink' (<- 'e')
+ 'ink'
+ 'jaitok' 'jeitek' (delete)
+ 'aitok' 'eitek' (delete)
+ '{a'}itok' (<- 'a')
+ '{e'}itek' (<- 'e')
+ 'itek' (delete)
+ 'jeik' 'jaik' (delete)
+ 'aik' 'eik' (delete)
+ '{a'}ik' (<- 'a')
+ '{e'}ik' (<- 'e')
+ 'ik' (delete)
+ )
+ )
+)
+
+define stem as (
+ do mark_regions
+ backwards (
+ do instrum
+ do case
+ do case_special
+ do case_other
+ do factive
+ do owned
+ do sing_owner
+ do plur_owner
+ do plural
+ )
+)
diff --git a/contrib/snowball/algorithms/hungarian/stem_Unicode.sbl b/contrib/snowball/algorithms/hungarian/stem_Unicode.sbl
new file mode 100644
index 000000000..c812e055c
--- /dev/null
+++ b/contrib/snowball/algorithms/hungarian/stem_Unicode.sbl
@@ -0,0 +1,241 @@
+/*
+Hungarian Stemmer
+Removes noun inflections
+*/
+
+routines (
+ mark_regions
+ R1
+ v_ending
+ case
+ case_special
+ case_other
+ plural
+ owned
+ sing_owner
+ plur_owner
+ instrum
+ factive
+ undouble
+ double
+)
+
+externals ( stem )
+
+integers ( p1 )
+groupings ( v )
+
+stringescapes {}
+
+/* special characters (in Unicode) */
+
+stringdef a' hex 'E1' //a-acute
+stringdef e' hex 'E9' //e-acute
+stringdef i' hex 'ED' //i-acute
+stringdef o' hex 'F3' //o-acute
+stringdef o" hex 'F6' //o-umlaut
+stringdef oq hex '151' //o-double acute
+stringdef u' hex 'FA' //u-acute
+stringdef u" hex 'FC' //u-umlaut
+stringdef uq hex '171' //u-double acute
+
+define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}'
+
+define mark_regions as (
+
+ $p1 = limit
+
+ (v goto non-v
+ among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next
+ setmark p1)
+ or
+
+ (non-v gopast v setmark p1)
+)
+
+backwardmode (
+
+ define R1 as $p1 <= cursor
+
+ define v_ending as (
+ [substring] R1 among(
+ '{a'}' (<- 'a')
+ '{e'}' (<- 'e')
+ )
+ )
+
+ define double as (
+ test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm'
+ 'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs')
+ )
+
+ define undouble as (
+ next [hop 1] delete
+ )
+
+ define instrum as(
+ [substring] R1 among(
+ 'al' (double)
+ 'el' (double)
+ )
+ delete
+ undouble
+ )
+
+
+ define case as (
+ [substring] R1 among(
+ 'ban' 'ben'
+ 'ba' 'be'
+ 'ra' 're'
+ 'nak' 'nek'
+ 'val' 'vel'
+ 't{o'}l' 't{oq}l'
+ 'r{o'}l' 'r{oq}l'
+ 'b{o'}l' 'b{oq}l'
+ 'hoz' 'hez' 'h{o"}z'
+ 'n{a'}l' 'n{e'}l'
+ 'ig'
+ 'at' 'et' 'ot' '{o"}t'
+ '{e'}rt'
+ 'k{e'}pp' 'k{e'}ppen'
+ 'kor'
+ 'ul' '{u"}l'
+ 'v{a'}' 'v{e'}'
+ 'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt'
+ 'k{e'}nt'
+ 'en' 'on' 'an' '{o"}n'
+ 'n'
+ 't'
+ )
+ delete
+ v_ending
+ )
+
+ define case_special as(
+ [substring] R1 among(
+ '{e'}n' (<- 'e')
+ '{a'}n' (<- 'a')
+ '{a'}nk{e'}nt' (<- 'a')
+ )
+ )
+
+ define case_other as(
+ [substring] R1 among(
+ 'astul' 'est{u"}l' (delete)
+ 'stul' 'st{u"}l' (delete)
+ '{a'}stul' (<- 'a')
+ '{e'}st{u"}l' (<- 'e')
+ )
+ )
+
+ define factive as(
+ [substring] R1 among(
+ '{a'}' (double)
+ '{e'}' (double)
+ )
+ delete
+ undouble
+ )
+
+ define plural as (
+ [substring] R1 among(
+ '{a'}k' (<- 'a')
+ '{e'}k' (<- 'e')
+ '{o"}k' (delete)
+ 'ak' (delete)
+ 'ok' (delete)
+ 'ek' (delete)
+ 'k' (delete)
+ )
+ )
+
+ define owned as (
+ [substring] R1 among (
+ 'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete)
+ '{e'}k{e'}' (<- 'e')
+ '{a'}k{e'}' (<- 'a')
+ 'k{e'}' (delete)
+ '{e'}{e'}i' (<- 'e')
+ '{a'}{e'}i' (<- 'a')
+ '{e'}i' (delete)
+ '{e'}{e'}' (<- 'e')
+ '{e'}' (delete)
+ )
+ )
+
+ define sing_owner as (
+ [substring] R1 among(
+ '{u"}nk' 'unk' (delete)
+ '{a'}nk' (<- 'a')
+ '{e'}nk' (<- 'e')
+ 'nk' (delete)
+ '{a'}juk' (<- 'a')
+ '{e'}j{u"}k' (<- 'e')
+ 'juk' 'j{u"}k' (delete)
+ 'uk' '{u"}k' (delete)
+ 'em' 'om' 'am' (delete)
+ '{a'}m' (<- 'a')
+ '{e'}m' (<- 'e')
+ 'm' (delete)
+ 'od' 'ed' 'ad' '{o"}d' (delete)
+ '{a'}d' (<- 'a')
+ '{e'}d' (<- 'e')
+ 'd' (delete)
+ 'ja' 'je' (delete)
+ 'a' 'e' 'o' (delete)
+ '{a'}' (<- 'a')
+ '{e'}' (<- 'e')
+ )
+ )
+
+ define plur_owner as (
+ [substring] R1 among(
+ 'jaim' 'jeim' (delete)
+ '{a'}im' (<- 'a')
+ '{e'}im' (<- 'e')
+ 'aim' 'eim' (delete)
+ 'im' (delete)
+ 'jaid' 'jeid' (delete)
+ '{a'}id' (<- 'a')
+ '{e'}id' (<- 'e')
+ 'aid' 'eid' (delete)
+ 'id' (delete)
+ 'jai' 'jei' (delete)
+ '{a'}i' (<- 'a')
+ '{e'}i' (<- 'e')
+ 'ai' 'ei' (delete)
+ 'i' (delete)
+ 'jaink' 'jeink' (delete)
+ 'eink' 'aink' (delete)
+ '{a'}ink' (<- 'a')
+ '{e'}ink' (<- 'e')
+ 'ink'
+ 'jaitok' 'jeitek' (delete)
+ 'aitok' 'eitek' (delete)
+ '{a'}itok' (<- 'a')
+ '{e'}itek' (<- 'e')
+ 'itek' (delete)
+ 'jeik' 'jaik' (delete)
+ 'aik' 'eik' (delete)
+ '{a'}ik' (<- 'a')
+ '{e'}ik' (<- 'e')
+ 'ik' (delete)
+ )
+ )
+)
+
+define stem as (
+ do mark_regions
+ backwards (
+ do instrum
+ do case
+ do case_special
+ do case_other
+ do factive
+ do owned
+ do sing_owner
+ do plur_owner
+ do plural
+ )
+)
diff --git a/contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..8d25cf64f
--- /dev/null
+++ b/contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl
@@ -0,0 +1,195 @@
+
+routines (
+ prelude postlude mark_regions
+ RV R1 R2
+ attached_pronoun
+ standard_suffix
+ verb_suffix
+ vowel_suffix
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v AEIO CG )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef a' hex 'E1'
+stringdef a` hex 'E0'
+stringdef e' hex 'E9'
+stringdef e` hex 'E8'
+stringdef i' hex 'ED'
+stringdef i` hex 'EC'
+stringdef o' hex 'F3'
+stringdef o` hex 'F2'
+stringdef u' hex 'FA'
+stringdef u` hex 'F9'
+
+define v 'aeiou{a`}{e`}{i`}{o`}{u`}'
+
+define prelude as (
+ test repeat (
+ [substring] among(
+ '{a'}' (<- '{a`}')
+ '{e'}' (<- '{e`}')
+ '{i'}' (<- '{i`}')
+ '{o'}' (<- '{o`}')
+ '{u'}' (<- '{u`}')
+ 'qu' (<- 'qU')
+ '' (next)
+ )
+ )
+ repeat goto (
+ v [ ('u' ] v <- 'U') or
+ ('i' ] v <- 'I')
+ )
+)
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v (non-v gopast v) or (v gopast non-v) )
+ or
+ ( non-v (non-v gopast v) or (v next) )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'I' (<- 'i')
+ 'U' (<- 'u')
+ '' (next)
+ )
+
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define attached_pronoun as (
+ [substring] among(
+ 'ci' 'gli' 'la' 'le' 'li' 'lo'
+ 'mi' 'ne' 'si' 'ti' 'vi'
+ // the compound forms are:
+ 'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene'
+ 'mela' 'mele' 'meli' 'melo' 'mene'
+ 'tela' 'tele' 'teli' 'telo' 'tene'
+ 'cela' 'cele' 'celi' 'celo' 'cene'
+ 'vela' 'vele' 'veli' 'velo' 'vene'
+ )
+ among( (RV)
+ 'ando' 'endo' (delete)
+ 'ar' 'er' 'ir' (<- 'e')
+ )
+ )
+
+ define standard_suffix as (
+ [substring] among(
+
+ 'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo'
+ 'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti'
+ 'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente'
+ 'atrice' 'atrici'
+ 'ante' 'anti' // Note 1
+ ( R2 delete )
+ 'azione' 'azioni' 'atore' 'atori'
+ ( R2 delete
+ try ( ['ic'] R2 delete )
+ )
+ 'logia' 'logie'
+ ( R2 <- 'log' )
+ 'uzione' 'uzioni' 'usione' 'usioni'
+ ( R2 <- 'u' )
+ 'enza' 'enze'
+ ( R2 <- 'ente' )
+ 'amento' 'amenti' 'imento' 'imenti'
+ ( RV delete )
+ 'amente' (
+ R1 delete
+ try (
+ [substring] R2 delete among(
+ 'iv' ( ['at'] R2 delete )
+ 'os' 'ic' 'abil'
+ )
+ )
+ )
+ 'it{a`}' (
+ R2 delete
+ try (
+ [substring] among(
+ 'abil' 'ic' 'iv' (R2 delete)
+ )
+ )
+ )
+ 'ivo' 'ivi' 'iva' 'ive' (
+ R2 delete
+ try ( ['at'] R2 delete ['ic'] R2 delete )
+ )
+ )
+ )
+
+ define verb_suffix as setlimit tomark pV for (
+ [substring] among(
+ 'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi'
+ 'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate'
+ 'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai'
+ 'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo'
+ 'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete'
+ 'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo'
+ 'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei'
+ 'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono'
+ 'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita'
+ 'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo'
+ 'ono' 'uta' 'ute' 'uti' 'uto'
+
+ 'ar' 'ir' // but 'er' is problematical
+ (delete)
+ )
+ )
+
+ define AEIO 'aeio{a`}{e`}{i`}{o`}'
+ define CG 'cg'
+
+ define vowel_suffix as (
+ try (
+ [AEIO] RV delete
+ ['i'] RV delete
+ )
+ try (
+ ['h'] CG RV delete
+ )
+ )
+)
+
+define stem as (
+ do prelude
+ do mark_regions
+ backwards (
+ do attached_pronoun
+ do (standard_suffix or verb_suffix)
+ do vowel_suffix
+ )
+ do postlude
+)
+
+/*
+ Note 1: additions of 15 Jun 2005
+*/
+
diff --git a/contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl
new file mode 100644
index 000000000..b43295c09
--- /dev/null
+++ b/contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl
@@ -0,0 +1,195 @@
+
+routines (
+ prelude postlude mark_regions
+ RV R1 R2
+ attached_pronoun
+ standard_suffix
+ verb_suffix
+ vowel_suffix
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v AEIO CG )
+
+stringescapes {}
+
+/* special characters (in MS-DOS Latin I) */
+
+stringdef a' hex 'A0'
+stringdef a` hex '85'
+stringdef e' hex '82'
+stringdef e` hex '8A'
+stringdef i' hex 'A1'
+stringdef i` hex '8D'
+stringdef o' hex 'A2'
+stringdef o` hex '95'
+stringdef u' hex 'A3'
+stringdef u` hex '97'
+
+define v 'aeiou{a`}{e`}{i`}{o`}{u`}'
+
+define prelude as (
+ test repeat (
+ [substring] among(
+ '{a'}' (<- '{a`}')
+ '{e'}' (<- '{e`}')
+ '{i'}' (<- '{i`}')
+ '{o'}' (<- '{o`}')
+ '{u'}' (<- '{u`}')
+ 'qu' (<- 'qU')
+ '' (next)
+ )
+ )
+ repeat goto (
+ v [ ('u' ] v <- 'U') or
+ ('i' ] v <- 'I')
+ )
+)
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v (non-v gopast v) or (v gopast non-v) )
+ or
+ ( non-v (non-v gopast v) or (v next) )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'I' (<- 'i')
+ 'U' (<- 'u')
+ '' (next)
+ )
+
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define attached_pronoun as (
+ [substring] among(
+ 'ci' 'gli' 'la' 'le' 'li' 'lo'
+ 'mi' 'ne' 'si' 'ti' 'vi'
+ // the compound forms are:
+ 'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene'
+ 'mela' 'mele' 'meli' 'melo' 'mene'
+ 'tela' 'tele' 'teli' 'telo' 'tene'
+ 'cela' 'cele' 'celi' 'celo' 'cene'
+ 'vela' 'vele' 'veli' 'velo' 'vene'
+ )
+ among( (RV)
+ 'ando' 'endo' (delete)
+ 'ar' 'er' 'ir' (<- 'e')
+ )
+ )
+
+ define standard_suffix as (
+ [substring] among(
+
+ 'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo'
+ 'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti'
+ 'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente'
+ 'atrice' 'atrici'
+ 'ante' 'anti' // Note 1
+ ( R2 delete )
+ 'azione' 'azioni' 'atore' 'atori'
+ ( R2 delete
+ try ( ['ic'] R2 delete )
+ )
+ 'logia' 'logie'
+ ( R2 <- 'log' )
+ 'uzione' 'uzioni' 'usione' 'usioni'
+ ( R2 <- 'u' )
+ 'enza' 'enze'
+ ( R2 <- 'ente' )
+ 'amento' 'amenti' 'imento' 'imenti'
+ ( RV delete )
+ 'amente' (
+ R1 delete
+ try (
+ [substring] R2 delete among(
+ 'iv' ( ['at'] R2 delete )
+ 'os' 'ic' 'abil'
+ )
+ )
+ )
+ 'it{a`}' (
+ R2 delete
+ try (
+ [substring] among(
+ 'abil' 'ic' 'iv' (R2 delete)
+ )
+ )
+ )
+ 'ivo' 'ivi' 'iva' 'ive' (
+ R2 delete
+ try ( ['at'] R2 delete ['ic'] R2 delete )
+ )
+ )
+ )
+
+ define verb_suffix as setlimit tomark pV for (
+ [substring] among(
+ 'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi'
+ 'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate'
+ 'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai'
+ 'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo'
+ 'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete'
+ 'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo'
+ 'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei'
+ 'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono'
+ 'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita'
+ 'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo'
+ 'ono' 'uta' 'ute' 'uti' 'uto'
+
+ 'ar' 'ir' // but 'er' is problematical
+ (delete)
+ )
+ )
+
+ define AEIO 'aeio{a`}{e`}{i`}{o`}'
+ define CG 'cg'
+
+ define vowel_suffix as (
+ try (
+ [AEIO] RV delete
+ ['i'] RV delete
+ )
+ try (
+ ['h'] CG RV delete
+ )
+ )
+)
+
+define stem as (
+ do prelude
+ do mark_regions
+ backwards (
+ do attached_pronoun
+ do (standard_suffix or verb_suffix)
+ do vowel_suffix
+ )
+ do postlude
+)
+
+/*
+ Note 1: additions of 15 Jun 2005
+*/
+
diff --git a/contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..cd79d1280
--- /dev/null
+++ b/contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl
@@ -0,0 +1,245 @@
+strings ( ch )
+integers ( x p1 p2 )
+booleans ( Y_found stemmed GE_removed )
+
+routines (
+
+ R1 R2
+ C V VX
+ lengthen_V
+ Step_1 Step_2 Step_3 Step_4 Step_7
+ Step_6 Step_1c
+ Lose_prefix
+ Lose_infix
+ measure
+)
+
+externals ( stem )
+
+groupings ( v v_WX AOU AIOU )
+
+stringescapes {}
+
+stringdef ' hex '27' // yuk
+
+define v 'aeiouy'
+define v_WX v + 'wx'
+define AOU 'aou'
+define AIOU 'aiou'
+
+backwardmode (
+
+ define R1 as (setmark x $x >= p1)
+ define R2 as (setmark x $x >= p2)
+
+ define V as test (v or 'ij')
+ define VX as test (next v or 'ij')
+ define C as test (not 'ij' non-v)
+
+ define lengthen_V as do (
+ non-v_WX [ (AOU] test (non-v or atlimit)) or
+ ('e'] test (non-v or atlimit
+ not AIOU
+ not (next AIOU non-v)))
+ ->ch insert ch
+ )
+
+ define Step_1 as
+ (
+ [among ( (])
+
+ '{'}s' (delete)
+ 's' (R1 not ('t' R1) C delete)
+ 'ies' (R1 <-'ie')
+ 'es'
+ (('ar' R1 C ] delete lengthen_V) or
+ ('er' R1 C ] delete) or
+ (R1 C <-'e'))
+
+ 'aus' (R1 V <-'au')
+ 'en' (('hed' R1 ] <-'heid') or
+ ('nd' delete) or
+ ('d' R1 C ] delete) or
+ ('i' or 'j' V delete) or
+ (R1 C delete lengthen_V))
+ 'nde' (<-'nd')
+ )
+ )
+
+ define Step_2 as
+ (
+ [among ( (])
+ 'je' (('{'}t' ] delete) or
+ ('et' ] R1 C delete) or
+ ('rnt' ] <-'rn') or
+ ('t' ] R1 VX delete) or
+ ('ink' ] <-'ing') or
+ ('mp' ] <-'m') or
+ ('{'}' ] R1 delete) or
+ (] R1 C delete))
+ 'ge' (R1 <-'g')
+ 'lijke'(R1 <-'lijk')
+ 'ische'(R1 <-'isch')
+ 'de' (R1 C delete)
+ 'te' (R1 <-'t')
+ 'se' (R1 <-'s')
+ 're' (R1 <-'r')
+ 'le' (R1 delete attach 'l' lengthen_V)
+ 'ene' (R1 C delete attach 'en' lengthen_V)
+ 'ieve' (R1 C <-'ief')
+ )
+ )
+
+ define Step_3 as
+ (
+ [among ( (])
+ 'atie' (R1 <-'eer')
+ 'iteit' (R1 delete lengthen_V)
+ 'heid'
+ 'sel'
+ 'ster' (R1 delete)
+ 'rder' (<-'r')
+ 'ing'
+ 'isme'
+ 'erij' (R1 delete lengthen_V)
+ 'arij' (R1 C <-'aar')
+ 'fie' (R2 delete attach 'f' lengthen_V)
+ 'gie' (R2 delete attach 'g' lengthen_V)
+ 'tst' (R1 C <-'t')
+ 'dst' (R1 C <-'d')
+ )
+ )
+
+ define Step_4 as
+ (
+ ( [among ( (])
+ 'ioneel' (R1 <-'ie')
+ 'atief' (R1 <-'eer')
+ 'baar' (R1 delete)
+ 'naar' (R1 V <-'n')
+ 'laar' (R1 V <-'l')
+ 'raar' (R1 V <-'r')
+ 'tant' (R1 <-'teer')
+ 'lijker'
+ 'lijkst' (R1 <-'lijk')
+ 'achtig'
+ 'achtiger'
+ 'achtigst'(R1 delete)
+ 'eriger'
+ 'erigst'
+ 'erig'
+ 'end' (R1 C delete lengthen_V)
+ )
+ )
+ or
+ ( [among ( (])
+ 'iger'
+ 'igst'
+ 'ig' (R1 C delete lengthen_V)
+ )
+ )
+ )
+
+ define Step_7 as
+ (
+ [among ( (])
+ 'kt' (<-'k')
+ 'ft' (<-'f')
+ 'pt' (<-'p')
+ )
+ )
+
+ define Step_6 as
+ (
+ [among ( (])
+ 'bb' (<-'b')
+ 'cc' (<-'c')
+ 'dd' (<-'d')
+ 'ff' (<-'f')
+ 'gg' (<-'g')
+ 'hh' (<-'h')
+ 'jj' (<-'j')
+ 'kk' (<-'k')
+ 'll' (<-'l')
+ 'mm' (<-'m')
+ 'nn' (<-'n')
+ 'pp' (<-'p')
+ 'qq' (<-'q')
+ 'rr' (<-'r')
+ 'ss' (<-'s')
+ 'tt' (<-'t')
+ 'vv' (<-'v')
+ 'ww' (<-'w')
+ 'xx' (<-'x')
+ 'zz' (<-'z')
+ 'v' (<-'f')
+ 'z' (<-'s')
+ )
+ )
+
+ define Step_1c as
+ (
+ [among ( (] R1 C)
+ 'd' (not ('n' R1) delete)
+ 't' (not ('h' R1) delete)
+ )
+ )
+)
+
+define Lose_prefix as (
+ ['ge'] test hop 3 (goto v goto non-v)
+ set GE_removed
+ delete
+)
+
+define Lose_infix as (
+ next
+ gopast (['ge']) test hop 3 (goto v goto non-v)
+ set GE_removed
+ delete
+)
+
+define measure as (
+ do (
+ tolimit
+ setmark p1
+ setmark p2
+ )
+ do(
+ repeat non-v atleast 1 ('ij' or v) non-v setmark p1
+ repeat non-v atleast 1 ('ij' or v) non-v setmark p2
+ )
+
+)
+define stem as (
+
+ unset Y_found
+ unset stemmed
+ do ( ['y'] <-'Y' set Y_found )
+ do repeat(goto (v ['y'])<-'Y' set Y_found )
+
+ measure
+
+ backwards (
+ do (Step_1 set stemmed )
+ do (Step_2 set stemmed )
+ do (Step_3 set stemmed )
+ do (Step_4 set stemmed )
+ )
+ unset GE_removed
+ do (Lose_prefix and measure)
+ backwards (
+ do (GE_removed Step_1c)
+ )
+ unset GE_removed
+ do (Lose_infix and measure)
+ backwards (
+ do (GE_removed Step_1c)
+ )
+ backwards (
+ do (Step_7 set stemmed )
+ do (stemmed or GE_removed Step_6)
+ )
+ do(Y_found repeat(goto (['Y']) <-'y'))
+)
+
diff --git a/contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..3f69f1572
--- /dev/null
+++ b/contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl
@@ -0,0 +1,208 @@
+
+stringescapes {}
+
+routines (
+ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z AA BB CC
+
+ endings
+
+ undouble respell
+)
+
+externals ( stem )
+
+backwardmode (
+
+ /* Lovins' conditions A, B ... CC, as given in her Appendix B, where
+ a test for a two letter prefix ('test hop 2') is implicitly
+ assumed. Note that 'e' next 'u' corresponds to her u*e because
+ Snowball is scanning backwards. */
+
+ define A as ( hop 2 )
+ define B as ( hop 3 )
+ define C as ( hop 4 )
+ define D as ( hop 5 )
+ define E as ( test hop 2 not 'e' )
+ define F as ( test hop 3 not 'e' )
+ define G as ( test hop 3 'f' )
+ define H as ( test hop 2 't' or 'll' )
+ define I as ( test hop 2 not 'o' not 'e' )
+ define J as ( test hop 2 not 'a' not 'e' )
+ define K as ( test hop 3 'l' or 'i' or ('e' next 'u') )
+ define L as ( test hop 2 not 'u' not 'x' not ('s' not 'o') )
+ define M as ( test hop 2 not 'a' not 'c' not 'e' not 'm' )
+ define N as ( test hop 3 ( hop 2 not 's' or hop 2 ) )
+ define O as ( test hop 2 'l' or 'i' )
+ define P as ( test hop 2 not 'c' )
+ define Q as ( test hop 2 test hop 3 not 'l' not 'n' )
+ define R as ( test hop 2 'n' or 'r' )
+ define S as ( test hop 2 'dr' or ('t' not 't') )
+ define T as ( test hop 2 's' or ('t' not 'o') )
+ define U as ( test hop 2 'l' or 'm' or 'n' or 'r' )
+ define V as ( test hop 2 'c' )
+ define W as ( test hop 2 not 's' not 'u' )
+ define X as ( test hop 2 'l' or 'i' or ('e' next 'u') )
+ define Y as ( test hop 2 'in' )
+ define Z as ( test hop 2 not 'f' )
+ define AA as ( test hop 2 among ( 'd' 'f' 'ph' 'th' 'l' 'er' 'or'
+ 'es' 't' ) )
+ define BB as ( test hop 3 not 'met' not 'ryst' )
+ define CC as ( test hop 2 'l' )
+
+
+ /* The system of endings, as given in Appendix A. */
+
+ define endings as (
+ [substring] among(
+ 'alistically' B 'arizability' A 'izationally' B
+
+ 'antialness' A 'arisations' A 'arizations' A 'entialness' A
+
+ 'allically' C 'antaneous' A 'antiality' A 'arisation' A
+ 'arization' A 'ationally' B 'ativeness' A 'eableness' E
+ 'entations' A 'entiality' A 'entialize' A 'entiation' A
+ 'ionalness' A 'istically' A 'itousness' A 'izability' A
+ 'izational' A
+
+ 'ableness' A 'arizable' A 'entation' A 'entially' A
+ 'eousness' A 'ibleness' A 'icalness' A 'ionalism' A
+ 'ionality' A 'ionalize' A 'iousness' A 'izations' A
+ 'lessness' A
+
+ 'ability' A 'aically' A 'alistic' B 'alities' A
+ 'ariness' E 'aristic' A 'arizing' A 'ateness' A
+ 'atingly' A 'ational' B 'atively' A 'ativism' A
+ 'elihood' E 'encible' A 'entally' A 'entials' A
+ 'entiate' A 'entness' A 'fulness' A 'ibility' A
+ 'icalism' A 'icalist' A 'icality' A 'icalize' A
+ 'ication' G 'icianry' A 'ination' A 'ingness' A
+ 'ionally' A 'isation' A 'ishness' A 'istical' A
+ 'iteness' A 'iveness' A 'ivistic' A 'ivities' A
+ 'ization' F 'izement' A 'oidally' A 'ousness' A
+
+ 'aceous' A 'acious' B 'action' G 'alness' A
+ 'ancial' A 'ancies' A 'ancing' B 'ariser' A
+ 'arized' A 'arizer' A 'atable' A 'ations' B
+ 'atives' A 'eature' Z 'efully' A 'encies' A
+ 'encing' A 'ential' A 'enting' C 'entist' A
+ 'eously' A 'ialist' A 'iality' A 'ialize' A
+ 'ically' A 'icance' A 'icians' A 'icists' A
+ 'ifully' A 'ionals' A 'ionate' D 'ioning' A
+ 'ionist' A 'iously' A 'istics' A 'izable' E
+ 'lessly' A 'nesses' A 'oidism' A
+
+ 'acies' A 'acity' A 'aging' B 'aical' A
+ 'alist' A 'alism' B 'ality' A 'alize' A
+ 'allic'BB 'anced' B 'ances' B 'antic' C
+ 'arial' A 'aries' A 'arily' A 'arity' B
+ 'arize' A 'aroid' A 'ately' A 'ating' I
+ 'ation' B 'ative' A 'ators' A 'atory' A
+ 'ature' E 'early' Y 'ehood' A 'eless' A
+ 'elity' A 'ement' A 'enced' A 'ences' A
+ 'eness' E 'ening' E 'ental' A 'ented' C
+ 'ently' A 'fully' A 'ially' A 'icant' A
+ 'ician' A 'icide' A 'icism' A 'icist' A
+ 'icity' A 'idine' I 'iedly' A 'ihood' A
+ 'inate' A 'iness' A 'ingly' B 'inism' J
+ 'inity'CC 'ional' A 'ioned' A 'ished' A
+ 'istic' A 'ities' A 'itous' A 'ively' A
+ 'ivity' A 'izers' F 'izing' F 'oidal' A
+ 'oides' A 'otide' A 'ously' A
+
+ 'able' A 'ably' A 'ages' B 'ally' B
+ 'ance' B 'ancy' B 'ants' B 'aric' A
+ 'arly' K 'ated' I 'ates' A 'atic' B
+ 'ator' A 'ealy' Y 'edly' E 'eful' A
+ 'eity' A 'ence' A 'ency' A 'ened' E
+ 'enly' E 'eous' A 'hood' A 'ials' A
+ 'ians' A 'ible' A 'ibly' A 'ical' A
+ 'ides' L 'iers' A 'iful' A 'ines' M
+ 'ings' N 'ions' B 'ious' A 'isms' B
+ 'ists' A 'itic' H 'ized' F 'izer' F
+ 'less' A 'lily' A 'ness' A 'ogen' A
+ 'ward' A 'wise' A 'ying' B 'yish' A
+
+ 'acy' A 'age' B 'aic' A 'als'BB
+ 'ant' B 'ars' O 'ary' F 'ata' A
+ 'ate' A 'eal' Y 'ear' Y 'ely' E
+ 'ene' E 'ent' C 'ery' E 'ese' A
+ 'ful' A 'ial' A 'ian' A 'ics' A
+ 'ide' L 'ied' A 'ier' A 'ies' P
+ 'ily' A 'ine' M 'ing' N 'ion' Q
+ 'ish' C 'ism' B 'ist' A 'ite'AA
+ 'ity' A 'ium' A 'ive' A 'ize' F
+ 'oid' A 'one' R 'ous' A
+
+ 'ae' A 'al'BB 'ar' X 'as' B
+ 'ed' E 'en' F 'es' E 'ia' A
+ 'ic' A 'is' A 'ly' B 'on' S
+ 'or' T 'um' U 'us' V 'yl' R
+ '{'}s' A 's{'}' A
+
+ 'a' A 'e' A 'i' A 'o' A
+ 's' W 'y' B
+
+ (delete)
+ )
+ )
+
+ /* Undoubling is rule 1 of appendix C. */
+
+ define undouble as (
+ test substring among ('bb' 'dd' 'gg' 'll' 'mm' 'nn' 'pp' 'rr' 'ss'
+ 'tt')
+ [next] delete
+ )
+
+ /* The other appendix C rules can be done together. */
+
+ define respell as (
+ [substring] among (
+ 'iev' (<-'ief')
+ 'uct' (<-'uc')
+ 'umpt' (<-'um')
+ 'rpt' (<-'rb')
+ 'urs' (<-'ur')
+ 'istr' (<-'ister')
+ 'metr' (<-'meter')
+ 'olv' (<-'olut')
+ 'ul' (not 'a' not 'i' not 'o' <-'l')
+ 'bex' (<-'bic')
+ 'dex' (<-'dic')
+ 'pex' (<-'pic')
+ 'tex' (<-'tic')
+ 'ax' (<-'ac')
+ 'ex' (<-'ec')
+ 'ix' (<-'ic')
+ 'lux' (<-'luc')
+ 'uad' (<-'uas')
+ 'vad' (<-'vas')
+ 'cid' (<-'cis')
+ 'lid' (<-'lis')
+ 'erid' (<-'eris')
+ 'pand' (<-'pans')
+ 'end' (not 's' <-'ens')
+ 'ond' (<-'ons')
+ 'lud' (<-'lus')
+ 'rud' (<-'rus')
+ 'her' (not 'p' not 't' <-'hes')
+ 'mit' (<-'mis')
+ 'ent' (not 'm' <-'ens')
+ /* 'ent' was 'end' in the 1968 paper - a typo. */
+ 'ert' (<-'ers')
+ 'et' (not 'n' <-'es')
+ 'yt' (<-'ys')
+ 'yz' (<-'ys')
+ )
+ )
+)
+
+define stem as (
+
+ backwards (
+ do endings
+ do undouble
+ do respell
+ )
+)
+
diff --git a/contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..94a071653
--- /dev/null
+++ b/contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl
@@ -0,0 +1,80 @@
+routines (
+ mark_regions
+ main_suffix
+ consonant_pair
+ other_suffix
+)
+
+externals ( stem )
+
+integers ( p1 x )
+
+groupings ( v s_ending )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef ae hex 'E6'
+stringdef ao hex 'E5'
+stringdef o/ hex 'F8'
+
+define v 'aeiouy{ae}{ao}{o/}'
+
+define s_ending 'bcdfghjlmnoprtvyz'
+
+define mark_regions as (
+
+ $p1 = limit
+
+ test ( hop 3 setmark x )
+ goto v gopast non-v setmark p1
+ try ( $p1 < x $p1 = x )
+)
+
+backwardmode (
+
+ define main_suffix as (
+ setlimit tomark p1 for ([substring])
+ among(
+
+ 'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar'
+ 'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens'
+ 'hetens' 'ers' 'ets' 'et' 'het' 'ast'
+ (delete)
+ 's'
+ (s_ending or ('k' non-v) delete)
+ 'erte' 'ert'
+ (<-'er')
+ )
+ )
+
+ define consonant_pair as (
+ test (
+ setlimit tomark p1 for ([substring])
+ among(
+ 'dt' 'vt'
+ )
+ )
+ next] delete
+ )
+
+ define other_suffix as (
+ setlimit tomark p1 for ([substring])
+ among(
+ 'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov'
+ 'hetslov'
+ (delete)
+ )
+ )
+)
+
+define stem as (
+
+ do mark_regions
+ backwards (
+ do main_suffix
+ do consonant_pair
+ do other_suffix
+ )
+)
diff --git a/contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl
new file mode 100644
index 000000000..f57483354
--- /dev/null
+++ b/contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl
@@ -0,0 +1,80 @@
+routines (
+ mark_regions
+ main_suffix
+ consonant_pair
+ other_suffix
+)
+
+externals ( stem )
+
+integers ( p1 x )
+
+groupings ( v s_ending )
+
+stringescapes {}
+
+/* special characters (in MS-DOS Latin I) */
+
+stringdef ae hex '91'
+stringdef ao hex '86'
+stringdef o/ hex '9B'
+
+define v 'aeiouy{ae}{ao}{o/}'
+
+define s_ending 'bcdfghjlmnoprtvyz'
+
+define mark_regions as (
+
+ $p1 = limit
+
+ test ( hop 3 setmark x )
+ goto v gopast non-v setmark p1
+ try ( $p1 < x $p1 = x )
+)
+
+backwardmode (
+
+ define main_suffix as (
+ setlimit tomark p1 for ([substring])
+ among(
+
+ 'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar'
+ 'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens'
+ 'hetens' 'ers' 'ets' 'et' 'het' 'ast'
+ (delete)
+ 's'
+ (s_ending or ('k' non-v) delete)
+ 'erte' 'ert'
+ (<-'er')
+ )
+ )
+
+ define consonant_pair as (
+ test (
+ setlimit tomark p1 for ([substring])
+ among(
+ 'dt' 'vt'
+ )
+ )
+ next] delete
+ )
+
+ define other_suffix as (
+ setlimit tomark p1 for ([substring])
+ among(
+ 'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov'
+ 'hetslov'
+ (delete)
+ )
+ )
+)
+
+define stem as (
+
+ do mark_regions
+ backwards (
+ do main_suffix
+ do consonant_pair
+ do other_suffix
+ )
+)
diff --git a/contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..9533b7932
--- /dev/null
+++ b/contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl
@@ -0,0 +1,139 @@
+integers ( p1 p2 )
+booleans ( Y_found )
+
+routines (
+ shortv
+ R1 R2
+ Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5a Step_5b
+)
+
+externals ( stem )
+
+groupings ( v v_WXY )
+
+define v 'aeiouy'
+define v_WXY v + 'wxY'
+
+backwardmode (
+
+ define shortv as ( non-v_WXY v non-v )
+
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define Step_1a as (
+ [substring] among (
+ 'sses' (<-'ss')
+ 'ies' (<-'i')
+ 'ss' ()
+ 's' (delete)
+ )
+ )
+
+ define Step_1b as (
+ [substring] among (
+ 'eed' (R1 <-'ee')
+ 'ed'
+ 'ing' (
+ test gopast v delete
+ test substring among(
+ 'at' 'bl' 'iz'
+ (<+ 'e')
+ 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt'
+ // ignoring double c, h, j, k, q, v, w, and x
+ ([next] delete)
+ '' (atmark p1 test shortv <+ 'e')
+ )
+ )
+ )
+ )
+
+ define Step_1c as (
+ ['y' or 'Y']
+ gopast v
+ <-'i'
+ )
+
+ define Step_2 as (
+ [substring] R1 among (
+ 'tional' (<-'tion')
+ 'enci' (<-'ence')
+ 'anci' (<-'ance')
+ 'abli' (<-'able')
+ 'entli' (<-'ent')
+ 'eli' (<-'e')
+ 'izer' 'ization'
+ (<-'ize')
+ 'ational' 'ation' 'ator'
+ (<-'ate')
+ 'alli' (<-'al')
+ 'alism' 'aliti'
+ (<-'al')
+ 'fulness' (<-'ful')
+ 'ousli' 'ousness'
+ (<-'ous')
+ 'iveness' 'iviti'
+ (<-'ive')
+ 'biliti' (<-'ble')
+ )
+ )
+
+ define Step_3 as (
+ [substring] R1 among (
+ 'alize' (<-'al')
+ 'icate' 'iciti' 'ical'
+ (<-'ic')
+ 'ative' 'ful' 'ness'
+ (delete)
+ )
+ )
+
+ define Step_4 as (
+ [substring] R2 among (
+ 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement'
+ 'ment' 'ent' 'ou' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize'
+ (delete)
+ 'ion' ('s' or 't' delete)
+ )
+ )
+
+ define Step_5a as (
+ ['e']
+ R2 or (R1 not shortv)
+ delete
+ )
+
+ define Step_5b as (
+ ['l']
+ R2 'l'
+ delete
+ )
+)
+
+define stem as (
+
+ unset Y_found
+ do ( ['y'] <-'Y' set Y_found)
+ do repeat(goto (v ['y']) <-'Y' set Y_found)
+
+ $p1 = limit
+ $p2 = limit
+ do(
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+
+ backwards (
+ do Step_1a
+ do Step_1b
+ do Step_1c
+ do Step_2
+ do Step_3
+ do Step_4
+ do Step_5a
+ do Step_5b
+ )
+
+ do(Y_found repeat(goto (['Y']) <-'y'))
+
+)
diff --git a/contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..3e7da08d4
--- /dev/null
+++ b/contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl
@@ -0,0 +1,218 @@
+routines (
+ prelude postlude mark_regions
+ RV R1 R2
+ standard_suffix
+ verb_suffix
+ residual_suffix
+ residual_form
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef a' hex 'E1' // a-acute
+stringdef a^ hex 'E2' // a-circumflex e.g. 'bota^nico
+stringdef e' hex 'E9' // e-acute
+stringdef e^ hex 'EA' // e-circumflex
+stringdef i' hex 'ED' // i-acute
+stringdef o^ hex 'F4' // o-circumflex
+stringdef o' hex 'F3' // o-acute
+stringdef u' hex 'FA' // u-acute
+stringdef c, hex 'E7' // c-cedilla
+
+stringdef a~ hex 'E3' // a-tilde
+stringdef o~ hex 'F5' // o-tilde
+
+
+define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}'
+
+define prelude as repeat (
+ [substring] among(
+ '{a~}' (<- 'a~')
+ '{o~}' (<- 'o~')
+ '' (next)
+ ) //or next
+)
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v (non-v gopast v) or (v gopast non-v) )
+ or
+ ( non-v (non-v gopast v) or (v next) )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+ [substring] among(
+ 'a~' (<- '{a~}')
+ 'o~' (<- '{o~}')
+ '' (next)
+ ) //or next
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define standard_suffix as (
+ [substring] among(
+
+ 'eza' 'ezas'
+ 'ico' 'ica' 'icos' 'icas'
+ 'ismo' 'ismos'
+ '{a'}vel'
+ '{i'}vel'
+ 'ista' 'istas'
+ 'oso' 'osa' 'osos' 'osas'
+ 'amento' 'amentos'
+ 'imento' 'imentos'
+
+ 'adora' 'ador' 'a{c,}a~o'
+ 'adoras' 'adores' 'a{c,}o~es' // no -ic test
+ 'ante' 'antes' '{a^}ncia' // Note 1
+ (
+ R2 delete
+ )
+ 'log{i'}a'
+ 'log{i'}as'
+ (
+ R2 <- 'log'
+ )
+ 'uci{o'}n' 'uciones'
+ (
+ R2 <- 'u'
+ )
+ '{e^}ncia' '{e^}ncias'
+ (
+ R2 <- 'ente'
+ )
+ 'amente'
+ (
+ R1 delete
+ try (
+ [substring] R2 delete among(
+ 'iv' (['at'] R2 delete)
+ 'os'
+ 'ic'
+ 'ad'
+ )
+ )
+ )
+ 'mente'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'ante' // Note 1
+ 'avel'
+ '{i'}vel' (R2 delete)
+ )
+ )
+ )
+ 'idade'
+ 'idades'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'abil'
+ 'ic'
+ 'iv' (R2 delete)
+ )
+ )
+ )
+ 'iva' 'ivo'
+ 'ivas' 'ivos'
+ (
+ R2 delete
+ try (
+ ['at'] R2 delete // but not a further ['ic'] R2 delete
+ )
+ )
+ 'ira' 'iras'
+ (
+ RV 'e' // -eira -eiras usually non-verbal
+ <- 'ir'
+ )
+ )
+ )
+
+ define verb_suffix as setlimit tomark pV for (
+ [substring] among(
+ 'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}'
+ 'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste'
+ 'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam'
+ 'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem'
+ 'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o'
+ 'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias'
+ 'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras'
+ 'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres'
+ 'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is'
+ 'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis'
+ '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis'
+ '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos'
+ '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos'
+ 'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos'
+ 'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos'
+ '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou'
+
+ 'ira' 'iras'
+ (delete)
+ )
+ )
+
+ define residual_suffix as (
+ [substring] among(
+ 'os'
+ 'a' 'i' 'o' '{a'}' '{i'}' '{o'}'
+ ( RV delete )
+ )
+ )
+
+ define residual_form as (
+ [substring] among(
+ 'e' '{e'}' '{e^}'
+ ( RV delete [('u'] test 'g') or
+ ('i'] test 'c') RV delete )
+ '{c,}' (<-'c')
+ )
+ )
+)
+
+define stem as (
+ do prelude
+ do mark_regions
+ backwards (
+ do (
+ ( ( standard_suffix or verb_suffix )
+ and do ( ['i'] test 'c' RV delete )
+ )
+ or residual_suffix
+ )
+ do residual_form
+ )
+ do postlude
+)
+
+/*
+ Note 1: additions of 15 Jun 2005
+*/
diff --git a/contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl
new file mode 100644
index 000000000..4d6c85214
--- /dev/null
+++ b/contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl
@@ -0,0 +1,218 @@
+routines (
+ prelude postlude mark_regions
+ RV R1 R2
+ standard_suffix
+ verb_suffix
+ residual_suffix
+ residual_form
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+stringescapes {}
+
+/* special characters (in MS-DOS Latin I) */
+
+stringdef a' hex 'A0' // a-acute
+stringdef a^ hex '83' // a-circumflex e.g. 'bota^nico
+stringdef e' hex '82' // e-acute
+stringdef e^ hex '88' // e-circumflex
+stringdef i' hex 'A1' // i-acute
+stringdef o^ hex '93' // o-circumflex
+stringdef o' hex 'A2' // o-acute
+stringdef u' hex 'A3' // u-acute
+stringdef c, hex '87' // c-cedilla
+
+stringdef a~ hex 'C6' // a-tilde
+stringdef o~ hex 'E4' // o-tilde
+
+
+define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}'
+
+define prelude as repeat (
+ [substring] among(
+ '{a~}' (<- 'a~')
+ '{o~}' (<- 'o~')
+ '' (next)
+ ) //or next
+)
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v (non-v gopast v) or (v gopast non-v) )
+ or
+ ( non-v (non-v gopast v) or (v next) )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+ [substring] among(
+ 'a~' (<- '{a~}')
+ 'o~' (<- '{o~}')
+ '' (next)
+ ) //or next
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define standard_suffix as (
+ [substring] among(
+
+ 'eza' 'ezas'
+ 'ico' 'ica' 'icos' 'icas'
+ 'ismo' 'ismos'
+ '{a'}vel'
+ '{i'}vel'
+ 'ista' 'istas'
+ 'oso' 'osa' 'osos' 'osas'
+ 'amento' 'amentos'
+ 'imento' 'imentos'
+
+ 'adora' 'ador' 'a{c,}a~o'
+ 'adoras' 'adores' 'a{c,}o~es' // no -ic test
+ 'ante' 'antes' '{a^}ncia' // Note 1
+ (
+ R2 delete
+ )
+ 'log{i'}a'
+ 'log{i'}as'
+ (
+ R2 <- 'log'
+ )
+ 'uci{o'}n' 'uciones'
+ (
+ R2 <- 'u'
+ )
+ '{e^}ncia' '{e^}ncias'
+ (
+ R2 <- 'ente'
+ )
+ 'amente'
+ (
+ R1 delete
+ try (
+ [substring] R2 delete among(
+ 'iv' (['at'] R2 delete)
+ 'os'
+ 'ic'
+ 'ad'
+ )
+ )
+ )
+ 'mente'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'ante' // Note 1
+ 'avel'
+ '{i'}vel' (R2 delete)
+ )
+ )
+ )
+ 'idade'
+ 'idades'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'abil'
+ 'ic'
+ 'iv' (R2 delete)
+ )
+ )
+ )
+ 'iva' 'ivo'
+ 'ivas' 'ivos'
+ (
+ R2 delete
+ try (
+ ['at'] R2 delete // but not a further ['ic'] R2 delete
+ )
+ )
+ 'ira' 'iras'
+ (
+ RV 'e' // -eira -eiras usually non-verbal
+ <- 'ir'
+ )
+ )
+ )
+
+ define verb_suffix as setlimit tomark pV for (
+ [substring] among(
+ 'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}'
+ 'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste'
+ 'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam'
+ 'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem'
+ 'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o'
+ 'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias'
+ 'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras'
+ 'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres'
+ 'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is'
+ 'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis'
+ '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis'
+ '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos'
+ '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos'
+ 'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos'
+ 'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos'
+ '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou'
+
+ 'ira' 'iras'
+ (delete)
+ )
+ )
+
+ define residual_suffix as (
+ [substring] among(
+ 'os'
+ 'a' 'i' 'o' '{a'}' '{i'}' '{o'}'
+ ( RV delete )
+ )
+ )
+
+ define residual_form as (
+ [substring] among(
+ 'e' '{e'}' '{e^}'
+ ( RV delete [('u'] test 'g') or
+ ('i'] test 'c') RV delete )
+ '{c,}' (<-'c')
+ )
+ )
+)
+
+define stem as (
+ do prelude
+ do mark_regions
+ backwards (
+ do (
+ ( ( standard_suffix or verb_suffix )
+ and do ( ['i'] test 'c' RV delete )
+ )
+ or residual_suffix
+ )
+ do residual_form
+ )
+ do postlude
+)
+
+/*
+ Note 1: additions of 15 Jun 2005
+*/
diff --git a/contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl b/contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl
new file mode 100644
index 000000000..48a148321
--- /dev/null
+++ b/contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl
@@ -0,0 +1,236 @@
+
+routines (
+ prelude postlude mark_regions
+ RV R1 R2
+ step_0
+ standard_suffix combo_suffix
+ verb_suffix
+ vowel_suffix
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+booleans ( standard_suffix_removed )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef a^ hex 'E2' // a circumflex
+stringdef i^ hex 'EE' // i circumflex
+stringdef a+ hex 'E3' // a breve
+stringdef s, hex 'BA' // s cedilla
+stringdef t, hex 'FE' // t cedilla
+
+define v 'aeiou{a^}{i^}{a+}'
+
+define prelude as (
+ repeat goto (
+ v [ ('u' ] v <- 'U') or
+ ('i' ] v <- 'I')
+ )
+)
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v (non-v gopast v) or (v gopast non-v) )
+ or
+ ( non-v (non-v gopast v) or (v next) )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'I' (<- 'i')
+ 'U' (<- 'u')
+ '' (next)
+ )
+
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define step_0 as (
+ [substring] R1 among(
+ 'ul' 'ului'
+ ( delete )
+ 'aua'
+ ( <-'a' )
+ 'ea' 'ele' 'elor'
+ ( <-'e' )
+ 'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor'
+ ( <-'i')
+ 'ile'
+ ( not 'ab' <- 'i' )
+ 'atei'
+ ( <- 'at' )
+ 'a{t,}ie' 'a{t,}ia'
+ ( <- 'a{t,}i' )
+ )
+ )
+
+ define combo_suffix as test (
+ [substring] R1 (
+ among(
+ /* 'IST'. alternative: include the following
+ 'alism' 'alisme'
+ 'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' (
+ <- 'al'
+ )
+ */
+ 'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' (
+ <- 'abil'
+ )
+ 'ibilitate' (
+ <- 'ibil'
+ )
+ 'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' (
+ <- 'iv'
+ )
+ 'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i'
+ 'icator' 'icatori'
+ 'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}'
+ 'ical' 'icala' 'icale' 'icali' 'ical{a+}' (
+ <- 'ic'
+ )
+ 'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune'
+ 'atoare' 'ator' 'atori'
+ '{a+}toare' '{a+}tor' '{a+}tori' (
+ <- 'at'
+ )
+ 'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune'
+ 'itoare' 'itor' 'itori' (
+ <- 'it'
+ )
+ )
+ set standard_suffix_removed
+ )
+ )
+
+ define standard_suffix as (
+ unset standard_suffix_removed
+ repeat combo_suffix
+ [substring] R2 (
+ among(
+
+ // past participle is treated here, rather than
+ // as a verb ending:
+ 'at' 'ata' 'at{a+}' 'ati' 'ate'
+ 'ut' 'uta' 'ut{a+}' 'uti' 'ute'
+ 'it' 'ita' 'it{a+}' 'iti' 'ite'
+
+ 'ic' 'ica' 'ice' 'ici' 'ic{a+}'
+ 'abil' 'abila' 'abile' 'abili' 'abil{a+}'
+ 'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}'
+ 'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i'
+ 'ant' 'anta' 'ante' 'anti' 'ant{a+}'
+ 'ator' 'atori'
+ 'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i'
+ 'iv' 'iva' 'ive' 'ivi' 'iv{a+}' (
+ delete
+ )
+ 'iune' 'iuni' (
+ '{t,}'] <- 't'
+ )
+ 'ism' 'isme'
+ 'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' (
+ <- 'ist'
+ /* 'IST'. alternative: remove with <- '' */
+ )
+ )
+ set standard_suffix_removed
+ )
+ )
+
+ define verb_suffix as setlimit tomark pV for (
+ [substring] among(
+ // 'long' infinitive:
+ 'are' 'ere' 'ire' '{a^}re'
+
+ // gerund:
+ 'ind' '{a^}nd'
+ 'indu' '{a^}ndu'
+
+ 'eze'
+ 'easc{a+}'
+ // present:
+ 'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti'
+ 'e{s,}te'
+ '{a+}sc' '{a+}{s,}ti'
+ '{a+}{s,}te'
+
+ // imperfect:
+ 'am' 'ai' 'au'
+ 'eam' 'eai' 'ea' 'ea{t,}i' 'eau'
+ 'iam' 'iai' 'ia' 'ia{t,}i' 'iau'
+
+ // past: // (not 'ii')
+ 'ui'
+ 'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}'
+ 'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}'
+ 'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}'
+ '{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}'
+
+ // pluferfect:
+ 'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}'
+ 'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}'
+ '{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i'
+ '{a^}ser{a+}'
+ 'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}'
+
+ ( non-v or 'u' delete )
+
+ // present:
+ '{a+}m' 'a{t,}i'
+ 'em' 'e{t,}i'
+ 'im' 'i{t,}i'
+ '{a^}m' '{a^}{t,}i'
+
+ // past:
+ 'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}'
+ 'sei' 'se'
+
+ // pluperfect:
+ 'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}'
+ (delete)
+ )
+ )
+
+ define vowel_suffix as (
+ [substring] RV among (
+ 'a' 'e' 'i' 'ie' '{a+}' ( delete )
+ )
+ )
+)
+
+define stem as (
+ do prelude
+ do mark_regions
+ backwards (
+ do step_0
+ do standard_suffix
+ do ( standard_suffix_removed or verb_suffix )
+ do vowel_suffix
+ )
+ do postlude
+)
+
diff --git a/contrib/snowball/algorithms/romanian/stem_Unicode.sbl b/contrib/snowball/algorithms/romanian/stem_Unicode.sbl
new file mode 100644
index 000000000..09aec6429
--- /dev/null
+++ b/contrib/snowball/algorithms/romanian/stem_Unicode.sbl
@@ -0,0 +1,236 @@
+
+routines (
+ prelude postlude mark_regions
+ RV R1 R2
+ step_0
+ standard_suffix combo_suffix
+ verb_suffix
+ vowel_suffix
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+booleans ( standard_suffix_removed )
+
+stringescapes {}
+
+/* special characters */
+
+stringdef a^ hex '0E2' // a circumflex
+stringdef i^ hex '0EE' // i circumflex
+stringdef a+ hex '103' // a breve
+stringdef s, hex '15F' // s cedilla
+stringdef t, hex '163' // t cedilla
+
+define v 'aeiou{a^}{i^}{a+}'
+
+define prelude as (
+ repeat goto (
+ v [ ('u' ] v <- 'U') or
+ ('i' ] v <- 'I')
+ )
+)
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v (non-v gopast v) or (v gopast non-v) )
+ or
+ ( non-v (non-v gopast v) or (v next) )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+
+ [substring] among(
+ 'I' (<- 'i')
+ 'U' (<- 'u')
+ '' (next)
+ )
+
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define step_0 as (
+ [substring] R1 among(
+ 'ul' 'ului'
+ ( delete )
+ 'aua'
+ ( <-'a' )
+ 'ea' 'ele' 'elor'
+ ( <-'e' )
+ 'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor'
+ ( <-'i')
+ 'ile'
+ ( not 'ab' <- 'i' )
+ 'atei'
+ ( <- 'at' )
+ 'a{t,}ie' 'a{t,}ia'
+ ( <- 'a{t,}i' )
+ )
+ )
+
+ define combo_suffix as test (
+ [substring] R1 (
+ among(
+ /* 'IST'. alternative: include the following
+ 'alism' 'alisme'
+ 'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' (
+ <- 'al'
+ )
+ */
+ 'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' (
+ <- 'abil'
+ )
+ 'ibilitate' (
+ <- 'ibil'
+ )
+ 'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' (
+ <- 'iv'
+ )
+ 'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i'
+ 'icator' 'icatori'
+ 'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}'
+ 'ical' 'icala' 'icale' 'icali' 'ical{a+}' (
+ <- 'ic'
+ )
+ 'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune'
+ 'atoare' 'ator' 'atori'
+ '{a+}toare' '{a+}tor' '{a+}tori' (
+ <- 'at'
+ )
+ 'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune'
+ 'itoare' 'itor' 'itori' (
+ <- 'it'
+ )
+ )
+ set standard_suffix_removed
+ )
+ )
+
+ define standard_suffix as (
+ unset standard_suffix_removed
+ repeat combo_suffix
+ [substring] R2 (
+ among(
+
+ // past participle is treated here, rather than
+ // as a verb ending:
+ 'at' 'ata' 'at{a+}' 'ati' 'ate'
+ 'ut' 'uta' 'ut{a+}' 'uti' 'ute'
+ 'it' 'ita' 'it{a+}' 'iti' 'ite'
+
+ 'ic' 'ica' 'ice' 'ici' 'ic{a+}'
+ 'abil' 'abila' 'abile' 'abili' 'abil{a+}'
+ 'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}'
+ 'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i'
+ 'ant' 'anta' 'ante' 'anti' 'ant{a+}'
+ 'ator' 'atori'
+ 'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i'
+ 'iv' 'iva' 'ive' 'ivi' 'iv{a+}' (
+ delete
+ )
+ 'iune' 'iuni' (
+ '{t,}'] <- 't'
+ )
+ 'ism' 'isme'
+ 'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' (
+ <- 'ist'
+ /* 'IST'. alternative: remove with <- '' */
+ )
+ )
+ set standard_suffix_removed
+ )
+ )
+
+ define verb_suffix as setlimit tomark pV for (
+ [substring] among(
+ // 'long' infinitive:
+ 'are' 'ere' 'ire' '{a^}re'
+
+ // gerund:
+ 'ind' '{a^}nd'
+ 'indu' '{a^}ndu'
+
+ 'eze'
+ 'easc{a+}'
+ // present:
+ 'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti'
+ 'e{s,}te'
+ '{a+}sc' '{a+}{s,}ti'
+ '{a+}{s,}te'
+
+ // imperfect:
+ 'am' 'ai' 'au'
+ 'eam' 'eai' 'ea' 'ea{t,}i' 'eau'
+ 'iam' 'iai' 'ia' 'ia{t,}i' 'iau'
+
+ // past: // (not 'ii')
+ 'ui'
+ 'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}'
+ 'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}'
+ 'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}'
+ '{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}'
+
+ // pluferfect:
+ 'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}'
+ 'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}'
+ '{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i'
+ '{a^}ser{a+}'
+ 'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}'
+
+ ( non-v or 'u' delete )
+
+ // present:
+ '{a+}m' 'a{t,}i'
+ 'em' 'e{t,}i'
+ 'im' 'i{t,}i'
+ '{a^}m' '{a^}{t,}i'
+
+ // past:
+ 'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}'
+ 'sei' 'se'
+
+ // pluperfect:
+ 'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}'
+ (delete)
+ )
+ )
+
+ define vowel_suffix as (
+ [substring] RV among (
+ 'a' 'e' 'i' 'ie' '{a+}' ( delete )
+ )
+ )
+)
+
+define stem as (
+ do prelude
+ do mark_regions
+ backwards (
+ do step_0
+ do standard_suffix
+ do ( standard_suffix_removed or verb_suffix )
+ do vowel_suffix
+ )
+ do postlude
+)
+
diff --git a/contrib/snowball/algorithms/russian/stem_KOI8_R.sbl b/contrib/snowball/algorithms/russian/stem_KOI8_R.sbl
new file mode 100644
index 000000000..cdacb19c4
--- /dev/null
+++ b/contrib/snowball/algorithms/russian/stem_KOI8_R.sbl
@@ -0,0 +1,217 @@
+stringescapes {}
+
+/* the 32 Cyrillic letters in the KOI8-R coding scheme, and represented
+ in Latin characters following the conventions of the standard Library
+ of Congress transliteration: */
+
+stringdef a hex 'C1'
+stringdef b hex 'C2'
+stringdef v hex 'D7'
+stringdef g hex 'C7'
+stringdef d hex 'C4'
+stringdef e hex 'C5'
+stringdef zh hex 'D6'
+stringdef z hex 'DA'
+stringdef i hex 'C9'
+stringdef i` hex 'CA'
+stringdef k hex 'CB'
+stringdef l hex 'CC'
+stringdef m hex 'CD'
+stringdef n hex 'CE'
+stringdef o hex 'CF'
+stringdef p hex 'D0'
+stringdef r hex 'D2'
+stringdef s hex 'D3'
+stringdef t hex 'D4'
+stringdef u hex 'D5'
+stringdef f hex 'C6'
+stringdef kh hex 'C8'
+stringdef ts hex 'C3'
+stringdef ch hex 'DE'
+stringdef sh hex 'DB'
+stringdef shch hex 'DD'
+stringdef " hex 'DF'
+stringdef y hex 'D9'
+stringdef ' hex 'D8'
+stringdef e` hex 'DC'
+stringdef iu hex 'C0'
+stringdef ia hex 'D1'
+
+routines ( mark_regions R2
+ perfective_gerund
+ adjective
+ adjectival
+ reflexive
+ verb
+ noun
+ derivational
+ tidy_up
+)
+
+externals ( stem )
+
+integers ( pV p2 )
+
+groupings ( v )
+
+define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}'
+
+define mark_regions as (
+
+ $pV = limit
+ $p2 = limit
+ do (
+ gopast v setmark pV gopast non-v
+ gopast v gopast non-v setmark p2
+ )
+)
+
+backwardmode (
+
+ define R2 as $p2 <= cursor
+
+ define perfective_gerund as (
+ [substring] among (
+ '{v}'
+ '{v}{sh}{i}'
+ '{v}{sh}{i}{s}{'}'
+ ('{a}' or '{ia}' delete)
+ '{i}{v}'
+ '{i}{v}{sh}{i}'
+ '{i}{v}{sh}{i}{s}{'}'
+ '{y}{v}'
+ '{y}{v}{sh}{i}'
+ '{y}{v}{sh}{i}{s}{'}'
+ (delete)
+ )
+ )
+
+ define adjective as (
+ [substring] among (
+ '{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}'
+ '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}'
+ '{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}'
+ '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}'
+ '{ia}{ia}'
+ // and -
+ '{o}{iu}' // - which is somewhat archaic
+ '{e}{iu}' // - soft form of {o}{iu}
+ (delete)
+ )
+ )
+
+ define adjectival as (
+ adjective
+
+ /* of the participle forms, em, vsh, ivsh, yvsh are readily removable.
+ nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of
+ errors. Removing im, uem, enn creates too many errors.
+ */
+
+ try (
+ [substring] among (
+ '{e}{m}' // present passive participle
+ '{n}{n}' // adjective from past passive participle
+ '{v}{sh}' // past active participle
+ '{iu}{shch}' '{shch}' // present active participle
+ ('{a}' or '{ia}' delete)
+
+ //but not '{i}{m}' '{u}{e}{m}' // present passive participle
+ //or '{e}{n}{n}' // adjective from past passive participle
+
+ '{i}{v}{sh}' '{y}{v}{sh}'// past active participle
+ '{u}{iu}{shch}' // present active participle
+ (delete)
+ )
+ )
+
+ )
+
+ define reflexive as (
+ [substring] among (
+ '{s}{ia}'
+ '{s}{'}'
+ (delete)
+ )
+ )
+
+ define verb as (
+ [substring] among (
+ '{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}'
+ '{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}'
+ '{n}{y}' '{t}{'}' '{e}{sh}{'}'
+
+ '{n}{n}{o}'
+ ('{a}' or '{ia}' delete)
+
+ '{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}'
+ '{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}'
+ '{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}'
+ '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}'
+ '{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}'
+ '{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}'
+ (delete)
+ /* note the short passive participle tests:
+ '{n}{a}' '{n}' '{n}{o}' '{n}{y}'
+ '{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}'
+ */
+ )
+ )
+
+ define noun as (
+ [substring] among (
+ '{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}'
+ '{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}'
+ '{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}'
+ '{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}'
+ '{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}'
+ '{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}'
+ (delete)
+ /* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}'
+ '{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}'
+ omitted - they only occur on 12 words.
+ */
+ )
+ )
+
+ define derivational as (
+ [substring] R2 among (
+ '{o}{s}{t}'
+ '{o}{s}{t}{'}'
+ (delete)
+ )
+ )
+
+ define tidy_up as (
+ [substring] among (
+
+ '{e}{i`}{sh}'
+ '{e}{i`}{sh}{e}' // superlative forms
+ (delete
+ ['{n}'] '{n}' delete
+ )
+ '{n}'
+ ('{n}' delete) // e.g. -nno endings
+ '{'}'
+ (delete) // with some slight false conflations
+ )
+ )
+)
+
+define stem as (
+
+ do mark_regions
+ backwards setlimit tomark pV for (
+ do (
+ perfective_gerund or
+ ( try reflexive
+ adjectival or verb or noun
+ )
+ )
+ try([ '{i}' ] delete)
+ // because noun ending -i{iu} is being treated as verb ending -{iu}
+
+ do derivational
+ do tidy_up
+ )
+)
diff --git a/contrib/snowball/algorithms/russian/stem_Unicode.sbl b/contrib/snowball/algorithms/russian/stem_Unicode.sbl
new file mode 100644
index 000000000..9e1a93f93
--- /dev/null
+++ b/contrib/snowball/algorithms/russian/stem_Unicode.sbl
@@ -0,0 +1,215 @@
+stringescapes {}
+
+/* the 32 Cyrillic letters in Unicode */
+
+stringdef a hex '430'
+stringdef b hex '431'
+stringdef v hex '432'
+stringdef g hex '433'
+stringdef d hex '434'
+stringdef e hex '435'
+stringdef zh hex '436'
+stringdef z hex '437'
+stringdef i hex '438'
+stringdef i` hex '439'
+stringdef k hex '43A'
+stringdef l hex '43B'
+stringdef m hex '43C'
+stringdef n hex '43D'
+stringdef o hex '43E'
+stringdef p hex '43F'
+stringdef r hex '440'
+stringdef s hex '441'
+stringdef t hex '442'
+stringdef u hex '443'
+stringdef f hex '444'
+stringdef kh hex '445'
+stringdef ts hex '446'
+stringdef ch hex '447'
+stringdef sh hex '448'
+stringdef shch hex '449'
+stringdef " hex '44A'
+stringdef y hex '44B'
+stringdef ' hex '44C'
+stringdef e` hex '44D'
+stringdef iu hex '44E'
+stringdef ia hex '44F'
+
+routines ( mark_regions R2
+ perfective_gerund
+ adjective
+ adjectival
+ reflexive
+ verb
+ noun
+ derivational
+ tidy_up
+)
+
+externals ( stem )
+
+integers ( pV p2 )
+
+groupings ( v )
+
+define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}'
+
+define mark_regions as (
+
+ $pV = limit
+ $p2 = limit
+ do (
+ gopast v setmark pV gopast non-v
+ gopast v gopast non-v setmark p2
+ )
+)
+
+backwardmode (
+
+ define R2 as $p2 <= cursor
+
+ define perfective_gerund as (
+ [substring] among (
+ '{v}'
+ '{v}{sh}{i}'
+ '{v}{sh}{i}{s}{'}'
+ ('{a}' or '{ia}' delete)
+ '{i}{v}'
+ '{i}{v}{sh}{i}'
+ '{i}{v}{sh}{i}{s}{'}'
+ '{y}{v}'
+ '{y}{v}{sh}{i}'
+ '{y}{v}{sh}{i}{s}{'}'
+ (delete)
+ )
+ )
+
+ define adjective as (
+ [substring] among (
+ '{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}'
+ '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}'
+ '{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}'
+ '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}'
+ '{ia}{ia}'
+ // and -
+ '{o}{iu}' // - which is somewhat archaic
+ '{e}{iu}' // - soft form of {o}{iu}
+ (delete)
+ )
+ )
+
+ define adjectival as (
+ adjective
+
+ /* of the participle forms, em, vsh, ivsh, yvsh are readily removable.
+ nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of
+ errors. Removing im, uem, enn creates too many errors.
+ */
+
+ try (
+ [substring] among (
+ '{e}{m}' // present passive participle
+ '{n}{n}' // adjective from past passive participle
+ '{v}{sh}' // past active participle
+ '{iu}{shch}' '{shch}' // present active participle
+ ('{a}' or '{ia}' delete)
+
+ //but not '{i}{m}' '{u}{e}{m}' // present passive participle
+ //or '{e}{n}{n}' // adjective from past passive participle
+
+ '{i}{v}{sh}' '{y}{v}{sh}'// past active participle
+ '{u}{iu}{shch}' // present active participle
+ (delete)
+ )
+ )
+
+ )
+
+ define reflexive as (
+ [substring] among (
+ '{s}{ia}'
+ '{s}{'}'
+ (delete)
+ )
+ )
+
+ define verb as (
+ [substring] among (
+ '{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}'
+ '{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}'
+ '{n}{y}' '{t}{'}' '{e}{sh}{'}'
+
+ '{n}{n}{o}'
+ ('{a}' or '{ia}' delete)
+
+ '{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}'
+ '{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}'
+ '{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}'
+ '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}'
+ '{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}'
+ '{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}'
+ (delete)
+ /* note the short passive participle tests:
+ '{n}{a}' '{n}' '{n}{o}' '{n}{y}'
+ '{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}'
+ */
+ )
+ )
+
+ define noun as (
+ [substring] among (
+ '{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}'
+ '{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}'
+ '{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}'
+ '{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}'
+ '{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}'
+ '{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}'
+ (delete)
+ /* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}'
+ '{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}'
+ omitted - they only occur on 12 words.
+ */
+ )
+ )
+
+ define derivational as (
+ [substring] R2 among (
+ '{o}{s}{t}'
+ '{o}{s}{t}{'}'
+ (delete)
+ )
+ )
+
+ define tidy_up as (
+ [substring] among (
+
+ '{e}{i`}{sh}'
+ '{e}{i`}{sh}{e}' // superlative forms
+ (delete
+ ['{n}'] '{n}' delete
+ )
+ '{n}'
+ ('{n}' delete) // e.g. -nno endings
+ '{'}'
+ (delete) // with some slight false conflations
+ )
+ )
+)
+
+define stem as (
+
+ do mark_regions
+ backwards setlimit tomark pV for (
+ do (
+ perfective_gerund or
+ ( try reflexive
+ adjectival or verb or noun
+ )
+ )
+ try([ '{i}' ] delete)
+ // because noun ending -i{iu} is being treated as verb ending -{iu}
+
+ do derivational
+ do tidy_up
+ )
+)
diff --git a/contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..9dee289cc
--- /dev/null
+++ b/contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl
@@ -0,0 +1,230 @@
+routines (
+ postlude mark_regions
+ RV R1 R2
+ attached_pronoun
+ standard_suffix
+ y_verb_suffix
+ verb_suffix
+ residual_suffix
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef a' hex 'E1' // a-acute
+stringdef e' hex 'E9' // e-acute
+stringdef i' hex 'ED' // i-acute
+stringdef o' hex 'F3' // o-acute
+stringdef u' hex 'FA' // u-acute
+stringdef u" hex 'FC' // u-diaeresis
+stringdef n~ hex 'F1' // n-tilde
+
+define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}'
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v (non-v gopast v) or (v gopast non-v) )
+ or
+ ( non-v (non-v gopast v) or (v next) )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+ [substring] among(
+ '{a'}' (<- 'a')
+ '{e'}' (<- 'e')
+ '{i'}' (<- 'i')
+ '{o'}' (<- 'o')
+ '{u'}' (<- 'u')
+ // and possibly {u"}->u here, or in prelude
+ '' (next)
+ ) //or next
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define attached_pronoun as (
+ [substring] among(
+ 'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo'
+ 'las' 'les' 'los' 'nos'
+ )
+ substring RV among(
+ 'i{e'}ndo' (] <- 'iendo')
+ '{a'}ndo' (] <- 'ando')
+ '{a'}r' (] <- 'ar')
+ '{e'}r' (] <- 'er')
+ '{i'}r' (] <- 'ir')
+ 'ando'
+ 'iendo'
+ 'ar' 'er' 'ir'
+ (delete)
+ 'yendo' ('u' delete)
+ )
+ )
+
+ define standard_suffix as (
+ [substring] among(
+
+ 'anza' 'anzas'
+ 'ico' 'ica' 'icos' 'icas'
+ 'ismo' 'ismos'
+ 'able' 'ables'
+ 'ible' 'ibles'
+ 'ista' 'istas'
+ 'oso' 'osa' 'osos' 'osas'
+ 'amiento' 'amientos'
+ 'imiento' 'imientos'
+ (
+ R2 delete
+ )
+ 'adora' 'ador' 'aci{o'}n'
+ 'adoras' 'adores' 'aciones'
+ 'ante' 'antes' 'ancia' 'ancias'// Note 1
+ (
+ R2 delete
+ try ( ['ic'] R2 delete )
+ )
+ 'log{i'}a'
+ 'log{i'}as'
+ (
+ R2 <- 'log'
+ )
+ 'uci{o'}n' 'uciones'
+ (
+ R2 <- 'u'
+ )
+ 'encia' 'encias'
+ (
+ R2 <- 'ente'
+ )
+ 'amente'
+ (
+ R1 delete
+ try (
+ [substring] R2 delete among(
+ 'iv' (['at'] R2 delete)
+ 'os'
+ 'ic'
+ 'ad'
+ )
+ )
+ )
+ 'mente'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'ante' // Note 1
+ 'able'
+ 'ible' (R2 delete)
+ )
+ )
+ )
+ 'idad'
+ 'idades'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'abil'
+ 'ic'
+ 'iv' (R2 delete)
+ )
+ )
+ )
+ 'iva' 'ivo'
+ 'ivas' 'ivos'
+ (
+ R2 delete
+ try (
+ ['at'] R2 delete // but not a further ['ic'] R2 delete
+ )
+ )
+ )
+ )
+
+ define y_verb_suffix as (
+ setlimit tomark pV for ([substring]) among(
+ 'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}'
+ 'yas' 'yes' 'yais' 'yamos'
+ ('u' delete)
+ )
+ )
+
+ define verb_suffix as (
+ setlimit tomark pV for ([substring]) among(
+
+ 'en' 'es' '{e'}is' 'emos'
+ (try ('u' test 'g') ] delete)
+
+ 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais'
+ 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}'
+ 'ar{e'}'
+ 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais'
+ 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}'
+ 'er{e'}'
+ 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais'
+ 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}'
+ 'ir{e'}'
+
+ 'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed'
+ 'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an'
+ 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado'
+ 'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as'
+ 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases'
+ 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais'
+ 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados'
+ 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos'
+ '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos'
+ (delete)
+ )
+ )
+
+ define residual_suffix as (
+ [substring] among(
+ 'os'
+ 'a' 'o' '{a'}' '{i'}' '{o'}'
+ ( RV delete )
+ 'e' '{e'}'
+ ( RV delete try( ['u'] test 'g' RV delete ) )
+ )
+ )
+)
+
+define stem as (
+ do mark_regions
+ backwards (
+ do attached_pronoun
+ do ( standard_suffix or
+ y_verb_suffix or
+ verb_suffix
+ )
+ do residual_suffix
+ )
+ do postlude
+)
+
+/*
+ Note 1: additions of 15 Jun 2005
+*/
diff --git a/contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl
new file mode 100644
index 000000000..db7a46201
--- /dev/null
+++ b/contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl
@@ -0,0 +1,230 @@
+routines (
+ postlude mark_regions
+ RV R1 R2
+ attached_pronoun
+ standard_suffix
+ y_verb_suffix
+ verb_suffix
+ residual_suffix
+)
+
+externals ( stem )
+
+integers ( pV p1 p2 )
+
+groupings ( v )
+
+stringescapes {}
+
+/* special characters (in MS-DOS Latin I) */
+
+stringdef a' hex 'A0' // a-acute
+stringdef e' hex '82' // e-acute
+stringdef i' hex 'A1' // i-acute
+stringdef o' hex 'A2' // o-acute
+stringdef u' hex 'A3' // u-acute
+stringdef u" hex '81' // u-diaeresis
+stringdef n~ hex 'A4' // n-tilde
+
+define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}'
+
+define mark_regions as (
+
+ $pV = limit
+ $p1 = limit
+ $p2 = limit // defaults
+
+ do (
+ ( v (non-v gopast v) or (v gopast non-v) )
+ or
+ ( non-v (non-v gopast v) or (v next) )
+ setmark pV
+ )
+ do (
+ gopast v gopast non-v setmark p1
+ gopast v gopast non-v setmark p2
+ )
+)
+
+define postlude as repeat (
+ [substring] among(
+ '{a'}' (<- 'a')
+ '{e'}' (<- 'e')
+ '{i'}' (<- 'i')
+ '{o'}' (<- 'o')
+ '{u'}' (<- 'u')
+ // and possibly {u"}->u here, or in prelude
+ '' (next)
+ ) //or next
+)
+
+backwardmode (
+
+ define RV as $pV <= cursor
+ define R1 as $p1 <= cursor
+ define R2 as $p2 <= cursor
+
+ define attached_pronoun as (
+ [substring] among(
+ 'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo'
+ 'las' 'les' 'los' 'nos'
+ )
+ substring RV among(
+ 'i{e'}ndo' (] <- 'iendo')
+ '{a'}ndo' (] <- 'ando')
+ '{a'}r' (] <- 'ar')
+ '{e'}r' (] <- 'er')
+ '{i'}r' (] <- 'ir')
+ 'ando'
+ 'iendo'
+ 'ar' 'er' 'ir'
+ (delete)
+ 'yendo' ('u' delete)
+ )
+ )
+
+ define standard_suffix as (
+ [substring] among(
+
+ 'anza' 'anzas'
+ 'ico' 'ica' 'icos' 'icas'
+ 'ismo' 'ismos'
+ 'able' 'ables'
+ 'ible' 'ibles'
+ 'ista' 'istas'
+ 'oso' 'osa' 'osos' 'osas'
+ 'amiento' 'amientos'
+ 'imiento' 'imientos'
+ (
+ R2 delete
+ )
+ 'adora' 'ador' 'aci{o'}n'
+ 'adoras' 'adores' 'aciones'
+ 'ante' 'antes' 'ancia' 'ancias'// Note 1
+ (
+ R2 delete
+ try ( ['ic'] R2 delete )
+ )
+ 'log{i'}a'
+ 'log{i'}as'
+ (
+ R2 <- 'log'
+ )
+ 'uci{o'}n' 'uciones'
+ (
+ R2 <- 'u'
+ )
+ 'encia' 'encias'
+ (
+ R2 <- 'ente'
+ )
+ 'amente'
+ (
+ R1 delete
+ try (
+ [substring] R2 delete among(
+ 'iv' (['at'] R2 delete)
+ 'os'
+ 'ic'
+ 'ad'
+ )
+ )
+ )
+ 'mente'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'ante' // Note 1
+ 'able'
+ 'ible' (R2 delete)
+ )
+ )
+ )
+ 'idad'
+ 'idades'
+ (
+ R2 delete
+ try (
+ [substring] among(
+ 'abil'
+ 'ic'
+ 'iv' (R2 delete)
+ )
+ )
+ )
+ 'iva' 'ivo'
+ 'ivas' 'ivos'
+ (
+ R2 delete
+ try (
+ ['at'] R2 delete // but not a further ['ic'] R2 delete
+ )
+ )
+ )
+ )
+
+ define y_verb_suffix as (
+ setlimit tomark pV for ([substring]) among(
+ 'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}'
+ 'yas' 'yes' 'yais' 'yamos'
+ ('u' delete)
+ )
+ )
+
+ define verb_suffix as (
+ setlimit tomark pV for ([substring]) among(
+
+ 'en' 'es' '{e'}is' 'emos'
+ (try ('u' test 'g') ] delete)
+
+ 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais'
+ 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}'
+ 'ar{e'}'
+ 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais'
+ 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}'
+ 'er{e'}'
+ 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais'
+ 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}'
+ 'ir{e'}'
+
+ 'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed'
+ 'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an'
+ 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado'
+ 'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as'
+ 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases'
+ 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais'
+ 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados'
+ 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos'
+ '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos'
+ (delete)
+ )
+ )
+
+ define residual_suffix as (
+ [substring] among(
+ 'os'
+ 'a' 'o' '{a'}' '{i'}' '{o'}'
+ ( RV delete )
+ 'e' '{e'}'
+ ( RV delete try( ['u'] test 'g' RV delete ) )
+ )
+ )
+)
+
+define stem as (
+ do mark_regions
+ backwards (
+ do attached_pronoun
+ do ( standard_suffix or
+ y_verb_suffix or
+ verb_suffix
+ )
+ do residual_suffix
+ )
+ do postlude
+)
+
+/*
+ Note 1: additions of 15 Jun 2005
+*/
diff --git a/contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl
new file mode 100644
index 000000000..03ce1e22f
--- /dev/null
+++ b/contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl
@@ -0,0 +1,72 @@
+routines (
+ mark_regions
+ main_suffix
+ consonant_pair
+ other_suffix
+)
+
+externals ( stem )
+
+integers ( p1 x )
+
+groupings ( v s_ending )
+
+stringescapes {}
+
+/* special characters (in ISO Latin I) */
+
+stringdef a" hex 'E4'
+stringdef ao hex 'E5'
+stringdef o" hex 'F6'
+
+define v 'aeiouy{a"}{ao}{o"}'
+
+define s_ending 'bcdfghjklmnoprtvy'
+
+define mark_regions as (
+
+ $p1 = limit
+ test ( hop 3 setmark x )
+ goto v gopast non-v setmark p1
+ try ( $p1 < x $p1 = x )
+)
+
+backwardmode (
+
+ define main_suffix as (
+ setlimit tomark p1 for ([substring])
+ among(
+
+ 'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne'
+ 'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter'
+ 'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens'
+ 'hetens' 'erns' 'at' 'andet' 'het' 'ast'
+ (delete)
+ 's'
+ (s_ending delete)
+ )
+ )
+
+ define consonant_pair as setlimit tomark p1 for (
+ among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt')
+ and ([next] delete)
+ )
+
+ define other_suffix as setlimit tomark p1 for (
+ [substring] among(
+ 'lig' 'ig' 'els' (delete)
+ 'l{o"}st' (<-'l{o"}s')
+ 'fullt' (<-'full')
+ )
+ )
+)
+
+define stem as (
+
+ do mark_regions
+ backwards (
+ do main_suffix
+ do consonant_pair
+ do other_suffix
+ )
+)
diff --git a/contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl
new file mode 100644
index 000000000..1631f401a
--- /dev/null
+++ b/contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl
@@ -0,0 +1,72 @@
+routines (
+ mark_regions
+ main_suffix
+ consonant_pair
+ other_suffix
+)
+
+externals ( stem )
+
+integers ( p1 x )
+
+groupings ( v s_ending )
+
+stringescapes {}
+
+/* special characters (in MS-DOS Latin I) */
+
+stringdef a" hex '84'
+stringdef ao hex '86'
+stringdef o" hex '94'
+
+define v 'aeiouy{a"}{ao}{o"}'
+
+define s_ending 'bcdfghjklmnoprtvy'
+
+define mark_regions as (
+
+ $p1 = limit
+ test ( hop 3 setmark x )
+ goto v gopast non-v setmark p1
+ try ( $p1 < x $p1 = x )
+)
+
+backwardmode (
+
+ define main_suffix as (
+ setlimit tomark p1 for ([substring])
+ among(
+
+ 'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne'
+ 'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter'
+ 'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens'
+ 'hetens' 'erns' 'at' 'andet' 'het' 'ast'
+ (delete)
+ 's'
+ (s_ending delete)
+ )
+ )
+
+ define consonant_pair as setlimit tomark p1 for (
+ among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt')
+ and ([next] delete)
+ )
+
+ define other_suffix as setlimit tomark p1 for (
+ [substring] among(
+ 'lig' 'ig' 'els' (delete)
+ 'l{o"}st' (<-'l{o"}s')
+ 'fullt' (<-'full')
+ )
+ )
+)
+
+define stem as (
+
+ do mark_regions
+ backwards (
+ do main_suffix
+ do consonant_pair
+ do other_suffix
+ )
+)
diff --git a/contrib/snowball/algorithms/turkish/stem_Unicode.sbl b/contrib/snowball/algorithms/turkish/stem_Unicode.sbl
new file mode 100644
index 000000000..16c02a51f
--- /dev/null
+++ b/contrib/snowball/algorithms/turkish/stem_Unicode.sbl
@@ -0,0 +1,477 @@
+/* Stemmer for Turkish
+ * author: Evren (Kapusuz) Çilden
+ * email: evren.kapusuz at gmail.com
+ * version: 1.0 (15.01.2007)
+
+
+ * stems nominal verb suffixes
+ * stems nominal inflections
+ * more than one syllable word check
+ * (y,n,s,U) context check
+ * vowel harmony check
+ * last consonant check and conversion (b, c, d, ğ to p, ç, t, k)
+
+ * The stemming algorithm is based on the paper "An Affix Stripping
+ * Morphological Analyzer for Turkish" by Gülşen Eryiğit and
+ * Eşref Adalı (Proceedings of the IAESTED International Conference
+ * ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004,
+ * Innsbruck, Austria
+
+ * Turkish is an agglutinative language and has a very rich morphological
+ * structure. In Turkish, you can form many different words from a single stem
+ * by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means
+ * "You had been the doctor of him". The stem of the word is "doktor" and it
+ * takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about
+ * the append order of suffixes can be clearly described as FSMs.
+ * The paper referenced above defines some FSMs for right to left
+ * morphological analysis. I generated a method for constructing snowball
+ * expressions from right to left FSMs for stemming suffixes.
+*/
+
+routines (
+ append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings
+ check_vowel_harmony // tests vowel harmony for suffixes
+ is_reserved_word // tests whether current string is a reserved word ('ad','soyad')
+ mark_cAsInA // nominal verb suffix
+ mark_DA // noun suffix
+ mark_DAn // noun suffix
+ mark_DUr // nominal verb suffix
+ mark_ki // noun suffix
+ mark_lAr // noun suffix, nominal verb suffix
+ mark_lArI // noun suffix
+ mark_nA // noun suffix
+ mark_ncA // noun suffix
+ mark_ndA // noun suffix
+ mark_ndAn // noun suffix
+ mark_nU // noun suffix
+ mark_nUn // noun suffix
+ mark_nUz // nominal verb suffix
+ mark_sU // noun suffix
+ mark_sUn // nominal verb suffix
+ mark_sUnUz // nominal verb suffix
+ mark_possessives // -(U)m,-(U)n,-(U)mUz,-(U)nUz,
+ mark_yA // noun suffix
+ mark_ylA // noun suffix
+ mark_yU // noun suffix
+ mark_yUm // nominal verb suffix
+ mark_yUz // nominal verb suffix
+ mark_yDU // nominal verb suffix
+ mark_yken // nominal verb suffix
+ mark_ymUs_ // nominal verb suffix
+ mark_ysA // nominal verb suffix
+
+ mark_suffix_with_optional_y_consonant
+ mark_suffix_with_optional_U_vowel
+ mark_suffix_with_optional_n_consonant
+ mark_suffix_with_optional_s_consonant
+
+ more_than_one_syllable_word
+
+ post_process_last_consonants
+ postlude
+
+ stem_nominal_verb_suffixes
+ stem_noun_suffixes
+ stem_suffix_chain_before_ki
+)
+
+/* Special characters in Unicode Latin-1 and Latin Extended-A */
+stringdef c. hex 'E7' // LATIN SMALL LETTER C WITH CEDILLA
+stringdef g~ hex '011F' // LATIN SMALL LETTER G WITH BREVE
+stringdef i' hex '0131' // LATIN SMALL LETTER I WITHOUT DOT
+stringdef o" hex 'F6' // LATIN SMALL LETTER O WITH DIAERESIS
+stringdef s. hex '015F' // LATIN SMALL LETTER S WITH CEDILLA
+stringdef u" hex 'FC' // LATIN SMALL LETTER U WITH DIAERESIS
+
+stringescapes { }
+
+integers ( strlen ) // length of a string
+
+booleans ( continue_stemming_noun_suffixes )
+
+groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6)
+
+define vowel 'ae{i'}io{o"}u{u"}'
+define U '{i'}iu{u"}'
+
+// the vowel grouping definitions below are used for checking vowel harmony
+define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a'
+define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e'
+define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i''
+define vowel4 'ei' // vowels that can end with suffixes containing 'i'
+define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u'
+define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"'
+
+externals ( stem )
+
+backwardmode (
+ // checks vowel harmony for possible suffixes,
+ // helps to detect whether the candidate for suffix applies to vowel harmony
+ // this rule is added to prevent over stemming
+ define check_vowel_harmony as (
+ test
+ (
+ (goto vowel) // if there is a vowel
+ (
+ ('a' goto vowel1) or
+ ('e' goto vowel2) or
+ ('{i'}' goto vowel3) or
+ ('i' goto vowel4) or
+ ('o' goto vowel5) or
+ ('{o"}' goto vowel6) or
+ ('u' goto vowel5) or
+ ('{u"}' goto vowel6)
+ )
+ )
+ )
+
+ // if the last consonant before suffix is vowel and n then advance and delete
+ // if the last consonant before suffix is non vowel and n do nothing
+ // if the last consonant before suffix is not n then only delete the suffix
+ // assumption: slice beginning is set correctly
+ define mark_suffix_with_optional_n_consonant as (
+ ((test 'n') next (test vowel))
+ or
+ ((not(test 'n')) test(next (test vowel)))
+
+ )
+
+ // if the last consonant before suffix is vowel and s then advance and delete
+ // if the last consonant before suffix is non vowel and s do nothing
+ // if the last consonant before suffix is not s then only delete the suffix
+ // assumption: slice beginning is set correctly
+ define mark_suffix_with_optional_s_consonant as (
+ ((test 's') next (test vowel))
+ or
+ ((not(test 's')) test(next (test vowel)))
+ )
+
+ // if the last consonant before suffix is vowel and y then advance and delete
+ // if the last consonant before suffix is non vowel and y do nothing
+ // if the last consonant before suffix is not y then only delete the suffix
+ // assumption: slice beginning is set correctly
+ define mark_suffix_with_optional_y_consonant as (
+ ((test 'y') next (test vowel))
+ or
+ ((not(test 'y')) test(next (test vowel)))
+ )
+
+ define mark_suffix_with_optional_U_vowel as (
+ ((test U) next (test non-vowel))
+ or
+ ((not(test U)) test(next (test non-vowel)))
+
+ )
+
+ define mark_possessives as (
+ among ('m{i'}z' 'miz' 'muz' 'm{u"}z'
+ 'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n')
+ (mark_suffix_with_optional_U_vowel)
+ )
+
+ define mark_sU as (
+ check_vowel_harmony
+ U
+ (mark_suffix_with_optional_s_consonant)
+ )
+
+ define mark_lArI as (
+ among ('leri' 'lar{i'}')
+ )
+
+ define mark_yU as (
+ check_vowel_harmony
+ U
+ (mark_suffix_with_optional_y_consonant)
+ )
+
+ define mark_nU as (
+ check_vowel_harmony
+ among ('n{i'}' 'ni' 'nu' 'n{u"}')
+ )
+
+ define mark_nUn as (
+ check_vowel_harmony
+ among ('{i'}n' 'in' 'un' '{u"}n')
+ (mark_suffix_with_optional_n_consonant)
+ )
+
+ define mark_yA as (
+ check_vowel_harmony
+ among('a' 'e')
+ (mark_suffix_with_optional_y_consonant)
+ )
+
+ define mark_nA as (
+ check_vowel_harmony
+ among('na' 'ne')
+ )
+
+ define mark_DA as (
+ check_vowel_harmony
+ among('da' 'de' 'ta' 'te')
+ )
+
+ define mark_ndA as (
+ check_vowel_harmony
+ among('nda' 'nde')
+ )
+
+ define mark_DAn as (
+ check_vowel_harmony
+ among('dan' 'den' 'tan' 'ten')
+ )
+
+ define mark_ndAn as (
+ check_vowel_harmony
+ among('ndan' 'nden')
+ )
+
+ define mark_ylA as (
+ check_vowel_harmony
+ among('la' 'le')
+ (mark_suffix_with_optional_y_consonant)
+ )
+
+ define mark_ki as (
+ 'ki'
+ )
+
+ define mark_ncA as (
+ check_vowel_harmony
+ among('ca' 'ce')
+ (mark_suffix_with_optional_n_consonant)
+ )
+
+ define mark_yUm as (
+ check_vowel_harmony
+ among ('{i'}m' 'im' 'um' '{u"}m')
+ (mark_suffix_with_optional_y_consonant)
+ )
+
+ define mark_sUn as (
+ check_vowel_harmony
+ among ('s{i'}n' 'sin' 'sun' 's{u"}n' )
+ )
+
+ define mark_yUz as (
+ check_vowel_harmony
+ among ('{i'}z' 'iz' 'uz' '{u"}z')
+ (mark_suffix_with_optional_y_consonant)
+ )
+
+ define mark_sUnUz as (
+ among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z')
+ )
+
+ define mark_lAr as (
+ check_vowel_harmony
+ among ('ler' 'lar')
+ )
+
+ define mark_nUz as (
+ check_vowel_harmony
+ among ('n{i'}z' 'niz' 'nuz' 'n{u"}z')
+ )
+
+ define mark_DUr as (
+ check_vowel_harmony
+ among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r')
+ )
+
+ define mark_cAsInA as (
+ among ('cas{i'}na' 'cesine')
+ )
+
+ define mark_yDU as (
+ check_vowel_harmony
+ among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m'
+ 't{i'}n' 'tin' 'tun' 't{u"}n' 'd{i'}n' 'din' 'dun' 'd{u"}n'
+ 't{i'}k' 'tik' 'tuk' 't{u"}k' 'd{i'}k' 'dik' 'duk' 'd{u"}k'
+ 't{i'}' 'ti' 'tu' 't{u"}' 'd{i'}' 'di' 'du' 'd{u"}')
+ (mark_suffix_with_optional_y_consonant)
+ )
+
+ // does not fully obey vowel harmony
+ define mark_ysA as (
+ among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se')
+ (mark_suffix_with_optional_y_consonant)
+ )
+
+ define mark_ymUs_ as (
+ check_vowel_harmony
+ among ('m{i'}{s.}' 'mi{s.}' 'mu{s.}' 'm{u"}{s.}')
+ (mark_suffix_with_optional_y_consonant)
+ )
+
+ define mark_yken as (
+ 'ken' (mark_suffix_with_optional_y_consonant)
+ )
+
+ define stem_nominal_verb_suffixes as (
+ [
+ set continue_stemming_noun_suffixes
+ (mark_ymUs_ or mark_yDU or mark_ysA or mark_yken)
+ or
+ (mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)
+ or
+ (
+ mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_))
+ unset continue_stemming_noun_suffixes
+ )
+ or
+ (mark_nUz (mark_yDU or mark_ysA))
+ or
+ ((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_))
+ or
+ (mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_))
+ ]delete
+ )
+
+ // stems noun suffix chains ending with -ki
+ define stem_suffix_chain_before_ki as (
+ [
+ mark_ki
+ (
+ (mark_DA] delete try([
+ (mark_lAr] delete try(stem_suffix_chain_before_ki))
+ or
+ (mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+
+ ))
+ or
+ (mark_nUn] delete try([
+ (mark_lArI] delete)
+ or
+ ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+ or
+ (stem_suffix_chain_before_ki)
+ ))
+ or
+ (mark_ndA (
+ (mark_lArI] delete)
+ or
+ ((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki)))
+ or
+ (stem_suffix_chain_before_ki)
+ ))
+ )
+ )
+
+ define stem_noun_suffixes as (
+ ([mark_lAr] delete try(stem_suffix_chain_before_ki))
+ or
+ ([mark_ncA] delete
+ try(
+ ([mark_lArI] delete)
+ or
+ ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+ or
+ ([mark_lAr] delete stem_suffix_chain_before_ki)
+ )
+ )
+ or
+ ([(mark_ndA or mark_nA)
+ (
+ (mark_lArI] delete)
+ or
+ (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+ or
+ (stem_suffix_chain_before_ki)
+ )
+ )
+ or
+ ([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI)))
+ or
+ ( [mark_DAn] delete try ([
+ (
+ (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+ or
+ (mark_lAr] delete try(stem_suffix_chain_before_ki))
+ or
+ (stem_suffix_chain_before_ki)
+ ))
+ )
+ or
+ ([mark_nUn or mark_ylA] delete
+ try(
+ ([mark_lAr] delete stem_suffix_chain_before_ki)
+ or
+ ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+ or
+ stem_suffix_chain_before_ki
+ )
+ )
+ or
+ ([mark_lArI] delete)
+ or
+ (stem_suffix_chain_before_ki)
+ or
+ ([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki))
+ or
+ ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
+ )
+
+ define post_process_last_consonants as (
+ [substring] among (
+ 'b' (<- 'p')
+ 'c' (<- '{c.}')
+ 'd' (<- 't')
+ '{g~}' (<- 'k')
+ )
+ )
+
+ // after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed
+ // like in 'kedim' -> 'ked'
+ // Turkish words don't usually end with 'd' or 'g'
+ // some very well known words are ignored (like 'ad' 'soyad'
+ // appends U to stems ending with d or g, decides which vowel to add
+ // based on the last vowel in the stem
+ define append_U_to_stems_ending_with_d_or_g as (
+ test('d' or 'g')
+ (test((goto vowel) 'a' or '{i'}') <+ '{i'}')
+ or
+ (test((goto vowel) 'e' or 'i') <+ 'i')
+ or
+ (test((goto vowel) 'o' or 'u') <+ 'u')
+ or
+ (test((goto vowel) '{o"}' or '{u"}') <+ '{u"}')
+ )
+
+)
+
+// Tests if there are more than one syllables
+// In Turkish each vowel indicates a distinct syllable
+define more_than_one_syllable_word as (
+ test (atleast 2 (gopast vowel))
+)
+
+define is_reserved_word as (
+ test(gopast 'ad' ($strlen = 2) ($strlen == limit))
+ or
+ test(gopast 'soyad' ($strlen = 5) ($strlen == limit))
+)
+
+define postlude as (
+ not(is_reserved_word)
+ backwards (
+ do append_U_to_stems_ending_with_d_or_g
+ do post_process_last_consonants
+
+ )
+)
+
+define stem as (
+ (more_than_one_syllable_word)
+ (
+ backwards (
+ do stem_nominal_verb_suffixes
+ continue_stemming_noun_suffixes
+ do stem_noun_suffixes
+ )
+
+ postlude
+ )
+)
+
+