diff options
Diffstat (limited to 'contrib/snowball/algorithms')
31 files changed, 5890 insertions, 0 deletions
diff --git a/contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..0a8190a08 --- /dev/null +++ b/contrib/snowball/algorithms/danish/stem_ISO_8859_1.sbl @@ -0,0 +1,91 @@ +routines ( + mark_regions + main_suffix + consonant_pair + other_suffix + undouble +) + +externals ( stem ) + +strings ( ch ) + +integers ( p1 x ) + +groupings ( v s_ending ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef ae hex 'E6' +stringdef ao hex 'E5' +stringdef o/ hex 'F8' + +define v 'aeiouy{ae}{ao}{o/}' + +define s_ending 'abcdfghjklmnoprtvyz{ao}' + +define mark_regions as ( + + $p1 = limit + + test ( hop 3 setmark x ) + goto v gopast non-v setmark p1 + try ( $p1 < x $p1 = x ) +) + +backwardmode ( + + define main_suffix as ( + setlimit tomark p1 for ([substring]) + among( + + 'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere' + 'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes' + 'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets' + 'erets' 'et' 'eret' + (delete) + 's' + (s_ending delete) + ) + ) + + define consonant_pair as ( + test ( + setlimit tomark p1 for ([substring]) + among( + 'gd' // significant in the call from other_suffix + 'dt' 'gt' 'kt' + ) + ) + next] delete + ) + + define other_suffix as ( + do ( ['st'] 'ig' delete ) + setlimit tomark p1 for ([substring]) + among( + 'ig' 'lig' 'elig' 'els' + (delete do consonant_pair) + 'l{o/}st' + (<-'l{o/}s') + ) + ) + define undouble as ( + setlimit tomark p1 for ([non-v] ->ch) + ch + delete + ) +) + +define stem as ( + + do mark_regions + backwards ( + do main_suffix + do consonant_pair + do other_suffix + do undouble + ) +) diff --git a/contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl new file mode 100644 index 000000000..1131a1cb7 --- /dev/null +++ b/contrib/snowball/algorithms/danish/stem_MS_DOS_Latin_I.sbl @@ -0,0 +1,91 @@ +routines ( + mark_regions + main_suffix + consonant_pair + other_suffix + undouble +) + +externals ( stem ) + +strings ( ch ) + +integers ( p1 x ) + +groupings ( v s_ending ) + +stringescapes {} + +/* special characters (in MS-DOS Latin I) */ + +stringdef ae hex '91' +stringdef ao hex '86' +stringdef o/ hex '9B' + +define v 'aeiouy{ae}{ao}{o/}' + +define s_ending 'abcdfghjklmnoprtvyz{ao}' + +define mark_regions as ( + + $p1 = limit + + test ( hop 3 setmark x ) + goto v gopast non-v setmark p1 + try ( $p1 < x $p1 = x ) +) + +backwardmode ( + + define main_suffix as ( + setlimit tomark p1 for ([substring]) + among( + + 'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere' + 'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes' + 'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets' + 'erets' 'et' 'eret' + (delete) + 's' + (s_ending delete) + ) + ) + + define consonant_pair as ( + test ( + setlimit tomark p1 for ([substring]) + among( + 'gd' // significant in the call from other_suffix + 'dt' 'gt' 'kt' + ) + ) + next] delete + ) + + define other_suffix as ( + do ( ['st'] 'ig' delete ) + setlimit tomark p1 for ([substring]) + among( + 'ig' 'lig' 'elig' 'els' + (delete do consonant_pair) + 'l{o/}st' + (<-'l{o/}s') + ) + ) + define undouble as ( + setlimit tomark p1 for ([non-v] ->ch) + ch + delete + ) +) + +define stem as ( + + do mark_regions + backwards ( + do main_suffix + do consonant_pair + do other_suffix + do undouble + ) +) diff --git a/contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..f7609f766 --- /dev/null +++ b/contrib/snowball/algorithms/dutch/stem_ISO_8859_1.sbl @@ -0,0 +1,164 @@ +routines ( + prelude postlude + e_ending + en_ending + mark_regions + R1 R2 + undouble + standard_suffix +) + +externals ( stem ) + +booleans ( e_found ) + +integers ( p1 p2 ) + +groupings ( v v_I v_j ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef a" hex 'E4' +stringdef e" hex 'EB' +stringdef i" hex 'EF' +stringdef o" hex 'F6' +stringdef u" hex 'FC' + +stringdef a' hex 'E1' +stringdef e' hex 'E9' +stringdef i' hex 'ED' +stringdef o' hex 'F3' +stringdef u' hex 'FA' + +stringdef e` hex 'E8' + +define v 'aeiouy{e`}' +define v_I v + 'I' +define v_j v + 'j' + +define prelude as ( + test repeat ( + [substring] among( + '{a"}' '{a'}' + (<- 'a') + '{e"}' '{e'}' + (<- 'e') + '{i"}' '{i'}' + (<- 'i') + '{o"}' '{o'}' + (<- 'o') + '{u"}' '{u'}' + (<- 'u') + '' (next) + ) //or next + ) + try(['y'] <- 'Y') + repeat goto ( + v [('i'] v <- 'I') or + ('y'] <- 'Y') + ) +) + +define mark_regions as ( + + $p1 = limit + $p2 = limit + + gopast v gopast non-v setmark p1 + try($p1 < 3 $p1 = 3) // at least 3 + gopast v gopast non-v setmark p2 + +) + +define postlude as repeat ( + + [substring] among( + 'Y' (<- 'y') + 'I' (<- 'i') + '' (next) + ) //or next + +) + +backwardmode ( + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define undouble as ( + test among('kk' 'dd' 'tt') [next] delete + ) + + define e_ending as ( + unset e_found + ['e'] R1 test non-v delete + set e_found + undouble + ) + + define en_ending as ( + R1 non-v and not 'gem' delete + undouble + ) + + define standard_suffix as ( + do ( + [substring] among( + 'heden' + ( R1 <- 'heid' + ) + 'en' 'ene' + ( en_ending + ) + 's' 'se' + ( R1 non-v_j delete + ) + ) + ) + do e_ending + + do ( ['heid'] R2 not 'c' delete + ['en'] en_ending + ) + + do ( + [substring] among( + 'end' 'ing' + ( R2 delete + (['ig'] R2 not 'e' delete) or undouble + ) + 'ig' + ( R2 not 'e' delete + ) + 'lijk' + ( R2 delete e_ending + ) + 'baar' + ( R2 delete + ) + 'bar' + ( R2 e_found delete + ) + ) + ) + do ( + non-v_I + test ( + among ('aa' 'ee' 'oo' 'uu') + non-v + ) + [next] delete + ) + ) +) + +define stem as ( + + do prelude + do mark_regions + backwards + do standard_suffix + do postlude +) diff --git a/contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl new file mode 100644 index 000000000..15b8718d1 --- /dev/null +++ b/contrib/snowball/algorithms/dutch/stem_MS_DOS_Latin_I.sbl @@ -0,0 +1,164 @@ +routines ( + prelude postlude + e_ending + en_ending + mark_regions + R1 R2 + undouble + standard_suffix +) + +externals ( stem ) + +booleans ( e_found ) + +integers ( p1 p2 ) + +groupings ( v v_I v_j ) + +stringescapes {} + +/* special characters (in MS-DOS Latin I) */ + +stringdef a" hex '84' +stringdef e" hex '89' +stringdef i" hex '8B' +stringdef o" hex '94' +stringdef u" hex '81' + +stringdef a' hex 'A0' +stringdef e' hex '82' +stringdef i' hex 'A1' +stringdef o' hex 'A2' +stringdef u' hex 'A3' + +stringdef e` hex '8A' + +define v 'aeiouy{e`}' +define v_I v + 'I' +define v_j v + 'j' + +define prelude as ( + test repeat ( + [substring] among( + '{a"}' '{a'}' + (<- 'a') + '{e"}' '{e'}' + (<- 'e') + '{i"}' '{i'}' + (<- 'i') + '{o"}' '{o'}' + (<- 'o') + '{u"}' '{u'}' + (<- 'u') + '' (next) + ) //or next + ) + try(['y'] <- 'Y') + repeat goto ( + v [('i'] v <- 'I') or + ('y'] <- 'Y') + ) +) + +define mark_regions as ( + + $p1 = limit + $p2 = limit + + gopast v gopast non-v setmark p1 + try($p1 < 3 $p1 = 3) // at least 3 + gopast v gopast non-v setmark p2 + +) + +define postlude as repeat ( + + [substring] among( + 'Y' (<- 'y') + 'I' (<- 'i') + '' (next) + ) //or next + +) + +backwardmode ( + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define undouble as ( + test among('kk' 'dd' 'tt') [next] delete + ) + + define e_ending as ( + unset e_found + ['e'] R1 test non-v delete + set e_found + undouble + ) + + define en_ending as ( + R1 non-v and not 'gem' delete + undouble + ) + + define standard_suffix as ( + do ( + [substring] among( + 'heden' + ( R1 <- 'heid' + ) + 'en' 'ene' + ( en_ending + ) + 's' 'se' + ( R1 non-v_j delete + ) + ) + ) + do e_ending + + do ( ['heid'] R2 not 'c' delete + ['en'] en_ending + ) + + do ( + [substring] among( + 'end' 'ing' + ( R2 delete + (['ig'] R2 not 'e' delete) or undouble + ) + 'ig' + ( R2 not 'e' delete + ) + 'lijk' + ( R2 delete e_ending + ) + 'baar' + ( R2 delete + ) + 'bar' + ( R2 e_found delete + ) + ) + ) + do ( + non-v_I + test ( + among ('aa' 'ee' 'oo' 'uu') + non-v + ) + [next] delete + ) + ) +) + +define stem as ( + + do prelude + do mark_regions + backwards + do standard_suffix + do postlude +) diff --git a/contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..fe18d7a91 --- /dev/null +++ b/contrib/snowball/algorithms/english/stem_ISO_8859_1.sbl @@ -0,0 +1,229 @@ +integers ( p1 p2 ) +booleans ( Y_found ) + +routines ( + prelude postlude + mark_regions + shortv + R1 R2 + Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5 + exception1 + exception2 +) + +externals ( stem ) + +groupings ( v v_WXY valid_LI ) + +stringescapes {} + +define v 'aeiouy' +define v_WXY v + 'wxY' + +define valid_LI 'cdeghkmnrt' + +define prelude as ( + unset Y_found + do ( ['{'}'] delete) + do ( ['y'] <-'Y' set Y_found) + do repeat(goto (v ['y']) <-'Y' set Y_found) +) + +define mark_regions as ( + $p1 = limit + $p2 = limit + do( + among ( + 'gener' + 'commun' // added May 2005 + 'arsen' // added Nov 2006 (arsenic/arsenal) + // ... extensions possible here ... + ) or (gopast v gopast non-v) + setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +backwardmode ( + + define shortv as ( + ( non-v_WXY v non-v ) + or + ( non-v v atlimit ) + ) + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define Step_1a as ( + try ( + [substring] among ( + '{'}' '{'}s' '{'}s{'}' + (delete) + ) + ) + [substring] among ( + 'sses' (<-'ss') + 'ied' 'ies' + ((hop 2 <-'i') or <-'ie') + 's' (next gopast v delete) + 'us' 'ss' + ) + ) + + define Step_1b as ( + [substring] among ( + 'eed' 'eedly' + (R1 <-'ee') + 'ed' 'edly' 'ing' 'ingly' + ( + test gopast v delete + test substring among( + 'at' 'bl' 'iz' + (<+ 'e') + 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt' + // ignoring double c, h, j, k, q, v, w, and x + ([next] delete) + '' (atmark p1 test shortv <+ 'e') + ) + ) + ) + ) + + define Step_1c as ( + ['y' or 'Y'] + non-v not atlimit + <-'i' + ) + + define Step_2 as ( + [substring] R1 among ( + 'tional' (<-'tion') + 'enci' (<-'ence') + 'anci' (<-'ance') + 'abli' (<-'able') + 'entli' (<-'ent') + 'izer' 'ization' + (<-'ize') + 'ational' 'ation' 'ator' + (<-'ate') + 'alism' 'aliti' 'alli' + (<-'al') + 'fulness' (<-'ful') + 'ousli' 'ousness' + (<-'ous') + 'iveness' 'iviti' + (<-'ive') + 'biliti' 'bli' + (<-'ble') + 'ogi' ('l' <-'og') + 'fulli' (<-'ful') + 'lessli' (<-'less') + 'li' (valid_LI delete) + ) + ) + + define Step_3 as ( + [substring] R1 among ( + 'tional' (<- 'tion') + 'ational' (<- 'ate') + 'alize' (<-'al') + 'icate' 'iciti' 'ical' + (<-'ic') + 'ful' 'ness' + (delete) + 'ative' + (R2 delete) // 'R2' added Dec 2001 + ) + ) + + define Step_4 as ( + [substring] R2 among ( + 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement' + 'ment' 'ent' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize' + (delete) + 'ion' ('s' or 't' delete) + ) + ) + + define Step_5 as ( + [substring] among ( + 'e' (R2 or (R1 not shortv) delete) + 'l' (R2 'l' delete) + ) + ) + + define exception2 as ( + + [substring] atlimit among( + 'inning' 'outing' 'canning' 'herring' 'earring' + 'proceed' 'exceed' 'succeed' + + // ... extensions possible here ... + + ) + ) +) + +define exception1 as ( + + [substring] atlimit among( + + /* special changes: */ + + 'skis' (<-'ski') + 'skies' (<-'sky') + 'dying' (<-'die') + 'lying' (<-'lie') + 'tying' (<-'tie') + + /* special -LY cases */ + + 'idly' (<-'idl') + 'gently' (<-'gentl') + 'ugly' (<-'ugli') + 'early' (<-'earli') + 'only' (<-'onli') + 'singly' (<-'singl') + + // ... extensions possible here ... + + /* invariant forms: */ + + 'sky' + 'news' + 'howe' + + 'atlas' 'cosmos' 'bias' 'andes' // not plural forms + + // ... extensions possible here ... + ) +) + +define postlude as (Y_found repeat(goto (['Y']) <-'y')) + +define stem as ( + + exception1 or + not hop 3 or ( + do prelude + do mark_regions + backwards ( + + do Step_1a + + exception2 or ( + + do Step_1b + do Step_1c + + do Step_2 + do Step_3 + do Step_4 + + do Step_5 + ) + ) + do postlude + ) +) diff --git a/contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..9ac74f292 --- /dev/null +++ b/contrib/snowball/algorithms/finnish/stem_ISO_8859_1.sbl @@ -0,0 +1,196 @@ + +/* Finnish stemmer. + + Numbers in square brackets refer to the sections in + Fred Karlsson, Finnish: An Essential Grammar. Routledge, 1999 + ISBN 0-415-20705-3 + +*/ + +routines ( + mark_regions + R2 + particle_etc possessive + LONG VI + case_ending + i_plural + t_plural + other_endings + tidy +) + +externals ( stem ) + +integers ( p1 p2 ) +strings ( x ) +booleans ( ending_removed ) +groupings ( AEI V1 V2 particle_end ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef a" hex 'E4' +stringdef o" hex 'F6' + +define AEI 'a{a"}ei' +define V1 'aeiouy{a"}{o"}' +define V2 'aeiou{a"}{o"}' +define particle_end V1 + 'nt' + +define mark_regions as ( + + $p1 = limit + $p2 = limit + + goto V1 gopast non-V1 setmark p1 + goto V1 gopast non-V1 setmark p2 +) + +backwardmode ( + + define R2 as $p2 <= cursor + + define particle_etc as ( + setlimit tomark p1 for ([substring]) + among( + 'kin' + 'kaan' 'k{a"}{a"}n' + 'ko' 'k{o"}' + 'han' 'h{a"}n' + 'pa' 'p{a"}' // Particles [91] + (particle_end) + 'sti' // Adverb [87] + (R2) + ) + delete + ) + define possessive as ( // [36] + setlimit tomark p1 for ([substring]) + among( + 'si' + (not 'k' delete) // take 'ksi' as the Comitative case + 'ni' + (delete ['kse'] <- 'ksi') // kseni = ksi + ni + 'nsa' 'ns{a"}' + 'mme' + 'nne' + (delete) + /* Now for Vn possessives after case endings: [36] */ + 'an' + (among('ta' 'ssa' 'sta' 'lla' 'lta' 'na') delete) + '{a"}n' + (among('t{a"}' 'ss{a"}' 'st{a"}' + 'll{a"}' 'lt{a"}' 'n{a"}') delete) + 'en' + (among('lle' 'ine') delete) + ) + ) + + define LONG as + among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}') + + define VI as ('i' V2) + + define case_ending as ( + setlimit tomark p1 for ([substring]) + among( + 'han' ('a') //-. + 'hen' ('e') // | + 'hin' ('i') // | + 'hon' ('o') // | + 'h{a"}n' ('{a"}') // Illative [43] + 'h{o"}n' ('{o"}') // | + 'siin' VI // | + 'seen' LONG //-' + + 'den' VI + 'tten' VI // Genitive plurals [34] + () + 'n' // Genitive or Illative + ( try ( LONG // Illative + or 'ie' // Genitive + and next ] + ) + /* otherwise Genitive */ + ) + + 'a' '{a"}' //-. + (V1 non-V1) // | + 'tta' 'tt{a"}' // Partitive [32] + ('e') // | + 'ta' 't{a"}' //-' + + 'ssa' 'ss{a"}' // Inessive [41] + 'sta' 'st{a"}' // Elative [42] + + 'lla' 'll{a"}' // Adessive [44] + 'lta' 'lt{a"}' // Ablative [51] + 'lle' // Allative [46] + 'na' 'n{a"}' // Essive [49] + 'ksi' // Translative[50] + 'ine' // Comitative [51] + + /* Abessive and Instructive are too rare for + inclusion [51] */ + + ) + delete + set ending_removed + ) + define other_endings as ( + setlimit tomark p2 for ([substring]) + among( + 'mpi' 'mpa' 'mp{a"}' + 'mmi' 'mma' 'mm{a"}' // Comparative forms [85] + (not 'po') //-improves things + 'impi' 'impa' 'imp{a"}' + 'immi' 'imma' 'imm{a"}' // Superlative forms [86] + 'eja' 'ej{a"}' // indicates agent [93.1B] + ) + delete + ) + define i_plural as ( // [26] + setlimit tomark p1 for ([substring]) + among( + 'i' 'j' + ) + delete + ) + define t_plural as ( // [26] + setlimit tomark p1 for ( + ['t'] test V1 + delete + ) + setlimit tomark p2 for ([substring]) + among( + 'mma' (not 'po') //-mmat endings + 'imma' //-immat endings + ) + delete + ) + define tidy as ( + setlimit tomark p1 for ( + do ( LONG and ([next] delete ) ) // undouble vowel + do ( [AEI] non-V1 delete ) // remove trailing a, a", e, i + do ( ['j'] 'o' or 'u' delete ) + do ( ['o'] 'j' delete ) + ) + goto non-V1 [next] -> x x delete // undouble consonant + ) +) + +define stem as ( + + do mark_regions + unset ending_removed + backwards ( + do particle_etc + do possessive + do case_ending + do other_endings + (ending_removed do i_plural) or do t_plural + do tidy + ) +) + diff --git a/contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..e972f227f --- /dev/null +++ b/contrib/snowball/algorithms/french/stem_ISO_8859_1.sbl @@ -0,0 +1,248 @@ +routines ( + prelude postlude mark_regions + RV R1 R2 + standard_suffix + i_verb_suffix + verb_suffix + residual_suffix + un_double + un_accent +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v keep_with_s ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef a^ hex 'E2' // a-circumflex +stringdef a` hex 'E0' // a-grave +stringdef c, hex 'E7' // c-cedilla + +stringdef e" hex 'EB' // e-diaeresis (rare) +stringdef e' hex 'E9' // e-acute +stringdef e^ hex 'EA' // e-circumflex +stringdef e` hex 'E8' // e-grave +stringdef i" hex 'EF' // i-diaeresis +stringdef i^ hex 'EE' // i-circumflex +stringdef o^ hex 'F4' // o-circumflex +stringdef u^ hex 'FB' // u-circumflex +stringdef u` hex 'F9' // u-grave + +define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}' + +define prelude as repeat goto ( + + ( v [ ('u' ] v <- 'U') or + ('i' ] v <- 'I') or + ('y' ] <- 'Y') + ) + or + ( ['y'] v <- 'Y' ) + or + ( 'q' ['u'] <- 'U' ) +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v v next ) + or + among ( // this exception list begun Nov 2006 + 'par' // paris, parie, pari + 'col' // colis + 'tap' // tapis + // extensions possible here + ) + or + ( next gopast v ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + + [substring] among( + 'I' (<- 'i') + 'U' (<- 'u') + 'Y' (<- 'y') + '' (next) + ) +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define standard_suffix as ( + [substring] among( + + 'ance' 'iqUe' 'isme' 'able' 'iste' 'eux' + 'ances' 'iqUes' 'ismes' 'ables' 'istes' + ( R2 delete ) + 'atrice' 'ateur' 'ation' + 'atrices' 'ateurs' 'ations' + ( R2 delete + try ( ['ic'] (R2 delete) or <-'iqU' ) + ) + 'logie' + 'logies' + ( R2 <- 'log' ) + 'usion' 'ution' + 'usions' 'utions' + ( R2 <- 'u' ) + 'ence' + 'ences' + ( R2 <- 'ent' ) + 'ement' + 'ements' + ( + RV delete + try ( + [substring] among( + 'iv' (R2 delete ['at'] R2 delete) + 'eus' ((R2 delete) or (R1<-'eux')) + 'abl' 'iqU' + (R2 delete) + 'i{e`}r' 'I{e`}r' //) + (RV <-'i') //)--new 2 Sept 02 + ) + ) + ) + 'it{e'}' + 'it{e'}s' + ( + R2 delete + try ( + [substring] among( + 'abil' ((R2 delete) or <-'abl') + 'ic' ((R2 delete) or <-'iqU') + 'iv' (R2 delete) + ) + ) + ) + 'if' 'ive' + 'ifs' 'ives' + ( + R2 delete + try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' ) + ) + 'eaux' (<- 'eau') + 'aux' (R1 <- 'al') + 'euse' + 'euses'((R2 delete) or (R1<-'eux')) + + 'issement' + 'issements'(R1 non-v delete) // verbal + + // fail(...) below forces entry to verb_suffix. -ment typically + // follows the p.p., e.g 'confus{e'}ment'. + + 'amment' (RV fail(<- 'ant')) + 'emment' (RV fail(<- 'ent')) + 'ment' + 'ments' (test(v RV) fail(delete)) + // v is e,i,u,{e'},I or U + ) + ) + + define i_verb_suffix as setlimit tomark pV for ( + [substring] among ( + '{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai' + 'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez' + 'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait' + 'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses' + 'issez' 'issiez' 'issions' 'issons' 'it' + (non-v delete) + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among ( + 'ions' + (R2 delete) + + '{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai' + 'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions' + 'erons' 'eront' 'ez' 'iez' + + // 'ons' //-best omitted + + (delete) + + '{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant' + 'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez' + 'assions' + (delete + try(['e'] delete) + ) + ) + ) + + define keep_with_s 'aiou{e`}s' + + define residual_suffix as ( + try(['s'] test non-keep_with_s delete) + setlimit tomark pV for ( + [substring] among( + 'ion' (R2 's' or 't' delete) + 'ier' 'i{e`}re' + 'Ier' 'I{e`}re' (<-'i') + 'e' (delete) + '{e"}' ('gu' delete) + ) + ) + ) + + define un_double as ( + test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete + ) + + define un_accent as ( + atleast 1 non-v + [ '{e'}' or '{e`}' ] <-'e' + ) +) + +define stem as ( + + do prelude + do mark_regions + backwards ( + + do ( + ( + ( standard_suffix or + i_verb_suffix or + verb_suffix + ) + and + try( [ ('Y' ] <- 'i' ) or + ('{c,}'] <- 'c' ) + ) + ) or + residual_suffix + ) + + // try(['ent'] RV delete) // is best omitted + + do un_double + do un_accent + ) + do postlude +) + diff --git a/contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl new file mode 100644 index 000000000..996eba1ab --- /dev/null +++ b/contrib/snowball/algorithms/french/stem_MS_DOS_Latin_I.sbl @@ -0,0 +1,239 @@ +routines ( + prelude postlude mark_regions + RV R1 R2 + standard_suffix + i_verb_suffix + verb_suffix + residual_suffix + un_double + un_accent +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v keep_with_s ) + +stringescapes {} + +/* special characters (in MS-DOS Latin I) */ + +stringdef a^ hex '83' // a-circumflex +stringdef a` hex '85' // a-grave +stringdef c, hex '87' // c-cedilla + +stringdef e" hex '89' // e-diaeresis (rare) +stringdef e' hex '82' // e-acute +stringdef e^ hex '88' // e-circumflex +stringdef e` hex '8A' // e-grave +stringdef i" hex '8B' // i-diaeresis +stringdef i^ hex '8C' // i-circumflex +stringdef o^ hex '93' // o-circumflex +stringdef u^ hex '96' // u-circumflex +stringdef u` hex '97' // u-grave + +define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}' + +define prelude as repeat goto ( + + ( v [ ('u' ] v <- 'U') or + ('i' ] v <- 'I') or + ('y' ] <- 'Y') + ) + or + ( ['y'] v <- 'Y' ) + or + ( 'q' ['u'] <- 'U' ) +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v v next ) or ( next gopast v ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + + [substring] among( + 'I' (<- 'i') + 'U' (<- 'u') + 'Y' (<- 'y') + '' (next) + ) +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define standard_suffix as ( + [substring] among( + + 'ance' 'iqUe' 'isme' 'able' 'iste' 'eux' + 'ances' 'iqUes' 'ismes' 'ables' 'istes' + ( R2 delete ) + 'atrice' 'ateur' 'ation' + 'atrices' 'ateurs' 'ations' + ( R2 delete + try ( ['ic'] (R2 delete) or <-'iqU' ) + ) + 'logie' + 'logies' + ( R2 <- 'log' ) + 'usion' 'ution' + 'usions' 'utions' + ( R2 <- 'u' ) + 'ence' + 'ences' + ( R2 <- 'ent' ) + 'ement' + 'ements' + ( + RV delete + try ( + [substring] among( + 'iv' (R2 delete ['at'] R2 delete) + 'eus' ((R2 delete) or (R1<-'eux')) + 'abl' 'iqU' + (R2 delete) + 'i{e`}r' 'I{e`}r' //) + (RV <-'i') //)--new 2 Sept 02 + ) + ) + ) + 'it{e'}' + 'it{e'}s' + ( + R2 delete + try ( + [substring] among( + 'abil' ((R2 delete) or <-'abl') + 'ic' ((R2 delete) or <-'iqU') + 'iv' (R2 delete) + ) + ) + ) + 'if' 'ive' + 'ifs' 'ives' + ( + R2 delete + try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' ) + ) + 'eaux' (<- 'eau') + 'aux' (R1 <- 'al') + 'euse' + 'euses'((R2 delete) or (R1<-'eux')) + + 'issement' + 'issements'(R1 non-v delete) // verbal + + // fail(...) below forces entry to verb_suffix. -ment typically + // follows the p.p., e.g 'confus{e'}ment'. + + 'amment' (RV fail(<- 'ant')) + 'emment' (RV fail(<- 'ent')) + 'ment' + 'ments' (test(v RV) fail(delete)) + // v is e,i,u,{e'},I or U + ) + ) + + define i_verb_suffix as setlimit tomark pV for ( + [substring] among ( + '{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai' + 'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez' + 'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait' + 'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses' + 'issez' 'issiez' 'issions' 'issons' 'it' + (non-v delete) + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among ( + 'ions' + (R2 delete) + + '{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai' + 'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions' + 'erons' 'eront' 'ez' 'iez' + + // 'ons' //-best omitted + + (delete) + + '{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant' + 'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez' + 'assions' + (delete + try(['e'] delete) + ) + ) + ) + + define keep_with_s 'aiou{e`}s' + + define residual_suffix as ( + try(['s'] test non-keep_with_s delete) + setlimit tomark pV for ( + [substring] among( + 'ion' (R2 's' or 't' delete) + 'ier' 'i{e`}re' + 'Ier' 'I{e`}re' (<-'i') + 'e' (delete) + '{e"}' ('gu' delete) + ) + ) + ) + + define un_double as ( + test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete + ) + + define un_accent as ( + atleast 1 non-v + [ '{e'}' or '{e`}' ] <-'e' + ) +) + +define stem as ( + + do prelude + do mark_regions + backwards ( + + do ( + ( + ( standard_suffix or + i_verb_suffix or + verb_suffix + ) + and + try( [ ('Y' ] <- 'i' ) or + ('{c,}'] <- 'c' ) + ) + ) or + residual_suffix + ) + + // try(['ent'] RV delete) // is best omitted + + do un_double + do un_accent + ) + do postlude +) + diff --git a/contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..7069daf0d --- /dev/null +++ b/contrib/snowball/algorithms/german/stem_ISO_8859_1.sbl @@ -0,0 +1,139 @@ + +/* + Extra rule for -nisse ending added 11 Dec 2009 +*/ + +routines ( + prelude postlude + mark_regions + R1 R2 + standard_suffix +) + +externals ( stem ) + +integers ( p1 p2 x ) + +groupings ( v s_ending st_ending ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef a" hex 'E4' +stringdef o" hex 'F6' +stringdef u" hex 'FC' +stringdef ss hex 'DF' + +define v 'aeiouy{a"}{o"}{u"}' + +define s_ending 'bdfghklmnrt' +define st_ending s_ending - 'r' + +define prelude as ( + + test repeat ( + ( + ['{ss}'] <- 'ss' + ) or next + ) + + repeat goto ( + v [('u'] v <- 'U') or + ('y'] v <- 'Y') + ) +) + +define mark_regions as ( + + $p1 = limit + $p2 = limit + + test(hop 3 setmark x) + + gopast v gopast non-v setmark p1 + try($p1 < x $p1 = x) // at least 3 + gopast v gopast non-v setmark p2 + +) + +define postlude as repeat ( + + [substring] among( + 'Y' (<- 'y') + 'U' (<- 'u') + '{a"}' (<- 'a') + '{o"}' (<- 'o') + '{u"}' (<- 'u') + '' (next) + ) + +) + +backwardmode ( + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define standard_suffix as ( + do ( + [substring] R1 among( + 'em' 'ern' 'er' + ( delete + ) + 'e' 'en' 'es' + ( delete + try (['s'] 'nis' delete) + ) + 's' + ( s_ending delete + ) + ) + ) + do ( + [substring] R1 among( + 'en' 'er' 'est' + ( delete + ) + 'st' + ( st_ending hop 3 delete + ) + ) + ) + do ( + [substring] R2 among( + 'end' 'ung' + ( delete + try (['ig'] not 'e' R2 delete) + ) + 'ig' 'ik' 'isch' + ( not 'e' delete + ) + 'lich' 'heit' + ( delete + try ( + ['er' or 'en'] R1 delete + ) + ) + 'keit' + ( delete + try ( + [substring] R2 among( + 'lich' 'ig' + ( delete + ) + ) + ) + ) + ) + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards + do standard_suffix + do postlude +) diff --git a/contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl new file mode 100644 index 000000000..3effb3257 --- /dev/null +++ b/contrib/snowball/algorithms/german/stem_MS_DOS_Latin_I.sbl @@ -0,0 +1,139 @@ + +/* + Extra rule for -nisse ending added 11 Dec 2009 +*/ + +routines ( + prelude postlude + mark_regions + R1 R2 + standard_suffix +) + +externals ( stem ) + +integers ( p1 p2 x ) + +groupings ( v s_ending st_ending ) + +stringescapes {} + +/* special characters (in MS-DOS Latin I) */ + +stringdef a" hex '84' +stringdef o" hex '94' +stringdef u" hex '81' +stringdef ss hex 'E1' + +define v 'aeiouy{a"}{o"}{u"}' + +define s_ending 'bdfghklmnrt' +define st_ending s_ending - 'r' + +define prelude as ( + + test repeat ( + ( + ['{ss}'] <- 'ss' + ) or next + ) + + repeat goto ( + v [('u'] v <- 'U') or + ('y'] v <- 'Y') + ) +) + +define mark_regions as ( + + $p1 = limit + $p2 = limit + + test(hop 3 setmark x) + + gopast v gopast non-v setmark p1 + try($p1 < x $p1 = x) // at least 3 + gopast v gopast non-v setmark p2 + +) + +define postlude as repeat ( + + [substring] among( + 'Y' (<- 'y') + 'U' (<- 'u') + '{a"}' (<- 'a') + '{o"}' (<- 'o') + '{u"}' (<- 'u') + '' (next) + ) + +) + +backwardmode ( + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define standard_suffix as ( + do ( + [substring] R1 among( + 'em' 'ern' 'er' + ( delete + ) + 'e' 'en' 'es' + ( delete + try (['s'] 'nis' delete) + ) + 's' + ( s_ending delete + ) + ) + ) + do ( + [substring] R1 among( + 'en' 'er' 'est' + ( delete + ) + 'st' + ( st_ending hop 3 delete + ) + ) + ) + do ( + [substring] R2 among( + 'end' 'ung' + ( delete + try (['ig'] not 'e' R2 delete) + ) + 'ig' 'ik' 'isch' + ( not 'e' delete + ) + 'lich' 'heit' + ( delete + try ( + ['er' or 'en'] R1 delete + ) + ) + 'keit' + ( delete + try ( + [substring] R2 among( + 'lich' 'ig' + ( delete + ) + ) + ) + ) + ) + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards + do standard_suffix + do postlude +) diff --git a/contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..ce6026a86 --- /dev/null +++ b/contrib/snowball/algorithms/german2/stem_ISO_8859_1.sbl @@ -0,0 +1,145 @@ + +/* + Extra rule for -nisse ending added 11 Dec 2009 +*/ + +routines ( + prelude postlude + mark_regions + R1 R2 + standard_suffix +) + +externals ( stem ) + +integers ( p1 p2 x ) + +groupings ( v s_ending st_ending ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef a" hex 'E4' +stringdef o" hex 'F6' +stringdef u" hex 'FC' +stringdef ss hex 'DF' + +define v 'aeiouy{a"}{o"}{u"}' + +define s_ending 'bdfghklmnrt' +define st_ending s_ending - 'r' + +define prelude as ( + + test repeat goto ( + v [('u'] v <- 'U') or + ('y'] v <- 'Y') + ) + + repeat ( + [substring] among( + '{ss}' (<- 'ss') + 'ae' (<- '{a"}') + 'oe' (<- '{o"}') + 'ue' (<- '{u"}') + 'qu' (hop 2) + '' (next) + ) + ) + +) + +define mark_regions as ( + + $p1 = limit + $p2 = limit + + test(hop 3 setmark x) + + gopast v gopast non-v setmark p1 + try($p1 < x $p1 = x) // at least 3 + gopast v gopast non-v setmark p2 + +) + +define postlude as repeat ( + + [substring] among( + 'Y' (<- 'y') + 'U' (<- 'u') + '{a"}' (<- 'a') + '{o"}' (<- 'o') + '{u"}' (<- 'u') + '' (next) + ) + +) + +backwardmode ( + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define standard_suffix as ( + do ( + [substring] R1 among( + 'em' 'ern' 'er' + ( delete + ) + 'e' 'en' 'es' + ( delete + try (['s'] 'nis' delete) + ) + 's' + ( s_ending delete + ) + ) + ) + do ( + [substring] R1 among( + 'en' 'er' 'est' + ( delete + ) + 'st' + ( st_ending hop 3 delete + ) + ) + ) + do ( + [substring] R2 among( + 'end' 'ung' + ( delete + try (['ig'] not 'e' R2 delete) + ) + 'ig' 'ik' 'isch' + ( not 'e' delete + ) + 'lich' 'heit' + ( delete + try ( + ['er' or 'en'] R1 delete + ) + ) + 'keit' + ( delete + try ( + [substring] R2 among( + 'lich' 'ig' + ( delete + ) + ) + ) + ) + ) + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards + do standard_suffix + do postlude +) diff --git a/contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl b/contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl new file mode 100644 index 000000000..d1a644931 --- /dev/null +++ b/contrib/snowball/algorithms/hungarian/stem_ISO_8859_2.sbl @@ -0,0 +1,241 @@ +/* +Hungarian Stemmer +Removes noun inflections +*/ + +routines ( + mark_regions + R1 + v_ending + case + case_special + case_other + plural + owned + sing_owner + plur_owner + instrum + factive + undouble + double +) + +externals ( stem ) + +integers ( p1 ) +groupings ( v ) + +stringescapes {} + +/* special characters (in ISO Latin 2) */ + +stringdef a' hex 'E1' //a-acute +stringdef e' hex 'E9' //e-acute +stringdef i' hex 'ED' //i-acute +stringdef o' hex 'F3' //o-acute +stringdef o" hex 'F6' //o-umlaut +stringdef oq hex 'F5' //o-double acute +stringdef u' hex 'FA' //u-acute +stringdef u" hex 'FC' //u-umlaut +stringdef uq hex 'FB' //u-double acute + +define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}' + +define mark_regions as ( + + $p1 = limit + + (v goto non-v + among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next + setmark p1) + or + + (non-v gopast v setmark p1) +) + +backwardmode ( + + define R1 as $p1 <= cursor + + define v_ending as ( + [substring] R1 among( + '{a'}' (<- 'a') + '{e'}' (<- 'e') + ) + ) + + define double as ( + test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm' + 'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs') + ) + + define undouble as ( + next [hop 1] delete + ) + + define instrum as( + [substring] R1 among( + 'al' (double) + 'el' (double) + ) + delete + undouble + ) + + + define case as ( + [substring] R1 among( + 'ban' 'ben' + 'ba' 'be' + 'ra' 're' + 'nak' 'nek' + 'val' 'vel' + 't{o'}l' 't{oq}l' + 'r{o'}l' 'r{oq}l' + 'b{o'}l' 'b{oq}l' + 'hoz' 'hez' 'h{o"}z' + 'n{a'}l' 'n{e'}l' + 'ig' + 'at' 'et' 'ot' '{o"}t' + '{e'}rt' + 'k{e'}pp' 'k{e'}ppen' + 'kor' + 'ul' '{u"}l' + 'v{a'}' 'v{e'}' + 'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt' + 'k{e'}nt' + 'en' 'on' 'an' '{o"}n' + 'n' + 't' + ) + delete + v_ending + ) + + define case_special as( + [substring] R1 among( + '{e'}n' (<- 'e') + '{a'}n' (<- 'a') + '{a'}nk{e'}nt' (<- 'a') + ) + ) + + define case_other as( + [substring] R1 among( + 'astul' 'est{u"}l' (delete) + 'stul' 'st{u"}l' (delete) + '{a'}stul' (<- 'a') + '{e'}st{u"}l' (<- 'e') + ) + ) + + define factive as( + [substring] R1 among( + '{a'}' (double) + '{e'}' (double) + ) + delete + undouble + ) + + define plural as ( + [substring] R1 among( + '{a'}k' (<- 'a') + '{e'}k' (<- 'e') + '{o"}k' (delete) + 'ak' (delete) + 'ok' (delete) + 'ek' (delete) + 'k' (delete) + ) + ) + + define owned as ( + [substring] R1 among ( + 'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete) + '{e'}k{e'}' (<- 'e') + '{a'}k{e'}' (<- 'a') + 'k{e'}' (delete) + '{e'}{e'}i' (<- 'e') + '{a'}{e'}i' (<- 'a') + '{e'}i' (delete) + '{e'}{e'}' (<- 'e') + '{e'}' (delete) + ) + ) + + define sing_owner as ( + [substring] R1 among( + '{u"}nk' 'unk' (delete) + '{a'}nk' (<- 'a') + '{e'}nk' (<- 'e') + 'nk' (delete) + '{a'}juk' (<- 'a') + '{e'}j{u"}k' (<- 'e') + 'juk' 'j{u"}k' (delete) + 'uk' '{u"}k' (delete) + 'em' 'om' 'am' (delete) + '{a'}m' (<- 'a') + '{e'}m' (<- 'e') + 'm' (delete) + 'od' 'ed' 'ad' '{o"}d' (delete) + '{a'}d' (<- 'a') + '{e'}d' (<- 'e') + 'd' (delete) + 'ja' 'je' (delete) + 'a' 'e' 'o' (delete) + '{a'}' (<- 'a') + '{e'}' (<- 'e') + ) + ) + + define plur_owner as ( + [substring] R1 among( + 'jaim' 'jeim' (delete) + '{a'}im' (<- 'a') + '{e'}im' (<- 'e') + 'aim' 'eim' (delete) + 'im' (delete) + 'jaid' 'jeid' (delete) + '{a'}id' (<- 'a') + '{e'}id' (<- 'e') + 'aid' 'eid' (delete) + 'id' (delete) + 'jai' 'jei' (delete) + '{a'}i' (<- 'a') + '{e'}i' (<- 'e') + 'ai' 'ei' (delete) + 'i' (delete) + 'jaink' 'jeink' (delete) + 'eink' 'aink' (delete) + '{a'}ink' (<- 'a') + '{e'}ink' (<- 'e') + 'ink' + 'jaitok' 'jeitek' (delete) + 'aitok' 'eitek' (delete) + '{a'}itok' (<- 'a') + '{e'}itek' (<- 'e') + 'itek' (delete) + 'jeik' 'jaik' (delete) + 'aik' 'eik' (delete) + '{a'}ik' (<- 'a') + '{e'}ik' (<- 'e') + 'ik' (delete) + ) + ) +) + +define stem as ( + do mark_regions + backwards ( + do instrum + do case + do case_special + do case_other + do factive + do owned + do sing_owner + do plur_owner + do plural + ) +) diff --git a/contrib/snowball/algorithms/hungarian/stem_Unicode.sbl b/contrib/snowball/algorithms/hungarian/stem_Unicode.sbl new file mode 100644 index 000000000..c812e055c --- /dev/null +++ b/contrib/snowball/algorithms/hungarian/stem_Unicode.sbl @@ -0,0 +1,241 @@ +/* +Hungarian Stemmer +Removes noun inflections +*/ + +routines ( + mark_regions + R1 + v_ending + case + case_special + case_other + plural + owned + sing_owner + plur_owner + instrum + factive + undouble + double +) + +externals ( stem ) + +integers ( p1 ) +groupings ( v ) + +stringescapes {} + +/* special characters (in Unicode) */ + +stringdef a' hex 'E1' //a-acute +stringdef e' hex 'E9' //e-acute +stringdef i' hex 'ED' //i-acute +stringdef o' hex 'F3' //o-acute +stringdef o" hex 'F6' //o-umlaut +stringdef oq hex '151' //o-double acute +stringdef u' hex 'FA' //u-acute +stringdef u" hex 'FC' //u-umlaut +stringdef uq hex '171' //u-double acute + +define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}' + +define mark_regions as ( + + $p1 = limit + + (v goto non-v + among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next + setmark p1) + or + + (non-v gopast v setmark p1) +) + +backwardmode ( + + define R1 as $p1 <= cursor + + define v_ending as ( + [substring] R1 among( + '{a'}' (<- 'a') + '{e'}' (<- 'e') + ) + ) + + define double as ( + test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm' + 'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs') + ) + + define undouble as ( + next [hop 1] delete + ) + + define instrum as( + [substring] R1 among( + 'al' (double) + 'el' (double) + ) + delete + undouble + ) + + + define case as ( + [substring] R1 among( + 'ban' 'ben' + 'ba' 'be' + 'ra' 're' + 'nak' 'nek' + 'val' 'vel' + 't{o'}l' 't{oq}l' + 'r{o'}l' 'r{oq}l' + 'b{o'}l' 'b{oq}l' + 'hoz' 'hez' 'h{o"}z' + 'n{a'}l' 'n{e'}l' + 'ig' + 'at' 'et' 'ot' '{o"}t' + '{e'}rt' + 'k{e'}pp' 'k{e'}ppen' + 'kor' + 'ul' '{u"}l' + 'v{a'}' 'v{e'}' + 'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt' + 'k{e'}nt' + 'en' 'on' 'an' '{o"}n' + 'n' + 't' + ) + delete + v_ending + ) + + define case_special as( + [substring] R1 among( + '{e'}n' (<- 'e') + '{a'}n' (<- 'a') + '{a'}nk{e'}nt' (<- 'a') + ) + ) + + define case_other as( + [substring] R1 among( + 'astul' 'est{u"}l' (delete) + 'stul' 'st{u"}l' (delete) + '{a'}stul' (<- 'a') + '{e'}st{u"}l' (<- 'e') + ) + ) + + define factive as( + [substring] R1 among( + '{a'}' (double) + '{e'}' (double) + ) + delete + undouble + ) + + define plural as ( + [substring] R1 among( + '{a'}k' (<- 'a') + '{e'}k' (<- 'e') + '{o"}k' (delete) + 'ak' (delete) + 'ok' (delete) + 'ek' (delete) + 'k' (delete) + ) + ) + + define owned as ( + [substring] R1 among ( + 'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete) + '{e'}k{e'}' (<- 'e') + '{a'}k{e'}' (<- 'a') + 'k{e'}' (delete) + '{e'}{e'}i' (<- 'e') + '{a'}{e'}i' (<- 'a') + '{e'}i' (delete) + '{e'}{e'}' (<- 'e') + '{e'}' (delete) + ) + ) + + define sing_owner as ( + [substring] R1 among( + '{u"}nk' 'unk' (delete) + '{a'}nk' (<- 'a') + '{e'}nk' (<- 'e') + 'nk' (delete) + '{a'}juk' (<- 'a') + '{e'}j{u"}k' (<- 'e') + 'juk' 'j{u"}k' (delete) + 'uk' '{u"}k' (delete) + 'em' 'om' 'am' (delete) + '{a'}m' (<- 'a') + '{e'}m' (<- 'e') + 'm' (delete) + 'od' 'ed' 'ad' '{o"}d' (delete) + '{a'}d' (<- 'a') + '{e'}d' (<- 'e') + 'd' (delete) + 'ja' 'je' (delete) + 'a' 'e' 'o' (delete) + '{a'}' (<- 'a') + '{e'}' (<- 'e') + ) + ) + + define plur_owner as ( + [substring] R1 among( + 'jaim' 'jeim' (delete) + '{a'}im' (<- 'a') + '{e'}im' (<- 'e') + 'aim' 'eim' (delete) + 'im' (delete) + 'jaid' 'jeid' (delete) + '{a'}id' (<- 'a') + '{e'}id' (<- 'e') + 'aid' 'eid' (delete) + 'id' (delete) + 'jai' 'jei' (delete) + '{a'}i' (<- 'a') + '{e'}i' (<- 'e') + 'ai' 'ei' (delete) + 'i' (delete) + 'jaink' 'jeink' (delete) + 'eink' 'aink' (delete) + '{a'}ink' (<- 'a') + '{e'}ink' (<- 'e') + 'ink' + 'jaitok' 'jeitek' (delete) + 'aitok' 'eitek' (delete) + '{a'}itok' (<- 'a') + '{e'}itek' (<- 'e') + 'itek' (delete) + 'jeik' 'jaik' (delete) + 'aik' 'eik' (delete) + '{a'}ik' (<- 'a') + '{e'}ik' (<- 'e') + 'ik' (delete) + ) + ) +) + +define stem as ( + do mark_regions + backwards ( + do instrum + do case + do case_special + do case_other + do factive + do owned + do sing_owner + do plur_owner + do plural + ) +) diff --git a/contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..8d25cf64f --- /dev/null +++ b/contrib/snowball/algorithms/italian/stem_ISO_8859_1.sbl @@ -0,0 +1,195 @@ + +routines ( + prelude postlude mark_regions + RV R1 R2 + attached_pronoun + standard_suffix + verb_suffix + vowel_suffix +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v AEIO CG ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef a' hex 'E1' +stringdef a` hex 'E0' +stringdef e' hex 'E9' +stringdef e` hex 'E8' +stringdef i' hex 'ED' +stringdef i` hex 'EC' +stringdef o' hex 'F3' +stringdef o` hex 'F2' +stringdef u' hex 'FA' +stringdef u` hex 'F9' + +define v 'aeiou{a`}{e`}{i`}{o`}{u`}' + +define prelude as ( + test repeat ( + [substring] among( + '{a'}' (<- '{a`}') + '{e'}' (<- '{e`}') + '{i'}' (<- '{i`}') + '{o'}' (<- '{o`}') + '{u'}' (<- '{u`}') + 'qu' (<- 'qU') + '' (next) + ) + ) + repeat goto ( + v [ ('u' ] v <- 'U') or + ('i' ] v <- 'I') + ) +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + + [substring] among( + 'I' (<- 'i') + 'U' (<- 'u') + '' (next) + ) + +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define attached_pronoun as ( + [substring] among( + 'ci' 'gli' 'la' 'le' 'li' 'lo' + 'mi' 'ne' 'si' 'ti' 'vi' + // the compound forms are: + 'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene' + 'mela' 'mele' 'meli' 'melo' 'mene' + 'tela' 'tele' 'teli' 'telo' 'tene' + 'cela' 'cele' 'celi' 'celo' 'cene' + 'vela' 'vele' 'veli' 'velo' 'vene' + ) + among( (RV) + 'ando' 'endo' (delete) + 'ar' 'er' 'ir' (<- 'e') + ) + ) + + define standard_suffix as ( + [substring] among( + + 'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo' + 'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti' + 'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente' + 'atrice' 'atrici' + 'ante' 'anti' // Note 1 + ( R2 delete ) + 'azione' 'azioni' 'atore' 'atori' + ( R2 delete + try ( ['ic'] R2 delete ) + ) + 'logia' 'logie' + ( R2 <- 'log' ) + 'uzione' 'uzioni' 'usione' 'usioni' + ( R2 <- 'u' ) + 'enza' 'enze' + ( R2 <- 'ente' ) + 'amento' 'amenti' 'imento' 'imenti' + ( RV delete ) + 'amente' ( + R1 delete + try ( + [substring] R2 delete among( + 'iv' ( ['at'] R2 delete ) + 'os' 'ic' 'abil' + ) + ) + ) + 'it{a`}' ( + R2 delete + try ( + [substring] among( + 'abil' 'ic' 'iv' (R2 delete) + ) + ) + ) + 'ivo' 'ivi' 'iva' 'ive' ( + R2 delete + try ( ['at'] R2 delete ['ic'] R2 delete ) + ) + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among( + 'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi' + 'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate' + 'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai' + 'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo' + 'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete' + 'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo' + 'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei' + 'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono' + 'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita' + 'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo' + 'ono' 'uta' 'ute' 'uti' 'uto' + + 'ar' 'ir' // but 'er' is problematical + (delete) + ) + ) + + define AEIO 'aeio{a`}{e`}{i`}{o`}' + define CG 'cg' + + define vowel_suffix as ( + try ( + [AEIO] RV delete + ['i'] RV delete + ) + try ( + ['h'] CG RV delete + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards ( + do attached_pronoun + do (standard_suffix or verb_suffix) + do vowel_suffix + ) + do postlude +) + +/* + Note 1: additions of 15 Jun 2005 +*/ + diff --git a/contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl new file mode 100644 index 000000000..b43295c09 --- /dev/null +++ b/contrib/snowball/algorithms/italian/stem_MS_DOS_Latin_I.sbl @@ -0,0 +1,195 @@ + +routines ( + prelude postlude mark_regions + RV R1 R2 + attached_pronoun + standard_suffix + verb_suffix + vowel_suffix +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v AEIO CG ) + +stringescapes {} + +/* special characters (in MS-DOS Latin I) */ + +stringdef a' hex 'A0' +stringdef a` hex '85' +stringdef e' hex '82' +stringdef e` hex '8A' +stringdef i' hex 'A1' +stringdef i` hex '8D' +stringdef o' hex 'A2' +stringdef o` hex '95' +stringdef u' hex 'A3' +stringdef u` hex '97' + +define v 'aeiou{a`}{e`}{i`}{o`}{u`}' + +define prelude as ( + test repeat ( + [substring] among( + '{a'}' (<- '{a`}') + '{e'}' (<- '{e`}') + '{i'}' (<- '{i`}') + '{o'}' (<- '{o`}') + '{u'}' (<- '{u`}') + 'qu' (<- 'qU') + '' (next) + ) + ) + repeat goto ( + v [ ('u' ] v <- 'U') or + ('i' ] v <- 'I') + ) +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + + [substring] among( + 'I' (<- 'i') + 'U' (<- 'u') + '' (next) + ) + +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define attached_pronoun as ( + [substring] among( + 'ci' 'gli' 'la' 'le' 'li' 'lo' + 'mi' 'ne' 'si' 'ti' 'vi' + // the compound forms are: + 'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene' + 'mela' 'mele' 'meli' 'melo' 'mene' + 'tela' 'tele' 'teli' 'telo' 'tene' + 'cela' 'cele' 'celi' 'celo' 'cene' + 'vela' 'vele' 'veli' 'velo' 'vene' + ) + among( (RV) + 'ando' 'endo' (delete) + 'ar' 'er' 'ir' (<- 'e') + ) + ) + + define standard_suffix as ( + [substring] among( + + 'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo' + 'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti' + 'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente' + 'atrice' 'atrici' + 'ante' 'anti' // Note 1 + ( R2 delete ) + 'azione' 'azioni' 'atore' 'atori' + ( R2 delete + try ( ['ic'] R2 delete ) + ) + 'logia' 'logie' + ( R2 <- 'log' ) + 'uzione' 'uzioni' 'usione' 'usioni' + ( R2 <- 'u' ) + 'enza' 'enze' + ( R2 <- 'ente' ) + 'amento' 'amenti' 'imento' 'imenti' + ( RV delete ) + 'amente' ( + R1 delete + try ( + [substring] R2 delete among( + 'iv' ( ['at'] R2 delete ) + 'os' 'ic' 'abil' + ) + ) + ) + 'it{a`}' ( + R2 delete + try ( + [substring] among( + 'abil' 'ic' 'iv' (R2 delete) + ) + ) + ) + 'ivo' 'ivi' 'iva' 'ive' ( + R2 delete + try ( ['at'] R2 delete ['ic'] R2 delete ) + ) + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among( + 'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi' + 'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate' + 'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai' + 'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo' + 'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete' + 'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo' + 'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei' + 'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono' + 'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita' + 'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo' + 'ono' 'uta' 'ute' 'uti' 'uto' + + 'ar' 'ir' // but 'er' is problematical + (delete) + ) + ) + + define AEIO 'aeio{a`}{e`}{i`}{o`}' + define CG 'cg' + + define vowel_suffix as ( + try ( + [AEIO] RV delete + ['i'] RV delete + ) + try ( + ['h'] CG RV delete + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards ( + do attached_pronoun + do (standard_suffix or verb_suffix) + do vowel_suffix + ) + do postlude +) + +/* + Note 1: additions of 15 Jun 2005 +*/ + diff --git a/contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..cd79d1280 --- /dev/null +++ b/contrib/snowball/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl @@ -0,0 +1,245 @@ +strings ( ch ) +integers ( x p1 p2 ) +booleans ( Y_found stemmed GE_removed ) + +routines ( + + R1 R2 + C V VX + lengthen_V + Step_1 Step_2 Step_3 Step_4 Step_7 + Step_6 Step_1c + Lose_prefix + Lose_infix + measure +) + +externals ( stem ) + +groupings ( v v_WX AOU AIOU ) + +stringescapes {} + +stringdef ' hex '27' // yuk + +define v 'aeiouy' +define v_WX v + 'wx' +define AOU 'aou' +define AIOU 'aiou' + +backwardmode ( + + define R1 as (setmark x $x >= p1) + define R2 as (setmark x $x >= p2) + + define V as test (v or 'ij') + define VX as test (next v or 'ij') + define C as test (not 'ij' non-v) + + define lengthen_V as do ( + non-v_WX [ (AOU] test (non-v or atlimit)) or + ('e'] test (non-v or atlimit + not AIOU + not (next AIOU non-v))) + ->ch insert ch + ) + + define Step_1 as + ( + [among ( (]) + + '{'}s' (delete) + 's' (R1 not ('t' R1) C delete) + 'ies' (R1 <-'ie') + 'es' + (('ar' R1 C ] delete lengthen_V) or + ('er' R1 C ] delete) or + (R1 C <-'e')) + + 'aus' (R1 V <-'au') + 'en' (('hed' R1 ] <-'heid') or + ('nd' delete) or + ('d' R1 C ] delete) or + ('i' or 'j' V delete) or + (R1 C delete lengthen_V)) + 'nde' (<-'nd') + ) + ) + + define Step_2 as + ( + [among ( (]) + 'je' (('{'}t' ] delete) or + ('et' ] R1 C delete) or + ('rnt' ] <-'rn') or + ('t' ] R1 VX delete) or + ('ink' ] <-'ing') or + ('mp' ] <-'m') or + ('{'}' ] R1 delete) or + (] R1 C delete)) + 'ge' (R1 <-'g') + 'lijke'(R1 <-'lijk') + 'ische'(R1 <-'isch') + 'de' (R1 C delete) + 'te' (R1 <-'t') + 'se' (R1 <-'s') + 're' (R1 <-'r') + 'le' (R1 delete attach 'l' lengthen_V) + 'ene' (R1 C delete attach 'en' lengthen_V) + 'ieve' (R1 C <-'ief') + ) + ) + + define Step_3 as + ( + [among ( (]) + 'atie' (R1 <-'eer') + 'iteit' (R1 delete lengthen_V) + 'heid' + 'sel' + 'ster' (R1 delete) + 'rder' (<-'r') + 'ing' + 'isme' + 'erij' (R1 delete lengthen_V) + 'arij' (R1 C <-'aar') + 'fie' (R2 delete attach 'f' lengthen_V) + 'gie' (R2 delete attach 'g' lengthen_V) + 'tst' (R1 C <-'t') + 'dst' (R1 C <-'d') + ) + ) + + define Step_4 as + ( + ( [among ( (]) + 'ioneel' (R1 <-'ie') + 'atief' (R1 <-'eer') + 'baar' (R1 delete) + 'naar' (R1 V <-'n') + 'laar' (R1 V <-'l') + 'raar' (R1 V <-'r') + 'tant' (R1 <-'teer') + 'lijker' + 'lijkst' (R1 <-'lijk') + 'achtig' + 'achtiger' + 'achtigst'(R1 delete) + 'eriger' + 'erigst' + 'erig' + 'end' (R1 C delete lengthen_V) + ) + ) + or + ( [among ( (]) + 'iger' + 'igst' + 'ig' (R1 C delete lengthen_V) + ) + ) + ) + + define Step_7 as + ( + [among ( (]) + 'kt' (<-'k') + 'ft' (<-'f') + 'pt' (<-'p') + ) + ) + + define Step_6 as + ( + [among ( (]) + 'bb' (<-'b') + 'cc' (<-'c') + 'dd' (<-'d') + 'ff' (<-'f') + 'gg' (<-'g') + 'hh' (<-'h') + 'jj' (<-'j') + 'kk' (<-'k') + 'll' (<-'l') + 'mm' (<-'m') + 'nn' (<-'n') + 'pp' (<-'p') + 'qq' (<-'q') + 'rr' (<-'r') + 'ss' (<-'s') + 'tt' (<-'t') + 'vv' (<-'v') + 'ww' (<-'w') + 'xx' (<-'x') + 'zz' (<-'z') + 'v' (<-'f') + 'z' (<-'s') + ) + ) + + define Step_1c as + ( + [among ( (] R1 C) + 'd' (not ('n' R1) delete) + 't' (not ('h' R1) delete) + ) + ) +) + +define Lose_prefix as ( + ['ge'] test hop 3 (goto v goto non-v) + set GE_removed + delete +) + +define Lose_infix as ( + next + gopast (['ge']) test hop 3 (goto v goto non-v) + set GE_removed + delete +) + +define measure as ( + do ( + tolimit + setmark p1 + setmark p2 + ) + do( + repeat non-v atleast 1 ('ij' or v) non-v setmark p1 + repeat non-v atleast 1 ('ij' or v) non-v setmark p2 + ) + +) +define stem as ( + + unset Y_found + unset stemmed + do ( ['y'] <-'Y' set Y_found ) + do repeat(goto (v ['y'])<-'Y' set Y_found ) + + measure + + backwards ( + do (Step_1 set stemmed ) + do (Step_2 set stemmed ) + do (Step_3 set stemmed ) + do (Step_4 set stemmed ) + ) + unset GE_removed + do (Lose_prefix and measure) + backwards ( + do (GE_removed Step_1c) + ) + unset GE_removed + do (Lose_infix and measure) + backwards ( + do (GE_removed Step_1c) + ) + backwards ( + do (Step_7 set stemmed ) + do (stemmed or GE_removed Step_6) + ) + do(Y_found repeat(goto (['Y']) <-'y')) +) + diff --git a/contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..3f69f1572 --- /dev/null +++ b/contrib/snowball/algorithms/lovins/stem_ISO_8859_1.sbl @@ -0,0 +1,208 @@ + +stringescapes {} + +routines ( + A B C D E F G H I J K L M N O P Q R S T U V W X Y Z AA BB CC + + endings + + undouble respell +) + +externals ( stem ) + +backwardmode ( + + /* Lovins' conditions A, B ... CC, as given in her Appendix B, where + a test for a two letter prefix ('test hop 2') is implicitly + assumed. Note that 'e' next 'u' corresponds to her u*e because + Snowball is scanning backwards. */ + + define A as ( hop 2 ) + define B as ( hop 3 ) + define C as ( hop 4 ) + define D as ( hop 5 ) + define E as ( test hop 2 not 'e' ) + define F as ( test hop 3 not 'e' ) + define G as ( test hop 3 'f' ) + define H as ( test hop 2 't' or 'll' ) + define I as ( test hop 2 not 'o' not 'e' ) + define J as ( test hop 2 not 'a' not 'e' ) + define K as ( test hop 3 'l' or 'i' or ('e' next 'u') ) + define L as ( test hop 2 not 'u' not 'x' not ('s' not 'o') ) + define M as ( test hop 2 not 'a' not 'c' not 'e' not 'm' ) + define N as ( test hop 3 ( hop 2 not 's' or hop 2 ) ) + define O as ( test hop 2 'l' or 'i' ) + define P as ( test hop 2 not 'c' ) + define Q as ( test hop 2 test hop 3 not 'l' not 'n' ) + define R as ( test hop 2 'n' or 'r' ) + define S as ( test hop 2 'dr' or ('t' not 't') ) + define T as ( test hop 2 's' or ('t' not 'o') ) + define U as ( test hop 2 'l' or 'm' or 'n' or 'r' ) + define V as ( test hop 2 'c' ) + define W as ( test hop 2 not 's' not 'u' ) + define X as ( test hop 2 'l' or 'i' or ('e' next 'u') ) + define Y as ( test hop 2 'in' ) + define Z as ( test hop 2 not 'f' ) + define AA as ( test hop 2 among ( 'd' 'f' 'ph' 'th' 'l' 'er' 'or' + 'es' 't' ) ) + define BB as ( test hop 3 not 'met' not 'ryst' ) + define CC as ( test hop 2 'l' ) + + + /* The system of endings, as given in Appendix A. */ + + define endings as ( + [substring] among( + 'alistically' B 'arizability' A 'izationally' B + + 'antialness' A 'arisations' A 'arizations' A 'entialness' A + + 'allically' C 'antaneous' A 'antiality' A 'arisation' A + 'arization' A 'ationally' B 'ativeness' A 'eableness' E + 'entations' A 'entiality' A 'entialize' A 'entiation' A + 'ionalness' A 'istically' A 'itousness' A 'izability' A + 'izational' A + + 'ableness' A 'arizable' A 'entation' A 'entially' A + 'eousness' A 'ibleness' A 'icalness' A 'ionalism' A + 'ionality' A 'ionalize' A 'iousness' A 'izations' A + 'lessness' A + + 'ability' A 'aically' A 'alistic' B 'alities' A + 'ariness' E 'aristic' A 'arizing' A 'ateness' A + 'atingly' A 'ational' B 'atively' A 'ativism' A + 'elihood' E 'encible' A 'entally' A 'entials' A + 'entiate' A 'entness' A 'fulness' A 'ibility' A + 'icalism' A 'icalist' A 'icality' A 'icalize' A + 'ication' G 'icianry' A 'ination' A 'ingness' A + 'ionally' A 'isation' A 'ishness' A 'istical' A + 'iteness' A 'iveness' A 'ivistic' A 'ivities' A + 'ization' F 'izement' A 'oidally' A 'ousness' A + + 'aceous' A 'acious' B 'action' G 'alness' A + 'ancial' A 'ancies' A 'ancing' B 'ariser' A + 'arized' A 'arizer' A 'atable' A 'ations' B + 'atives' A 'eature' Z 'efully' A 'encies' A + 'encing' A 'ential' A 'enting' C 'entist' A + 'eously' A 'ialist' A 'iality' A 'ialize' A + 'ically' A 'icance' A 'icians' A 'icists' A + 'ifully' A 'ionals' A 'ionate' D 'ioning' A + 'ionist' A 'iously' A 'istics' A 'izable' E + 'lessly' A 'nesses' A 'oidism' A + + 'acies' A 'acity' A 'aging' B 'aical' A + 'alist' A 'alism' B 'ality' A 'alize' A + 'allic'BB 'anced' B 'ances' B 'antic' C + 'arial' A 'aries' A 'arily' A 'arity' B + 'arize' A 'aroid' A 'ately' A 'ating' I + 'ation' B 'ative' A 'ators' A 'atory' A + 'ature' E 'early' Y 'ehood' A 'eless' A + 'elity' A 'ement' A 'enced' A 'ences' A + 'eness' E 'ening' E 'ental' A 'ented' C + 'ently' A 'fully' A 'ially' A 'icant' A + 'ician' A 'icide' A 'icism' A 'icist' A + 'icity' A 'idine' I 'iedly' A 'ihood' A + 'inate' A 'iness' A 'ingly' B 'inism' J + 'inity'CC 'ional' A 'ioned' A 'ished' A + 'istic' A 'ities' A 'itous' A 'ively' A + 'ivity' A 'izers' F 'izing' F 'oidal' A + 'oides' A 'otide' A 'ously' A + + 'able' A 'ably' A 'ages' B 'ally' B + 'ance' B 'ancy' B 'ants' B 'aric' A + 'arly' K 'ated' I 'ates' A 'atic' B + 'ator' A 'ealy' Y 'edly' E 'eful' A + 'eity' A 'ence' A 'ency' A 'ened' E + 'enly' E 'eous' A 'hood' A 'ials' A + 'ians' A 'ible' A 'ibly' A 'ical' A + 'ides' L 'iers' A 'iful' A 'ines' M + 'ings' N 'ions' B 'ious' A 'isms' B + 'ists' A 'itic' H 'ized' F 'izer' F + 'less' A 'lily' A 'ness' A 'ogen' A + 'ward' A 'wise' A 'ying' B 'yish' A + + 'acy' A 'age' B 'aic' A 'als'BB + 'ant' B 'ars' O 'ary' F 'ata' A + 'ate' A 'eal' Y 'ear' Y 'ely' E + 'ene' E 'ent' C 'ery' E 'ese' A + 'ful' A 'ial' A 'ian' A 'ics' A + 'ide' L 'ied' A 'ier' A 'ies' P + 'ily' A 'ine' M 'ing' N 'ion' Q + 'ish' C 'ism' B 'ist' A 'ite'AA + 'ity' A 'ium' A 'ive' A 'ize' F + 'oid' A 'one' R 'ous' A + + 'ae' A 'al'BB 'ar' X 'as' B + 'ed' E 'en' F 'es' E 'ia' A + 'ic' A 'is' A 'ly' B 'on' S + 'or' T 'um' U 'us' V 'yl' R + '{'}s' A 's{'}' A + + 'a' A 'e' A 'i' A 'o' A + 's' W 'y' B + + (delete) + ) + ) + + /* Undoubling is rule 1 of appendix C. */ + + define undouble as ( + test substring among ('bb' 'dd' 'gg' 'll' 'mm' 'nn' 'pp' 'rr' 'ss' + 'tt') + [next] delete + ) + + /* The other appendix C rules can be done together. */ + + define respell as ( + [substring] among ( + 'iev' (<-'ief') + 'uct' (<-'uc') + 'umpt' (<-'um') + 'rpt' (<-'rb') + 'urs' (<-'ur') + 'istr' (<-'ister') + 'metr' (<-'meter') + 'olv' (<-'olut') + 'ul' (not 'a' not 'i' not 'o' <-'l') + 'bex' (<-'bic') + 'dex' (<-'dic') + 'pex' (<-'pic') + 'tex' (<-'tic') + 'ax' (<-'ac') + 'ex' (<-'ec') + 'ix' (<-'ic') + 'lux' (<-'luc') + 'uad' (<-'uas') + 'vad' (<-'vas') + 'cid' (<-'cis') + 'lid' (<-'lis') + 'erid' (<-'eris') + 'pand' (<-'pans') + 'end' (not 's' <-'ens') + 'ond' (<-'ons') + 'lud' (<-'lus') + 'rud' (<-'rus') + 'her' (not 'p' not 't' <-'hes') + 'mit' (<-'mis') + 'ent' (not 'm' <-'ens') + /* 'ent' was 'end' in the 1968 paper - a typo. */ + 'ert' (<-'ers') + 'et' (not 'n' <-'es') + 'yt' (<-'ys') + 'yz' (<-'ys') + ) + ) +) + +define stem as ( + + backwards ( + do endings + do undouble + do respell + ) +) + diff --git a/contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..94a071653 --- /dev/null +++ b/contrib/snowball/algorithms/norwegian/stem_ISO_8859_1.sbl @@ -0,0 +1,80 @@ +routines ( + mark_regions + main_suffix + consonant_pair + other_suffix +) + +externals ( stem ) + +integers ( p1 x ) + +groupings ( v s_ending ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef ae hex 'E6' +stringdef ao hex 'E5' +stringdef o/ hex 'F8' + +define v 'aeiouy{ae}{ao}{o/}' + +define s_ending 'bcdfghjlmnoprtvyz' + +define mark_regions as ( + + $p1 = limit + + test ( hop 3 setmark x ) + goto v gopast non-v setmark p1 + try ( $p1 < x $p1 = x ) +) + +backwardmode ( + + define main_suffix as ( + setlimit tomark p1 for ([substring]) + among( + + 'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar' + 'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens' + 'hetens' 'ers' 'ets' 'et' 'het' 'ast' + (delete) + 's' + (s_ending or ('k' non-v) delete) + 'erte' 'ert' + (<-'er') + ) + ) + + define consonant_pair as ( + test ( + setlimit tomark p1 for ([substring]) + among( + 'dt' 'vt' + ) + ) + next] delete + ) + + define other_suffix as ( + setlimit tomark p1 for ([substring]) + among( + 'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov' + 'hetslov' + (delete) + ) + ) +) + +define stem as ( + + do mark_regions + backwards ( + do main_suffix + do consonant_pair + do other_suffix + ) +) diff --git a/contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl new file mode 100644 index 000000000..f57483354 --- /dev/null +++ b/contrib/snowball/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl @@ -0,0 +1,80 @@ +routines ( + mark_regions + main_suffix + consonant_pair + other_suffix +) + +externals ( stem ) + +integers ( p1 x ) + +groupings ( v s_ending ) + +stringescapes {} + +/* special characters (in MS-DOS Latin I) */ + +stringdef ae hex '91' +stringdef ao hex '86' +stringdef o/ hex '9B' + +define v 'aeiouy{ae}{ao}{o/}' + +define s_ending 'bcdfghjlmnoprtvyz' + +define mark_regions as ( + + $p1 = limit + + test ( hop 3 setmark x ) + goto v gopast non-v setmark p1 + try ( $p1 < x $p1 = x ) +) + +backwardmode ( + + define main_suffix as ( + setlimit tomark p1 for ([substring]) + among( + + 'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar' + 'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens' + 'hetens' 'ers' 'ets' 'et' 'het' 'ast' + (delete) + 's' + (s_ending or ('k' non-v) delete) + 'erte' 'ert' + (<-'er') + ) + ) + + define consonant_pair as ( + test ( + setlimit tomark p1 for ([substring]) + among( + 'dt' 'vt' + ) + ) + next] delete + ) + + define other_suffix as ( + setlimit tomark p1 for ([substring]) + among( + 'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov' + 'hetslov' + (delete) + ) + ) +) + +define stem as ( + + do mark_regions + backwards ( + do main_suffix + do consonant_pair + do other_suffix + ) +) diff --git a/contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..9533b7932 --- /dev/null +++ b/contrib/snowball/algorithms/porter/stem_ISO_8859_1.sbl @@ -0,0 +1,139 @@ +integers ( p1 p2 ) +booleans ( Y_found ) + +routines ( + shortv + R1 R2 + Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5a Step_5b +) + +externals ( stem ) + +groupings ( v v_WXY ) + +define v 'aeiouy' +define v_WXY v + 'wxY' + +backwardmode ( + + define shortv as ( non-v_WXY v non-v ) + + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define Step_1a as ( + [substring] among ( + 'sses' (<-'ss') + 'ies' (<-'i') + 'ss' () + 's' (delete) + ) + ) + + define Step_1b as ( + [substring] among ( + 'eed' (R1 <-'ee') + 'ed' + 'ing' ( + test gopast v delete + test substring among( + 'at' 'bl' 'iz' + (<+ 'e') + 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt' + // ignoring double c, h, j, k, q, v, w, and x + ([next] delete) + '' (atmark p1 test shortv <+ 'e') + ) + ) + ) + ) + + define Step_1c as ( + ['y' or 'Y'] + gopast v + <-'i' + ) + + define Step_2 as ( + [substring] R1 among ( + 'tional' (<-'tion') + 'enci' (<-'ence') + 'anci' (<-'ance') + 'abli' (<-'able') + 'entli' (<-'ent') + 'eli' (<-'e') + 'izer' 'ization' + (<-'ize') + 'ational' 'ation' 'ator' + (<-'ate') + 'alli' (<-'al') + 'alism' 'aliti' + (<-'al') + 'fulness' (<-'ful') + 'ousli' 'ousness' + (<-'ous') + 'iveness' 'iviti' + (<-'ive') + 'biliti' (<-'ble') + ) + ) + + define Step_3 as ( + [substring] R1 among ( + 'alize' (<-'al') + 'icate' 'iciti' 'ical' + (<-'ic') + 'ative' 'ful' 'ness' + (delete) + ) + ) + + define Step_4 as ( + [substring] R2 among ( + 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement' + 'ment' 'ent' 'ou' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize' + (delete) + 'ion' ('s' or 't' delete) + ) + ) + + define Step_5a as ( + ['e'] + R2 or (R1 not shortv) + delete + ) + + define Step_5b as ( + ['l'] + R2 'l' + delete + ) +) + +define stem as ( + + unset Y_found + do ( ['y'] <-'Y' set Y_found) + do repeat(goto (v ['y']) <-'Y' set Y_found) + + $p1 = limit + $p2 = limit + do( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) + + backwards ( + do Step_1a + do Step_1b + do Step_1c + do Step_2 + do Step_3 + do Step_4 + do Step_5a + do Step_5b + ) + + do(Y_found repeat(goto (['Y']) <-'y')) + +) diff --git a/contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..3e7da08d4 --- /dev/null +++ b/contrib/snowball/algorithms/portuguese/stem_ISO_8859_1.sbl @@ -0,0 +1,218 @@ +routines ( + prelude postlude mark_regions + RV R1 R2 + standard_suffix + verb_suffix + residual_suffix + residual_form +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef a' hex 'E1' // a-acute +stringdef a^ hex 'E2' // a-circumflex e.g. 'bota^nico +stringdef e' hex 'E9' // e-acute +stringdef e^ hex 'EA' // e-circumflex +stringdef i' hex 'ED' // i-acute +stringdef o^ hex 'F4' // o-circumflex +stringdef o' hex 'F3' // o-acute +stringdef u' hex 'FA' // u-acute +stringdef c, hex 'E7' // c-cedilla + +stringdef a~ hex 'E3' // a-tilde +stringdef o~ hex 'F5' // o-tilde + + +define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}' + +define prelude as repeat ( + [substring] among( + '{a~}' (<- 'a~') + '{o~}' (<- 'o~') + '' (next) + ) //or next +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + [substring] among( + 'a~' (<- '{a~}') + 'o~' (<- '{o~}') + '' (next) + ) //or next +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define standard_suffix as ( + [substring] among( + + 'eza' 'ezas' + 'ico' 'ica' 'icos' 'icas' + 'ismo' 'ismos' + '{a'}vel' + '{i'}vel' + 'ista' 'istas' + 'oso' 'osa' 'osos' 'osas' + 'amento' 'amentos' + 'imento' 'imentos' + + 'adora' 'ador' 'a{c,}a~o' + 'adoras' 'adores' 'a{c,}o~es' // no -ic test + 'ante' 'antes' '{a^}ncia' // Note 1 + ( + R2 delete + ) + 'log{i'}a' + 'log{i'}as' + ( + R2 <- 'log' + ) + 'uci{o'}n' 'uciones' + ( + R2 <- 'u' + ) + '{e^}ncia' '{e^}ncias' + ( + R2 <- 'ente' + ) + 'amente' + ( + R1 delete + try ( + [substring] R2 delete among( + 'iv' (['at'] R2 delete) + 'os' + 'ic' + 'ad' + ) + ) + ) + 'mente' + ( + R2 delete + try ( + [substring] among( + 'ante' // Note 1 + 'avel' + '{i'}vel' (R2 delete) + ) + ) + ) + 'idade' + 'idades' + ( + R2 delete + try ( + [substring] among( + 'abil' + 'ic' + 'iv' (R2 delete) + ) + ) + ) + 'iva' 'ivo' + 'ivas' 'ivos' + ( + R2 delete + try ( + ['at'] R2 delete // but not a further ['ic'] R2 delete + ) + ) + 'ira' 'iras' + ( + RV 'e' // -eira -eiras usually non-verbal + <- 'ir' + ) + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among( + 'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}' + 'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste' + 'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam' + 'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem' + 'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o' + 'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias' + 'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras' + 'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres' + 'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is' + 'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis' + '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis' + '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos' + '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos' + 'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos' + 'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos' + '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou' + + 'ira' 'iras' + (delete) + ) + ) + + define residual_suffix as ( + [substring] among( + 'os' + 'a' 'i' 'o' '{a'}' '{i'}' '{o'}' + ( RV delete ) + ) + ) + + define residual_form as ( + [substring] among( + 'e' '{e'}' '{e^}' + ( RV delete [('u'] test 'g') or + ('i'] test 'c') RV delete ) + '{c,}' (<-'c') + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards ( + do ( + ( ( standard_suffix or verb_suffix ) + and do ( ['i'] test 'c' RV delete ) + ) + or residual_suffix + ) + do residual_form + ) + do postlude +) + +/* + Note 1: additions of 15 Jun 2005 +*/ diff --git a/contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl new file mode 100644 index 000000000..4d6c85214 --- /dev/null +++ b/contrib/snowball/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl @@ -0,0 +1,218 @@ +routines ( + prelude postlude mark_regions + RV R1 R2 + standard_suffix + verb_suffix + residual_suffix + residual_form +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v ) + +stringescapes {} + +/* special characters (in MS-DOS Latin I) */ + +stringdef a' hex 'A0' // a-acute +stringdef a^ hex '83' // a-circumflex e.g. 'bota^nico +stringdef e' hex '82' // e-acute +stringdef e^ hex '88' // e-circumflex +stringdef i' hex 'A1' // i-acute +stringdef o^ hex '93' // o-circumflex +stringdef o' hex 'A2' // o-acute +stringdef u' hex 'A3' // u-acute +stringdef c, hex '87' // c-cedilla + +stringdef a~ hex 'C6' // a-tilde +stringdef o~ hex 'E4' // o-tilde + + +define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}' + +define prelude as repeat ( + [substring] among( + '{a~}' (<- 'a~') + '{o~}' (<- 'o~') + '' (next) + ) //or next +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + [substring] among( + 'a~' (<- '{a~}') + 'o~' (<- '{o~}') + '' (next) + ) //or next +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define standard_suffix as ( + [substring] among( + + 'eza' 'ezas' + 'ico' 'ica' 'icos' 'icas' + 'ismo' 'ismos' + '{a'}vel' + '{i'}vel' + 'ista' 'istas' + 'oso' 'osa' 'osos' 'osas' + 'amento' 'amentos' + 'imento' 'imentos' + + 'adora' 'ador' 'a{c,}a~o' + 'adoras' 'adores' 'a{c,}o~es' // no -ic test + 'ante' 'antes' '{a^}ncia' // Note 1 + ( + R2 delete + ) + 'log{i'}a' + 'log{i'}as' + ( + R2 <- 'log' + ) + 'uci{o'}n' 'uciones' + ( + R2 <- 'u' + ) + '{e^}ncia' '{e^}ncias' + ( + R2 <- 'ente' + ) + 'amente' + ( + R1 delete + try ( + [substring] R2 delete among( + 'iv' (['at'] R2 delete) + 'os' + 'ic' + 'ad' + ) + ) + ) + 'mente' + ( + R2 delete + try ( + [substring] among( + 'ante' // Note 1 + 'avel' + '{i'}vel' (R2 delete) + ) + ) + ) + 'idade' + 'idades' + ( + R2 delete + try ( + [substring] among( + 'abil' + 'ic' + 'iv' (R2 delete) + ) + ) + ) + 'iva' 'ivo' + 'ivas' 'ivos' + ( + R2 delete + try ( + ['at'] R2 delete // but not a further ['ic'] R2 delete + ) + ) + 'ira' 'iras' + ( + RV 'e' // -eira -eiras usually non-verbal + <- 'ir' + ) + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among( + 'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}' + 'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste' + 'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam' + 'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem' + 'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o' + 'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias' + 'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras' + 'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres' + 'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is' + 'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis' + '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis' + '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos' + '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos' + 'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos' + 'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos' + '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou' + + 'ira' 'iras' + (delete) + ) + ) + + define residual_suffix as ( + [substring] among( + 'os' + 'a' 'i' 'o' '{a'}' '{i'}' '{o'}' + ( RV delete ) + ) + ) + + define residual_form as ( + [substring] among( + 'e' '{e'}' '{e^}' + ( RV delete [('u'] test 'g') or + ('i'] test 'c') RV delete ) + '{c,}' (<-'c') + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards ( + do ( + ( ( standard_suffix or verb_suffix ) + and do ( ['i'] test 'c' RV delete ) + ) + or residual_suffix + ) + do residual_form + ) + do postlude +) + +/* + Note 1: additions of 15 Jun 2005 +*/ diff --git a/contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl b/contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl new file mode 100644 index 000000000..48a148321 --- /dev/null +++ b/contrib/snowball/algorithms/romanian/stem_ISO_8859_2.sbl @@ -0,0 +1,236 @@ + +routines ( + prelude postlude mark_regions + RV R1 R2 + step_0 + standard_suffix combo_suffix + verb_suffix + vowel_suffix +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v ) + +booleans ( standard_suffix_removed ) + +stringescapes {} + +/* special characters */ + +stringdef a^ hex 'E2' // a circumflex +stringdef i^ hex 'EE' // i circumflex +stringdef a+ hex 'E3' // a breve +stringdef s, hex 'BA' // s cedilla +stringdef t, hex 'FE' // t cedilla + +define v 'aeiou{a^}{i^}{a+}' + +define prelude as ( + repeat goto ( + v [ ('u' ] v <- 'U') or + ('i' ] v <- 'I') + ) +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + + [substring] among( + 'I' (<- 'i') + 'U' (<- 'u') + '' (next) + ) + +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define step_0 as ( + [substring] R1 among( + 'ul' 'ului' + ( delete ) + 'aua' + ( <-'a' ) + 'ea' 'ele' 'elor' + ( <-'e' ) + 'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor' + ( <-'i') + 'ile' + ( not 'ab' <- 'i' ) + 'atei' + ( <- 'at' ) + 'a{t,}ie' 'a{t,}ia' + ( <- 'a{t,}i' ) + ) + ) + + define combo_suffix as test ( + [substring] R1 ( + among( + /* 'IST'. alternative: include the following + 'alism' 'alisme' + 'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' ( + <- 'al' + ) + */ + 'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' ( + <- 'abil' + ) + 'ibilitate' ( + <- 'ibil' + ) + 'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' ( + <- 'iv' + ) + 'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i' + 'icator' 'icatori' + 'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}' + 'ical' 'icala' 'icale' 'icali' 'ical{a+}' ( + <- 'ic' + ) + 'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune' + 'atoare' 'ator' 'atori' + '{a+}toare' '{a+}tor' '{a+}tori' ( + <- 'at' + ) + 'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune' + 'itoare' 'itor' 'itori' ( + <- 'it' + ) + ) + set standard_suffix_removed + ) + ) + + define standard_suffix as ( + unset standard_suffix_removed + repeat combo_suffix + [substring] R2 ( + among( + + // past participle is treated here, rather than + // as a verb ending: + 'at' 'ata' 'at{a+}' 'ati' 'ate' + 'ut' 'uta' 'ut{a+}' 'uti' 'ute' + 'it' 'ita' 'it{a+}' 'iti' 'ite' + + 'ic' 'ica' 'ice' 'ici' 'ic{a+}' + 'abil' 'abila' 'abile' 'abili' 'abil{a+}' + 'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}' + 'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i' + 'ant' 'anta' 'ante' 'anti' 'ant{a+}' + 'ator' 'atori' + 'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i' + 'iv' 'iva' 'ive' 'ivi' 'iv{a+}' ( + delete + ) + 'iune' 'iuni' ( + '{t,}'] <- 't' + ) + 'ism' 'isme' + 'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' ( + <- 'ist' + /* 'IST'. alternative: remove with <- '' */ + ) + ) + set standard_suffix_removed + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among( + // 'long' infinitive: + 'are' 'ere' 'ire' '{a^}re' + + // gerund: + 'ind' '{a^}nd' + 'indu' '{a^}ndu' + + 'eze' + 'easc{a+}' + // present: + 'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti' + 'e{s,}te' + '{a+}sc' '{a+}{s,}ti' + '{a+}{s,}te' + + // imperfect: + 'am' 'ai' 'au' + 'eam' 'eai' 'ea' 'ea{t,}i' 'eau' + 'iam' 'iai' 'ia' 'ia{t,}i' 'iau' + + // past: // (not 'ii') + 'ui' + 'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}' + 'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}' + 'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}' + '{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}' + + // pluferfect: + 'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}' + 'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}' + '{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i' + '{a^}ser{a+}' + 'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}' + + ( non-v or 'u' delete ) + + // present: + '{a+}m' 'a{t,}i' + 'em' 'e{t,}i' + 'im' 'i{t,}i' + '{a^}m' '{a^}{t,}i' + + // past: + 'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}' + 'sei' 'se' + + // pluperfect: + 'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}' + (delete) + ) + ) + + define vowel_suffix as ( + [substring] RV among ( + 'a' 'e' 'i' 'ie' '{a+}' ( delete ) + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards ( + do step_0 + do standard_suffix + do ( standard_suffix_removed or verb_suffix ) + do vowel_suffix + ) + do postlude +) + diff --git a/contrib/snowball/algorithms/romanian/stem_Unicode.sbl b/contrib/snowball/algorithms/romanian/stem_Unicode.sbl new file mode 100644 index 000000000..09aec6429 --- /dev/null +++ b/contrib/snowball/algorithms/romanian/stem_Unicode.sbl @@ -0,0 +1,236 @@ + +routines ( + prelude postlude mark_regions + RV R1 R2 + step_0 + standard_suffix combo_suffix + verb_suffix + vowel_suffix +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v ) + +booleans ( standard_suffix_removed ) + +stringescapes {} + +/* special characters */ + +stringdef a^ hex '0E2' // a circumflex +stringdef i^ hex '0EE' // i circumflex +stringdef a+ hex '103' // a breve +stringdef s, hex '15F' // s cedilla +stringdef t, hex '163' // t cedilla + +define v 'aeiou{a^}{i^}{a+}' + +define prelude as ( + repeat goto ( + v [ ('u' ] v <- 'U') or + ('i' ] v <- 'I') + ) +) + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + + [substring] among( + 'I' (<- 'i') + 'U' (<- 'u') + '' (next) + ) + +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define step_0 as ( + [substring] R1 among( + 'ul' 'ului' + ( delete ) + 'aua' + ( <-'a' ) + 'ea' 'ele' 'elor' + ( <-'e' ) + 'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor' + ( <-'i') + 'ile' + ( not 'ab' <- 'i' ) + 'atei' + ( <- 'at' ) + 'a{t,}ie' 'a{t,}ia' + ( <- 'a{t,}i' ) + ) + ) + + define combo_suffix as test ( + [substring] R1 ( + among( + /* 'IST'. alternative: include the following + 'alism' 'alisme' + 'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' ( + <- 'al' + ) + */ + 'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' ( + <- 'abil' + ) + 'ibilitate' ( + <- 'ibil' + ) + 'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' ( + <- 'iv' + ) + 'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i' + 'icator' 'icatori' + 'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}' + 'ical' 'icala' 'icale' 'icali' 'ical{a+}' ( + <- 'ic' + ) + 'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune' + 'atoare' 'ator' 'atori' + '{a+}toare' '{a+}tor' '{a+}tori' ( + <- 'at' + ) + 'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune' + 'itoare' 'itor' 'itori' ( + <- 'it' + ) + ) + set standard_suffix_removed + ) + ) + + define standard_suffix as ( + unset standard_suffix_removed + repeat combo_suffix + [substring] R2 ( + among( + + // past participle is treated here, rather than + // as a verb ending: + 'at' 'ata' 'at{a+}' 'ati' 'ate' + 'ut' 'uta' 'ut{a+}' 'uti' 'ute' + 'it' 'ita' 'it{a+}' 'iti' 'ite' + + 'ic' 'ica' 'ice' 'ici' 'ic{a+}' + 'abil' 'abila' 'abile' 'abili' 'abil{a+}' + 'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}' + 'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i' + 'ant' 'anta' 'ante' 'anti' 'ant{a+}' + 'ator' 'atori' + 'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i' + 'iv' 'iva' 'ive' 'ivi' 'iv{a+}' ( + delete + ) + 'iune' 'iuni' ( + '{t,}'] <- 't' + ) + 'ism' 'isme' + 'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' ( + <- 'ist' + /* 'IST'. alternative: remove with <- '' */ + ) + ) + set standard_suffix_removed + ) + ) + + define verb_suffix as setlimit tomark pV for ( + [substring] among( + // 'long' infinitive: + 'are' 'ere' 'ire' '{a^}re' + + // gerund: + 'ind' '{a^}nd' + 'indu' '{a^}ndu' + + 'eze' + 'easc{a+}' + // present: + 'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti' + 'e{s,}te' + '{a+}sc' '{a+}{s,}ti' + '{a+}{s,}te' + + // imperfect: + 'am' 'ai' 'au' + 'eam' 'eai' 'ea' 'ea{t,}i' 'eau' + 'iam' 'iai' 'ia' 'ia{t,}i' 'iau' + + // past: // (not 'ii') + 'ui' + 'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}' + 'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}' + 'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}' + '{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}' + + // pluferfect: + 'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}' + 'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}' + '{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i' + '{a^}ser{a+}' + 'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}' + + ( non-v or 'u' delete ) + + // present: + '{a+}m' 'a{t,}i' + 'em' 'e{t,}i' + 'im' 'i{t,}i' + '{a^}m' '{a^}{t,}i' + + // past: + 'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}' + 'sei' 'se' + + // pluperfect: + 'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}' + (delete) + ) + ) + + define vowel_suffix as ( + [substring] RV among ( + 'a' 'e' 'i' 'ie' '{a+}' ( delete ) + ) + ) +) + +define stem as ( + do prelude + do mark_regions + backwards ( + do step_0 + do standard_suffix + do ( standard_suffix_removed or verb_suffix ) + do vowel_suffix + ) + do postlude +) + diff --git a/contrib/snowball/algorithms/russian/stem_KOI8_R.sbl b/contrib/snowball/algorithms/russian/stem_KOI8_R.sbl new file mode 100644 index 000000000..cdacb19c4 --- /dev/null +++ b/contrib/snowball/algorithms/russian/stem_KOI8_R.sbl @@ -0,0 +1,217 @@ +stringescapes {} + +/* the 32 Cyrillic letters in the KOI8-R coding scheme, and represented + in Latin characters following the conventions of the standard Library + of Congress transliteration: */ + +stringdef a hex 'C1' +stringdef b hex 'C2' +stringdef v hex 'D7' +stringdef g hex 'C7' +stringdef d hex 'C4' +stringdef e hex 'C5' +stringdef zh hex 'D6' +stringdef z hex 'DA' +stringdef i hex 'C9' +stringdef i` hex 'CA' +stringdef k hex 'CB' +stringdef l hex 'CC' +stringdef m hex 'CD' +stringdef n hex 'CE' +stringdef o hex 'CF' +stringdef p hex 'D0' +stringdef r hex 'D2' +stringdef s hex 'D3' +stringdef t hex 'D4' +stringdef u hex 'D5' +stringdef f hex 'C6' +stringdef kh hex 'C8' +stringdef ts hex 'C3' +stringdef ch hex 'DE' +stringdef sh hex 'DB' +stringdef shch hex 'DD' +stringdef " hex 'DF' +stringdef y hex 'D9' +stringdef ' hex 'D8' +stringdef e` hex 'DC' +stringdef iu hex 'C0' +stringdef ia hex 'D1' + +routines ( mark_regions R2 + perfective_gerund + adjective + adjectival + reflexive + verb + noun + derivational + tidy_up +) + +externals ( stem ) + +integers ( pV p2 ) + +groupings ( v ) + +define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}' + +define mark_regions as ( + + $pV = limit + $p2 = limit + do ( + gopast v setmark pV gopast non-v + gopast v gopast non-v setmark p2 + ) +) + +backwardmode ( + + define R2 as $p2 <= cursor + + define perfective_gerund as ( + [substring] among ( + '{v}' + '{v}{sh}{i}' + '{v}{sh}{i}{s}{'}' + ('{a}' or '{ia}' delete) + '{i}{v}' + '{i}{v}{sh}{i}' + '{i}{v}{sh}{i}{s}{'}' + '{y}{v}' + '{y}{v}{sh}{i}' + '{y}{v}{sh}{i}{s}{'}' + (delete) + ) + ) + + define adjective as ( + [substring] among ( + '{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}' + '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}' + '{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}' + '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}' + '{ia}{ia}' + // and - + '{o}{iu}' // - which is somewhat archaic + '{e}{iu}' // - soft form of {o}{iu} + (delete) + ) + ) + + define adjectival as ( + adjective + + /* of the participle forms, em, vsh, ivsh, yvsh are readily removable. + nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of + errors. Removing im, uem, enn creates too many errors. + */ + + try ( + [substring] among ( + '{e}{m}' // present passive participle + '{n}{n}' // adjective from past passive participle + '{v}{sh}' // past active participle + '{iu}{shch}' '{shch}' // present active participle + ('{a}' or '{ia}' delete) + + //but not '{i}{m}' '{u}{e}{m}' // present passive participle + //or '{e}{n}{n}' // adjective from past passive participle + + '{i}{v}{sh}' '{y}{v}{sh}'// past active participle + '{u}{iu}{shch}' // present active participle + (delete) + ) + ) + + ) + + define reflexive as ( + [substring] among ( + '{s}{ia}' + '{s}{'}' + (delete) + ) + ) + + define verb as ( + [substring] among ( + '{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}' + '{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}' + '{n}{y}' '{t}{'}' '{e}{sh}{'}' + + '{n}{n}{o}' + ('{a}' or '{ia}' delete) + + '{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}' + '{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}' + '{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}' + '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}' + '{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}' + '{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}' + (delete) + /* note the short passive participle tests: + '{n}{a}' '{n}' '{n}{o}' '{n}{y}' + '{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}' + */ + ) + ) + + define noun as ( + [substring] among ( + '{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}' + '{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}' + '{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}' + '{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}' + '{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}' + '{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}' + (delete) + /* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}' + '{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}' + omitted - they only occur on 12 words. + */ + ) + ) + + define derivational as ( + [substring] R2 among ( + '{o}{s}{t}' + '{o}{s}{t}{'}' + (delete) + ) + ) + + define tidy_up as ( + [substring] among ( + + '{e}{i`}{sh}' + '{e}{i`}{sh}{e}' // superlative forms + (delete + ['{n}'] '{n}' delete + ) + '{n}' + ('{n}' delete) // e.g. -nno endings + '{'}' + (delete) // with some slight false conflations + ) + ) +) + +define stem as ( + + do mark_regions + backwards setlimit tomark pV for ( + do ( + perfective_gerund or + ( try reflexive + adjectival or verb or noun + ) + ) + try([ '{i}' ] delete) + // because noun ending -i{iu} is being treated as verb ending -{iu} + + do derivational + do tidy_up + ) +) diff --git a/contrib/snowball/algorithms/russian/stem_Unicode.sbl b/contrib/snowball/algorithms/russian/stem_Unicode.sbl new file mode 100644 index 000000000..9e1a93f93 --- /dev/null +++ b/contrib/snowball/algorithms/russian/stem_Unicode.sbl @@ -0,0 +1,215 @@ +stringescapes {} + +/* the 32 Cyrillic letters in Unicode */ + +stringdef a hex '430' +stringdef b hex '431' +stringdef v hex '432' +stringdef g hex '433' +stringdef d hex '434' +stringdef e hex '435' +stringdef zh hex '436' +stringdef z hex '437' +stringdef i hex '438' +stringdef i` hex '439' +stringdef k hex '43A' +stringdef l hex '43B' +stringdef m hex '43C' +stringdef n hex '43D' +stringdef o hex '43E' +stringdef p hex '43F' +stringdef r hex '440' +stringdef s hex '441' +stringdef t hex '442' +stringdef u hex '443' +stringdef f hex '444' +stringdef kh hex '445' +stringdef ts hex '446' +stringdef ch hex '447' +stringdef sh hex '448' +stringdef shch hex '449' +stringdef " hex '44A' +stringdef y hex '44B' +stringdef ' hex '44C' +stringdef e` hex '44D' +stringdef iu hex '44E' +stringdef ia hex '44F' + +routines ( mark_regions R2 + perfective_gerund + adjective + adjectival + reflexive + verb + noun + derivational + tidy_up +) + +externals ( stem ) + +integers ( pV p2 ) + +groupings ( v ) + +define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}' + +define mark_regions as ( + + $pV = limit + $p2 = limit + do ( + gopast v setmark pV gopast non-v + gopast v gopast non-v setmark p2 + ) +) + +backwardmode ( + + define R2 as $p2 <= cursor + + define perfective_gerund as ( + [substring] among ( + '{v}' + '{v}{sh}{i}' + '{v}{sh}{i}{s}{'}' + ('{a}' or '{ia}' delete) + '{i}{v}' + '{i}{v}{sh}{i}' + '{i}{v}{sh}{i}{s}{'}' + '{y}{v}' + '{y}{v}{sh}{i}' + '{y}{v}{sh}{i}{s}{'}' + (delete) + ) + ) + + define adjective as ( + [substring] among ( + '{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}' + '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}' + '{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}' + '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}' + '{ia}{ia}' + // and - + '{o}{iu}' // - which is somewhat archaic + '{e}{iu}' // - soft form of {o}{iu} + (delete) + ) + ) + + define adjectival as ( + adjective + + /* of the participle forms, em, vsh, ivsh, yvsh are readily removable. + nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of + errors. Removing im, uem, enn creates too many errors. + */ + + try ( + [substring] among ( + '{e}{m}' // present passive participle + '{n}{n}' // adjective from past passive participle + '{v}{sh}' // past active participle + '{iu}{shch}' '{shch}' // present active participle + ('{a}' or '{ia}' delete) + + //but not '{i}{m}' '{u}{e}{m}' // present passive participle + //or '{e}{n}{n}' // adjective from past passive participle + + '{i}{v}{sh}' '{y}{v}{sh}'// past active participle + '{u}{iu}{shch}' // present active participle + (delete) + ) + ) + + ) + + define reflexive as ( + [substring] among ( + '{s}{ia}' + '{s}{'}' + (delete) + ) + ) + + define verb as ( + [substring] among ( + '{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}' + '{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}' + '{n}{y}' '{t}{'}' '{e}{sh}{'}' + + '{n}{n}{o}' + ('{a}' or '{ia}' delete) + + '{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}' + '{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}' + '{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}' + '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}' + '{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}' + '{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}' + (delete) + /* note the short passive participle tests: + '{n}{a}' '{n}' '{n}{o}' '{n}{y}' + '{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}' + */ + ) + ) + + define noun as ( + [substring] among ( + '{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}' + '{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}' + '{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}' + '{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}' + '{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}' + '{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}' + (delete) + /* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}' + '{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}' + omitted - they only occur on 12 words. + */ + ) + ) + + define derivational as ( + [substring] R2 among ( + '{o}{s}{t}' + '{o}{s}{t}{'}' + (delete) + ) + ) + + define tidy_up as ( + [substring] among ( + + '{e}{i`}{sh}' + '{e}{i`}{sh}{e}' // superlative forms + (delete + ['{n}'] '{n}' delete + ) + '{n}' + ('{n}' delete) // e.g. -nno endings + '{'}' + (delete) // with some slight false conflations + ) + ) +) + +define stem as ( + + do mark_regions + backwards setlimit tomark pV for ( + do ( + perfective_gerund or + ( try reflexive + adjectival or verb or noun + ) + ) + try([ '{i}' ] delete) + // because noun ending -i{iu} is being treated as verb ending -{iu} + + do derivational + do tidy_up + ) +) diff --git a/contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..9dee289cc --- /dev/null +++ b/contrib/snowball/algorithms/spanish/stem_ISO_8859_1.sbl @@ -0,0 +1,230 @@ +routines ( + postlude mark_regions + RV R1 R2 + attached_pronoun + standard_suffix + y_verb_suffix + verb_suffix + residual_suffix +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef a' hex 'E1' // a-acute +stringdef e' hex 'E9' // e-acute +stringdef i' hex 'ED' // i-acute +stringdef o' hex 'F3' // o-acute +stringdef u' hex 'FA' // u-acute +stringdef u" hex 'FC' // u-diaeresis +stringdef n~ hex 'F1' // n-tilde + +define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}' + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + [substring] among( + '{a'}' (<- 'a') + '{e'}' (<- 'e') + '{i'}' (<- 'i') + '{o'}' (<- 'o') + '{u'}' (<- 'u') + // and possibly {u"}->u here, or in prelude + '' (next) + ) //or next +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define attached_pronoun as ( + [substring] among( + 'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo' + 'las' 'les' 'los' 'nos' + ) + substring RV among( + 'i{e'}ndo' (] <- 'iendo') + '{a'}ndo' (] <- 'ando') + '{a'}r' (] <- 'ar') + '{e'}r' (] <- 'er') + '{i'}r' (] <- 'ir') + 'ando' + 'iendo' + 'ar' 'er' 'ir' + (delete) + 'yendo' ('u' delete) + ) + ) + + define standard_suffix as ( + [substring] among( + + 'anza' 'anzas' + 'ico' 'ica' 'icos' 'icas' + 'ismo' 'ismos' + 'able' 'ables' + 'ible' 'ibles' + 'ista' 'istas' + 'oso' 'osa' 'osos' 'osas' + 'amiento' 'amientos' + 'imiento' 'imientos' + ( + R2 delete + ) + 'adora' 'ador' 'aci{o'}n' + 'adoras' 'adores' 'aciones' + 'ante' 'antes' 'ancia' 'ancias'// Note 1 + ( + R2 delete + try ( ['ic'] R2 delete ) + ) + 'log{i'}a' + 'log{i'}as' + ( + R2 <- 'log' + ) + 'uci{o'}n' 'uciones' + ( + R2 <- 'u' + ) + 'encia' 'encias' + ( + R2 <- 'ente' + ) + 'amente' + ( + R1 delete + try ( + [substring] R2 delete among( + 'iv' (['at'] R2 delete) + 'os' + 'ic' + 'ad' + ) + ) + ) + 'mente' + ( + R2 delete + try ( + [substring] among( + 'ante' // Note 1 + 'able' + 'ible' (R2 delete) + ) + ) + ) + 'idad' + 'idades' + ( + R2 delete + try ( + [substring] among( + 'abil' + 'ic' + 'iv' (R2 delete) + ) + ) + ) + 'iva' 'ivo' + 'ivas' 'ivos' + ( + R2 delete + try ( + ['at'] R2 delete // but not a further ['ic'] R2 delete + ) + ) + ) + ) + + define y_verb_suffix as ( + setlimit tomark pV for ([substring]) among( + 'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}' + 'yas' 'yes' 'yais' 'yamos' + ('u' delete) + ) + ) + + define verb_suffix as ( + setlimit tomark pV for ([substring]) among( + + 'en' 'es' '{e'}is' 'emos' + (try ('u' test 'g') ] delete) + + 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais' + 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}' + 'ar{e'}' + 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais' + 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}' + 'er{e'}' + 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais' + 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}' + 'ir{e'}' + + 'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed' + 'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an' + 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado' + 'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as' + 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases' + 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais' + 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados' + 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' + '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos' + (delete) + ) + ) + + define residual_suffix as ( + [substring] among( + 'os' + 'a' 'o' '{a'}' '{i'}' '{o'}' + ( RV delete ) + 'e' '{e'}' + ( RV delete try( ['u'] test 'g' RV delete ) ) + ) + ) +) + +define stem as ( + do mark_regions + backwards ( + do attached_pronoun + do ( standard_suffix or + y_verb_suffix or + verb_suffix + ) + do residual_suffix + ) + do postlude +) + +/* + Note 1: additions of 15 Jun 2005 +*/ diff --git a/contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl new file mode 100644 index 000000000..db7a46201 --- /dev/null +++ b/contrib/snowball/algorithms/spanish/stem_MS_DOS_Latin_I.sbl @@ -0,0 +1,230 @@ +routines ( + postlude mark_regions + RV R1 R2 + attached_pronoun + standard_suffix + y_verb_suffix + verb_suffix + residual_suffix +) + +externals ( stem ) + +integers ( pV p1 p2 ) + +groupings ( v ) + +stringescapes {} + +/* special characters (in MS-DOS Latin I) */ + +stringdef a' hex 'A0' // a-acute +stringdef e' hex '82' // e-acute +stringdef i' hex 'A1' // i-acute +stringdef o' hex 'A2' // o-acute +stringdef u' hex 'A3' // u-acute +stringdef u" hex '81' // u-diaeresis +stringdef n~ hex 'A4' // n-tilde + +define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}' + +define mark_regions as ( + + $pV = limit + $p1 = limit + $p2 = limit // defaults + + do ( + ( v (non-v gopast v) or (v gopast non-v) ) + or + ( non-v (non-v gopast v) or (v next) ) + setmark pV + ) + do ( + gopast v gopast non-v setmark p1 + gopast v gopast non-v setmark p2 + ) +) + +define postlude as repeat ( + [substring] among( + '{a'}' (<- 'a') + '{e'}' (<- 'e') + '{i'}' (<- 'i') + '{o'}' (<- 'o') + '{u'}' (<- 'u') + // and possibly {u"}->u here, or in prelude + '' (next) + ) //or next +) + +backwardmode ( + + define RV as $pV <= cursor + define R1 as $p1 <= cursor + define R2 as $p2 <= cursor + + define attached_pronoun as ( + [substring] among( + 'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo' + 'las' 'les' 'los' 'nos' + ) + substring RV among( + 'i{e'}ndo' (] <- 'iendo') + '{a'}ndo' (] <- 'ando') + '{a'}r' (] <- 'ar') + '{e'}r' (] <- 'er') + '{i'}r' (] <- 'ir') + 'ando' + 'iendo' + 'ar' 'er' 'ir' + (delete) + 'yendo' ('u' delete) + ) + ) + + define standard_suffix as ( + [substring] among( + + 'anza' 'anzas' + 'ico' 'ica' 'icos' 'icas' + 'ismo' 'ismos' + 'able' 'ables' + 'ible' 'ibles' + 'ista' 'istas' + 'oso' 'osa' 'osos' 'osas' + 'amiento' 'amientos' + 'imiento' 'imientos' + ( + R2 delete + ) + 'adora' 'ador' 'aci{o'}n' + 'adoras' 'adores' 'aciones' + 'ante' 'antes' 'ancia' 'ancias'// Note 1 + ( + R2 delete + try ( ['ic'] R2 delete ) + ) + 'log{i'}a' + 'log{i'}as' + ( + R2 <- 'log' + ) + 'uci{o'}n' 'uciones' + ( + R2 <- 'u' + ) + 'encia' 'encias' + ( + R2 <- 'ente' + ) + 'amente' + ( + R1 delete + try ( + [substring] R2 delete among( + 'iv' (['at'] R2 delete) + 'os' + 'ic' + 'ad' + ) + ) + ) + 'mente' + ( + R2 delete + try ( + [substring] among( + 'ante' // Note 1 + 'able' + 'ible' (R2 delete) + ) + ) + ) + 'idad' + 'idades' + ( + R2 delete + try ( + [substring] among( + 'abil' + 'ic' + 'iv' (R2 delete) + ) + ) + ) + 'iva' 'ivo' + 'ivas' 'ivos' + ( + R2 delete + try ( + ['at'] R2 delete // but not a further ['ic'] R2 delete + ) + ) + ) + ) + + define y_verb_suffix as ( + setlimit tomark pV for ([substring]) among( + 'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}' + 'yas' 'yes' 'yais' 'yamos' + ('u' delete) + ) + ) + + define verb_suffix as ( + setlimit tomark pV for ([substring]) among( + + 'en' 'es' '{e'}is' 'emos' + (try ('u' test 'g') ] delete) + + 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais' + 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}' + 'ar{e'}' + 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais' + 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}' + 'er{e'}' + 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais' + 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}' + 'ir{e'}' + + 'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed' + 'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an' + 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado' + 'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as' + 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases' + 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais' + 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados' + 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' + '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos' + (delete) + ) + ) + + define residual_suffix as ( + [substring] among( + 'os' + 'a' 'o' '{a'}' '{i'}' '{o'}' + ( RV delete ) + 'e' '{e'}' + ( RV delete try( ['u'] test 'g' RV delete ) ) + ) + ) +) + +define stem as ( + do mark_regions + backwards ( + do attached_pronoun + do ( standard_suffix or + y_verb_suffix or + verb_suffix + ) + do residual_suffix + ) + do postlude +) + +/* + Note 1: additions of 15 Jun 2005 +*/ diff --git a/contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl b/contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl new file mode 100644 index 000000000..03ce1e22f --- /dev/null +++ b/contrib/snowball/algorithms/swedish/stem_ISO_8859_1.sbl @@ -0,0 +1,72 @@ +routines ( + mark_regions + main_suffix + consonant_pair + other_suffix +) + +externals ( stem ) + +integers ( p1 x ) + +groupings ( v s_ending ) + +stringescapes {} + +/* special characters (in ISO Latin I) */ + +stringdef a" hex 'E4' +stringdef ao hex 'E5' +stringdef o" hex 'F6' + +define v 'aeiouy{a"}{ao}{o"}' + +define s_ending 'bcdfghjklmnoprtvy' + +define mark_regions as ( + + $p1 = limit + test ( hop 3 setmark x ) + goto v gopast non-v setmark p1 + try ( $p1 < x $p1 = x ) +) + +backwardmode ( + + define main_suffix as ( + setlimit tomark p1 for ([substring]) + among( + + 'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne' + 'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter' + 'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens' + 'hetens' 'erns' 'at' 'andet' 'het' 'ast' + (delete) + 's' + (s_ending delete) + ) + ) + + define consonant_pair as setlimit tomark p1 for ( + among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt') + and ([next] delete) + ) + + define other_suffix as setlimit tomark p1 for ( + [substring] among( + 'lig' 'ig' 'els' (delete) + 'l{o"}st' (<-'l{o"}s') + 'fullt' (<-'full') + ) + ) +) + +define stem as ( + + do mark_regions + backwards ( + do main_suffix + do consonant_pair + do other_suffix + ) +) diff --git a/contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl b/contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl new file mode 100644 index 000000000..1631f401a --- /dev/null +++ b/contrib/snowball/algorithms/swedish/stem_MS_DOS_Latin_I.sbl @@ -0,0 +1,72 @@ +routines ( + mark_regions + main_suffix + consonant_pair + other_suffix +) + +externals ( stem ) + +integers ( p1 x ) + +groupings ( v s_ending ) + +stringescapes {} + +/* special characters (in MS-DOS Latin I) */ + +stringdef a" hex '84' +stringdef ao hex '86' +stringdef o" hex '94' + +define v 'aeiouy{a"}{ao}{o"}' + +define s_ending 'bcdfghjklmnoprtvy' + +define mark_regions as ( + + $p1 = limit + test ( hop 3 setmark x ) + goto v gopast non-v setmark p1 + try ( $p1 < x $p1 = x ) +) + +backwardmode ( + + define main_suffix as ( + setlimit tomark p1 for ([substring]) + among( + + 'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne' + 'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter' + 'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens' + 'hetens' 'erns' 'at' 'andet' 'het' 'ast' + (delete) + 's' + (s_ending delete) + ) + ) + + define consonant_pair as setlimit tomark p1 for ( + among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt') + and ([next] delete) + ) + + define other_suffix as setlimit tomark p1 for ( + [substring] among( + 'lig' 'ig' 'els' (delete) + 'l{o"}st' (<-'l{o"}s') + 'fullt' (<-'full') + ) + ) +) + +define stem as ( + + do mark_regions + backwards ( + do main_suffix + do consonant_pair + do other_suffix + ) +) diff --git a/contrib/snowball/algorithms/turkish/stem_Unicode.sbl b/contrib/snowball/algorithms/turkish/stem_Unicode.sbl new file mode 100644 index 000000000..16c02a51f --- /dev/null +++ b/contrib/snowball/algorithms/turkish/stem_Unicode.sbl @@ -0,0 +1,477 @@ +/* Stemmer for Turkish + * author: Evren (Kapusuz) Çilden + * email: evren.kapusuz at gmail.com + * version: 1.0 (15.01.2007) + + + * stems nominal verb suffixes + * stems nominal inflections + * more than one syllable word check + * (y,n,s,U) context check + * vowel harmony check + * last consonant check and conversion (b, c, d, ğ to p, ç, t, k) + + * The stemming algorithm is based on the paper "An Affix Stripping + * Morphological Analyzer for Turkish" by Gülşen Eryiğit and + * Eşref Adalı (Proceedings of the IAESTED International Conference + * ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004, + * Innsbruck, Austria + + * Turkish is an agglutinative language and has a very rich morphological + * structure. In Turkish, you can form many different words from a single stem + * by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means + * "You had been the doctor of him". The stem of the word is "doktor" and it + * takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about + * the append order of suffixes can be clearly described as FSMs. + * The paper referenced above defines some FSMs for right to left + * morphological analysis. I generated a method for constructing snowball + * expressions from right to left FSMs for stemming suffixes. +*/ + +routines ( + append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings + check_vowel_harmony // tests vowel harmony for suffixes + is_reserved_word // tests whether current string is a reserved word ('ad','soyad') + mark_cAsInA // nominal verb suffix + mark_DA // noun suffix + mark_DAn // noun suffix + mark_DUr // nominal verb suffix + mark_ki // noun suffix + mark_lAr // noun suffix, nominal verb suffix + mark_lArI // noun suffix + mark_nA // noun suffix + mark_ncA // noun suffix + mark_ndA // noun suffix + mark_ndAn // noun suffix + mark_nU // noun suffix + mark_nUn // noun suffix + mark_nUz // nominal verb suffix + mark_sU // noun suffix + mark_sUn // nominal verb suffix + mark_sUnUz // nominal verb suffix + mark_possessives // -(U)m,-(U)n,-(U)mUz,-(U)nUz, + mark_yA // noun suffix + mark_ylA // noun suffix + mark_yU // noun suffix + mark_yUm // nominal verb suffix + mark_yUz // nominal verb suffix + mark_yDU // nominal verb suffix + mark_yken // nominal verb suffix + mark_ymUs_ // nominal verb suffix + mark_ysA // nominal verb suffix + + mark_suffix_with_optional_y_consonant + mark_suffix_with_optional_U_vowel + mark_suffix_with_optional_n_consonant + mark_suffix_with_optional_s_consonant + + more_than_one_syllable_word + + post_process_last_consonants + postlude + + stem_nominal_verb_suffixes + stem_noun_suffixes + stem_suffix_chain_before_ki +) + +/* Special characters in Unicode Latin-1 and Latin Extended-A */ +stringdef c. hex 'E7' // LATIN SMALL LETTER C WITH CEDILLA +stringdef g~ hex '011F' // LATIN SMALL LETTER G WITH BREVE +stringdef i' hex '0131' // LATIN SMALL LETTER I WITHOUT DOT +stringdef o" hex 'F6' // LATIN SMALL LETTER O WITH DIAERESIS +stringdef s. hex '015F' // LATIN SMALL LETTER S WITH CEDILLA +stringdef u" hex 'FC' // LATIN SMALL LETTER U WITH DIAERESIS + +stringescapes { } + +integers ( strlen ) // length of a string + +booleans ( continue_stemming_noun_suffixes ) + +groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6) + +define vowel 'ae{i'}io{o"}u{u"}' +define U '{i'}iu{u"}' + +// the vowel grouping definitions below are used for checking vowel harmony +define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a' +define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e' +define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i'' +define vowel4 'ei' // vowels that can end with suffixes containing 'i' +define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u' +define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"' + +externals ( stem ) + +backwardmode ( + // checks vowel harmony for possible suffixes, + // helps to detect whether the candidate for suffix applies to vowel harmony + // this rule is added to prevent over stemming + define check_vowel_harmony as ( + test + ( + (goto vowel) // if there is a vowel + ( + ('a' goto vowel1) or + ('e' goto vowel2) or + ('{i'}' goto vowel3) or + ('i' goto vowel4) or + ('o' goto vowel5) or + ('{o"}' goto vowel6) or + ('u' goto vowel5) or + ('{u"}' goto vowel6) + ) + ) + ) + + // if the last consonant before suffix is vowel and n then advance and delete + // if the last consonant before suffix is non vowel and n do nothing + // if the last consonant before suffix is not n then only delete the suffix + // assumption: slice beginning is set correctly + define mark_suffix_with_optional_n_consonant as ( + ((test 'n') next (test vowel)) + or + ((not(test 'n')) test(next (test vowel))) + + ) + + // if the last consonant before suffix is vowel and s then advance and delete + // if the last consonant before suffix is non vowel and s do nothing + // if the last consonant before suffix is not s then only delete the suffix + // assumption: slice beginning is set correctly + define mark_suffix_with_optional_s_consonant as ( + ((test 's') next (test vowel)) + or + ((not(test 's')) test(next (test vowel))) + ) + + // if the last consonant before suffix is vowel and y then advance and delete + // if the last consonant before suffix is non vowel and y do nothing + // if the last consonant before suffix is not y then only delete the suffix + // assumption: slice beginning is set correctly + define mark_suffix_with_optional_y_consonant as ( + ((test 'y') next (test vowel)) + or + ((not(test 'y')) test(next (test vowel))) + ) + + define mark_suffix_with_optional_U_vowel as ( + ((test U) next (test non-vowel)) + or + ((not(test U)) test(next (test non-vowel))) + + ) + + define mark_possessives as ( + among ('m{i'}z' 'miz' 'muz' 'm{u"}z' + 'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n') + (mark_suffix_with_optional_U_vowel) + ) + + define mark_sU as ( + check_vowel_harmony + U + (mark_suffix_with_optional_s_consonant) + ) + + define mark_lArI as ( + among ('leri' 'lar{i'}') + ) + + define mark_yU as ( + check_vowel_harmony + U + (mark_suffix_with_optional_y_consonant) + ) + + define mark_nU as ( + check_vowel_harmony + among ('n{i'}' 'ni' 'nu' 'n{u"}') + ) + + define mark_nUn as ( + check_vowel_harmony + among ('{i'}n' 'in' 'un' '{u"}n') + (mark_suffix_with_optional_n_consonant) + ) + + define mark_yA as ( + check_vowel_harmony + among('a' 'e') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_nA as ( + check_vowel_harmony + among('na' 'ne') + ) + + define mark_DA as ( + check_vowel_harmony + among('da' 'de' 'ta' 'te') + ) + + define mark_ndA as ( + check_vowel_harmony + among('nda' 'nde') + ) + + define mark_DAn as ( + check_vowel_harmony + among('dan' 'den' 'tan' 'ten') + ) + + define mark_ndAn as ( + check_vowel_harmony + among('ndan' 'nden') + ) + + define mark_ylA as ( + check_vowel_harmony + among('la' 'le') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_ki as ( + 'ki' + ) + + define mark_ncA as ( + check_vowel_harmony + among('ca' 'ce') + (mark_suffix_with_optional_n_consonant) + ) + + define mark_yUm as ( + check_vowel_harmony + among ('{i'}m' 'im' 'um' '{u"}m') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_sUn as ( + check_vowel_harmony + among ('s{i'}n' 'sin' 'sun' 's{u"}n' ) + ) + + define mark_yUz as ( + check_vowel_harmony + among ('{i'}z' 'iz' 'uz' '{u"}z') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_sUnUz as ( + among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z') + ) + + define mark_lAr as ( + check_vowel_harmony + among ('ler' 'lar') + ) + + define mark_nUz as ( + check_vowel_harmony + among ('n{i'}z' 'niz' 'nuz' 'n{u"}z') + ) + + define mark_DUr as ( + check_vowel_harmony + among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r') + ) + + define mark_cAsInA as ( + among ('cas{i'}na' 'cesine') + ) + + define mark_yDU as ( + check_vowel_harmony + among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m' + 't{i'}n' 'tin' 'tun' 't{u"}n' 'd{i'}n' 'din' 'dun' 'd{u"}n' + 't{i'}k' 'tik' 'tuk' 't{u"}k' 'd{i'}k' 'dik' 'duk' 'd{u"}k' + 't{i'}' 'ti' 'tu' 't{u"}' 'd{i'}' 'di' 'du' 'd{u"}') + (mark_suffix_with_optional_y_consonant) + ) + + // does not fully obey vowel harmony + define mark_ysA as ( + among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_ymUs_ as ( + check_vowel_harmony + among ('m{i'}{s.}' 'mi{s.}' 'mu{s.}' 'm{u"}{s.}') + (mark_suffix_with_optional_y_consonant) + ) + + define mark_yken as ( + 'ken' (mark_suffix_with_optional_y_consonant) + ) + + define stem_nominal_verb_suffixes as ( + [ + set continue_stemming_noun_suffixes + (mark_ymUs_ or mark_yDU or mark_ysA or mark_yken) + or + (mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_) + or + ( + mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_)) + unset continue_stemming_noun_suffixes + ) + or + (mark_nUz (mark_yDU or mark_ysA)) + or + ((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_)) + or + (mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)) + ]delete + ) + + // stems noun suffix chains ending with -ki + define stem_suffix_chain_before_ki as ( + [ + mark_ki + ( + (mark_DA] delete try([ + (mark_lAr] delete try(stem_suffix_chain_before_ki)) + or + (mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + + )) + or + (mark_nUn] delete try([ + (mark_lArI] delete) + or + ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + (stem_suffix_chain_before_ki) + )) + or + (mark_ndA ( + (mark_lArI] delete) + or + ((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki))) + or + (stem_suffix_chain_before_ki) + )) + ) + ) + + define stem_noun_suffixes as ( + ([mark_lAr] delete try(stem_suffix_chain_before_ki)) + or + ([mark_ncA] delete + try( + ([mark_lArI] delete) + or + ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + ([mark_lAr] delete stem_suffix_chain_before_ki) + ) + ) + or + ([(mark_ndA or mark_nA) + ( + (mark_lArI] delete) + or + (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + (stem_suffix_chain_before_ki) + ) + ) + or + ([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI))) + or + ( [mark_DAn] delete try ([ + ( + (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + (mark_lAr] delete try(stem_suffix_chain_before_ki)) + or + (stem_suffix_chain_before_ki) + )) + ) + or + ([mark_nUn or mark_ylA] delete + try( + ([mark_lAr] delete stem_suffix_chain_before_ki) + or + ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + or + stem_suffix_chain_before_ki + ) + ) + or + ([mark_lArI] delete) + or + (stem_suffix_chain_before_ki) + or + ([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki)) + or + ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) + ) + + define post_process_last_consonants as ( + [substring] among ( + 'b' (<- 'p') + 'c' (<- '{c.}') + 'd' (<- 't') + '{g~}' (<- 'k') + ) + ) + + // after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed + // like in 'kedim' -> 'ked' + // Turkish words don't usually end with 'd' or 'g' + // some very well known words are ignored (like 'ad' 'soyad' + // appends U to stems ending with d or g, decides which vowel to add + // based on the last vowel in the stem + define append_U_to_stems_ending_with_d_or_g as ( + test('d' or 'g') + (test((goto vowel) 'a' or '{i'}') <+ '{i'}') + or + (test((goto vowel) 'e' or 'i') <+ 'i') + or + (test((goto vowel) 'o' or 'u') <+ 'u') + or + (test((goto vowel) '{o"}' or '{u"}') <+ '{u"}') + ) + +) + +// Tests if there are more than one syllables +// In Turkish each vowel indicates a distinct syllable +define more_than_one_syllable_word as ( + test (atleast 2 (gopast vowel)) +) + +define is_reserved_word as ( + test(gopast 'ad' ($strlen = 2) ($strlen == limit)) + or + test(gopast 'soyad' ($strlen = 5) ($strlen == limit)) +) + +define postlude as ( + not(is_reserved_word) + backwards ( + do append_U_to_stems_ending_with_d_or_g + do post_process_last_consonants + + ) +) + +define stem as ( + (more_than_one_syllable_word) + ( + backwards ( + do stem_nominal_verb_suffixes + continue_stemming_noun_suffixes + do stem_noun_suffixes + ) + + postlude + ) +) + + |