123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196 |
-
- /* Finnish stemmer.
-
- Numbers in square brackets refer to the sections in
- Fred Karlsson, Finnish: An Essential Grammar. Routledge, 1999
- ISBN 0-415-20705-3
-
- */
-
- routines (
- mark_regions
- R2
- particle_etc possessive
- LONG VI
- case_ending
- i_plural
- t_plural
- other_endings
- tidy
- )
-
- externals ( stem )
-
- integers ( p1 p2 )
- strings ( x )
- booleans ( ending_removed )
- groupings ( AEI V1 V2 particle_end )
-
- stringescapes {}
-
- /* special characters (in ISO Latin I) */
-
- stringdef a" hex 'E4'
- stringdef o" hex 'F6'
-
- define AEI 'a{a"}ei'
- define V1 'aeiouy{a"}{o"}'
- define V2 'aeiou{a"}{o"}'
- define particle_end V1 + 'nt'
-
- define mark_regions as (
-
- $p1 = limit
- $p2 = limit
-
- goto V1 gopast non-V1 setmark p1
- goto V1 gopast non-V1 setmark p2
- )
-
- backwardmode (
-
- define R2 as $p2 <= cursor
-
- define particle_etc as (
- setlimit tomark p1 for ([substring])
- among(
- 'kin'
- 'kaan' 'k{a"}{a"}n'
- 'ko' 'k{o"}'
- 'han' 'h{a"}n'
- 'pa' 'p{a"}' // Particles [91]
- (particle_end)
- 'sti' // Adverb [87]
- (R2)
- )
- delete
- )
- define possessive as ( // [36]
- setlimit tomark p1 for ([substring])
- among(
- 'si'
- (not 'k' delete) // take 'ksi' as the Comitative case
- 'ni'
- (delete ['kse'] <- 'ksi') // kseni = ksi + ni
- 'nsa' 'ns{a"}'
- 'mme'
- 'nne'
- (delete)
- /* Now for Vn possessives after case endings: [36] */
- 'an'
- (among('ta' 'ssa' 'sta' 'lla' 'lta' 'na') delete)
- '{a"}n'
- (among('t{a"}' 'ss{a"}' 'st{a"}'
- 'll{a"}' 'lt{a"}' 'n{a"}') delete)
- 'en'
- (among('lle' 'ine') delete)
- )
- )
-
- define LONG as
- among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}')
-
- define VI as ('i' V2)
-
- define case_ending as (
- setlimit tomark p1 for ([substring])
- among(
- 'han' ('a') //-.
- 'hen' ('e') // |
- 'hin' ('i') // |
- 'hon' ('o') // |
- 'h{a"}n' ('{a"}') // Illative [43]
- 'h{o"}n' ('{o"}') // |
- 'siin' VI // |
- 'seen' LONG //-'
-
- 'den' VI
- 'tten' VI // Genitive plurals [34]
- ()
- 'n' // Genitive or Illative
- ( try ( LONG // Illative
- or 'ie' // Genitive
- and next ]
- )
- /* otherwise Genitive */
- )
-
- 'a' '{a"}' //-.
- (V1 non-V1) // |
- 'tta' 'tt{a"}' // Partitive [32]
- ('e') // |
- 'ta' 't{a"}' //-'
-
- 'ssa' 'ss{a"}' // Inessive [41]
- 'sta' 'st{a"}' // Elative [42]
-
- 'lla' 'll{a"}' // Adessive [44]
- 'lta' 'lt{a"}' // Ablative [51]
- 'lle' // Allative [46]
- 'na' 'n{a"}' // Essive [49]
- 'ksi' // Translative[50]
- 'ine' // Comitative [51]
-
- /* Abessive and Instructive are too rare for
- inclusion [51] */
-
- )
- delete
- set ending_removed
- )
- define other_endings as (
- setlimit tomark p2 for ([substring])
- among(
- 'mpi' 'mpa' 'mp{a"}'
- 'mmi' 'mma' 'mm{a"}' // Comparative forms [85]
- (not 'po') //-improves things
- 'impi' 'impa' 'imp{a"}'
- 'immi' 'imma' 'imm{a"}' // Superlative forms [86]
- 'eja' 'ej{a"}' // indicates agent [93.1B]
- )
- delete
- )
- define i_plural as ( // [26]
- setlimit tomark p1 for ([substring])
- among(
- 'i' 'j'
- )
- delete
- )
- define t_plural as ( // [26]
- setlimit tomark p1 for (
- ['t'] test V1
- delete
- )
- setlimit tomark p2 for ([substring])
- among(
- 'mma' (not 'po') //-mmat endings
- 'imma' //-immat endings
- )
- delete
- )
- define tidy as (
- setlimit tomark p1 for (
- do ( LONG and ([next] delete ) ) // undouble vowel
- do ( [AEI] non-V1 delete ) // remove trailing a, a", e, i
- do ( ['j'] 'o' or 'u' delete )
- do ( ['o'] 'j' delete )
- )
- goto non-V1 [next] -> x x delete // undouble consonant
- )
- )
-
- define stem as (
-
- do mark_regions
- unset ending_removed
- backwards (
- do particle_etc
- do possessive
- do case_ending
- do other_endings
- (ending_removed do i_plural) or do t_plural
- do tidy
- )
- )
|