You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

stem_ISO_8859_1.sbl 5.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. /* Finnish stemmer.
  2. Numbers in square brackets refer to the sections in
  3. Fred Karlsson, Finnish: An Essential Grammar. Routledge, 1999
  4. ISBN 0-415-20705-3
  5. */
  6. routines (
  7. mark_regions
  8. R2
  9. particle_etc possessive
  10. LONG VI
  11. case_ending
  12. i_plural
  13. t_plural
  14. other_endings
  15. tidy
  16. )
  17. externals ( stem )
  18. integers ( p1 p2 )
  19. strings ( x )
  20. booleans ( ending_removed )
  21. groupings ( AEI V1 V2 particle_end )
  22. stringescapes {}
  23. /* special characters (in ISO Latin I) */
  24. stringdef a" hex 'E4'
  25. stringdef o" hex 'F6'
  26. define AEI 'a{a"}ei'
  27. define V1 'aeiouy{a"}{o"}'
  28. define V2 'aeiou{a"}{o"}'
  29. define particle_end V1 + 'nt'
  30. define mark_regions as (
  31. $p1 = limit
  32. $p2 = limit
  33. goto V1 gopast non-V1 setmark p1
  34. goto V1 gopast non-V1 setmark p2
  35. )
  36. backwardmode (
  37. define R2 as $p2 <= cursor
  38. define particle_etc as (
  39. setlimit tomark p1 for ([substring])
  40. among(
  41. 'kin'
  42. 'kaan' 'k{a"}{a"}n'
  43. 'ko' 'k{o"}'
  44. 'han' 'h{a"}n'
  45. 'pa' 'p{a"}' // Particles [91]
  46. (particle_end)
  47. 'sti' // Adverb [87]
  48. (R2)
  49. )
  50. delete
  51. )
  52. define possessive as ( // [36]
  53. setlimit tomark p1 for ([substring])
  54. among(
  55. 'si'
  56. (not 'k' delete) // take 'ksi' as the Comitative case
  57. 'ni'
  58. (delete ['kse'] <- 'ksi') // kseni = ksi + ni
  59. 'nsa' 'ns{a"}'
  60. 'mme'
  61. 'nne'
  62. (delete)
  63. /* Now for Vn possessives after case endings: [36] */
  64. 'an'
  65. (among('ta' 'ssa' 'sta' 'lla' 'lta' 'na') delete)
  66. '{a"}n'
  67. (among('t{a"}' 'ss{a"}' 'st{a"}'
  68. 'll{a"}' 'lt{a"}' 'n{a"}') delete)
  69. 'en'
  70. (among('lle' 'ine') delete)
  71. )
  72. )
  73. define LONG as
  74. among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}')
  75. define VI as ('i' V2)
  76. define case_ending as (
  77. setlimit tomark p1 for ([substring])
  78. among(
  79. 'han' ('a') //-.
  80. 'hen' ('e') // |
  81. 'hin' ('i') // |
  82. 'hon' ('o') // |
  83. 'h{a"}n' ('{a"}') // Illative [43]
  84. 'h{o"}n' ('{o"}') // |
  85. 'siin' VI // |
  86. 'seen' LONG //-'
  87. 'den' VI
  88. 'tten' VI // Genitive plurals [34]
  89. ()
  90. 'n' // Genitive or Illative
  91. ( try ( LONG // Illative
  92. or 'ie' // Genitive
  93. and next ]
  94. )
  95. /* otherwise Genitive */
  96. )
  97. 'a' '{a"}' //-.
  98. (V1 non-V1) // |
  99. 'tta' 'tt{a"}' // Partitive [32]
  100. ('e') // |
  101. 'ta' 't{a"}' //-'
  102. 'ssa' 'ss{a"}' // Inessive [41]
  103. 'sta' 'st{a"}' // Elative [42]
  104. 'lla' 'll{a"}' // Adessive [44]
  105. 'lta' 'lt{a"}' // Ablative [51]
  106. 'lle' // Allative [46]
  107. 'na' 'n{a"}' // Essive [49]
  108. 'ksi' // Translative[50]
  109. 'ine' // Comitative [51]
  110. /* Abessive and Instructive are too rare for
  111. inclusion [51] */
  112. )
  113. delete
  114. set ending_removed
  115. )
  116. define other_endings as (
  117. setlimit tomark p2 for ([substring])
  118. among(
  119. 'mpi' 'mpa' 'mp{a"}'
  120. 'mmi' 'mma' 'mm{a"}' // Comparative forms [85]
  121. (not 'po') //-improves things
  122. 'impi' 'impa' 'imp{a"}'
  123. 'immi' 'imma' 'imm{a"}' // Superlative forms [86]
  124. 'eja' 'ej{a"}' // indicates agent [93.1B]
  125. )
  126. delete
  127. )
  128. define i_plural as ( // [26]
  129. setlimit tomark p1 for ([substring])
  130. among(
  131. 'i' 'j'
  132. )
  133. delete
  134. )
  135. define t_plural as ( // [26]
  136. setlimit tomark p1 for (
  137. ['t'] test V1
  138. delete
  139. )
  140. setlimit tomark p2 for ([substring])
  141. among(
  142. 'mma' (not 'po') //-mmat endings
  143. 'imma' //-immat endings
  144. )
  145. delete
  146. )
  147. define tidy as (
  148. setlimit tomark p1 for (
  149. do ( LONG and ([next] delete ) ) // undouble vowel
  150. do ( [AEI] non-V1 delete ) // remove trailing a, a", e, i
  151. do ( ['j'] 'o' or 'u' delete )
  152. do ( ['o'] 'j' delete )
  153. )
  154. goto non-V1 [next] -> x x delete // undouble consonant
  155. )
  156. )
  157. define stem as (
  158. do mark_regions
  159. unset ending_removed
  160. backwards (
  161. do particle_etc
  162. do possessive
  163. do case_ending
  164. do other_endings
  165. (ending_removed do i_plural) or do t_plural
  166. do tidy
  167. )
  168. )