You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

stem_ISO_8859_1.sbl 6.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. routines (
  2. prelude postlude mark_regions
  3. RV R1 R2
  4. standard_suffix
  5. i_verb_suffix
  6. verb_suffix
  7. residual_suffix
  8. un_double
  9. un_accent
  10. )
  11. externals ( stem )
  12. integers ( pV p1 p2 )
  13. groupings ( v keep_with_s )
  14. stringescapes {}
  15. /* special characters (in ISO Latin I) */
  16. stringdef a^ hex 'E2' // a-circumflex
  17. stringdef a` hex 'E0' // a-grave
  18. stringdef c, hex 'E7' // c-cedilla
  19. stringdef e" hex 'EB' // e-diaeresis (rare)
  20. stringdef e' hex 'E9' // e-acute
  21. stringdef e^ hex 'EA' // e-circumflex
  22. stringdef e` hex 'E8' // e-grave
  23. stringdef i" hex 'EF' // i-diaeresis
  24. stringdef i^ hex 'EE' // i-circumflex
  25. stringdef o^ hex 'F4' // o-circumflex
  26. stringdef u^ hex 'FB' // u-circumflex
  27. stringdef u` hex 'F9' // u-grave
  28. define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}'
  29. define prelude as repeat goto (
  30. ( v [ ('u' ] v <- 'U') or
  31. ('i' ] v <- 'I') or
  32. ('y' ] <- 'Y')
  33. )
  34. or
  35. ( ['y'] v <- 'Y' )
  36. or
  37. ( 'q' ['u'] <- 'U' )
  38. )
  39. define mark_regions as (
  40. $pV = limit
  41. $p1 = limit
  42. $p2 = limit // defaults
  43. do (
  44. ( v v next )
  45. or
  46. among ( // this exception list begun Nov 2006
  47. 'par' // paris, parie, pari
  48. 'col' // colis
  49. 'tap' // tapis
  50. // extensions possible here
  51. )
  52. or
  53. ( next gopast v )
  54. setmark pV
  55. )
  56. do (
  57. gopast v gopast non-v setmark p1
  58. gopast v gopast non-v setmark p2
  59. )
  60. )
  61. define postlude as repeat (
  62. [substring] among(
  63. 'I' (<- 'i')
  64. 'U' (<- 'u')
  65. 'Y' (<- 'y')
  66. '' (next)
  67. )
  68. )
  69. backwardmode (
  70. define RV as $pV <= cursor
  71. define R1 as $p1 <= cursor
  72. define R2 as $p2 <= cursor
  73. define standard_suffix as (
  74. [substring] among(
  75. 'ance' 'iqUe' 'isme' 'able' 'iste' 'eux'
  76. 'ances' 'iqUes' 'ismes' 'ables' 'istes'
  77. ( R2 delete )
  78. 'atrice' 'ateur' 'ation'
  79. 'atrices' 'ateurs' 'ations'
  80. ( R2 delete
  81. try ( ['ic'] (R2 delete) or <-'iqU' )
  82. )
  83. 'logie'
  84. 'logies'
  85. ( R2 <- 'log' )
  86. 'usion' 'ution'
  87. 'usions' 'utions'
  88. ( R2 <- 'u' )
  89. 'ence'
  90. 'ences'
  91. ( R2 <- 'ent' )
  92. 'ement'
  93. 'ements'
  94. (
  95. RV delete
  96. try (
  97. [substring] among(
  98. 'iv' (R2 delete ['at'] R2 delete)
  99. 'eus' ((R2 delete) or (R1<-'eux'))
  100. 'abl' 'iqU'
  101. (R2 delete)
  102. 'i{e`}r' 'I{e`}r' //)
  103. (RV <-'i') //)--new 2 Sept 02
  104. )
  105. )
  106. )
  107. 'it{e'}'
  108. 'it{e'}s'
  109. (
  110. R2 delete
  111. try (
  112. [substring] among(
  113. 'abil' ((R2 delete) or <-'abl')
  114. 'ic' ((R2 delete) or <-'iqU')
  115. 'iv' (R2 delete)
  116. )
  117. )
  118. )
  119. 'if' 'ive'
  120. 'ifs' 'ives'
  121. (
  122. R2 delete
  123. try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' )
  124. )
  125. 'eaux' (<- 'eau')
  126. 'aux' (R1 <- 'al')
  127. 'euse'
  128. 'euses'((R2 delete) or (R1<-'eux'))
  129. 'issement'
  130. 'issements'(R1 non-v delete) // verbal
  131. // fail(...) below forces entry to verb_suffix. -ment typically
  132. // follows the p.p., e.g 'confus{e'}ment'.
  133. 'amment' (RV fail(<- 'ant'))
  134. 'emment' (RV fail(<- 'ent'))
  135. 'ment'
  136. 'ments' (test(v RV) fail(delete))
  137. // v is e,i,u,{e'},I or U
  138. )
  139. )
  140. define i_verb_suffix as setlimit tomark pV for (
  141. [substring] among (
  142. '{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai'
  143. 'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez'
  144. 'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait'
  145. 'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses'
  146. 'issez' 'issiez' 'issions' 'issons' 'it'
  147. (non-v delete)
  148. )
  149. )
  150. define verb_suffix as setlimit tomark pV for (
  151. [substring] among (
  152. 'ions'
  153. (R2 delete)
  154. '{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai'
  155. 'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions'
  156. 'erons' 'eront' 'ez' 'iez'
  157. // 'ons' //-best omitted
  158. (delete)
  159. '{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant'
  160. 'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez'
  161. 'assions'
  162. (delete
  163. try(['e'] delete)
  164. )
  165. )
  166. )
  167. define keep_with_s 'aiou{e`}s'
  168. define residual_suffix as (
  169. try(['s'] test non-keep_with_s delete)
  170. setlimit tomark pV for (
  171. [substring] among(
  172. 'ion' (R2 's' or 't' delete)
  173. 'ier' 'i{e`}re'
  174. 'Ier' 'I{e`}re' (<-'i')
  175. 'e' (delete)
  176. '{e"}' ('gu' delete)
  177. )
  178. )
  179. )
  180. define un_double as (
  181. test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete
  182. )
  183. define un_accent as (
  184. atleast 1 non-v
  185. [ '{e'}' or '{e`}' ] <-'e'
  186. )
  187. )
  188. define stem as (
  189. do prelude
  190. do mark_regions
  191. backwards (
  192. do (
  193. (
  194. ( standard_suffix or
  195. i_verb_suffix or
  196. verb_suffix
  197. )
  198. and
  199. try( [ ('Y' ] <- 'i' ) or
  200. ('{c,}'] <- 'c' )
  201. )
  202. ) or
  203. residual_suffix
  204. )
  205. // try(['ent'] RV delete) // is best omitted
  206. do un_double
  207. do un_accent
  208. )
  209. do postlude
  210. )