You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

stem_Unicode.sbl 5.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. /*
  2. Hungarian Stemmer
  3. Removes noun inflections
  4. */
  5. routines (
  6. mark_regions
  7. R1
  8. v_ending
  9. case
  10. case_special
  11. case_other
  12. plural
  13. owned
  14. sing_owner
  15. plur_owner
  16. instrum
  17. factive
  18. undouble
  19. double
  20. )
  21. externals ( stem )
  22. integers ( p1 )
  23. groupings ( v )
  24. stringescapes {}
  25. /* special characters (in Unicode) */
  26. stringdef a' hex 'E1' //a-acute
  27. stringdef e' hex 'E9' //e-acute
  28. stringdef i' hex 'ED' //i-acute
  29. stringdef o' hex 'F3' //o-acute
  30. stringdef o" hex 'F6' //o-umlaut
  31. stringdef oq hex '151' //o-double acute
  32. stringdef u' hex 'FA' //u-acute
  33. stringdef u" hex 'FC' //u-umlaut
  34. stringdef uq hex '171' //u-double acute
  35. define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}'
  36. define mark_regions as (
  37. $p1 = limit
  38. (v goto non-v
  39. among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next
  40. setmark p1)
  41. or
  42. (non-v gopast v setmark p1)
  43. )
  44. backwardmode (
  45. define R1 as $p1 <= cursor
  46. define v_ending as (
  47. [substring] R1 among(
  48. '{a'}' (<- 'a')
  49. '{e'}' (<- 'e')
  50. )
  51. )
  52. define double as (
  53. test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm'
  54. 'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs')
  55. )
  56. define undouble as (
  57. next [hop 1] delete
  58. )
  59. define instrum as(
  60. [substring] R1 among(
  61. 'al' (double)
  62. 'el' (double)
  63. )
  64. delete
  65. undouble
  66. )
  67. define case as (
  68. [substring] R1 among(
  69. 'ban' 'ben'
  70. 'ba' 'be'
  71. 'ra' 're'
  72. 'nak' 'nek'
  73. 'val' 'vel'
  74. 't{o'}l' 't{oq}l'
  75. 'r{o'}l' 'r{oq}l'
  76. 'b{o'}l' 'b{oq}l'
  77. 'hoz' 'hez' 'h{o"}z'
  78. 'n{a'}l' 'n{e'}l'
  79. 'ig'
  80. 'at' 'et' 'ot' '{o"}t'
  81. '{e'}rt'
  82. 'k{e'}pp' 'k{e'}ppen'
  83. 'kor'
  84. 'ul' '{u"}l'
  85. 'v{a'}' 'v{e'}'
  86. 'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt'
  87. 'k{e'}nt'
  88. 'en' 'on' 'an' '{o"}n'
  89. 'n'
  90. 't'
  91. )
  92. delete
  93. v_ending
  94. )
  95. define case_special as(
  96. [substring] R1 among(
  97. '{e'}n' (<- 'e')
  98. '{a'}n' (<- 'a')
  99. '{a'}nk{e'}nt' (<- 'a')
  100. )
  101. )
  102. define case_other as(
  103. [substring] R1 among(
  104. 'astul' 'est{u"}l' (delete)
  105. 'stul' 'st{u"}l' (delete)
  106. '{a'}stul' (<- 'a')
  107. '{e'}st{u"}l' (<- 'e')
  108. )
  109. )
  110. define factive as(
  111. [substring] R1 among(
  112. '{a'}' (double)
  113. '{e'}' (double)
  114. )
  115. delete
  116. undouble
  117. )
  118. define plural as (
  119. [substring] R1 among(
  120. '{a'}k' (<- 'a')
  121. '{e'}k' (<- 'e')
  122. '{o"}k' (delete)
  123. 'ak' (delete)
  124. 'ok' (delete)
  125. 'ek' (delete)
  126. 'k' (delete)
  127. )
  128. )
  129. define owned as (
  130. [substring] R1 among (
  131. 'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete)
  132. '{e'}k{e'}' (<- 'e')
  133. '{a'}k{e'}' (<- 'a')
  134. 'k{e'}' (delete)
  135. '{e'}{e'}i' (<- 'e')
  136. '{a'}{e'}i' (<- 'a')
  137. '{e'}i' (delete)
  138. '{e'}{e'}' (<- 'e')
  139. '{e'}' (delete)
  140. )
  141. )
  142. define sing_owner as (
  143. [substring] R1 among(
  144. '{u"}nk' 'unk' (delete)
  145. '{a'}nk' (<- 'a')
  146. '{e'}nk' (<- 'e')
  147. 'nk' (delete)
  148. '{a'}juk' (<- 'a')
  149. '{e'}j{u"}k' (<- 'e')
  150. 'juk' 'j{u"}k' (delete)
  151. 'uk' '{u"}k' (delete)
  152. 'em' 'om' 'am' (delete)
  153. '{a'}m' (<- 'a')
  154. '{e'}m' (<- 'e')
  155. 'm' (delete)
  156. 'od' 'ed' 'ad' '{o"}d' (delete)
  157. '{a'}d' (<- 'a')
  158. '{e'}d' (<- 'e')
  159. 'd' (delete)
  160. 'ja' 'je' (delete)
  161. 'a' 'e' 'o' (delete)
  162. '{a'}' (<- 'a')
  163. '{e'}' (<- 'e')
  164. )
  165. )
  166. define plur_owner as (
  167. [substring] R1 among(
  168. 'jaim' 'jeim' (delete)
  169. '{a'}im' (<- 'a')
  170. '{e'}im' (<- 'e')
  171. 'aim' 'eim' (delete)
  172. 'im' (delete)
  173. 'jaid' 'jeid' (delete)
  174. '{a'}id' (<- 'a')
  175. '{e'}id' (<- 'e')
  176. 'aid' 'eid' (delete)
  177. 'id' (delete)
  178. 'jai' 'jei' (delete)
  179. '{a'}i' (<- 'a')
  180. '{e'}i' (<- 'e')
  181. 'ai' 'ei' (delete)
  182. 'i' (delete)
  183. 'jaink' 'jeink' (delete)
  184. 'eink' 'aink' (delete)
  185. '{a'}ink' (<- 'a')
  186. '{e'}ink' (<- 'e')
  187. 'ink'
  188. 'jaitok' 'jeitek' (delete)
  189. 'aitok' 'eitek' (delete)
  190. '{a'}itok' (<- 'a')
  191. '{e'}itek' (<- 'e')
  192. 'itek' (delete)
  193. 'jeik' 'jaik' (delete)
  194. 'aik' 'eik' (delete)
  195. '{a'}ik' (<- 'a')
  196. '{e'}ik' (<- 'e')
  197. 'ik' (delete)
  198. )
  199. )
  200. )
  201. define stem as (
  202. do mark_regions
  203. backwards (
  204. do instrum
  205. do case
  206. do case_special
  207. do case_other
  208. do factive
  209. do owned
  210. do sing_owner
  211. do plur_owner
  212. do plural
  213. )
  214. )