123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241 |
- /*
- Hungarian Stemmer
- Removes noun inflections
- */
-
- routines (
- mark_regions
- R1
- v_ending
- case
- case_special
- case_other
- plural
- owned
- sing_owner
- plur_owner
- instrum
- factive
- undouble
- double
- )
-
- externals ( stem )
-
- integers ( p1 )
- groupings ( v )
-
- stringescapes {}
-
- /* special characters (in Unicode) */
-
- stringdef a' hex 'E1' //a-acute
- stringdef e' hex 'E9' //e-acute
- stringdef i' hex 'ED' //i-acute
- stringdef o' hex 'F3' //o-acute
- stringdef o" hex 'F6' //o-umlaut
- stringdef oq hex '151' //o-double acute
- stringdef u' hex 'FA' //u-acute
- stringdef u" hex 'FC' //u-umlaut
- stringdef uq hex '171' //u-double acute
-
- define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}'
-
- define mark_regions as (
-
- $p1 = limit
-
- (v goto non-v
- among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next
- setmark p1)
- or
-
- (non-v gopast v setmark p1)
- )
-
- backwardmode (
-
- define R1 as $p1 <= cursor
-
- define v_ending as (
- [substring] R1 among(
- '{a'}' (<- 'a')
- '{e'}' (<- 'e')
- )
- )
-
- define double as (
- test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm'
- 'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs')
- )
-
- define undouble as (
- next [hop 1] delete
- )
-
- define instrum as(
- [substring] R1 among(
- 'al' (double)
- 'el' (double)
- )
- delete
- undouble
- )
-
-
- define case as (
- [substring] R1 among(
- 'ban' 'ben'
- 'ba' 'be'
- 'ra' 're'
- 'nak' 'nek'
- 'val' 'vel'
- 't{o'}l' 't{oq}l'
- 'r{o'}l' 'r{oq}l'
- 'b{o'}l' 'b{oq}l'
- 'hoz' 'hez' 'h{o"}z'
- 'n{a'}l' 'n{e'}l'
- 'ig'
- 'at' 'et' 'ot' '{o"}t'
- '{e'}rt'
- 'k{e'}pp' 'k{e'}ppen'
- 'kor'
- 'ul' '{u"}l'
- 'v{a'}' 'v{e'}'
- 'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt'
- 'k{e'}nt'
- 'en' 'on' 'an' '{o"}n'
- 'n'
- 't'
- )
- delete
- v_ending
- )
-
- define case_special as(
- [substring] R1 among(
- '{e'}n' (<- 'e')
- '{a'}n' (<- 'a')
- '{a'}nk{e'}nt' (<- 'a')
- )
- )
-
- define case_other as(
- [substring] R1 among(
- 'astul' 'est{u"}l' (delete)
- 'stul' 'st{u"}l' (delete)
- '{a'}stul' (<- 'a')
- '{e'}st{u"}l' (<- 'e')
- )
- )
-
- define factive as(
- [substring] R1 among(
- '{a'}' (double)
- '{e'}' (double)
- )
- delete
- undouble
- )
-
- define plural as (
- [substring] R1 among(
- '{a'}k' (<- 'a')
- '{e'}k' (<- 'e')
- '{o"}k' (delete)
- 'ak' (delete)
- 'ok' (delete)
- 'ek' (delete)
- 'k' (delete)
- )
- )
-
- define owned as (
- [substring] R1 among (
- 'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete)
- '{e'}k{e'}' (<- 'e')
- '{a'}k{e'}' (<- 'a')
- 'k{e'}' (delete)
- '{e'}{e'}i' (<- 'e')
- '{a'}{e'}i' (<- 'a')
- '{e'}i' (delete)
- '{e'}{e'}' (<- 'e')
- '{e'}' (delete)
- )
- )
-
- define sing_owner as (
- [substring] R1 among(
- '{u"}nk' 'unk' (delete)
- '{a'}nk' (<- 'a')
- '{e'}nk' (<- 'e')
- 'nk' (delete)
- '{a'}juk' (<- 'a')
- '{e'}j{u"}k' (<- 'e')
- 'juk' 'j{u"}k' (delete)
- 'uk' '{u"}k' (delete)
- 'em' 'om' 'am' (delete)
- '{a'}m' (<- 'a')
- '{e'}m' (<- 'e')
- 'm' (delete)
- 'od' 'ed' 'ad' '{o"}d' (delete)
- '{a'}d' (<- 'a')
- '{e'}d' (<- 'e')
- 'd' (delete)
- 'ja' 'je' (delete)
- 'a' 'e' 'o' (delete)
- '{a'}' (<- 'a')
- '{e'}' (<- 'e')
- )
- )
-
- define plur_owner as (
- [substring] R1 among(
- 'jaim' 'jeim' (delete)
- '{a'}im' (<- 'a')
- '{e'}im' (<- 'e')
- 'aim' 'eim' (delete)
- 'im' (delete)
- 'jaid' 'jeid' (delete)
- '{a'}id' (<- 'a')
- '{e'}id' (<- 'e')
- 'aid' 'eid' (delete)
- 'id' (delete)
- 'jai' 'jei' (delete)
- '{a'}i' (<- 'a')
- '{e'}i' (<- 'e')
- 'ai' 'ei' (delete)
- 'i' (delete)
- 'jaink' 'jeink' (delete)
- 'eink' 'aink' (delete)
- '{a'}ink' (<- 'a')
- '{e'}ink' (<- 'e')
- 'ink'
- 'jaitok' 'jeitek' (delete)
- 'aitok' 'eitek' (delete)
- '{a'}itok' (<- 'a')
- '{e'}itek' (<- 'e')
- 'itek' (delete)
- 'jeik' 'jaik' (delete)
- 'aik' 'eik' (delete)
- '{a'}ik' (<- 'a')
- '{e'}ik' (<- 'e')
- 'ik' (delete)
- )
- )
- )
-
- define stem as (
- do mark_regions
- backwards (
- do instrum
- do case
- do case_special
- do case_other
- do factive
- do owned
- do sing_owner
- do plur_owner
- do plural
- )
- )
|