diff options
Diffstat (limited to 'contrib/snowball/algorithms/hungarian/stem_Unicode.sbl')
-rw-r--r-- | contrib/snowball/algorithms/hungarian/stem_Unicode.sbl | 241 |
1 files changed, 241 insertions, 0 deletions
diff --git a/contrib/snowball/algorithms/hungarian/stem_Unicode.sbl b/contrib/snowball/algorithms/hungarian/stem_Unicode.sbl new file mode 100644 index 000000000..c812e055c --- /dev/null +++ b/contrib/snowball/algorithms/hungarian/stem_Unicode.sbl @@ -0,0 +1,241 @@ +/* +Hungarian Stemmer +Removes noun inflections +*/ + +routines ( + mark_regions + R1 + v_ending + case + case_special + case_other + plural + owned + sing_owner + plur_owner + instrum + factive + undouble + double +) + +externals ( stem ) + +integers ( p1 ) +groupings ( v ) + +stringescapes {} + +/* special characters (in Unicode) */ + +stringdef a' hex 'E1' //a-acute +stringdef e' hex 'E9' //e-acute +stringdef i' hex 'ED' //i-acute +stringdef o' hex 'F3' //o-acute +stringdef o" hex 'F6' //o-umlaut +stringdef oq hex '151' //o-double acute +stringdef u' hex 'FA' //u-acute +stringdef u" hex 'FC' //u-umlaut +stringdef uq hex '171' //u-double acute + +define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}' + +define mark_regions as ( + + $p1 = limit + + (v goto non-v + among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next + setmark p1) + or + + (non-v gopast v setmark p1) +) + +backwardmode ( + + define R1 as $p1 <= cursor + + define v_ending as ( + [substring] R1 among( + '{a'}' (<- 'a') + '{e'}' (<- 'e') + ) + ) + + define double as ( + test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm' + 'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs') + ) + + define undouble as ( + next [hop 1] delete + ) + + define instrum as( + [substring] R1 among( + 'al' (double) + 'el' (double) + ) + delete + undouble + ) + + + define case as ( + [substring] R1 among( + 'ban' 'ben' + 'ba' 'be' + 'ra' 're' + 'nak' 'nek' + 'val' 'vel' + 't{o'}l' 't{oq}l' + 'r{o'}l' 'r{oq}l' + 'b{o'}l' 'b{oq}l' + 'hoz' 'hez' 'h{o"}z' + 'n{a'}l' 'n{e'}l' + 'ig' + 'at' 'et' 'ot' '{o"}t' + '{e'}rt' + 'k{e'}pp' 'k{e'}ppen' + 'kor' + 'ul' '{u"}l' + 'v{a'}' 'v{e'}' + 'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt' + 'k{e'}nt' + 'en' 'on' 'an' '{o"}n' + 'n' + 't' + ) + delete + v_ending + ) + + define case_special as( + [substring] R1 among( + '{e'}n' (<- 'e') + '{a'}n' (<- 'a') + '{a'}nk{e'}nt' (<- 'a') + ) + ) + + define case_other as( + [substring] R1 among( + 'astul' 'est{u"}l' (delete) + 'stul' 'st{u"}l' (delete) + '{a'}stul' (<- 'a') + '{e'}st{u"}l' (<- 'e') + ) + ) + + define factive as( + [substring] R1 among( + '{a'}' (double) + '{e'}' (double) + ) + delete + undouble + ) + + define plural as ( + [substring] R1 among( + '{a'}k' (<- 'a') + '{e'}k' (<- 'e') + '{o"}k' (delete) + 'ak' (delete) + 'ok' (delete) + 'ek' (delete) + 'k' (delete) + ) + ) + + define owned as ( + [substring] R1 among ( + 'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete) + '{e'}k{e'}' (<- 'e') + '{a'}k{e'}' (<- 'a') + 'k{e'}' (delete) + '{e'}{e'}i' (<- 'e') + '{a'}{e'}i' (<- 'a') + '{e'}i' (delete) + '{e'}{e'}' (<- 'e') + '{e'}' (delete) + ) + ) + + define sing_owner as ( + [substring] R1 among( + '{u"}nk' 'unk' (delete) + '{a'}nk' (<- 'a') + '{e'}nk' (<- 'e') + 'nk' (delete) + '{a'}juk' (<- 'a') + '{e'}j{u"}k' (<- 'e') + 'juk' 'j{u"}k' (delete) + 'uk' '{u"}k' (delete) + 'em' 'om' 'am' (delete) + '{a'}m' (<- 'a') + '{e'}m' (<- 'e') + 'm' (delete) + 'od' 'ed' 'ad' '{o"}d' (delete) + '{a'}d' (<- 'a') + '{e'}d' (<- 'e') + 'd' (delete) + 'ja' 'je' (delete) + 'a' 'e' 'o' (delete) + '{a'}' (<- 'a') + '{e'}' (<- 'e') + ) + ) + + define plur_owner as ( + [substring] R1 among( + 'jaim' 'jeim' (delete) + '{a'}im' (<- 'a') + '{e'}im' (<- 'e') + 'aim' 'eim' (delete) + 'im' (delete) + 'jaid' 'jeid' (delete) + '{a'}id' (<- 'a') + '{e'}id' (<- 'e') + 'aid' 'eid' (delete) + 'id' (delete) + 'jai' 'jei' (delete) + '{a'}i' (<- 'a') + '{e'}i' (<- 'e') + 'ai' 'ei' (delete) + 'i' (delete) + 'jaink' 'jeink' (delete) + 'eink' 'aink' (delete) + '{a'}ink' (<- 'a') + '{e'}ink' (<- 'e') + 'ink' + 'jaitok' 'jeitek' (delete) + 'aitok' 'eitek' (delete) + '{a'}itok' (<- 'a') + '{e'}itek' (<- 'e') + 'itek' (delete) + 'jeik' 'jaik' (delete) + 'aik' 'eik' (delete) + '{a'}ik' (<- 'a') + '{e'}ik' (<- 'e') + 'ik' (delete) + ) + ) +) + +define stem as ( + do mark_regions + backwards ( + do instrum + do case + do case_special + do case_other + do factive + do owned + do sing_owner + do plur_owner + do plural + ) +) |