123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405 |
- /*
- * Affix stripping stemming algorithm for Tamil
- * By Damodharan Rajalingam
- */
-
- stringescapes {}
-
- /* Aytham */
- stringdef aytham '{U+0B83}'
-
- /* Uyir - independent vowels */
- stringdef a '{U+0B85}'
- stringdef aa '{U+0B86}'
- stringdef i '{U+0B87}'
- stringdef ii '{U+0B88}'
- stringdef u '{U+0B89}'
- stringdef uu '{U+0B8A}'
- stringdef e '{U+0B8E}'
- stringdef ee '{U+0B8F}'
- stringdef ai '{U+0B90}'
- stringdef o '{U+0B92}'
- stringdef oo '{U+0B93}'
- stringdef au '{U+0B94}'
-
- /* Consonants */
- stringdef ka '{U+0B95}'
- stringdef nga '{U+0B99}'
- stringdef ca '{U+0B9A}'
- stringdef ja '{U+0B9C}'
- stringdef nya '{U+0B9E}'
- stringdef tta '{U+0B9F}'
- stringdef nna '{U+0BA3}'
- stringdef ta '{U+0BA4}'
- stringdef tha '{U+0BA4}'
- stringdef na '{U+0BA8}'
- stringdef nnna '{U+0BA9}'
- stringdef pa '{U+0BAA}'
- stringdef ma '{U+0BAE}'
- stringdef ya '{U+0BAF}'
- stringdef ra '{U+0BB0}'
- stringdef rra '{U+0BB1}'
- stringdef la '{U+0BB2}'
- stringdef lla '{U+0BB3}'
- stringdef llla '{U+0BB4}'
- stringdef zha '{U+0BB4}'
- stringdef va '{U+0BB5}'
-
- /* Vatamozi - borrowed */
- stringdef sha '{U+0BB6}'
- stringdef ssa '{U+0BB7}'
- stringdef sa '{U+0BB8}'
- stringdef ha '{U+0BB9}'
-
-
- /* Dependent vowel signs (kombu etc.) */
- stringdef vs_aa '{U+0BBE}'
- stringdef vs_i '{U+0BBF}'
- stringdef vs_ii '{U+0BC0}'
- stringdef vs_u '{U+0BC1}'
- stringdef vs_uu '{U+0BC2}'
- stringdef vs_e '{U+0BC6}'
- stringdef vs_ee '{U+0BC7}'
- stringdef vs_ai '{U+0BC8}'
- stringdef vs_o '{U+0BCA}'
- stringdef vs_oo '{U+0BCB}'
- stringdef vs_au '{U+0BCC}'
-
- /* Pulli */
- stringdef pulli '{U+0BCD}'
-
- /* AU length markk */
- stringdef au_lmark '{U+0BD7}'
-
-
- routines (
- remove_plural_suffix
- remove_question_suffixes
- remove_question_prefixes
- remove_pronoun_prefixes
- remove_command_suffixes
- remove_um
- remove_vetrumai_urupukal
- fix_va_start
- fix_ending
- fix_endings
- remove_tense_suffix
- remove_tense_suffixes
- remove_common_word_endings
- has_min_length
- )
-
- externals ( stem )
-
- booleans (
- found_a_match
- found_vetrumai_urupu
- )
-
- define has_min_length as (
- $(len > 4)
- )
-
- define fix_va_start as (
- (try '{va}{vs_oo}' and [ '{va}{vs_oo}' ] <- '{oo}' ) or
- (try '{va}{vs_o}' and [ '{va}{vs_o}' ] <- '{o}' ) or
- (try '{va}{vs_u}' and [ '{va}{vs_u}' ] <- '{u}' ) or
- (try '{va}{vs_uu}' and [ '{va}{vs_uu}' ] <- '{uu}' )
- )
-
- define fix_endings as (
- do repeat fix_ending
- )
-
- define remove_question_prefixes as (
- [ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
- do fix_va_start
- )
-
- // Gives signal t if an ending was fixed, signal f otherwise.
- define fix_ending as (
- $(len > 3)
- backwards (
- ( [among('{na}{pulli}' '{na}{pulli}{ta}' '{na}{pulli}{ta}{pulli}') ] delete )
- or
- ( ['{ya}{pulli}' test among('{vs_ai}' '{vs_i}' '{vs_ii}') ] delete )
- or
- ( [ '{tta}{pulli}{pa}{pulli}' or '{tta}{pulli}{ka}{pulli}' ] <- '{lla}{pulli}' )
- or
- ( [ '{nnna}{pulli}{rra}{pulli}' ] <- '{la}{pulli}' )
- or
- // ( [ '{rra}{pulli}{ka}{pulli}' or '{nnna}{pulli}{nnna}{pulli}' ] <- '{la}{pulli}' )
- ( [ '{rra}{pulli}{ka}{pulli}' ] <- '{la}{pulli}' )
- or
- ( [ '{tta}{pulli}{tta}{pulli}' ] <- '{tta}{vs_u}' )
- or
- ( found_vetrumai_urupu [ '{ta}{pulli}{ta}{pulli}' (test not '{vs_ai}') ] <- '{ma}{pulli}' ] )
- or
- ( [ '{vs_u}{ka}{pulli}' or '{vs_u}{ka}{pulli}{ka}{pulli}' ] <- '{pulli}' )
- or
- ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
- or
- ( [ '{vs_u}{ka}{pulli}' ] <- '{pulli}' )
- or
- ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
- or
- ( [ '{pulli}' (among('{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}') or among('{nga}' '{nya}' '{nna}' '{na}' '{ma}' '{nnna}')) '{pulli}' ] <- '{pulli}' )
- or
- ( [ among('{va}' '{ya}' '{va}{pulli}') ] delete )
- or
- ( [ '{nnna}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')) ] delete )
- or
- ( [ '{nga}{pulli}' (test not '{vs_ai}')] <- '{ma}{pulli}' )
- or
- ( [ '{nga}{pulli}' ] delete )
- or
- ( [ '{pulli}' (test (among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') or '{pulli}')) ] delete )
- )
- )
-
- define remove_pronoun_prefixes as (
- unset found_a_match
- [ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
- (set found_a_match)
- do fix_va_start
- )
-
- define remove_plural_suffix as (
- unset found_a_match
- backwards (
- ( [ '{vs_u}{nga}{pulli}{ka}{lla}{pulli}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')) ] <- '{pulli}' ) or
- ( [ '{rra}{pulli}{ka}{lla}{pulli}' ] <- '{la}{pulli}' ) or
- ( [ '{tta}{pulli}{ka}{lla}{pulli}' ] <- '{lla}{pulli}' ) or
- ( [ '{ka}{lla}{pulli}' ] delete )
- (set found_a_match)
- )
- )
-
- define remove_question_suffixes as (
- has_min_length
- unset found_a_match
- backwards (
- do (
- [ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}'
- (set found_a_match)
- )
- )
- do fix_endings
- )
-
- define remove_command_suffixes as (
- has_min_length
- unset found_a_match
- backwards (
- [ among('{pa}{vs_i}' '{va}{vs_i}') ] delete
- (set found_a_match)
- )
- )
-
- define remove_um as (
- unset found_a_match
- has_min_length
- backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}'
- (set found_a_match)
- )
- do fix_ending
- )
-
- define remove_common_word_endings as (
- // These are not suffixes actually but are
- // some words that are attached to other words
- // but can be removed for stemming
- unset found_a_match
- has_min_length
- backwards (
- test ( [ '{vs_u}{tta}{nnna}{pulli}' or
- '{vs_i}{la}{pulli}{la}{vs_ai}' or
- '{vs_i}{tta}{ma}{pulli}' or
- '{vs_i}{nnna}{pulli}{rra}{vs_i}' or
- '{vs_aa}{ka}{vs_i}' or
- '{vs_aa}{ka}{vs_i}{ya}' or
- '{vs_e}{nnna}{pulli}{rra}{vs_u}' or
- '{vs_u}{lla}{pulli}{lla}' or
- '{vs_u}{tta}{vs_ai}{ya}' or
- '{vs_u}{tta}{vs_ai}' or
- '{vs_e}{nnna}{vs_u}{ma}{pulli}' or
- ('{la}{pulli}{la}' test (not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
- '{vs_e}{nnna}' or
- '{vs_aa}{ka}{vs_i}' ] <- '{pulli}'
- (set found_a_match)
- )
- or
- test ( [ among('{pa}{tta}{vs_u}'
- '{pa}{tta}{pulli}{tta}'
- '{pa}{tta}{pulli}{tta}{vs_u}'
- '{pa}{tta}{pulli}{tta}{ta}{vs_u}'
- '{pa}{tta}{pulli}{tta}{nna}'
- '{ka}{vs_u}{ra}{vs_i}{ya}'
- '{pa}{rra}{pulli}{rra}{vs_i}'
- '{va}{vs_i}{tta}{vs_u}'
- '{va}{vs_i}{tta}{pulli}{tta}{vs_u}'
- '{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}'
- '{pa}{tta}{vs_i}'
- '{ta}{vs_aa}{nnna}'
- '{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}')
- ] delete
- (set found_a_match)
- )
- )
- do fix_endings
- )
-
- define remove_vetrumai_urupukal as (
- unset found_a_match
- unset found_vetrumai_urupu
- has_min_length
- backwards (
- (
- test ( ['{nnna}{vs_ai}'] delete )
- or
- test ([ ( '{vs_i}{nnna}{vs_ai}' or
- '{vs_ai}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}'))) or
- ( '{vs_ai}' (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}')))
- ] <- '{pulli}'
- )
- or
- test ( [
- '{vs_o}{tta}{vs_u}' or
- '{vs_oo}{tta}{vs_u}' or
- '{vs_i}{la}{pulli}' or
- '{vs_i}{rra}{pulli}' or
- ('{vs_i}{nnna}{pulli}' (test not '{ma}')) or
- '{vs_i}{nnna}{pulli}{rra}{vs_u}' or
- '{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}' or
- '{va}{vs_i}{tta}' or
- ($(len >= 7) '{vs_i}{tta}{ma}{pulli}') or
- '{vs_aa}{la}{pulli}' or
- '{vs_u}{tta}{vs_ai}' or
- '{vs_aa}{ma}{la}{pulli}' or
- ('{la}{pulli}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
- '{vs_u}{lla}{pulli}'
- ] <- '{pulli}'
- )
- or
- test ( [
- '{ka}{nna}{pulli}' or
- '{ma}{vs_u}{nnna}{pulli}' or
- '{ma}{vs_ee}{la}{pulli}' or
- '{ma}{vs_ee}{rra}{pulli}' or
- '{ka}{vs_ii}{llla}{pulli}' or
- '{pa}{vs_i}{nnna}{pulli}' or
- ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')))
- ] delete
- )
- or
- test ([ '{vs_ii}' ] <- '{vs_i}')
- )
- (set found_a_match)
- (set found_vetrumai_urupu)
- do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' )
- )
- do fix_endings
- )
-
- define remove_tense_suffixes as (
- set found_a_match
- repeat ( found_a_match (do remove_tense_suffix) )
- )
-
- define remove_tense_suffix as (
- unset found_a_match
- has_min_length
- backwards (
- do (
- test ( [among(
- '{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}'
- '{pa}{tta}{vs_u}'
- )] delete
- (set found_a_match)
- )
- or
- test ( [
- '{ma}{vs_aa}{ra}{pulli}' or
- '{ma}{vs_i}{nnna}{pulli}' or
- '{nnna}{nnna}{pulli}' or
- '{nnna}{vs_aa}{nnna}{pulli}' or
- '{nnna}{vs_aa}{lla}{pulli}' or
- '{nnna}{vs_aa}{ra}{pulli}' or
- ('{va}{nnna}{pulli}' test (not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}')) ) or
- '{nnna}{lla}{pulli}' or
- '{va}{lla}{pulli}' or
- '{nnna}{ra}{pulli}' or
- '{va}{ra}{pulli}' or
- '{nnna}' or '{pa}' or '{ka}' or '{ta}' or '{ya}' or
- '{pa}{nnna}{pulli}' or
- '{pa}{lla}{pulli}' or
- '{pa}{ra}{pulli}' or
- ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
- '{vs_i}{rra}{pulli}{rra}{vs_u}' or
- '{pa}{ma}{pulli}' or
- '{nnna}{ma}{pulli}' or
- '{ta}{vs_u}{ma}{pulli}' or
- '{rra}{vs_u}{ma}{pulli}' or
- '{ka}{vs_u}{ma}{pulli}' or
- '{nnna}{vs_e}{nnna}{pulli}' or
- '{nnna}{vs_ai}' or
- '{va}{vs_ai}'
- ] delete
- (set found_a_match)
- )
- or
- test ( [
- ('{vs_aa}{nnna}{pulli}' test (not '{ca}')) or
- '{vs_aa}{lla}{pulli}' or
- '{vs_aa}{ra}{pulli}' or
- '{vs_ee}{nnna}{pulli}' or
- '{vs_aa}' or
- '{vs_aa}{ma}{pulli}' or
- '{vs_e}{ma}{pulli}' or
- '{vs_ee}{ma}{pulli}' or
- '{vs_oo}{ma}{pulli}' or
- '{ka}{vs_u}{ma}{pulli}' or
- '{ta}{vs_u}{ma}{pulli}' or
- '{tta}{vs_u}{ma}{pulli}' or
- '{rra}{vs_u}{ma}{pulli}' or
- '{vs_aa}{ya}{pulli}' or
- '{nnna}{vs_e}{nnna}{pulli}' or
- '{nnna}{vs_i}{ra}{pulli}' or
- '{vs_ii}{ra}{pulli}' or
- '{vs_ii}{ya}{ra}{pulli}'
- ] <- '{pulli}'
- (set found_a_match)
- )
- or
- test ( ([ '{ka}{vs_u}' or '{ta}{vs_u}' ) (test '{pulli}') ] delete
- (set found_a_match)
- )
- )
- do ([among(
- '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}'
- '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}'
- '{ka}{vs_i}{nnna}{pulli}{rra}'
- '{ka}{vs_i}{nnna}{pulli}{rra}{pulli}'
- '{ka}{vs_i}{rra}'
- '{ka}{vs_i}{rra}{pulli}'
- )] delete
- (set found_a_match)
- )
- )
- do fix_endings
- )
-
- define stem as (
- unset found_vetrumai_urupu
- do fix_ending
- has_min_length
- do remove_question_prefixes
- do remove_pronoun_prefixes
- do remove_question_suffixes
- do remove_um
- do remove_common_word_endings
- do remove_vetrumai_urupukal
- do remove_plural_suffix
- do remove_command_suffixes
- do remove_tense_suffixes
- )
|