Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

tamil.sbl 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. /*
  2. * Affix stripping stemming algorithm for Tamil
  3. * By Damodharan Rajalingam
  4. */
  5. stringescapes {}
  6. /* Aytham */
  7. stringdef aytham '{U+0B83}'
  8. /* Uyir - independent vowels */
  9. stringdef a '{U+0B85}'
  10. stringdef aa '{U+0B86}'
  11. stringdef i '{U+0B87}'
  12. stringdef ii '{U+0B88}'
  13. stringdef u '{U+0B89}'
  14. stringdef uu '{U+0B8A}'
  15. stringdef e '{U+0B8E}'
  16. stringdef ee '{U+0B8F}'
  17. stringdef ai '{U+0B90}'
  18. stringdef o '{U+0B92}'
  19. stringdef oo '{U+0B93}'
  20. stringdef au '{U+0B94}'
  21. /* Consonants */
  22. stringdef ka '{U+0B95}'
  23. stringdef nga '{U+0B99}'
  24. stringdef ca '{U+0B9A}'
  25. stringdef ja '{U+0B9C}'
  26. stringdef nya '{U+0B9E}'
  27. stringdef tta '{U+0B9F}'
  28. stringdef nna '{U+0BA3}'
  29. stringdef ta '{U+0BA4}'
  30. stringdef tha '{U+0BA4}'
  31. stringdef na '{U+0BA8}'
  32. stringdef nnna '{U+0BA9}'
  33. stringdef pa '{U+0BAA}'
  34. stringdef ma '{U+0BAE}'
  35. stringdef ya '{U+0BAF}'
  36. stringdef ra '{U+0BB0}'
  37. stringdef rra '{U+0BB1}'
  38. stringdef la '{U+0BB2}'
  39. stringdef lla '{U+0BB3}'
  40. stringdef llla '{U+0BB4}'
  41. stringdef zha '{U+0BB4}'
  42. stringdef va '{U+0BB5}'
  43. /* Vatamozi - borrowed */
  44. stringdef sha '{U+0BB6}'
  45. stringdef ssa '{U+0BB7}'
  46. stringdef sa '{U+0BB8}'
  47. stringdef ha '{U+0BB9}'
  48. /* Dependent vowel signs (kombu etc.) */
  49. stringdef vs_aa '{U+0BBE}'
  50. stringdef vs_i '{U+0BBF}'
  51. stringdef vs_ii '{U+0BC0}'
  52. stringdef vs_u '{U+0BC1}'
  53. stringdef vs_uu '{U+0BC2}'
  54. stringdef vs_e '{U+0BC6}'
  55. stringdef vs_ee '{U+0BC7}'
  56. stringdef vs_ai '{U+0BC8}'
  57. stringdef vs_o '{U+0BCA}'
  58. stringdef vs_oo '{U+0BCB}'
  59. stringdef vs_au '{U+0BCC}'
  60. /* Pulli */
  61. stringdef pulli '{U+0BCD}'
  62. /* AU length markk */
  63. stringdef au_lmark '{U+0BD7}'
  64. routines (
  65. remove_plural_suffix
  66. remove_question_suffixes
  67. remove_question_prefixes
  68. remove_pronoun_prefixes
  69. remove_command_suffixes
  70. remove_um
  71. remove_vetrumai_urupukal
  72. fix_va_start
  73. fix_ending
  74. fix_endings
  75. remove_tense_suffix
  76. remove_tense_suffixes
  77. remove_common_word_endings
  78. has_min_length
  79. )
  80. externals ( stem )
  81. booleans (
  82. found_a_match
  83. found_vetrumai_urupu
  84. )
  85. define has_min_length as (
  86. $(len > 4)
  87. )
  88. define fix_va_start as (
  89. (try '{va}{vs_oo}' and [ '{va}{vs_oo}' ] <- '{oo}' ) or
  90. (try '{va}{vs_o}' and [ '{va}{vs_o}' ] <- '{o}' ) or
  91. (try '{va}{vs_u}' and [ '{va}{vs_u}' ] <- '{u}' ) or
  92. (try '{va}{vs_uu}' and [ '{va}{vs_uu}' ] <- '{uu}' )
  93. )
  94. define fix_endings as (
  95. do repeat fix_ending
  96. )
  97. define remove_question_prefixes as (
  98. [ ('{e}' ) among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
  99. do fix_va_start
  100. )
  101. // Gives signal t if an ending was fixed, signal f otherwise.
  102. define fix_ending as (
  103. $(len > 3)
  104. backwards (
  105. ( [among('{na}{pulli}' '{na}{pulli}{ta}' '{na}{pulli}{ta}{pulli}') ] delete )
  106. or
  107. ( ['{ya}{pulli}' test among('{vs_ai}' '{vs_i}' '{vs_ii}') ] delete )
  108. or
  109. ( [ '{tta}{pulli}{pa}{pulli}' or '{tta}{pulli}{ka}{pulli}' ] <- '{lla}{pulli}' )
  110. or
  111. ( [ '{nnna}{pulli}{rra}{pulli}' ] <- '{la}{pulli}' )
  112. or
  113. // ( [ '{rra}{pulli}{ka}{pulli}' or '{nnna}{pulli}{nnna}{pulli}' ] <- '{la}{pulli}' )
  114. ( [ '{rra}{pulli}{ka}{pulli}' ] <- '{la}{pulli}' )
  115. or
  116. ( [ '{tta}{pulli}{tta}{pulli}' ] <- '{tta}{vs_u}' )
  117. or
  118. ( found_vetrumai_urupu [ '{ta}{pulli}{ta}{pulli}' (test not '{vs_ai}') ] <- '{ma}{pulli}' ] )
  119. or
  120. ( [ '{vs_u}{ka}{pulli}' or '{vs_u}{ka}{pulli}{ka}{pulli}' ] <- '{pulli}' )
  121. or
  122. ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
  123. or
  124. ( [ '{vs_u}{ka}{pulli}' ] <- '{pulli}' )
  125. or
  126. ( [ '{pulli}' among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') ] delete )
  127. or
  128. ( [ '{pulli}' (among('{ya}' '{ra}' '{la}' '{va}' '{zha}' '{lla}') or among('{nga}' '{nya}' '{nna}' '{na}' '{ma}' '{nnna}')) '{pulli}' ] <- '{pulli}' )
  129. or
  130. ( [ among('{va}' '{ya}' '{va}{pulli}') ] delete )
  131. or
  132. ( [ '{nnna}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')) ] delete )
  133. or
  134. ( [ '{nga}{pulli}' (test not '{vs_ai}')] <- '{ma}{pulli}' )
  135. or
  136. ( [ '{nga}{pulli}' ] delete )
  137. or
  138. ( [ '{pulli}' (test (among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}') or '{pulli}')) ] delete )
  139. )
  140. )
  141. define remove_pronoun_prefixes as (
  142. unset found_a_match
  143. [ among('{a}' '{i}' '{u}') among('{ka}' '{ca}' '{tha}' '{va}' '{na}' '{pa}' '{ma}' '{ya}' '{nga}' '{nya}') '{pulli}' ] delete
  144. (set found_a_match)
  145. do fix_va_start
  146. )
  147. define remove_plural_suffix as (
  148. unset found_a_match
  149. backwards (
  150. ( [ '{vs_u}{nga}{pulli}{ka}{lla}{pulli}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}')) ] <- '{pulli}' ) or
  151. ( [ '{rra}{pulli}{ka}{lla}{pulli}' ] <- '{la}{pulli}' ) or
  152. ( [ '{tta}{pulli}{ka}{lla}{pulli}' ] <- '{lla}{pulli}' ) or
  153. ( [ '{ka}{lla}{pulli}' ] delete )
  154. (set found_a_match)
  155. )
  156. )
  157. define remove_question_suffixes as (
  158. has_min_length
  159. unset found_a_match
  160. backwards (
  161. do (
  162. [ among('{vs_oo}' '{vs_ee}' '{vs_aa}') ] <- '{pulli}'
  163. (set found_a_match)
  164. )
  165. )
  166. do fix_endings
  167. )
  168. define remove_command_suffixes as (
  169. has_min_length
  170. unset found_a_match
  171. backwards (
  172. [ among('{pa}{vs_i}' '{va}{vs_i}') ] delete
  173. (set found_a_match)
  174. )
  175. )
  176. define remove_um as (
  177. unset found_a_match
  178. has_min_length
  179. backwards ( [ '{vs_u}{ma}{pulli}' ] <- '{pulli}'
  180. (set found_a_match)
  181. )
  182. do fix_ending
  183. )
  184. define remove_common_word_endings as (
  185. // These are not suffixes actually but are
  186. // some words that are attached to other words
  187. // but can be removed for stemming
  188. unset found_a_match
  189. has_min_length
  190. backwards (
  191. test ( [ '{vs_u}{tta}{nnna}{pulli}' or
  192. '{vs_i}{la}{pulli}{la}{vs_ai}' or
  193. '{vs_i}{tta}{ma}{pulli}' or
  194. '{vs_i}{nnna}{pulli}{rra}{vs_i}' or
  195. '{vs_aa}{ka}{vs_i}' or
  196. '{vs_aa}{ka}{vs_i}{ya}' or
  197. '{vs_e}{nnna}{pulli}{rra}{vs_u}' or
  198. '{vs_u}{lla}{pulli}{lla}' or
  199. '{vs_u}{tta}{vs_ai}{ya}' or
  200. '{vs_u}{tta}{vs_ai}' or
  201. '{vs_e}{nnna}{vs_u}{ma}{pulli}' or
  202. ('{la}{pulli}{la}' test (not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
  203. '{vs_e}{nnna}' or
  204. '{vs_aa}{ka}{vs_i}' ] <- '{pulli}'
  205. (set found_a_match)
  206. )
  207. or
  208. test ( [ among('{pa}{tta}{vs_u}'
  209. '{pa}{tta}{pulli}{tta}'
  210. '{pa}{tta}{pulli}{tta}{vs_u}'
  211. '{pa}{tta}{pulli}{tta}{ta}{vs_u}'
  212. '{pa}{tta}{pulli}{tta}{nna}'
  213. '{ka}{vs_u}{ra}{vs_i}{ya}'
  214. '{pa}{rra}{pulli}{rra}{vs_i}'
  215. '{va}{vs_i}{tta}{vs_u}'
  216. '{va}{vs_i}{tta}{pulli}{tta}{vs_u}'
  217. '{pa}{tta}{vs_i}{ta}{vs_aa}{nnna}'
  218. '{pa}{tta}{vs_i}'
  219. '{ta}{vs_aa}{nnna}'
  220. '{vs_e}{la}{pulli}{la}{vs_aa}{ma}{pulli}')
  221. ] delete
  222. (set found_a_match)
  223. )
  224. )
  225. do fix_endings
  226. )
  227. define remove_vetrumai_urupukal as (
  228. unset found_a_match
  229. unset found_vetrumai_urupu
  230. has_min_length
  231. backwards (
  232. (
  233. test ( ['{nnna}{vs_ai}'] delete )
  234. or
  235. test ([ ( '{vs_i}{nnna}{vs_ai}' or
  236. '{vs_ai}' (test not among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}'))) or
  237. ( '{vs_ai}' (test (among('{ka}' '{ca}' '{tta}' '{tha}' '{pa}' '{rra}') '{pulli}')))
  238. ] <- '{pulli}'
  239. )
  240. or
  241. test ( [
  242. '{vs_o}{tta}{vs_u}' or
  243. '{vs_oo}{tta}{vs_u}' or
  244. '{vs_i}{la}{pulli}' or
  245. '{vs_i}{rra}{pulli}' or
  246. ('{vs_i}{nnna}{pulli}' (test not '{ma}')) or
  247. '{vs_i}{nnna}{pulli}{rra}{vs_u}' or
  248. '{vs_i}{ra}{vs_u}{na}{pulli}{ta}{vs_u}' or
  249. '{va}{vs_i}{tta}' or
  250. ($(len >= 7) '{vs_i}{tta}{ma}{pulli}') or
  251. '{vs_aa}{la}{pulli}' or
  252. '{vs_u}{tta}{vs_ai}' or
  253. '{vs_aa}{ma}{la}{pulli}' or
  254. ('{la}{pulli}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
  255. '{vs_u}{lla}{pulli}'
  256. ] <- '{pulli}'
  257. )
  258. or
  259. test ( [
  260. '{ka}{nna}{pulli}' or
  261. '{ma}{vs_u}{nnna}{pulli}' or
  262. '{ma}{vs_ee}{la}{pulli}' or
  263. '{ma}{vs_ee}{rra}{pulli}' or
  264. '{ka}{vs_ii}{llla}{pulli}' or
  265. '{pa}{vs_i}{nnna}{pulli}' or
  266. ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}')))
  267. ] delete
  268. )
  269. or
  270. test ([ '{vs_ii}' ] <- '{vs_i}')
  271. )
  272. (set found_a_match)
  273. (set found_vetrumai_urupu)
  274. do ( [ '{vs_i}{nnna}{pulli}' ] <- '{pulli}' )
  275. )
  276. do fix_endings
  277. )
  278. define remove_tense_suffixes as (
  279. set found_a_match
  280. repeat ( found_a_match (do remove_tense_suffix) )
  281. )
  282. define remove_tense_suffix as (
  283. unset found_a_match
  284. has_min_length
  285. backwards (
  286. do (
  287. test ( [among(
  288. '{ka}{vs_o}{nna}{pulli}{tta}{vs_i}{ra}{pulli}'
  289. '{pa}{tta}{vs_u}'
  290. )] delete
  291. (set found_a_match)
  292. )
  293. or
  294. test ( [
  295. '{ma}{vs_aa}{ra}{pulli}' or
  296. '{ma}{vs_i}{nnna}{pulli}' or
  297. '{nnna}{nnna}{pulli}' or
  298. '{nnna}{vs_aa}{nnna}{pulli}' or
  299. '{nnna}{vs_aa}{lla}{pulli}' or
  300. '{nnna}{vs_aa}{ra}{pulli}' or
  301. ('{va}{nnna}{pulli}' test (not among('{a}' '{aa}' '{i}' '{ii}' '{u}' '{uu}' '{e}' '{ee}' '{ai}' '{o}' '{oo}' '{au}')) ) or
  302. '{nnna}{lla}{pulli}' or
  303. '{va}{lla}{pulli}' or
  304. '{nnna}{ra}{pulli}' or
  305. '{va}{ra}{pulli}' or
  306. '{nnna}' or '{pa}' or '{ka}' or '{ta}' or '{ya}' or
  307. '{pa}{nnna}{pulli}' or
  308. '{pa}{lla}{pulli}' or
  309. '{pa}{ra}{pulli}' or
  310. ('{ta}{vs_u}' (test not among('{vs_aa}' '{vs_i}' '{vs_ii}' '{vs_e}' '{vs_ee}' '{vs_u}' '{vs_uu}' '{vs_ai}'))) or
  311. '{vs_i}{rra}{pulli}{rra}{vs_u}' or
  312. '{pa}{ma}{pulli}' or
  313. '{nnna}{ma}{pulli}' or
  314. '{ta}{vs_u}{ma}{pulli}' or
  315. '{rra}{vs_u}{ma}{pulli}' or
  316. '{ka}{vs_u}{ma}{pulli}' or
  317. '{nnna}{vs_e}{nnna}{pulli}' or
  318. '{nnna}{vs_ai}' or
  319. '{va}{vs_ai}'
  320. ] delete
  321. (set found_a_match)
  322. )
  323. or
  324. test ( [
  325. ('{vs_aa}{nnna}{pulli}' test (not '{ca}')) or
  326. '{vs_aa}{lla}{pulli}' or
  327. '{vs_aa}{ra}{pulli}' or
  328. '{vs_ee}{nnna}{pulli}' or
  329. '{vs_aa}' or
  330. '{vs_aa}{ma}{pulli}' or
  331. '{vs_e}{ma}{pulli}' or
  332. '{vs_ee}{ma}{pulli}' or
  333. '{vs_oo}{ma}{pulli}' or
  334. '{ka}{vs_u}{ma}{pulli}' or
  335. '{ta}{vs_u}{ma}{pulli}' or
  336. '{tta}{vs_u}{ma}{pulli}' or
  337. '{rra}{vs_u}{ma}{pulli}' or
  338. '{vs_aa}{ya}{pulli}' or
  339. '{nnna}{vs_e}{nnna}{pulli}' or
  340. '{nnna}{vs_i}{ra}{pulli}' or
  341. '{vs_ii}{ra}{pulli}' or
  342. '{vs_ii}{ya}{ra}{pulli}'
  343. ] <- '{pulli}'
  344. (set found_a_match)
  345. )
  346. or
  347. test ( ([ '{ka}{vs_u}' or '{ta}{vs_u}' ) (test '{pulli}') ] delete
  348. (set found_a_match)
  349. )
  350. )
  351. do ([among(
  352. '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}'
  353. '{vs_aa}{na}{vs_i}{nnna}{pulli}{rra}{pulli}'
  354. '{ka}{vs_i}{nnna}{pulli}{rra}'
  355. '{ka}{vs_i}{nnna}{pulli}{rra}{pulli}'
  356. '{ka}{vs_i}{rra}'
  357. '{ka}{vs_i}{rra}{pulli}'
  358. )] delete
  359. (set found_a_match)
  360. )
  361. )
  362. do fix_endings
  363. )
  364. define stem as (
  365. unset found_vetrumai_urupu
  366. do fix_ending
  367. has_min_length
  368. do remove_question_prefixes
  369. do remove_pronoun_prefixes
  370. do remove_question_suffixes
  371. do remove_um
  372. do remove_common_word_endings
  373. do remove_vetrumai_urupukal
  374. do remove_plural_suffix
  375. do remove_command_suffixes
  376. do remove_tense_suffixes
  377. )