aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/snowball/algorithms/turkish.sbl
blob: eadd61d02b466cf707998068f481d67d4e503d9c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
/* Stemmer for Turkish
	* author: Evren (Kapusuz) Çilden
	* email: evren.kapusuz at gmail.com
	* version: 1.0 (15.01.2007)


	* stems nominal verb suffixes
	* stems nominal inflections
	* more than one syllable word check
	* (y,n,s,U) context check
	* vowel harmony check
	* last consonant check and conversion (b, c, d, ğ to p, ç, t, k)

	* The stemming algorithm is based on the paper "An Affix Stripping
	* Morphological Analyzer for Turkish" by Gülşen Eryiğit and
	* Eşref Adalı (Proceedings of the IAESTED International Conference
	* ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004,
	* Innsbruck, Austria

	* Turkish is an agglutinative language and has a very rich morphological
	* structure. In Turkish, you can form many different words from a single stem
	* by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means
	* "You had been the doctor of him". The stem of the word is "doktor" and it
	* takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about
	* the append order of suffixes can be clearly described as FSMs.
	* The paper referenced above defines some FSMs for right to left
	* morphological analysis. I generated a method for constructing snowball
	* expressions from right to left FSMs for stemming suffixes.
*/

routines (
	append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings
	check_vowel_harmony	// tests vowel harmony for suffixes
	is_reserved_word	// tests whether current string is a reserved word ('ad','soyad')
	mark_cAsInA		// nominal verb suffix
	mark_DA			// noun suffix
	mark_DAn		// noun suffix
	mark_DUr		// nominal verb suffix
	mark_ki			// noun suffix
	mark_lAr		// noun suffix, nominal verb suffix
	mark_lArI		// noun suffix
	mark_nA			// noun suffix
	mark_ncA		// noun suffix
	mark_ndA		// noun suffix
	mark_ndAn		// noun suffix
	mark_nU			// noun suffix
	mark_nUn		// noun suffix
	mark_nUz		// nominal verb suffix
	mark_sU			// noun suffix
	mark_sUn		// nominal verb suffix
	mark_sUnUz		// nominal verb suffix
	mark_possessives	// -(U)m,-(U)n,-(U)mUz,-(U)nUz,
	mark_yA			// noun suffix
	mark_ylA		// noun suffix
	mark_yU			// noun suffix
	mark_yUm		// nominal verb suffix
	mark_yUz		// nominal verb suffix
	mark_yDU		// nominal verb suffix
	mark_yken		// nominal verb suffix
	mark_ymUs_		// nominal verb suffix
	mark_ysA		// nominal verb suffix

	mark_suffix_with_optional_y_consonant
	mark_suffix_with_optional_U_vowel
	mark_suffix_with_optional_n_consonant
	mark_suffix_with_optional_s_consonant

	more_than_one_syllable_word

	post_process_last_consonants
	postlude

	stem_nominal_verb_suffixes
	stem_noun_suffixes
	stem_suffix_chain_before_ki
)

stringescapes	{ }

/* Special characters in Unicode Latin-1 and Latin Extended-A */
stringdef c,	'{U+00E7}'	// LATIN SMALL LETTER C WITH CEDILLA
stringdef g~	'{U+011F}'	// LATIN SMALL LETTER G WITH BREVE
stringdef i'	'{U+0131}'	// LATIN SMALL LETTER I WITHOUT DOT
stringdef o"	'{U+00F6}'	// LATIN SMALL LETTER O WITH DIAERESIS
stringdef s,	'{U+015F}'	// LATIN SMALL LETTER S WITH CEDILLA
stringdef u"	'{U+00FC}'	// LATIN SMALL LETTER U WITH DIAERESIS

booleans	( continue_stemming_noun_suffixes )

groupings	( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6)

define vowel	'ae{i'}io{o"}u{u"}'
define U	'{i'}iu{u"}'

// the vowel grouping definitions below are used for checking vowel harmony
define vowel1	'a{i'}ou'		// vowels that can end with suffixes containing 'a'
define vowel2	'ei{o"}{u"}'		// vowels that can end with suffixes containing 'e'
define vowel3	'a{i'}'			// vowels that can end with suffixes containing 'i''
define vowel4	'ei'			// vowels that can end with suffixes containing 'i'
define vowel5	'ou'			// vowels that can end with suffixes containing 'o' or 'u'
define vowel6	'{o"}{u"}'		// vowels that can end with suffixes containing 'o"' or 'u"'

externals	( stem )

backwardmode (
	// checks vowel harmony for possible suffixes,
	// helps to detect whether the candidate for suffix applies to vowel harmony
	// this rule is added to prevent over stemming
	define check_vowel_harmony as (
		test
		(
			(goto vowel)   // if there is a vowel
			(
				('a' goto vowel1) or
				('e' goto vowel2) or
				('{i'}' goto vowel3) or
				('i' goto vowel4) or
				('o' goto vowel5) or
				('{o"}' goto vowel6) or
				('u' goto vowel5) or
				('{u"}' goto vowel6)
			)
		)
	)

	// if the last consonant before suffix is vowel and n then advance and delete
	// if the last consonant before suffix is non vowel and n do nothing
	// if the last consonant before suffix is not n then only delete the suffix
	// assumption: slice beginning is set correctly
	define mark_suffix_with_optional_n_consonant as (
		('n' (test vowel))
		or
		((not(test 'n')) test(next vowel))

	)

	// if the last consonant before suffix is vowel and s then advance and delete
	// if the last consonant before suffix is non vowel and s do nothing
	// if the last consonant before suffix is not s then only delete the suffix
	// assumption: slice beginning is set correctly
	define mark_suffix_with_optional_s_consonant as (
		('s' (test vowel))
		or
		((not(test 's')) test(next vowel))
	)

	// if the last consonant before suffix is vowel and y then advance and delete
	// if the last consonant before suffix is non vowel and y do nothing
	// if the last consonant before suffix is not y then only delete the suffix
	// assumption: slice beginning is set correctly
	define mark_suffix_with_optional_y_consonant as (
		('y' (test vowel))
		or
		((not(test 'y')) test(next vowel))
	)

	define mark_suffix_with_optional_U_vowel as (
		(U (test non-vowel))
		or
		((not(test U)) test(next non-vowel))

	)

	define mark_possessives as (
		among ('m{i'}z' 'miz' 'muz' 'm{u"}z'
		       'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n')
		(mark_suffix_with_optional_U_vowel)
	)

	define mark_sU as (
		check_vowel_harmony
		U
		(mark_suffix_with_optional_s_consonant)
	)

	define mark_lArI as (
		among ('leri' 'lar{i'}')
	)

	define mark_yU as (
		check_vowel_harmony
		U
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_nU as (
		check_vowel_harmony
		among ('n{i'}' 'ni' 'nu' 'n{u"}')
	)

	define mark_nUn as (
		check_vowel_harmony
		among ('{i'}n' 'in' 'un' '{u"}n')
		(mark_suffix_with_optional_n_consonant)
	)

	define mark_yA as (
		check_vowel_harmony
		among('a' 'e')
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_nA as (
		check_vowel_harmony
		among('na' 'ne')
	)

	define mark_DA as (
		check_vowel_harmony
		among('da' 'de' 'ta' 'te')
	)

	define mark_ndA as (
		check_vowel_harmony
		among('nda' 'nde')
	)

	define mark_DAn as (
		check_vowel_harmony
		among('dan' 'den' 'tan' 'ten')
	)

	define mark_ndAn as (
		check_vowel_harmony
		among('ndan' 'nden')
	)

	define mark_ylA as (
		check_vowel_harmony
		among('la' 'le')
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_ki as (
		'ki'
	)

	define mark_ncA as (
		check_vowel_harmony
		among('ca' 'ce')
		(mark_suffix_with_optional_n_consonant)
	)

	define mark_yUm as (
		check_vowel_harmony
		among ('{i'}m' 'im' 'um' '{u"}m')
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_sUn as (
		check_vowel_harmony
		among ('s{i'}n' 'sin' 'sun' 's{u"}n' )
	)

	define mark_yUz as (
		check_vowel_harmony
		among ('{i'}z' 'iz' 'uz' '{u"}z')
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_sUnUz as (
		among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z')
	)

	define mark_lAr as (
		check_vowel_harmony
		among ('ler' 'lar')
	)

	define mark_nUz as (
		check_vowel_harmony
		among ('n{i'}z' 'niz' 'nuz' 'n{u"}z')
	)

	define mark_DUr as (
		check_vowel_harmony
		among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r')
	)

	define mark_cAsInA as (
		among ('cas{i'}na' 'cesine')
	)

	define mark_yDU as (
		check_vowel_harmony
		among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m'
			't{i'}n' 'tin' 'tun' 't{u"}n' 'd{i'}n' 'din' 'dun' 'd{u"}n'
			't{i'}k' 'tik' 'tuk' 't{u"}k' 'd{i'}k' 'dik' 'duk' 'd{u"}k'
			't{i'}' 'ti' 'tu' 't{u"}' 'd{i'}' 'di' 'du' 'd{u"}')
		(mark_suffix_with_optional_y_consonant)
	)

	// does not fully obey vowel harmony
	define mark_ysA as (
		among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se')
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_ymUs_ as (
		check_vowel_harmony
		among ('m{i'}{s,}' 'mi{s,}' 'mu{s,}' 'm{u"}{s,}')
		(mark_suffix_with_optional_y_consonant)
	)

	define mark_yken as (
		'ken' (mark_suffix_with_optional_y_consonant)
	)

	define stem_nominal_verb_suffixes as (
		[
			set continue_stemming_noun_suffixes
			(mark_ymUs_ or mark_yDU or mark_ysA or mark_yken)
			or
			(mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)
			or
			(
				mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_))
				unset continue_stemming_noun_suffixes
			)
			or
			(mark_nUz (mark_yDU or mark_ysA))
			or
			((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_))
			or
			(mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_))
		]delete
	)

	// stems noun suffix chains ending with -ki
	define stem_suffix_chain_before_ki as (
		[
			mark_ki
			(
				(mark_DA] delete try([
					(mark_lAr] delete try(stem_suffix_chain_before_ki))
					or
					(mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki))

				))
				or
				(mark_nUn] delete try([
					(mark_lArI] delete)
					or
					([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
					or
					(stem_suffix_chain_before_ki)
				))
				or
				(mark_ndA (
					(mark_lArI] delete)
					or
					((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki)))
					or
					(stem_suffix_chain_before_ki)
				))
			)
	)

	define stem_noun_suffixes as (
		([mark_lAr] delete try(stem_suffix_chain_before_ki))
		or
		([mark_ncA] delete
			try(
				([mark_lArI] delete)
				or
				([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
				or
				([mark_lAr] delete stem_suffix_chain_before_ki)
			)
		)
		or
		([(mark_ndA or mark_nA)
			(
				(mark_lArI] delete)
				or
				(mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
				or
				(stem_suffix_chain_before_ki)
			)
		)
		or
		([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI)))
		or
		( [mark_DAn] delete try ([
			(
				(mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
				or
				(mark_lAr] delete try(stem_suffix_chain_before_ki))
				or
				(stem_suffix_chain_before_ki)
			))
		)
		or
		([mark_nUn or mark_ylA] delete
			try(
				([mark_lAr] delete stem_suffix_chain_before_ki)
				or
				([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
				or
				stem_suffix_chain_before_ki
			)
		)
		or
		([mark_lArI] delete)
		or
		(stem_suffix_chain_before_ki)
		or
		([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki))
		or
		([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
	)

	define post_process_last_consonants as (
		[substring] among (
			'b' (<- 'p')
			'c' (<- '{c,}')
			'd' (<- 't')
			'{g~}' (<- 'k')
		)
	)

	// after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed
	// like in 'kedim' -> 'ked'
	// Turkish words don't usually end with 'd' or 'g'
	// some very well known words are ignored (like 'ad' 'soyad'
	// appends U to stems ending with d or g, decides which vowel to add
	// based on the last vowel in the stem
	define append_U_to_stems_ending_with_d_or_g as (
		test('d' or 'g')
		(test((goto vowel) 'a' or '{i'}') <+ '{i'}')
		or
		(test((goto vowel) 'e' or 'i') <+ 'i')
		or
		(test((goto vowel) 'o' or 'u') <+ 'u')
		or
		(test((goto vowel) '{o"}' or '{u"}') <+ '{u"}')
	)

	define is_reserved_word as (
		'ad' try 'soy' atlimit
	)
)

// Tests if there are more than one syllables
// In Turkish each vowel indicates a distinct syllable
define more_than_one_syllable_word as (
	test (atleast 2 (gopast vowel))
)

define postlude as (
	backwards (
		not(is_reserved_word)
		do append_U_to_stems_ending_with_d_or_g
		do post_process_last_consonants

	)
)

define stem as (
	(more_than_one_syllable_word)
	(
		backwards (
			do stem_nominal_verb_suffixes
			continue_stemming_noun_suffixes
			do stem_noun_suffixes
		)

	postlude
	)
)