mirror of https://gitee.com/bigwinds/arangodb
478 lines
12 KiB
Plaintext
478 lines
12 KiB
Plaintext
/* Stemmer for Turkish
|
||
* author: Evren (Kapusuz) Çilden
|
||
* email: evren.kapusuz at gmail.com
|
||
* version: 1.0 (15.01.2007)
|
||
|
||
|
||
* stems nominal verb suffixes
|
||
* stems nominal inflections
|
||
* more than one syllable word check
|
||
* (y,n,s,U) context check
|
||
* vowel harmony check
|
||
* last consonant check and conversion (b, c, d, ğ to p, ç, t, k)
|
||
|
||
* The stemming algorithm is based on the paper "An Affix Stripping
|
||
* Morphological Analyzer for Turkish" by Gülşen Eryiğit and
|
||
* Eşref Adalı (Proceedings of the IAESTED International Conference
|
||
* ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004,
|
||
* Innsbruck, Austria
|
||
|
||
* Turkish is an agglutinative language and has a very rich morphological
|
||
* structure. In Turkish, you can form many different words from a single stem
|
||
* by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means
|
||
* "You had been the doctor of him". The stem of the word is "doktor" and it
|
||
* takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about
|
||
* the append order of suffixes can be clearly described as FSMs.
|
||
* The paper referenced above defines some FSMs for right to left
|
||
* morphological analysis. I generated a method for constructing snowball
|
||
* expressions from right to left FSMs for stemming suffixes.
|
||
*/
|
||
|
||
routines (
|
||
append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings
|
||
check_vowel_harmony // tests vowel harmony for suffixes
|
||
is_reserved_word // tests whether current string is a reserved word ('ad','soyad')
|
||
mark_cAsInA // nominal verb suffix
|
||
mark_DA // noun suffix
|
||
mark_DAn // noun suffix
|
||
mark_DUr // nominal verb suffix
|
||
mark_ki // noun suffix
|
||
mark_lAr // noun suffix, nominal verb suffix
|
||
mark_lArI // noun suffix
|
||
mark_nA // noun suffix
|
||
mark_ncA // noun suffix
|
||
mark_ndA // noun suffix
|
||
mark_ndAn // noun suffix
|
||
mark_nU // noun suffix
|
||
mark_nUn // noun suffix
|
||
mark_nUz // nominal verb suffix
|
||
mark_sU // noun suffix
|
||
mark_sUn // nominal verb suffix
|
||
mark_sUnUz // nominal verb suffix
|
||
mark_possessives // -(U)m,-(U)n,-(U)mUz,-(U)nUz,
|
||
mark_yA // noun suffix
|
||
mark_ylA // noun suffix
|
||
mark_yU // noun suffix
|
||
mark_yUm // nominal verb suffix
|
||
mark_yUz // nominal verb suffix
|
||
mark_yDU // nominal verb suffix
|
||
mark_yken // nominal verb suffix
|
||
mark_ymUs_ // nominal verb suffix
|
||
mark_ysA // nominal verb suffix
|
||
|
||
mark_suffix_with_optional_y_consonant
|
||
mark_suffix_with_optional_U_vowel
|
||
mark_suffix_with_optional_n_consonant
|
||
mark_suffix_with_optional_s_consonant
|
||
|
||
more_than_one_syllable_word
|
||
|
||
post_process_last_consonants
|
||
postlude
|
||
|
||
stem_nominal_verb_suffixes
|
||
stem_noun_suffixes
|
||
stem_suffix_chain_before_ki
|
||
)
|
||
|
||
/* Special characters in Unicode Latin-1 and Latin Extended-A */
|
||
stringdef c. hex 'E7' // LATIN SMALL LETTER C WITH CEDILLA
|
||
stringdef g~ hex '011F' // LATIN SMALL LETTER G WITH BREVE
|
||
stringdef i' hex '0131' // LATIN SMALL LETTER I WITHOUT DOT
|
||
stringdef o" hex 'F6' // LATIN SMALL LETTER O WITH DIAERESIS
|
||
stringdef s. hex '015F' // LATIN SMALL LETTER S WITH CEDILLA
|
||
stringdef u" hex 'FC' // LATIN SMALL LETTER U WITH DIAERESIS
|
||
|
||
stringescapes { }
|
||
|
||
integers ( strlen ) // length of a string
|
||
|
||
booleans ( continue_stemming_noun_suffixes )
|
||
|
||
groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6)
|
||
|
||
define vowel 'ae{i'}io{o"}u{u"}'
|
||
define U '{i'}iu{u"}'
|
||
|
||
// the vowel grouping definitions below are used for checking vowel harmony
|
||
define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a'
|
||
define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e'
|
||
define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i''
|
||
define vowel4 'ei' // vowels that can end with suffixes containing 'i'
|
||
define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u'
|
||
define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"'
|
||
|
||
externals ( stem )
|
||
|
||
backwardmode (
|
||
// checks vowel harmony for possible suffixes,
|
||
// helps to detect whether the candidate for suffix applies to vowel harmony
|
||
// this rule is added to prevent over stemming
|
||
define check_vowel_harmony as (
|
||
test
|
||
(
|
||
(goto vowel) // if there is a vowel
|
||
(
|
||
('a' goto vowel1) or
|
||
('e' goto vowel2) or
|
||
('{i'}' goto vowel3) or
|
||
('i' goto vowel4) or
|
||
('o' goto vowel5) or
|
||
('{o"}' goto vowel6) or
|
||
('u' goto vowel5) or
|
||
('{u"}' goto vowel6)
|
||
)
|
||
)
|
||
)
|
||
|
||
// if the last consonant before suffix is vowel and n then advance and delete
|
||
// if the last consonant before suffix is non vowel and n do nothing
|
||
// if the last consonant before suffix is not n then only delete the suffix
|
||
// assumption: slice beginning is set correctly
|
||
define mark_suffix_with_optional_n_consonant as (
|
||
('n' (test vowel))
|
||
or
|
||
((not(test 'n')) test(next vowel))
|
||
|
||
)
|
||
|
||
// if the last consonant before suffix is vowel and s then advance and delete
|
||
// if the last consonant before suffix is non vowel and s do nothing
|
||
// if the last consonant before suffix is not s then only delete the suffix
|
||
// assumption: slice beginning is set correctly
|
||
define mark_suffix_with_optional_s_consonant as (
|
||
('s' (test vowel))
|
||
or
|
||
((not(test 's')) test(next vowel))
|
||
)
|
||
|
||
// if the last consonant before suffix is vowel and y then advance and delete
|
||
// if the last consonant before suffix is non vowel and y do nothing
|
||
// if the last consonant before suffix is not y then only delete the suffix
|
||
// assumption: slice beginning is set correctly
|
||
define mark_suffix_with_optional_y_consonant as (
|
||
('y' (test vowel))
|
||
or
|
||
((not(test 'y')) test(next vowel))
|
||
)
|
||
|
||
define mark_suffix_with_optional_U_vowel as (
|
||
(U (test non-vowel))
|
||
or
|
||
((not(test U)) test(next non-vowel))
|
||
|
||
)
|
||
|
||
define mark_possessives as (
|
||
among ('m{i'}z' 'miz' 'muz' 'm{u"}z'
|
||
'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n')
|
||
(mark_suffix_with_optional_U_vowel)
|
||
)
|
||
|
||
define mark_sU as (
|
||
check_vowel_harmony
|
||
U
|
||
(mark_suffix_with_optional_s_consonant)
|
||
)
|
||
|
||
define mark_lArI as (
|
||
among ('leri' 'lar{i'}')
|
||
)
|
||
|
||
define mark_yU as (
|
||
check_vowel_harmony
|
||
U
|
||
(mark_suffix_with_optional_y_consonant)
|
||
)
|
||
|
||
define mark_nU as (
|
||
check_vowel_harmony
|
||
among ('n{i'}' 'ni' 'nu' 'n{u"}')
|
||
)
|
||
|
||
define mark_nUn as (
|
||
check_vowel_harmony
|
||
among ('{i'}n' 'in' 'un' '{u"}n')
|
||
(mark_suffix_with_optional_n_consonant)
|
||
)
|
||
|
||
define mark_yA as (
|
||
check_vowel_harmony
|
||
among('a' 'e')
|
||
(mark_suffix_with_optional_y_consonant)
|
||
)
|
||
|
||
define mark_nA as (
|
||
check_vowel_harmony
|
||
among('na' 'ne')
|
||
)
|
||
|
||
define mark_DA as (
|
||
check_vowel_harmony
|
||
among('da' 'de' 'ta' 'te')
|
||
)
|
||
|
||
define mark_ndA as (
|
||
check_vowel_harmony
|
||
among('nda' 'nde')
|
||
)
|
||
|
||
define mark_DAn as (
|
||
check_vowel_harmony
|
||
among('dan' 'den' 'tan' 'ten')
|
||
)
|
||
|
||
define mark_ndAn as (
|
||
check_vowel_harmony
|
||
among('ndan' 'nden')
|
||
)
|
||
|
||
define mark_ylA as (
|
||
check_vowel_harmony
|
||
among('la' 'le')
|
||
(mark_suffix_with_optional_y_consonant)
|
||
)
|
||
|
||
define mark_ki as (
|
||
'ki'
|
||
)
|
||
|
||
define mark_ncA as (
|
||
check_vowel_harmony
|
||
among('ca' 'ce')
|
||
(mark_suffix_with_optional_n_consonant)
|
||
)
|
||
|
||
define mark_yUm as (
|
||
check_vowel_harmony
|
||
among ('{i'}m' 'im' 'um' '{u"}m')
|
||
(mark_suffix_with_optional_y_consonant)
|
||
)
|
||
|
||
define mark_sUn as (
|
||
check_vowel_harmony
|
||
among ('s{i'}n' 'sin' 'sun' 's{u"}n' )
|
||
)
|
||
|
||
define mark_yUz as (
|
||
check_vowel_harmony
|
||
among ('{i'}z' 'iz' 'uz' '{u"}z')
|
||
(mark_suffix_with_optional_y_consonant)
|
||
)
|
||
|
||
define mark_sUnUz as (
|
||
among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z')
|
||
)
|
||
|
||
define mark_lAr as (
|
||
check_vowel_harmony
|
||
among ('ler' 'lar')
|
||
)
|
||
|
||
define mark_nUz as (
|
||
check_vowel_harmony
|
||
among ('n{i'}z' 'niz' 'nuz' 'n{u"}z')
|
||
)
|
||
|
||
define mark_DUr as (
|
||
check_vowel_harmony
|
||
among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r')
|
||
)
|
||
|
||
define mark_cAsInA as (
|
||
among ('cas{i'}na' 'cesine')
|
||
)
|
||
|
||
define mark_yDU as (
|
||
check_vowel_harmony
|
||
among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m'
|
||
't{i'}n' 'tin' 'tun' 't{u"}n' 'd{i'}n' 'din' 'dun' 'd{u"}n'
|
||
't{i'}k' 'tik' 'tuk' 't{u"}k' 'd{i'}k' 'dik' 'duk' 'd{u"}k'
|
||
't{i'}' 'ti' 'tu' 't{u"}' 'd{i'}' 'di' 'du' 'd{u"}')
|
||
(mark_suffix_with_optional_y_consonant)
|
||
)
|
||
|
||
// does not fully obey vowel harmony
|
||
define mark_ysA as (
|
||
among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se')
|
||
(mark_suffix_with_optional_y_consonant)
|
||
)
|
||
|
||
define mark_ymUs_ as (
|
||
check_vowel_harmony
|
||
among ('m{i'}{s.}' 'mi{s.}' 'mu{s.}' 'm{u"}{s.}')
|
||
(mark_suffix_with_optional_y_consonant)
|
||
)
|
||
|
||
define mark_yken as (
|
||
'ken' (mark_suffix_with_optional_y_consonant)
|
||
)
|
||
|
||
define stem_nominal_verb_suffixes as (
|
||
[
|
||
set continue_stemming_noun_suffixes
|
||
(mark_ymUs_ or mark_yDU or mark_ysA or mark_yken)
|
||
or
|
||
(mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)
|
||
or
|
||
(
|
||
mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_))
|
||
unset continue_stemming_noun_suffixes
|
||
)
|
||
or
|
||
(mark_nUz (mark_yDU or mark_ysA))
|
||
or
|
||
((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_))
|
||
or
|
||
(mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_))
|
||
]delete
|
||
)
|
||
|
||
// stems noun suffix chains ending with -ki
|
||
define stem_suffix_chain_before_ki as (
|
||
[
|
||
mark_ki
|
||
(
|
||
(mark_DA] delete try([
|
||
(mark_lAr] delete try(stem_suffix_chain_before_ki))
|
||
or
|
||
(mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
|
||
|
||
))
|
||
or
|
||
(mark_nUn] delete try([
|
||
(mark_lArI] delete)
|
||
or
|
||
([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
|
||
or
|
||
(stem_suffix_chain_before_ki)
|
||
))
|
||
or
|
||
(mark_ndA (
|
||
(mark_lArI] delete)
|
||
or
|
||
((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki)))
|
||
or
|
||
(stem_suffix_chain_before_ki)
|
||
))
|
||
)
|
||
)
|
||
|
||
define stem_noun_suffixes as (
|
||
([mark_lAr] delete try(stem_suffix_chain_before_ki))
|
||
or
|
||
([mark_ncA] delete
|
||
try(
|
||
([mark_lArI] delete)
|
||
or
|
||
([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
|
||
or
|
||
([mark_lAr] delete stem_suffix_chain_before_ki)
|
||
)
|
||
)
|
||
or
|
||
([(mark_ndA or mark_nA)
|
||
(
|
||
(mark_lArI] delete)
|
||
or
|
||
(mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
|
||
or
|
||
(stem_suffix_chain_before_ki)
|
||
)
|
||
)
|
||
or
|
||
([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI)))
|
||
or
|
||
( [mark_DAn] delete try ([
|
||
(
|
||
(mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
|
||
or
|
||
(mark_lAr] delete try(stem_suffix_chain_before_ki))
|
||
or
|
||
(stem_suffix_chain_before_ki)
|
||
))
|
||
)
|
||
or
|
||
([mark_nUn or mark_ylA] delete
|
||
try(
|
||
([mark_lAr] delete stem_suffix_chain_before_ki)
|
||
or
|
||
([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
|
||
or
|
||
stem_suffix_chain_before_ki
|
||
)
|
||
)
|
||
or
|
||
([mark_lArI] delete)
|
||
or
|
||
(stem_suffix_chain_before_ki)
|
||
or
|
||
([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki))
|
||
or
|
||
([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki))
|
||
)
|
||
|
||
define post_process_last_consonants as (
|
||
[substring] among (
|
||
'b' (<- 'p')
|
||
'c' (<- '{c.}')
|
||
'd' (<- 't')
|
||
'{g~}' (<- 'k')
|
||
)
|
||
)
|
||
|
||
// after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed
|
||
// like in 'kedim' -> 'ked'
|
||
// Turkish words don't usually end with 'd' or 'g'
|
||
// some very well known words are ignored (like 'ad' 'soyad'
|
||
// appends U to stems ending with d or g, decides which vowel to add
|
||
// based on the last vowel in the stem
|
||
define append_U_to_stems_ending_with_d_or_g as (
|
||
test('d' or 'g')
|
||
(test((goto vowel) 'a' or '{i'}') <+ '{i'}')
|
||
or
|
||
(test((goto vowel) 'e' or 'i') <+ 'i')
|
||
or
|
||
(test((goto vowel) 'o' or 'u') <+ 'u')
|
||
or
|
||
(test((goto vowel) '{o"}' or '{u"}') <+ '{u"}')
|
||
)
|
||
|
||
)
|
||
|
||
// Tests if there are more than one syllables
|
||
// In Turkish each vowel indicates a distinct syllable
|
||
define more_than_one_syllable_word as (
|
||
test (atleast 2 (gopast vowel))
|
||
)
|
||
|
||
define is_reserved_word as (
|
||
test(gopast 'ad' ($strlen = 2) ($strlen == limit))
|
||
or
|
||
test(gopast 'soyad' ($strlen = 5) ($strlen == limit))
|
||
)
|
||
|
||
define postlude as (
|
||
not(is_reserved_word)
|
||
backwards (
|
||
do append_U_to_stems_ending_with_d_or_g
|
||
do post_process_last_consonants
|
||
|
||
)
|
||
)
|
||
|
||
define stem as (
|
||
(more_than_one_syllable_word)
|
||
(
|
||
backwards (
|
||
do stem_nominal_verb_suffixes
|
||
continue_stemming_noun_suffixes
|
||
do stem_noun_suffixes
|
||
)
|
||
|
||
postlude
|
||
)
|
||
)
|
||
|
||
|