mirror of https://gitee.com/bigwinds/arangodb
242 lines
5.2 KiB
Plaintext
242 lines
5.2 KiB
Plaintext
/*
|
|
Hungarian Stemmer
|
|
Removes noun inflections
|
|
*/
|
|
|
|
routines (
|
|
mark_regions
|
|
R1
|
|
v_ending
|
|
case
|
|
case_special
|
|
case_other
|
|
plural
|
|
owned
|
|
sing_owner
|
|
plur_owner
|
|
instrum
|
|
factive
|
|
undouble
|
|
double
|
|
)
|
|
|
|
externals ( stem )
|
|
|
|
integers ( p1 )
|
|
groupings ( v )
|
|
|
|
stringescapes {}
|
|
|
|
/* special characters (in Unicode) */
|
|
|
|
stringdef a' hex 'E1' //a-acute
|
|
stringdef e' hex 'E9' //e-acute
|
|
stringdef i' hex 'ED' //i-acute
|
|
stringdef o' hex 'F3' //o-acute
|
|
stringdef o" hex 'F6' //o-umlaut
|
|
stringdef oq hex '151' //o-double acute
|
|
stringdef u' hex 'FA' //u-acute
|
|
stringdef u" hex 'FC' //u-umlaut
|
|
stringdef uq hex '171' //u-double acute
|
|
|
|
define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}'
|
|
|
|
define mark_regions as (
|
|
|
|
$p1 = limit
|
|
|
|
(v goto non-v
|
|
among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next
|
|
setmark p1)
|
|
or
|
|
|
|
(non-v gopast v setmark p1)
|
|
)
|
|
|
|
backwardmode (
|
|
|
|
define R1 as $p1 <= cursor
|
|
|
|
define v_ending as (
|
|
[substring] R1 among(
|
|
'{a'}' (<- 'a')
|
|
'{e'}' (<- 'e')
|
|
)
|
|
)
|
|
|
|
define double as (
|
|
test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm'
|
|
'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs')
|
|
)
|
|
|
|
define undouble as (
|
|
next [hop 1] delete
|
|
)
|
|
|
|
define instrum as(
|
|
[substring] R1 among(
|
|
'al' (double)
|
|
'el' (double)
|
|
)
|
|
delete
|
|
undouble
|
|
)
|
|
|
|
|
|
define case as (
|
|
[substring] R1 among(
|
|
'ban' 'ben'
|
|
'ba' 'be'
|
|
'ra' 're'
|
|
'nak' 'nek'
|
|
'val' 'vel'
|
|
't{o'}l' 't{oq}l'
|
|
'r{o'}l' 'r{oq}l'
|
|
'b{o'}l' 'b{oq}l'
|
|
'hoz' 'hez' 'h{o"}z'
|
|
'n{a'}l' 'n{e'}l'
|
|
'ig'
|
|
'at' 'et' 'ot' '{o"}t'
|
|
'{e'}rt'
|
|
'k{e'}pp' 'k{e'}ppen'
|
|
'kor'
|
|
'ul' '{u"}l'
|
|
'v{a'}' 'v{e'}'
|
|
'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt'
|
|
'k{e'}nt'
|
|
'en' 'on' 'an' '{o"}n'
|
|
'n'
|
|
't'
|
|
)
|
|
delete
|
|
v_ending
|
|
)
|
|
|
|
define case_special as(
|
|
[substring] R1 among(
|
|
'{e'}n' (<- 'e')
|
|
'{a'}n' (<- 'a')
|
|
'{a'}nk{e'}nt' (<- 'a')
|
|
)
|
|
)
|
|
|
|
define case_other as(
|
|
[substring] R1 among(
|
|
'astul' 'est{u"}l' (delete)
|
|
'stul' 'st{u"}l' (delete)
|
|
'{a'}stul' (<- 'a')
|
|
'{e'}st{u"}l' (<- 'e')
|
|
)
|
|
)
|
|
|
|
define factive as(
|
|
[substring] R1 among(
|
|
'{a'}' (double)
|
|
'{e'}' (double)
|
|
)
|
|
delete
|
|
undouble
|
|
)
|
|
|
|
define plural as (
|
|
[substring] R1 among(
|
|
'{a'}k' (<- 'a')
|
|
'{e'}k' (<- 'e')
|
|
'{o"}k' (delete)
|
|
'ak' (delete)
|
|
'ok' (delete)
|
|
'ek' (delete)
|
|
'k' (delete)
|
|
)
|
|
)
|
|
|
|
define owned as (
|
|
[substring] R1 among (
|
|
'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete)
|
|
'{e'}k{e'}' (<- 'e')
|
|
'{a'}k{e'}' (<- 'a')
|
|
'k{e'}' (delete)
|
|
'{e'}{e'}i' (<- 'e')
|
|
'{a'}{e'}i' (<- 'a')
|
|
'{e'}i' (delete)
|
|
'{e'}{e'}' (<- 'e')
|
|
'{e'}' (delete)
|
|
)
|
|
)
|
|
|
|
define sing_owner as (
|
|
[substring] R1 among(
|
|
'{u"}nk' 'unk' (delete)
|
|
'{a'}nk' (<- 'a')
|
|
'{e'}nk' (<- 'e')
|
|
'nk' (delete)
|
|
'{a'}juk' (<- 'a')
|
|
'{e'}j{u"}k' (<- 'e')
|
|
'juk' 'j{u"}k' (delete)
|
|
'uk' '{u"}k' (delete)
|
|
'em' 'om' 'am' (delete)
|
|
'{a'}m' (<- 'a')
|
|
'{e'}m' (<- 'e')
|
|
'm' (delete)
|
|
'od' 'ed' 'ad' '{o"}d' (delete)
|
|
'{a'}d' (<- 'a')
|
|
'{e'}d' (<- 'e')
|
|
'd' (delete)
|
|
'ja' 'je' (delete)
|
|
'a' 'e' 'o' (delete)
|
|
'{a'}' (<- 'a')
|
|
'{e'}' (<- 'e')
|
|
)
|
|
)
|
|
|
|
define plur_owner as (
|
|
[substring] R1 among(
|
|
'jaim' 'jeim' (delete)
|
|
'{a'}im' (<- 'a')
|
|
'{e'}im' (<- 'e')
|
|
'aim' 'eim' (delete)
|
|
'im' (delete)
|
|
'jaid' 'jeid' (delete)
|
|
'{a'}id' (<- 'a')
|
|
'{e'}id' (<- 'e')
|
|
'aid' 'eid' (delete)
|
|
'id' (delete)
|
|
'jai' 'jei' (delete)
|
|
'{a'}i' (<- 'a')
|
|
'{e'}i' (<- 'e')
|
|
'ai' 'ei' (delete)
|
|
'i' (delete)
|
|
'jaink' 'jeink' (delete)
|
|
'eink' 'aink' (delete)
|
|
'{a'}ink' (<- 'a')
|
|
'{e'}ink' (<- 'e')
|
|
'ink'
|
|
'jaitok' 'jeitek' (delete)
|
|
'aitok' 'eitek' (delete)
|
|
'{a'}itok' (<- 'a')
|
|
'{e'}itek' (<- 'e')
|
|
'itek' (delete)
|
|
'jeik' 'jaik' (delete)
|
|
'aik' 'eik' (delete)
|
|
'{a'}ik' (<- 'a')
|
|
'{e'}ik' (<- 'e')
|
|
'ik' (delete)
|
|
)
|
|
)
|
|
)
|
|
|
|
define stem as (
|
|
do mark_regions
|
|
backwards (
|
|
do instrum
|
|
do case
|
|
do case_special
|
|
do case_other
|
|
do factive
|
|
do owned
|
|
do sing_owner
|
|
do plur_owner
|
|
do plural
|
|
)
|
|
)
|