mirror of https://gitee.com/bigwinds/arangodb
197 lines
5.1 KiB
Plaintext
197 lines
5.1 KiB
Plaintext
|
|
/* Finnish stemmer.
|
|
|
|
Numbers in square brackets refer to the sections in
|
|
Fred Karlsson, Finnish: An Essential Grammar. Routledge, 1999
|
|
ISBN 0-415-20705-3
|
|
|
|
*/
|
|
|
|
routines (
|
|
mark_regions
|
|
R2
|
|
particle_etc possessive
|
|
LONG VI
|
|
case_ending
|
|
i_plural
|
|
t_plural
|
|
other_endings
|
|
tidy
|
|
)
|
|
|
|
externals ( stem )
|
|
|
|
integers ( p1 p2 )
|
|
strings ( x )
|
|
booleans ( ending_removed )
|
|
groupings ( AEI V1 V2 particle_end )
|
|
|
|
stringescapes {}
|
|
|
|
/* special characters (in ISO Latin I) */
|
|
|
|
stringdef a" hex 'E4'
|
|
stringdef o" hex 'F6'
|
|
|
|
define AEI 'a{a"}ei'
|
|
define V1 'aeiouy{a"}{o"}'
|
|
define V2 'aeiou{a"}{o"}'
|
|
define particle_end V1 + 'nt'
|
|
|
|
define mark_regions as (
|
|
|
|
$p1 = limit
|
|
$p2 = limit
|
|
|
|
goto V1 gopast non-V1 setmark p1
|
|
goto V1 gopast non-V1 setmark p2
|
|
)
|
|
|
|
backwardmode (
|
|
|
|
define R2 as $p2 <= cursor
|
|
|
|
define particle_etc as (
|
|
setlimit tomark p1 for ([substring])
|
|
among(
|
|
'kin'
|
|
'kaan' 'k{a"}{a"}n'
|
|
'ko' 'k{o"}'
|
|
'han' 'h{a"}n'
|
|
'pa' 'p{a"}' // Particles [91]
|
|
(particle_end)
|
|
'sti' // Adverb [87]
|
|
(R2)
|
|
)
|
|
delete
|
|
)
|
|
define possessive as ( // [36]
|
|
setlimit tomark p1 for ([substring])
|
|
among(
|
|
'si'
|
|
(not 'k' delete) // take 'ksi' as the Comitative case
|
|
'ni'
|
|
(delete ['kse'] <- 'ksi') // kseni = ksi + ni
|
|
'nsa' 'ns{a"}'
|
|
'mme'
|
|
'nne'
|
|
(delete)
|
|
/* Now for Vn possessives after case endings: [36] */
|
|
'an'
|
|
(among('ta' 'ssa' 'sta' 'lla' 'lta' 'na') delete)
|
|
'{a"}n'
|
|
(among('t{a"}' 'ss{a"}' 'st{a"}'
|
|
'll{a"}' 'lt{a"}' 'n{a"}') delete)
|
|
'en'
|
|
(among('lle' 'ine') delete)
|
|
)
|
|
)
|
|
|
|
define LONG as
|
|
among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}')
|
|
|
|
define VI as ('i' V2)
|
|
|
|
define case_ending as (
|
|
setlimit tomark p1 for ([substring])
|
|
among(
|
|
'han' ('a') //-.
|
|
'hen' ('e') // |
|
|
'hin' ('i') // |
|
|
'hon' ('o') // |
|
|
'h{a"}n' ('{a"}') // Illative [43]
|
|
'h{o"}n' ('{o"}') // |
|
|
'siin' VI // |
|
|
'seen' LONG //-'
|
|
|
|
'den' VI
|
|
'tten' VI // Genitive plurals [34]
|
|
()
|
|
'n' // Genitive or Illative
|
|
( try ( LONG // Illative
|
|
or 'ie' // Genitive
|
|
and next ]
|
|
)
|
|
/* otherwise Genitive */
|
|
)
|
|
|
|
'a' '{a"}' //-.
|
|
(V1 non-V1) // |
|
|
'tta' 'tt{a"}' // Partitive [32]
|
|
('e') // |
|
|
'ta' 't{a"}' //-'
|
|
|
|
'ssa' 'ss{a"}' // Inessive [41]
|
|
'sta' 'st{a"}' // Elative [42]
|
|
|
|
'lla' 'll{a"}' // Adessive [44]
|
|
'lta' 'lt{a"}' // Ablative [51]
|
|
'lle' // Allative [46]
|
|
'na' 'n{a"}' // Essive [49]
|
|
'ksi' // Translative[50]
|
|
'ine' // Comitative [51]
|
|
|
|
/* Abessive and Instructive are too rare for
|
|
inclusion [51] */
|
|
|
|
)
|
|
delete
|
|
set ending_removed
|
|
)
|
|
define other_endings as (
|
|
setlimit tomark p2 for ([substring])
|
|
among(
|
|
'mpi' 'mpa' 'mp{a"}'
|
|
'mmi' 'mma' 'mm{a"}' // Comparative forms [85]
|
|
(not 'po') //-improves things
|
|
'impi' 'impa' 'imp{a"}'
|
|
'immi' 'imma' 'imm{a"}' // Superlative forms [86]
|
|
'eja' 'ej{a"}' // indicates agent [93.1B]
|
|
)
|
|
delete
|
|
)
|
|
define i_plural as ( // [26]
|
|
setlimit tomark p1 for ([substring])
|
|
among(
|
|
'i' 'j'
|
|
)
|
|
delete
|
|
)
|
|
define t_plural as ( // [26]
|
|
setlimit tomark p1 for (
|
|
['t'] test V1
|
|
delete
|
|
)
|
|
setlimit tomark p2 for ([substring])
|
|
among(
|
|
'mma' (not 'po') //-mmat endings
|
|
'imma' //-immat endings
|
|
)
|
|
delete
|
|
)
|
|
define tidy as (
|
|
setlimit tomark p1 for (
|
|
do ( LONG and ([next] delete ) ) // undouble vowel
|
|
do ( [AEI] non-V1 delete ) // remove trailing a, a", e, i
|
|
do ( ['j'] 'o' or 'u' delete )
|
|
do ( ['o'] 'j' delete )
|
|
)
|
|
goto non-V1 [next] -> x x delete // undouble consonant
|
|
)
|
|
)
|
|
|
|
define stem as (
|
|
|
|
do mark_regions
|
|
unset ending_removed
|
|
backwards (
|
|
do particle_etc
|
|
do possessive
|
|
do case_ending
|
|
do other_endings
|
|
(ending_removed do i_plural) or do t_plural
|
|
do tidy
|
|
)
|
|
)
|
|
|