mirror of https://gitee.com/bigwinds/arangodb
586 lines
18 KiB
Plaintext
586 lines
18 KiB
Plaintext
/*
|
||
* Authors:
|
||
* - Assem Chelli, < assem [dot] ch [at] gmail >
|
||
* - Abdelkrim Aries <ab [underscore] aries [at] esi [dot] dz>
|
||
*
|
||
*/
|
||
|
||
stringescapes { }
|
||
|
||
/* the Arabic letters in Unicode */
|
||
// Hamza
|
||
stringdef o hex '621' // Hamza
|
||
stringdef ao hex '623' // Hamza above Alef
|
||
stringdef ao_ hex '625' // Hamza below Alef
|
||
stringdef a~ hex '622' // Alef madda
|
||
stringdef wo hex '624' // Hamza above waw
|
||
stringdef yo hex '626' // Hamza above yeh
|
||
|
||
// Letters
|
||
stringdef a hex '627' // Alef
|
||
stringdef a_ hex '649' // Alef Maksura
|
||
stringdef b hex '628' // Beh
|
||
stringdef t_ hex '629' // Teh_Marbuta
|
||
stringdef t hex '62a' // Teh
|
||
stringdef th hex '62b' // Theh
|
||
stringdef j hex '62c' // Jeem
|
||
stringdef h hex '62d' // Hah
|
||
stringdef x hex '62e' // Khah
|
||
stringdef d hex '62f' // Dal
|
||
stringdef dz hex '630' // Thal
|
||
stringdef r hex '631' // Reh
|
||
stringdef z hex '632' // Zain
|
||
stringdef s hex '633' // Seen
|
||
stringdef sh hex '634' // Sheen
|
||
stringdef c hex '635' // Sad
|
||
stringdef dh hex '636' // Dad
|
||
stringdef tt hex '637' // Tah
|
||
stringdef zh hex '638' // Zah
|
||
stringdef i hex '639' // Ain
|
||
stringdef gh hex '63a' // Ghain
|
||
stringdef f hex '641' // Feh
|
||
stringdef q hex '642' // Qaf
|
||
stringdef k hex '643' // Kaf
|
||
stringdef l hex '644' // Lam
|
||
stringdef m hex '645' // Meem
|
||
stringdef n hex '646' // Noon
|
||
stringdef e hex '647' // Heh
|
||
stringdef w hex '648' // Waw
|
||
stringdef y hex '64a' // Yeh
|
||
|
||
// Diacritics
|
||
stringdef aan hex '64b' // FatHatan
|
||
stringdef uun hex '64c' // Dammatan
|
||
stringdef iin hex '64d' // Kasratan
|
||
stringdef aa hex '64e' // FatHa
|
||
stringdef uu hex '64f' // Damma
|
||
stringdef ii hex '650' // Kasra
|
||
stringdef oo hex '652' // Sukun
|
||
stringdef ~ hex '651' // Shadda
|
||
|
||
// Hindu–Arabic numerals
|
||
stringdef 0 hex '0660'
|
||
stringdef 1 hex '0661'
|
||
stringdef 2 hex '0662'
|
||
stringdef 3 hex '0663'
|
||
stringdef 4 hex '0664'
|
||
stringdef 5 hex '0665'
|
||
stringdef 6 hex '0666'
|
||
stringdef 7 hex '0667'
|
||
stringdef 8 hex '0668'
|
||
stringdef 9 hex '0669'
|
||
|
||
|
||
// Kasheeda
|
||
stringdef _ hex '640' // Kasheeda, Tatweel
|
||
|
||
// Shaped forms
|
||
stringdef o1 hex 'fe80' // HAMZA
|
||
stringdef ao1 hex 'fe83' // ALEF_HAMZA_ABOVE
|
||
stringdef ao2 hex 'fe84' // ALEF_HAMZA_ABOVE
|
||
stringdef ao_1 hex 'fe87' // ALEF_HAMZA_BELOW
|
||
stringdef ao_2 hex 'fe88' // ALEF_HAMZA_BELOW
|
||
stringdef yo1 hex 'fe8b' // YEH_HAMZA
|
||
stringdef yo2 hex 'fe8c' // YEH_HAMZA
|
||
stringdef yo3 hex 'fe89' // YEH_HAMZA
|
||
stringdef yo4 hex 'fe8a' // YEH_HAMZA
|
||
stringdef a~1 hex 'fe81' // ALEF_MADDA
|
||
stringdef a~2 hex 'fe82' // ALEF_MADDA
|
||
stringdef wo1 hex 'fe85' // WAW_HAMZA
|
||
stringdef wo2 hex 'fe86' // WAW_HAMZA
|
||
stringdef a1 hex 'fe8d' // ALEF
|
||
stringdef a2 hex 'fe8e' // ALEF
|
||
stringdef b1 hex 'fe8f' // BEH
|
||
stringdef b2 hex 'fe90' // BEH
|
||
stringdef b3 hex 'fe91' // BEH
|
||
stringdef b4 hex 'fe92' // BEH
|
||
stringdef t_1 hex 'fe93' // TEH_MARBUTA
|
||
stringdef t_2 hex 'fe94' // TEH_MARBUTA
|
||
stringdef t1 hex 'fe97' // TEH
|
||
stringdef t2 hex 'fe98' // TEH
|
||
stringdef t3 hex 'fe95' // TEH
|
||
stringdef t4 hex 'fe96' // TEH
|
||
stringdef th1 hex 'fe9b' // THEH
|
||
stringdef th2 hex 'fe9c' // THEH
|
||
stringdef th3 hex 'fe9a' // THEH
|
||
stringdef th4 hex 'fe99' // THEH
|
||
stringdef j1 hex 'fe9f' // JEEM
|
||
stringdef j2 hex 'fea0' // JEEM
|
||
stringdef j3 hex 'fe9d' // JEEM
|
||
stringdef j4 hex 'fe9e' // JEEM
|
||
stringdef h1 hex 'fea3' // HAH
|
||
stringdef h2 hex 'fea4' // HAH
|
||
stringdef h3 hex 'fea1' // HAH
|
||
stringdef h4 hex 'fea2' // HAH
|
||
stringdef x1 hex 'fea7' // KHAH
|
||
stringdef x2 hex 'fea8' // KHAH
|
||
stringdef x3 hex 'fea5' // KHAH
|
||
stringdef x4 hex 'fea6' // KHAH
|
||
stringdef d1 hex 'fea9' // DAL
|
||
stringdef d2 hex 'feaa' // DAL
|
||
stringdef dz1 hex 'feab' // THAL
|
||
stringdef dz2 hex 'feac' // THAL
|
||
stringdef r1 hex 'fead' // REH
|
||
stringdef r2 hex 'feae' // REH
|
||
stringdef z1 hex 'feaf' // ZAIN
|
||
stringdef z2 hex 'feb0' // ZAIN
|
||
stringdef s1 hex 'feb3' // SEEN
|
||
stringdef s2 hex 'feb4' // SEEN
|
||
stringdef s3 hex 'feb1' // SEEN
|
||
stringdef s4 hex 'feb2' // SEEN
|
||
stringdef sh1 hex 'feb7' // SHEEN
|
||
stringdef sh2 hex 'feb8' // SHEEN
|
||
stringdef sh3 hex 'feb5' // SHEEN
|
||
stringdef sh4 hex 'feb6' // SHEEN
|
||
stringdef c1 hex 'febb' // SAD
|
||
stringdef c2 hex 'febc' // SAD
|
||
stringdef c3 hex 'feb9' // SAD
|
||
stringdef c4 hex 'feba' // SAD
|
||
stringdef dh1 hex 'febf' // DAD
|
||
stringdef dh2 hex 'fec0' // DAD
|
||
stringdef dh3 hex 'febd' // DAD
|
||
stringdef dh4 hex 'febe' // DAD
|
||
stringdef tt1 hex 'fec3' // TAH
|
||
stringdef tt2 hex 'fec4' // TAH
|
||
stringdef tt3 hex 'fec1' // TAH
|
||
stringdef tt4 hex 'fec2' // TAH
|
||
stringdef zh1 hex 'fec7' // ZAH
|
||
stringdef zh2 hex 'fec8' // ZAH
|
||
stringdef zh3 hex 'fec5' // ZAH
|
||
stringdef zh4 hex 'fec6' // ZAH
|
||
stringdef i1 hex 'fecb' // AIN
|
||
stringdef i2 hex 'fecc' // AIN
|
||
stringdef i3 hex 'fec9' // AIN
|
||
stringdef i4 hex 'feca' // AIN
|
||
stringdef gh1 hex 'fecf' // GHAIN
|
||
stringdef gh2 hex 'fed0' // GHAIN
|
||
stringdef gh3 hex 'fecd' // GHAIN
|
||
stringdef gh4 hex 'fece' // GHAIN
|
||
stringdef f1 hex 'fed3' // FEH
|
||
stringdef f2 hex 'fed4' // FEH
|
||
stringdef f3 hex 'fed1' // FEH
|
||
stringdef f4 hex 'fed2' // FEH
|
||
stringdef q1 hex 'fed7' // QAF
|
||
stringdef q2 hex 'fed8' // QAF
|
||
stringdef q3 hex 'fed5' // QAF
|
||
stringdef q4 hex 'fed6' // QAF
|
||
stringdef k1 hex 'fedb' // KAF
|
||
stringdef k2 hex 'fedc' // KAF
|
||
stringdef k3 hex 'fed9' // KAF
|
||
stringdef k4 hex 'feda' // KAF
|
||
stringdef l1 hex 'fedf' // LAM
|
||
stringdef l2 hex 'fee0' // LAM
|
||
stringdef l3 hex 'fedd' // LAM
|
||
stringdef l4 hex 'fede' // LAM
|
||
stringdef m1 hex 'fee3' // MEEM
|
||
stringdef m2 hex 'fee4' // MEEM
|
||
stringdef m3 hex 'fee1' // MEEM
|
||
stringdef m4 hex 'fee2' // MEEM
|
||
stringdef n1 hex 'fee7' // NOON
|
||
stringdef n2 hex 'fee8' // NOON
|
||
stringdef n3 hex 'fee5' // NOON
|
||
stringdef n4 hex 'fee6' // NOON
|
||
stringdef e1 hex 'feeb' // HEH
|
||
stringdef e2 hex 'feec' // HEH
|
||
stringdef e3 hex 'fee9' // HEH
|
||
stringdef e4 hex 'feea' // HEH
|
||
stringdef w1 hex 'feed' // WAW
|
||
stringdef w2 hex 'feee' // WAW
|
||
stringdef a_1 hex 'feef' // ALEF_MAKSURA
|
||
stringdef a_2 hex 'fef0' // ALEF_MAKSURA
|
||
stringdef y1 hex 'fef3' // YEH
|
||
stringdef y2 hex 'fef4' // YEH
|
||
stringdef y3 hex 'fef1' // YEH
|
||
stringdef y4 hex 'fef2' // YEH
|
||
|
||
// Ligatures Lam-Alef
|
||
stringdef la hex 'fefb' // LAM_ALEF
|
||
stringdef la2 hex 'fefc' // LAM_ALEF
|
||
stringdef lao hex 'fef7' // LAM_ALEF_HAMZA_ABOVE
|
||
stringdef lao2 hex 'fef8' // LAM_ALEF_HAMZA_ABOVE
|
||
stringdef lao_ hex 'fef9' // LAM_ALEF_HAMZA_BELOW
|
||
stringdef lao_2 hex 'fefa' // LAM_ALEF_HAMZA_BELOW
|
||
stringdef la~ hex 'fef5' // LAM_ALEF_MADDA_ABOVE
|
||
stringdef la~2 hex 'fef6' // LAM_ALEF_MADDA_ABOVE
|
||
|
||
|
||
|
||
integers (
|
||
word_len
|
||
)
|
||
|
||
booleans (
|
||
is_noun
|
||
is_verb
|
||
is_defined
|
||
)
|
||
|
||
routines (
|
||
Prefix_Step1
|
||
Prefix_Step2
|
||
Prefix_Step3a_Noun
|
||
Prefix_Step3b_Noun
|
||
Prefix_Step3_Verb
|
||
Prefix_Step4_Verb
|
||
|
||
Suffix_All_alef_maqsura
|
||
Suffix_Noun_Step1a
|
||
Suffix_Noun_Step1b
|
||
Suffix_Noun_Step2a
|
||
Suffix_Noun_Step2b
|
||
Suffix_Noun_Step2c1
|
||
Suffix_Noun_Step2c2
|
||
Suffix_Noun_Step3
|
||
Suffix_Verb_Step1
|
||
Suffix_Verb_Step2a
|
||
Suffix_Verb_Step2b
|
||
Suffix_Verb_Step2c
|
||
|
||
Normalize_post
|
||
Normalize_pre
|
||
|
||
Checks1
|
||
)
|
||
|
||
externals ( stem )
|
||
|
||
groupings ( )
|
||
|
||
|
||
// Normalizations
|
||
define Normalize_pre as (
|
||
loop len (
|
||
(
|
||
[substring] among (
|
||
'{aan}' '{uun}' '{iin}' '{aa}' '{uu}' '{ii}' '{oo}' '{~}'( delete ) // strip vocalization
|
||
'{_}' ( delete ) // strip kasheeda
|
||
|
||
// Hindu–Arabic numerals
|
||
'{0}' ( <- '0')
|
||
'{1}' ( <- '1')
|
||
'{2}' ( <- '2')
|
||
'{3}' ( <- '3')
|
||
'{4}' ( <- '4')
|
||
'{5}' ( <- '5')
|
||
'{6}' ( <- '6')
|
||
'{7}' ( <- '7')
|
||
'{8}' ( <- '8')
|
||
'{9}' ( <- '9')
|
||
|
||
// Shaped forms
|
||
'{o1}' ( <- '{o}' ) // HAMZA
|
||
'{ao1}' '{ao2}' ( <- '{ao}' ) // ALEF_HAMZA_ABOVE
|
||
'{ao_1}' '{ao_2}' ( <- '{ao_}' ) // ALEF_HAMZA_BELOW
|
||
'{yo1}' '{yo2}' '{yo3}' '{yo4}' ( <- '{yo}' ) // YEH_HAMZA
|
||
'{a~1}' '{a~2}'( <- '{a~}' ) // ALEF_MADDA
|
||
'{wo1}' '{wo2}'( <- '{wo}' ) // WAW_HAMZA
|
||
'{a1}' '{a2}' ( <- '{a}' ) // ALEF
|
||
'{b1}' '{b2}' '{b3}' '{b4}' ( <- '{b}' ) // BEH
|
||
'{t_1}' '{t_2}' ( <- '{t_}' ) // TEH_MARBUTA
|
||
'{t1}' '{t2}' '{t3}' '{t4}' ( <- '{t}' ) // TEH
|
||
'{th1}' '{th2}' '{th3}' '{th4}' ( <- '{th}' ) // THEH
|
||
'{j1}' '{j2}' '{j3}' '{j4}'( <- '{j}' ) // JEEM
|
||
'{h1}' '{h2}' '{h3}' '{h4}' ( <- '{h}' ) // HAH
|
||
'{x1}' '{x2}' '{x3}' '{x4}'( <- '{x}' ) // KHAH
|
||
'{d1}' '{d2}' ( <- '{d}' ) // DAL
|
||
'{dz1}''{dz2}' ( <- '{dz}' ) // THAL
|
||
'{r1}' '{r2}'( <- '{r}' ) // REH
|
||
'{z1}' '{z2}' ( <- '{z}' ) // ZAIN
|
||
'{s1}' '{s2}' '{s3}' '{s4}'( <- '{s}' ) // SEEN
|
||
'{sh1}' '{sh2}' '{sh3}' '{sh4}' ( <- '{sh}' ) // SHEEN
|
||
'{c1}' '{c2}' '{c3}' '{c4}'( <- '{c}' ) // SAD
|
||
'{dh1}' '{dh2}' '{dh3}' '{dh4}'( <- '{dh}' ) // DAD
|
||
'{tt1}' '{tt2}' '{tt3}' '{tt4}' ( <- '{tt}' ) // TAH
|
||
'{zh1}' '{zh2}' '{zh3}' '{zh4}'( <- '{zh}' ) // ZAH
|
||
'{i1}' '{i2}' '{i3}' '{i4}'( <- '{i}' ) // AIN
|
||
'{gh1}' '{gh2}' '{gh3}' '{gh4}'( <- '{gh}' ) // GHAIN
|
||
'{f1}' '{f2}' '{f3}' '{f4}' ( <- '{f}' ) // FEH
|
||
'{q1}' '{q2}' '{q3}' '{q4}' ( <- '{q}' ) // QAF
|
||
'{k1}' '{k2}' '{k3}' '{k4}'( <- '{k}' ) // KAF
|
||
'{l1}' '{l2}' '{l3}' '{l4}'( <- '{l}' ) // LAM
|
||
'{m1}' '{m2}' '{m3}' '{m4}' ( <- '{m}' ) // MEEM
|
||
'{n1}' '{n2}' '{n3}' '{n4}'( <- '{n}' ) // NOON
|
||
'{e1}' '{e2}' '{e3}' '{e4}' ( <- '{e}' ) // HEH
|
||
'{w1}' '{w2}' ( <- '{w}' ) // WAW
|
||
'{a_1}' '{a_2}' ( <- '{a_}' ) // ALEF_MAKSURA
|
||
'{y1}' '{y2}' '{y3}' '{y4}' ( <- '{y}' ) // YEH
|
||
|
||
// Ligatures Lam-Alef
|
||
'{la}' '{la2}' (<- '{l}{a}')
|
||
'{lao}' '{lao2}' (<- '{l}{ao}')
|
||
'{lao_}' '{lao_2}' (<- '{l}{ao_}')
|
||
'{la~}' '{la~2}' (<- '{l}{a~}')
|
||
|
||
)
|
||
)
|
||
or
|
||
next
|
||
)
|
||
)
|
||
|
||
define Normalize_post as (
|
||
|
||
do (
|
||
// normalize last hamza
|
||
backwards (
|
||
[substring] among (
|
||
'{ao}''{ao_}' '{a~}' ( <- '{o}')
|
||
'{wo}' ( <- '{o}')
|
||
'{yo}' ( <- '{o}')
|
||
)
|
||
)
|
||
)
|
||
|
||
do loop word_len (
|
||
(
|
||
// normalize other hamza's
|
||
[substring] among (
|
||
'{ao}''{ao_}' '{a~}' ( <- '{a}')
|
||
'{wo}' ( <- '{w}')
|
||
'{yo}' ( <- '{y}')
|
||
)
|
||
)
|
||
or
|
||
next
|
||
)
|
||
)
|
||
|
||
// Checks
|
||
define Checks1 as (
|
||
$word_len = len
|
||
[substring] among (
|
||
'{b}{a}{l}' '{k}{a}{l}' ($word_len > 4 set is_noun unset is_verb set is_defined)
|
||
'{l}{l}' '{a}{l}' ($word_len > 3 set is_noun unset is_verb set is_defined)
|
||
)
|
||
)
|
||
|
||
|
||
//prefixes
|
||
define Prefix_Step1 as (
|
||
$word_len = len
|
||
[substring] among (
|
||
'{ao}{ao}' ($word_len > 3 <- '{ao}' )
|
||
'{ao}{a~}' ($word_len > 3 <- '{a~}' )
|
||
'{ao}{wo}' ($word_len > 3 <- '{ao}' )
|
||
'{ao}{a}' ($word_len > 3 <- '{a}' )
|
||
'{ao}{ao_}' ($word_len > 3 <- '{ao_}' )
|
||
// '{ao}' ($word_len > 3 delete) //rare case
|
||
)
|
||
)
|
||
|
||
define Prefix_Step2 as (
|
||
$word_len = len
|
||
not '{f}{a}'
|
||
not '{w}{a}'
|
||
[substring] among (
|
||
'{f}' ($word_len > 3 delete)
|
||
'{w}' ($word_len > 3 delete)
|
||
)
|
||
)
|
||
|
||
define Prefix_Step3a_Noun as ( // it is noun and defined
|
||
$word_len = len
|
||
[substring] among (
|
||
'{b}{a}{l}' '{k}{a}{l}' ($word_len > 5 delete)
|
||
'{l}{l}' '{a}{l}' ($word_len > 4 delete)
|
||
)
|
||
)
|
||
|
||
define Prefix_Step3b_Noun as ( // probably noun and defined
|
||
$word_len = len
|
||
not '{b}{a}' // exception
|
||
[substring] among (
|
||
'{b}' ($word_len > 3 delete)
|
||
// '{k}' '{l}' ($word_len > 3 delete) // BUG: cause confusion
|
||
'{b}{b}' ($word_len > 3 <- '{b}' )
|
||
'{k}{k}' ($word_len > 3 <- '{k}' )
|
||
)
|
||
|
||
)
|
||
|
||
define Prefix_Step3_Verb as (
|
||
$word_len = len
|
||
[substring] among (
|
||
//'{s}' ($word_len > 4 delete)// BUG: cause confusion
|
||
'{s}{y}' ($word_len > 4 <- '{y}' )
|
||
'{s}{t}' ($word_len > 4 <- '{t}')
|
||
'{s}{n}' ($word_len > 4 <- '{n}')
|
||
'{s}{ao}' ($word_len > 4 <- '{ao}')
|
||
)
|
||
)
|
||
|
||
define Prefix_Step4_Verb as (
|
||
$word_len = len
|
||
[substring] among (
|
||
'{y}{s}{t}' '{n}{s}{t}' '{t}{s}{t}' ($word_len > 4 set is_verb unset is_noun <- '{a}{s}{t}' )
|
||
)
|
||
)
|
||
|
||
// suffixes
|
||
backwardmode (
|
||
|
||
define Suffix_Noun_Step1a as (
|
||
$word_len = len
|
||
[substring] among (
|
||
'{y}' '{k}' '{e}' ($word_len >= 4 delete)
|
||
'{n}{a}' '{k}{m}' '{e}{a}' '{e}{n}' '{e}{m}' ($word_len >= 5 delete)
|
||
'{k}{m}{a}' '{e}{m}{a}' ($word_len >= 6 delete)
|
||
)
|
||
)
|
||
define Suffix_Noun_Step1b as (
|
||
$word_len = len
|
||
[substring] among (
|
||
'{n}' ($word_len > 5 delete)
|
||
)
|
||
)
|
||
|
||
define Suffix_Noun_Step2a as (
|
||
$word_len = len
|
||
[substring] among (
|
||
'{a}' '{y}' '{w}' ($word_len > 4 delete)
|
||
)
|
||
)
|
||
|
||
define Suffix_Noun_Step2b as (
|
||
$word_len = len
|
||
[substring] among (
|
||
'{a}{t}' ($word_len >= 5 delete)
|
||
)
|
||
)
|
||
|
||
define Suffix_Noun_Step2c1 as (
|
||
$word_len = len
|
||
[substring] among (
|
||
'{t}' ($word_len >= 4 delete)
|
||
)
|
||
)
|
||
define Suffix_Noun_Step2c2 as ( // feminine t_
|
||
$word_len = len
|
||
[substring] among (
|
||
'{t_}' ($word_len >= 4 delete)
|
||
)
|
||
)
|
||
define Suffix_Noun_Step3 as ( // ya' nisbiya
|
||
$word_len = len
|
||
[substring] among (
|
||
'{y}' ($word_len >= 3 delete)
|
||
)
|
||
)
|
||
|
||
define Suffix_Verb_Step1 as (
|
||
$word_len = len
|
||
[substring] among (
|
||
'{e}' '{k}' ($word_len >= 4 delete)
|
||
'{n}{y}' '{n}{a}' '{e}{a}' '{e}{m}' '{e}{n}' '{k}{m}' '{k}{n}' ($word_len >= 5 delete)
|
||
'{e}{m}{a}' '{k}{m}{a}' '{k}{m}{w}'($word_len >= 6 delete)
|
||
)
|
||
)
|
||
define Suffix_Verb_Step2a as (
|
||
$word_len = len
|
||
[substring] among (
|
||
'{t}' ($word_len >= 4 delete)
|
||
'{a}' '{n}' '{y}' ($word_len >= 4 delete)
|
||
'{n}{a}' '{t}{a}' '{t}{n}' ($word_len >= 5 delete)// past
|
||
'{a}{n}' '{w}{n}' '{y}{n}' ($word_len > 5 delete) // present
|
||
'{t}{m}{a}' ($word_len >= 6 delete)
|
||
)
|
||
)
|
||
|
||
define Suffix_Verb_Step2b as (
|
||
$word_len = len
|
||
[substring] among (
|
||
'{w}{a}' '{t}{m}' ($word_len >= 5 delete) // len >= 5
|
||
)
|
||
)
|
||
|
||
|
||
define Suffix_Verb_Step2c as (
|
||
$word_len = len
|
||
[substring] among (
|
||
'{w}' ($word_len >= 4 delete)
|
||
'{t}{m}{w}' ($word_len >= 6 delete)
|
||
)
|
||
)
|
||
|
||
define Suffix_All_alef_maqsura as (
|
||
$word_len = len
|
||
[substring] among (
|
||
'{a_}' ( <- '{y}' ) // spell error
|
||
// '{a_}' ( delete ) // if noun > 3
|
||
// '{a_}' ( <- '{a}') // if verb
|
||
)
|
||
)
|
||
)
|
||
|
||
define stem as (
|
||
// set initial values
|
||
set is_noun
|
||
set is_verb
|
||
unset is_defined
|
||
|
||
// guess type and properties
|
||
do Checks1
|
||
|
||
// normalization pre-stemming
|
||
do Normalize_pre
|
||
|
||
|
||
backwards (
|
||
|
||
do (
|
||
//Suffixes for verbs
|
||
(
|
||
is_verb
|
||
(
|
||
(
|
||
(atleast 1 Suffix_Verb_Step1)
|
||
( Suffix_Verb_Step2a or Suffix_Verb_Step2c or next)
|
||
)
|
||
or Suffix_Verb_Step2b
|
||
or Suffix_Verb_Step2a
|
||
)
|
||
)
|
||
//Suffixes for nouns
|
||
or (
|
||
is_noun
|
||
(
|
||
|
||
try (
|
||
Suffix_Noun_Step2c2
|
||
or (not is_defined Suffix_Noun_Step1a (
|
||
Suffix_Noun_Step2a
|
||
or Suffix_Noun_Step2b
|
||
or Suffix_Noun_Step2c1
|
||
or next))
|
||
or (Suffix_Noun_Step1b (
|
||
Suffix_Noun_Step2a
|
||
or Suffix_Noun_Step2b
|
||
or Suffix_Noun_Step2c1))
|
||
or (not is_defined Suffix_Noun_Step2a)
|
||
or (Suffix_Noun_Step2b)
|
||
)
|
||
Suffix_Noun_Step3
|
||
)
|
||
|
||
)
|
||
|
||
// Suffixes for alef maqsura
|
||
or Suffix_All_alef_maqsura
|
||
)
|
||
)
|
||
|
||
//Prefixes
|
||
do (
|
||
try Prefix_Step1
|
||
try Prefix_Step2
|
||
( Prefix_Step3a_Noun
|
||
or (is_noun Prefix_Step3b_Noun)
|
||
or (is_verb try Prefix_Step3_Verb Prefix_Step4_Verb)
|
||
)
|
||
)
|
||
|
||
// normalization post-stemming
|
||
do Normalize_post
|
||
|
||
)
|