1
0
Fork 0
arangodb/3rdParty/snowball/algorithms/arabic/stem_Unicode.sbl

586 lines
18 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Authors:
* - Assem Chelli, < assem [dot] ch [at] gmail >
* - Abdelkrim Aries <ab [underscore] aries [at] esi [dot] dz>
*
*/
stringescapes { }
/* the Arabic letters in Unicode */
// Hamza
stringdef o hex '621' // Hamza
stringdef ao hex '623' // Hamza above Alef
stringdef ao_ hex '625' // Hamza below Alef
stringdef a~ hex '622' // Alef madda
stringdef wo hex '624' // Hamza above waw
stringdef yo hex '626' // Hamza above yeh
// Letters
stringdef a hex '627' // Alef
stringdef a_ hex '649' // Alef Maksura
stringdef b hex '628' // Beh
stringdef t_ hex '629' // Teh_Marbuta
stringdef t hex '62a' // Teh
stringdef th hex '62b' // Theh
stringdef j hex '62c' // Jeem
stringdef h hex '62d' // Hah
stringdef x hex '62e' // Khah
stringdef d hex '62f' // Dal
stringdef dz hex '630' // Thal
stringdef r hex '631' // Reh
stringdef z hex '632' // Zain
stringdef s hex '633' // Seen
stringdef sh hex '634' // Sheen
stringdef c hex '635' // Sad
stringdef dh hex '636' // Dad
stringdef tt hex '637' // Tah
stringdef zh hex '638' // Zah
stringdef i hex '639' // Ain
stringdef gh hex '63a' // Ghain
stringdef f hex '641' // Feh
stringdef q hex '642' // Qaf
stringdef k hex '643' // Kaf
stringdef l hex '644' // Lam
stringdef m hex '645' // Meem
stringdef n hex '646' // Noon
stringdef e hex '647' // Heh
stringdef w hex '648' // Waw
stringdef y hex '64a' // Yeh
// Diacritics
stringdef aan hex '64b' // FatHatan
stringdef uun hex '64c' // Dammatan
stringdef iin hex '64d' // Kasratan
stringdef aa hex '64e' // FatHa
stringdef uu hex '64f' // Damma
stringdef ii hex '650' // Kasra
stringdef oo hex '652' // Sukun
stringdef ~ hex '651' // Shadda
// HinduArabic numerals
stringdef 0 hex '0660'
stringdef 1 hex '0661'
stringdef 2 hex '0662'
stringdef 3 hex '0663'
stringdef 4 hex '0664'
stringdef 5 hex '0665'
stringdef 6 hex '0666'
stringdef 7 hex '0667'
stringdef 8 hex '0668'
stringdef 9 hex '0669'
// Kasheeda
stringdef _ hex '640' // Kasheeda, Tatweel
// Shaped forms
stringdef o1 hex 'fe80' // HAMZA
stringdef ao1 hex 'fe83' // ALEF_HAMZA_ABOVE
stringdef ao2 hex 'fe84' // ALEF_HAMZA_ABOVE
stringdef ao_1 hex 'fe87' // ALEF_HAMZA_BELOW
stringdef ao_2 hex 'fe88' // ALEF_HAMZA_BELOW
stringdef yo1 hex 'fe8b' // YEH_HAMZA
stringdef yo2 hex 'fe8c' // YEH_HAMZA
stringdef yo3 hex 'fe89' // YEH_HAMZA
stringdef yo4 hex 'fe8a' // YEH_HAMZA
stringdef a~1 hex 'fe81' // ALEF_MADDA
stringdef a~2 hex 'fe82' // ALEF_MADDA
stringdef wo1 hex 'fe85' // WAW_HAMZA
stringdef wo2 hex 'fe86' // WAW_HAMZA
stringdef a1 hex 'fe8d' // ALEF
stringdef a2 hex 'fe8e' // ALEF
stringdef b1 hex 'fe8f' // BEH
stringdef b2 hex 'fe90' // BEH
stringdef b3 hex 'fe91' // BEH
stringdef b4 hex 'fe92' // BEH
stringdef t_1 hex 'fe93' // TEH_MARBUTA
stringdef t_2 hex 'fe94' // TEH_MARBUTA
stringdef t1 hex 'fe97' // TEH
stringdef t2 hex 'fe98' // TEH
stringdef t3 hex 'fe95' // TEH
stringdef t4 hex 'fe96' // TEH
stringdef th1 hex 'fe9b' // THEH
stringdef th2 hex 'fe9c' // THEH
stringdef th3 hex 'fe9a' // THEH
stringdef th4 hex 'fe99' // THEH
stringdef j1 hex 'fe9f' // JEEM
stringdef j2 hex 'fea0' // JEEM
stringdef j3 hex 'fe9d' // JEEM
stringdef j4 hex 'fe9e' // JEEM
stringdef h1 hex 'fea3' // HAH
stringdef h2 hex 'fea4' // HAH
stringdef h3 hex 'fea1' // HAH
stringdef h4 hex 'fea2' // HAH
stringdef x1 hex 'fea7' // KHAH
stringdef x2 hex 'fea8' // KHAH
stringdef x3 hex 'fea5' // KHAH
stringdef x4 hex 'fea6' // KHAH
stringdef d1 hex 'fea9' // DAL
stringdef d2 hex 'feaa' // DAL
stringdef dz1 hex 'feab' // THAL
stringdef dz2 hex 'feac' // THAL
stringdef r1 hex 'fead' // REH
stringdef r2 hex 'feae' // REH
stringdef z1 hex 'feaf' // ZAIN
stringdef z2 hex 'feb0' // ZAIN
stringdef s1 hex 'feb3' // SEEN
stringdef s2 hex 'feb4' // SEEN
stringdef s3 hex 'feb1' // SEEN
stringdef s4 hex 'feb2' // SEEN
stringdef sh1 hex 'feb7' // SHEEN
stringdef sh2 hex 'feb8' // SHEEN
stringdef sh3 hex 'feb5' // SHEEN
stringdef sh4 hex 'feb6' // SHEEN
stringdef c1 hex 'febb' // SAD
stringdef c2 hex 'febc' // SAD
stringdef c3 hex 'feb9' // SAD
stringdef c4 hex 'feba' // SAD
stringdef dh1 hex 'febf' // DAD
stringdef dh2 hex 'fec0' // DAD
stringdef dh3 hex 'febd' // DAD
stringdef dh4 hex 'febe' // DAD
stringdef tt1 hex 'fec3' // TAH
stringdef tt2 hex 'fec4' // TAH
stringdef tt3 hex 'fec1' // TAH
stringdef tt4 hex 'fec2' // TAH
stringdef zh1 hex 'fec7' // ZAH
stringdef zh2 hex 'fec8' // ZAH
stringdef zh3 hex 'fec5' // ZAH
stringdef zh4 hex 'fec6' // ZAH
stringdef i1 hex 'fecb' // AIN
stringdef i2 hex 'fecc' // AIN
stringdef i3 hex 'fec9' // AIN
stringdef i4 hex 'feca' // AIN
stringdef gh1 hex 'fecf' // GHAIN
stringdef gh2 hex 'fed0' // GHAIN
stringdef gh3 hex 'fecd' // GHAIN
stringdef gh4 hex 'fece' // GHAIN
stringdef f1 hex 'fed3' // FEH
stringdef f2 hex 'fed4' // FEH
stringdef f3 hex 'fed1' // FEH
stringdef f4 hex 'fed2' // FEH
stringdef q1 hex 'fed7' // QAF
stringdef q2 hex 'fed8' // QAF
stringdef q3 hex 'fed5' // QAF
stringdef q4 hex 'fed6' // QAF
stringdef k1 hex 'fedb' // KAF
stringdef k2 hex 'fedc' // KAF
stringdef k3 hex 'fed9' // KAF
stringdef k4 hex 'feda' // KAF
stringdef l1 hex 'fedf' // LAM
stringdef l2 hex 'fee0' // LAM
stringdef l3 hex 'fedd' // LAM
stringdef l4 hex 'fede' // LAM
stringdef m1 hex 'fee3' // MEEM
stringdef m2 hex 'fee4' // MEEM
stringdef m3 hex 'fee1' // MEEM
stringdef m4 hex 'fee2' // MEEM
stringdef n1 hex 'fee7' // NOON
stringdef n2 hex 'fee8' // NOON
stringdef n3 hex 'fee5' // NOON
stringdef n4 hex 'fee6' // NOON
stringdef e1 hex 'feeb' // HEH
stringdef e2 hex 'feec' // HEH
stringdef e3 hex 'fee9' // HEH
stringdef e4 hex 'feea' // HEH
stringdef w1 hex 'feed' // WAW
stringdef w2 hex 'feee' // WAW
stringdef a_1 hex 'feef' // ALEF_MAKSURA
stringdef a_2 hex 'fef0' // ALEF_MAKSURA
stringdef y1 hex 'fef3' // YEH
stringdef y2 hex 'fef4' // YEH
stringdef y3 hex 'fef1' // YEH
stringdef y4 hex 'fef2' // YEH
// Ligatures Lam-Alef
stringdef la hex 'fefb' // LAM_ALEF
stringdef la2 hex 'fefc' // LAM_ALEF
stringdef lao hex 'fef7' // LAM_ALEF_HAMZA_ABOVE
stringdef lao2 hex 'fef8' // LAM_ALEF_HAMZA_ABOVE
stringdef lao_ hex 'fef9' // LAM_ALEF_HAMZA_BELOW
stringdef lao_2 hex 'fefa' // LAM_ALEF_HAMZA_BELOW
stringdef la~ hex 'fef5' // LAM_ALEF_MADDA_ABOVE
stringdef la~2 hex 'fef6' // LAM_ALEF_MADDA_ABOVE
integers (
word_len
)
booleans (
is_noun
is_verb
is_defined
)
routines (
Prefix_Step1
Prefix_Step2
Prefix_Step3a_Noun
Prefix_Step3b_Noun
Prefix_Step3_Verb
Prefix_Step4_Verb
Suffix_All_alef_maqsura
Suffix_Noun_Step1a
Suffix_Noun_Step1b
Suffix_Noun_Step2a
Suffix_Noun_Step2b
Suffix_Noun_Step2c1
Suffix_Noun_Step2c2
Suffix_Noun_Step3
Suffix_Verb_Step1
Suffix_Verb_Step2a
Suffix_Verb_Step2b
Suffix_Verb_Step2c
Normalize_post
Normalize_pre
Checks1
)
externals ( stem )
groupings ( )
// Normalizations
define Normalize_pre as (
loop len (
(
[substring] among (
'{aan}' '{uun}' '{iin}' '{aa}' '{uu}' '{ii}' '{oo}' '{~}'( delete ) // strip vocalization
'{_}' ( delete ) // strip kasheeda
// HinduArabic numerals
'{0}' ( <- '0')
'{1}' ( <- '1')
'{2}' ( <- '2')
'{3}' ( <- '3')
'{4}' ( <- '4')
'{5}' ( <- '5')
'{6}' ( <- '6')
'{7}' ( <- '7')
'{8}' ( <- '8')
'{9}' ( <- '9')
// Shaped forms
'{o1}' ( <- '{o}' ) // HAMZA
'{ao1}' '{ao2}' ( <- '{ao}' ) // ALEF_HAMZA_ABOVE
'{ao_1}' '{ao_2}' ( <- '{ao_}' ) // ALEF_HAMZA_BELOW
'{yo1}' '{yo2}' '{yo3}' '{yo4}' ( <- '{yo}' ) // YEH_HAMZA
'{a~1}' '{a~2}'( <- '{a~}' ) // ALEF_MADDA
'{wo1}' '{wo2}'( <- '{wo}' ) // WAW_HAMZA
'{a1}' '{a2}' ( <- '{a}' ) // ALEF
'{b1}' '{b2}' '{b3}' '{b4}' ( <- '{b}' ) // BEH
'{t_1}' '{t_2}' ( <- '{t_}' ) // TEH_MARBUTA
'{t1}' '{t2}' '{t3}' '{t4}' ( <- '{t}' ) // TEH
'{th1}' '{th2}' '{th3}' '{th4}' ( <- '{th}' ) // THEH
'{j1}' '{j2}' '{j3}' '{j4}'( <- '{j}' ) // JEEM
'{h1}' '{h2}' '{h3}' '{h4}' ( <- '{h}' ) // HAH
'{x1}' '{x2}' '{x3}' '{x4}'( <- '{x}' ) // KHAH
'{d1}' '{d2}' ( <- '{d}' ) // DAL
'{dz1}''{dz2}' ( <- '{dz}' ) // THAL
'{r1}' '{r2}'( <- '{r}' ) // REH
'{z1}' '{z2}' ( <- '{z}' ) // ZAIN
'{s1}' '{s2}' '{s3}' '{s4}'( <- '{s}' ) // SEEN
'{sh1}' '{sh2}' '{sh3}' '{sh4}' ( <- '{sh}' ) // SHEEN
'{c1}' '{c2}' '{c3}' '{c4}'( <- '{c}' ) // SAD
'{dh1}' '{dh2}' '{dh3}' '{dh4}'( <- '{dh}' ) // DAD
'{tt1}' '{tt2}' '{tt3}' '{tt4}' ( <- '{tt}' ) // TAH
'{zh1}' '{zh2}' '{zh3}' '{zh4}'( <- '{zh}' ) // ZAH
'{i1}' '{i2}' '{i3}' '{i4}'( <- '{i}' ) // AIN
'{gh1}' '{gh2}' '{gh3}' '{gh4}'( <- '{gh}' ) // GHAIN
'{f1}' '{f2}' '{f3}' '{f4}' ( <- '{f}' ) // FEH
'{q1}' '{q2}' '{q3}' '{q4}' ( <- '{q}' ) // QAF
'{k1}' '{k2}' '{k3}' '{k4}'( <- '{k}' ) // KAF
'{l1}' '{l2}' '{l3}' '{l4}'( <- '{l}' ) // LAM
'{m1}' '{m2}' '{m3}' '{m4}' ( <- '{m}' ) // MEEM
'{n1}' '{n2}' '{n3}' '{n4}'( <- '{n}' ) // NOON
'{e1}' '{e2}' '{e3}' '{e4}' ( <- '{e}' ) // HEH
'{w1}' '{w2}' ( <- '{w}' ) // WAW
'{a_1}' '{a_2}' ( <- '{a_}' ) // ALEF_MAKSURA
'{y1}' '{y2}' '{y3}' '{y4}' ( <- '{y}' ) // YEH
// Ligatures Lam-Alef
'{la}' '{la2}' (<- '{l}{a}')
'{lao}' '{lao2}' (<- '{l}{ao}')
'{lao_}' '{lao_2}' (<- '{l}{ao_}')
'{la~}' '{la~2}' (<- '{l}{a~}')
)
)
or
next
)
)
define Normalize_post as (
do (
// normalize last hamza
backwards (
[substring] among (
'{ao}''{ao_}' '{a~}' ( <- '{o}')
'{wo}' ( <- '{o}')
'{yo}' ( <- '{o}')
)
)
)
do loop word_len (
(
// normalize other hamza's
[substring] among (
'{ao}''{ao_}' '{a~}' ( <- '{a}')
'{wo}' ( <- '{w}')
'{yo}' ( <- '{y}')
)
)
or
next
)
)
// Checks
define Checks1 as (
$word_len = len
[substring] among (
'{b}{a}{l}' '{k}{a}{l}' ($word_len > 4 set is_noun unset is_verb set is_defined)
'{l}{l}' '{a}{l}' ($word_len > 3 set is_noun unset is_verb set is_defined)
)
)
//prefixes
define Prefix_Step1 as (
$word_len = len
[substring] among (
'{ao}{ao}' ($word_len > 3 <- '{ao}' )
'{ao}{a~}' ($word_len > 3 <- '{a~}' )
'{ao}{wo}' ($word_len > 3 <- '{ao}' )
'{ao}{a}' ($word_len > 3 <- '{a}' )
'{ao}{ao_}' ($word_len > 3 <- '{ao_}' )
// '{ao}' ($word_len > 3 delete) //rare case
)
)
define Prefix_Step2 as (
$word_len = len
not '{f}{a}'
not '{w}{a}'
[substring] among (
'{f}' ($word_len > 3 delete)
'{w}' ($word_len > 3 delete)
)
)
define Prefix_Step3a_Noun as ( // it is noun and defined
$word_len = len
[substring] among (
'{b}{a}{l}' '{k}{a}{l}' ($word_len > 5 delete)
'{l}{l}' '{a}{l}' ($word_len > 4 delete)
)
)
define Prefix_Step3b_Noun as ( // probably noun and defined
$word_len = len
not '{b}{a}' // exception
[substring] among (
'{b}' ($word_len > 3 delete)
// '{k}' '{l}' ($word_len > 3 delete) // BUG: cause confusion
'{b}{b}' ($word_len > 3 <- '{b}' )
'{k}{k}' ($word_len > 3 <- '{k}' )
)
)
define Prefix_Step3_Verb as (
$word_len = len
[substring] among (
//'{s}' ($word_len > 4 delete)// BUG: cause confusion
'{s}{y}' ($word_len > 4 <- '{y}' )
'{s}{t}' ($word_len > 4 <- '{t}')
'{s}{n}' ($word_len > 4 <- '{n}')
'{s}{ao}' ($word_len > 4 <- '{ao}')
)
)
define Prefix_Step4_Verb as (
$word_len = len
[substring] among (
'{y}{s}{t}' '{n}{s}{t}' '{t}{s}{t}' ($word_len > 4 set is_verb unset is_noun <- '{a}{s}{t}' )
)
)
// suffixes
backwardmode (
define Suffix_Noun_Step1a as (
$word_len = len
[substring] among (
'{y}' '{k}' '{e}' ($word_len >= 4 delete)
'{n}{a}' '{k}{m}' '{e}{a}' '{e}{n}' '{e}{m}' ($word_len >= 5 delete)
'{k}{m}{a}' '{e}{m}{a}' ($word_len >= 6 delete)
)
)
define Suffix_Noun_Step1b as (
$word_len = len
[substring] among (
'{n}' ($word_len > 5 delete)
)
)
define Suffix_Noun_Step2a as (
$word_len = len
[substring] among (
'{a}' '{y}' '{w}' ($word_len > 4 delete)
)
)
define Suffix_Noun_Step2b as (
$word_len = len
[substring] among (
'{a}{t}' ($word_len >= 5 delete)
)
)
define Suffix_Noun_Step2c1 as (
$word_len = len
[substring] among (
'{t}' ($word_len >= 4 delete)
)
)
define Suffix_Noun_Step2c2 as ( // feminine t_
$word_len = len
[substring] among (
'{t_}' ($word_len >= 4 delete)
)
)
define Suffix_Noun_Step3 as ( // ya' nisbiya
$word_len = len
[substring] among (
'{y}' ($word_len >= 3 delete)
)
)
define Suffix_Verb_Step1 as (
$word_len = len
[substring] among (
'{e}' '{k}' ($word_len >= 4 delete)
'{n}{y}' '{n}{a}' '{e}{a}' '{e}{m}' '{e}{n}' '{k}{m}' '{k}{n}' ($word_len >= 5 delete)
'{e}{m}{a}' '{k}{m}{a}' '{k}{m}{w}'($word_len >= 6 delete)
)
)
define Suffix_Verb_Step2a as (
$word_len = len
[substring] among (
'{t}' ($word_len >= 4 delete)
'{a}' '{n}' '{y}' ($word_len >= 4 delete)
'{n}{a}' '{t}{a}' '{t}{n}' ($word_len >= 5 delete)// past
'{a}{n}' '{w}{n}' '{y}{n}' ($word_len > 5 delete) // present
'{t}{m}{a}' ($word_len >= 6 delete)
)
)
define Suffix_Verb_Step2b as (
$word_len = len
[substring] among (
'{w}{a}' '{t}{m}' ($word_len >= 5 delete) // len >= 5
)
)
define Suffix_Verb_Step2c as (
$word_len = len
[substring] among (
'{w}' ($word_len >= 4 delete)
'{t}{m}{w}' ($word_len >= 6 delete)
)
)
define Suffix_All_alef_maqsura as (
$word_len = len
[substring] among (
'{a_}' ( <- '{y}' ) // spell error
// '{a_}' ( delete ) // if noun > 3
// '{a_}' ( <- '{a}') // if verb
)
)
)
define stem as (
// set initial values
set is_noun
set is_verb
unset is_defined
// guess type and properties
do Checks1
// normalization pre-stemming
do Normalize_pre
backwards (
do (
//Suffixes for verbs
(
is_verb
(
(
(atleast 1 Suffix_Verb_Step1)
( Suffix_Verb_Step2a or Suffix_Verb_Step2c or next)
)
or Suffix_Verb_Step2b
or Suffix_Verb_Step2a
)
)
//Suffixes for nouns
or (
is_noun
(
try (
Suffix_Noun_Step2c2
or (not is_defined Suffix_Noun_Step1a (
Suffix_Noun_Step2a
or Suffix_Noun_Step2b
or Suffix_Noun_Step2c1
or next))
or (Suffix_Noun_Step1b (
Suffix_Noun_Step2a
or Suffix_Noun_Step2b
or Suffix_Noun_Step2c1))
or (not is_defined Suffix_Noun_Step2a)
or (Suffix_Noun_Step2b)
)
Suffix_Noun_Step3
)
)
// Suffixes for alef maqsura
or Suffix_All_alef_maqsura
)
)
//Prefixes
do (
try Prefix_Step1
try Prefix_Step2
( Prefix_Step3a_Noun
or (is_noun Prefix_Step3b_Noun)
or (is_verb try Prefix_Step3_Verb Prefix_Step4_Verb)
)
)
// normalization post-stemming
do Normalize_post
)