In Snowball `len` is very like `size` (the difference is `len` gives the length in Unicode characters rather than bytes), so change `len` to be Name.Builtin to be consistent with the existing highlighting of `size`.
111 lines
2.3 KiB
Text
111 lines
2.3 KiB
Text
/* Stemmer for Esperanto in UTF-8 */
|
|
|
|
strings ()
|
|
|
|
integers ()
|
|
|
|
booleans ( foreign )
|
|
|
|
routines (
|
|
apostrophe
|
|
canonical_form
|
|
correlative
|
|
interjection
|
|
short_word
|
|
standard_suffix
|
|
unuj
|
|
)
|
|
|
|
externals ( stem )
|
|
|
|
groupings ( vowel aiou ao ou )
|
|
|
|
stringdef a' decimal '225'
|
|
stringdef e' hex 'E9'
|
|
stringdef i' hex 'ED'
|
|
stringdef o' hex ' f3'
|
|
stringdef u' hex 'fa '
|
|
|
|
stringdef cx hex '0109'
|
|
stringdef gx hex '011D'
|
|
stringdef hx hex '0125'
|
|
stringdef jx hex '0135'
|
|
stringdef sx hex '015D'
|
|
stringdef ux hex '016D'
|
|
|
|
define canonical_form as repeat (
|
|
[substring]
|
|
among (
|
|
stringescapes //
|
|
'/a'/' (<- 'a' set foreign)
|
|
'/e'/' (<- 'e' set foreign)
|
|
'/i'/' (<- 'i' set foreign)
|
|
'/o'/' (<- 'o' set foreign)
|
|
'/u'/' (<- 'u' set foreign)
|
|
stringescapes `'
|
|
'cx' (<- '`cx'')
|
|
'gx' (<- '`gx'')
|
|
'hx' (<- '`hx'')
|
|
'jx' (<- '`jx'')
|
|
'sx' (<- '`sx'')
|
|
'ux' (<- '`ux'')
|
|
'' (next)
|
|
)
|
|
)
|
|
|
|
backwardmode (
|
|
stringescapes { }
|
|
|
|
define apostrophe as (
|
|
(['un{'}'] atlimit <- 'unu') or
|
|
(['l{'}'] atlimit <- 'la') or
|
|
(['{'}'] <- 'o')
|
|
)
|
|
|
|
define vowel 'aeiou'
|
|
define aiou vowel - 'e'
|
|
define ao 'ao'
|
|
define ou 'ou'
|
|
|
|
define short_word as not (loop (maxint * 0 + 4 / 2) gopast vowel)
|
|
|
|
define interjection as (
|
|
among ('adia{ux}' 'aha' 'amen' 'hola' 'hura' 'mia{ux}' 'muu' 'oho')
|
|
atlimit
|
|
)
|
|
|
|
define correlative as (
|
|
[]
|
|
// Ignore -al, -am, etc. since they can't be confused with suffixes.
|
|
test (
|
|
('a' or (try 'n'] 'e') or (try 'n' try 'j'] ou))
|
|
'i'
|
|
try ('k' or 't' or '{cx}' or 'nen')
|
|
atlimit
|
|
)
|
|
delete
|
|
)
|
|
|
|
define unuj as (
|
|
[try 'n' 'j'] 'unu' atlimit delete
|
|
)
|
|
|
|
define standard_suffix as (
|
|
[
|
|
try ((try 'n' try 'j' ao) or (try 's' aiou) or (try 'n' 'e'))
|
|
try '-' try 'a{ux}'
|
|
] delete
|
|
)
|
|
)
|
|
|
|
define stem as (
|
|
do canonical_form
|
|
not foreign
|
|
backwards (
|
|
do apostrophe
|
|
short_word or interjection or
|
|
correlative or unuj or do standard_suffix
|
|
)
|
|
$(lenof 'a' <= sizeof 'a')
|
|
$(len <= size)
|
|
)
|