tilde-projects/Code/irc/inflect.py

3982 lines
103 KiB
Python
Raw Normal View History

2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
inflect.py: correctly generate plurals, ordinals, indefinite articles;
convert numbers to words
Copyright (C) 2010 Paul Dyson
Based upon the Perl module Lingua::EN::Inflect by Damian Conway.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
The original Perl module Lingua::EN::Inflect by Damian Conway is
available from http://search.cpan.org/~dconway/
This module can be downloaded at http://pypi.python.org/pypi/inflect
methods:
classical inflect
plural plural_noun plural_verb plural_adj singular_noun no num a an
compare compare_nouns compare_verbs compare_adjs
present_participle
ordinal
number_to_words
join
defnoun defverb defadj defa defan
INFLECTIONS: classical inflect
plural plural_noun plural_verb plural_adj singular_noun compare
no num a an present_participle
PLURALS: classical inflect
plural plural_noun plural_verb plural_adj singular_noun no num
compare compare_nouns compare_verbs compare_adjs
COMPARISONS: classical
compare compare_nouns compare_verbs compare_adjs
ARTICLES: classical inflect num a an
NUMERICAL: ordinal number_to_words
USER_DEFINED: defnoun defverb defadj defa defan
Exceptions:
UnknownClassicalModeError
BadNumValueError
BadChunkingOptionError
NumOutOfRangeError
BadUserDefinedPatternError
BadRcFileError
BadGenderError
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
from re import match, search, subn, IGNORECASE, VERBOSE
from re import split as splitre
from re import error as reerror
from re import sub as resub
class UnknownClassicalModeError(Exception):
pass
class BadNumValueError(Exception):
pass
class BadChunkingOptionError(Exception):
pass
class NumOutOfRangeError(Exception):
pass
class BadUserDefinedPatternError(Exception):
pass
class BadRcFileError(Exception):
pass
class BadGenderError(Exception):
pass
2018-10-05 20:02:38 +00:00
2015-01-02 16:59:08 +00:00
__ver_major__ = 0
__ver_minor__ = 2
__ver_patch__ = 4
__ver_sub__ = ""
2018-10-05 20:02:38 +00:00
__version__ = "%d.%d.%d%s" % (__ver_major__, __ver_minor__, __ver_patch__, __ver_sub__)
2015-01-02 16:59:08 +00:00
STDOUT_ON = False
def print3(txt):
if STDOUT_ON:
print(txt)
def enclose(s):
return "(?:%s)" % s
2018-10-05 20:02:38 +00:00
def joinstem(cutpoint=0, words=""):
"""
2015-01-02 16:59:08 +00:00
join stem of each word in words into a string for regex
each word is truncated at cutpoint
cutpoint is usually negative indicating the number of letters to remove
from the end of each word
e.g.
joinstem(-2, ["ephemeris", "iris", ".*itis"]) returns
(?:ephemer|ir|.*it)
2018-10-05 20:02:38 +00:00
"""
return enclose("|".join(w[:cutpoint] for w in words))
2015-01-02 16:59:08 +00:00
def bysize(words):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
take a list of words and return a dict of sets sorted by word length
e.g.
ret[3]=set(['ant', 'cat', 'dog', 'pig'])
ret[4]=set(['frog', 'goat'])
ret[5]=set(['horse'])
ret[8]=set(['elephant'])
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
ret = {}
for w in words:
if len(w) not in ret:
ret[len(w)] = set()
ret[len(w)].add(w)
return ret
def make_pl_si_lists(lst, plending, siendingsize, dojoinstem=True):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
given a list of singular words: lst
an ending to append to make the plural: plending
the number of characters to remove from the singular before appending plending: siendingsize
a flag whether to create a joinstem: dojoinstem
return:
a list of pluralised words: si_list (called si because this is what you need to
look for to make the singular)
the pluralised words as a dict of sets sorted by word length: si_bysize
the singular words as a dict of sets sorted by word length: pl_bysize
if dojoinstem is True: a regular expression that matches any of the stems: stem
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
if siendingsize is not None:
siendingsize = -siendingsize
si_list = [w[:siendingsize] + plending for w in lst]
pl_bysize = bysize(lst)
si_bysize = bysize(si_list)
if dojoinstem:
stem = joinstem(siendingsize, lst)
return si_list, si_bysize, pl_bysize, stem
else:
return si_list, si_bysize, pl_bysize
# 1. PLURALS
pl_sb_irregular_s = {
"corpus": "corpuses|corpora",
2018-10-05 20:02:38 +00:00
"opus": "opuses|opera",
"genus": "genera",
2015-01-02 16:59:08 +00:00
"mythos": "mythoi",
2018-10-05 20:02:38 +00:00
"penis": "penises|penes",
2015-01-02 16:59:08 +00:00
"testis": "testes",
2018-10-05 20:02:38 +00:00
"atlas": "atlases|atlantes",
"yes": "yeses",
2015-01-02 16:59:08 +00:00
}
pl_sb_irregular = {
2018-10-05 20:02:38 +00:00
"child": "children",
"brother": "brothers|brethren",
"loaf": "loaves",
"hoof": "hoofs|hooves",
"beef": "beefs|beeves",
"thief": "thiefs|thieves",
"money": "monies",
"mongoose": "mongooses",
"ox": "oxen",
"cow": "cows|kine",
"graffito": "graffiti",
"octopus": "octopuses|octopodes",
"genie": "genies|genii",
"ganglion": "ganglions|ganglia",
"trilby": "trilbys",
"turf": "turfs|turves",
"numen": "numina",
"atman": "atmas",
"occiput": "occiputs|occipita",
2015-01-02 16:59:08 +00:00
"sabretooth": "sabretooths",
"sabertooth": "sabertooths",
2018-10-05 20:02:38 +00:00
"lowlife": "lowlifes",
"flatfoot": "flatfoots",
2015-01-02 16:59:08 +00:00
"tenderfoot": "tenderfoots",
2018-10-05 20:02:38 +00:00
"romany": "romanies",
"jerry": "jerries",
"mary": "maries",
"talouse": "talouses",
"blouse": "blouses",
"rom": "roma",
"carmen": "carmina",
2015-01-02 16:59:08 +00:00
}
pl_sb_irregular.update(pl_sb_irregular_s)
# pl_sb_irregular_keys = enclose('|'.join(pl_sb_irregular.keys()))
pl_sb_irregular_caps = {
2018-10-05 20:02:38 +00:00
"Romany": "Romanies",
"Jerry": "Jerrys",
"Mary": "Marys",
"Rom": "Roma",
2015-01-02 16:59:08 +00:00
}
2018-10-05 20:02:38 +00:00
pl_sb_irregular_compound = {"prima donna": "prima donnas|prime donne"}
2015-01-02 16:59:08 +00:00
si_sb_irregular = dict([(v, k) for (k, v) in pl_sb_irregular.items()])
keys = list(si_sb_irregular.keys())
for k in keys:
2018-10-05 20:02:38 +00:00
if "|" in k:
k1, k2 = k.split("|")
2015-01-02 16:59:08 +00:00
si_sb_irregular[k1] = si_sb_irregular[k2] = si_sb_irregular[k]
del si_sb_irregular[k]
si_sb_irregular_caps = dict([(v, k) for (k, v) in pl_sb_irregular_caps.items()])
si_sb_irregular_compound = dict([(v, k) for (k, v) in pl_sb_irregular_compound.items()])
keys = list(si_sb_irregular_compound.keys())
for k in keys:
2018-10-05 20:02:38 +00:00
if "|" in k:
k1, k2 = k.split("|")
si_sb_irregular_compound[k1] = si_sb_irregular_compound[
k2
] = si_sb_irregular_compound[k]
2015-01-02 16:59:08 +00:00
del si_sb_irregular_compound[k]
# si_sb_irregular_keys = enclose('|'.join(si_sb_irregular.keys()))
# Z's that don't double
2018-10-05 20:02:38 +00:00
pl_sb_z_zes_list = ("quartz", "topaz")
2015-01-02 16:59:08 +00:00
pl_sb_z_zes_bysize = bysize(pl_sb_z_zes_list)
2018-10-05 20:02:38 +00:00
pl_sb_ze_zes_list = ("snooze",)
2015-01-02 16:59:08 +00:00
pl_sb_ze_zes_bysize = bysize(pl_sb_ze_zes_list)
# CLASSICAL "..is" -> "..ides"
pl_sb_C_is_ides_complete = [
# GENERAL WORDS...
2018-10-05 20:02:38 +00:00
"ephemeris",
"iris",
"clitoris",
"chrysalis",
"epididymis",
2015-01-02 16:59:08 +00:00
]
pl_sb_C_is_ides_endings = [
# INFLAMATIONS...
2018-10-05 20:02:38 +00:00
"itis"
2015-01-02 16:59:08 +00:00
]
2018-10-05 20:02:38 +00:00
pl_sb_C_is_ides = joinstem(
-2, pl_sb_C_is_ides_complete + [".*%s" % w for w in pl_sb_C_is_ides_endings]
)
2015-01-02 16:59:08 +00:00
pl_sb_C_is_ides_list = pl_sb_C_is_ides_complete + pl_sb_C_is_ides_endings
2018-10-05 20:02:38 +00:00
(
si_sb_C_is_ides_list,
si_sb_C_is_ides_bysize,
pl_sb_C_is_ides_bysize,
) = make_pl_si_lists(pl_sb_C_is_ides_list, "ides", 2, dojoinstem=False)
2015-01-02 16:59:08 +00:00
# CLASSICAL "..a" -> "..ata"
pl_sb_C_a_ata_list = (
2018-10-05 20:02:38 +00:00
"anathema",
"bema",
"carcinoma",
"charisma",
"diploma",
"dogma",
"drama",
"edema",
"enema",
"enigma",
"lemma",
"lymphoma",
"magma",
"melisma",
"miasma",
"oedema",
"sarcoma",
"schema",
"soma",
"stigma",
"stoma",
"trauma",
"gumma",
"pragma",
2015-01-02 16:59:08 +00:00
)
2018-10-05 20:02:38 +00:00
(
si_sb_C_a_ata_list,
si_sb_C_a_ata_bysize,
pl_sb_C_a_ata_bysize,
pl_sb_C_a_ata,
) = make_pl_si_lists(pl_sb_C_a_ata_list, "ata", 1)
2015-01-02 16:59:08 +00:00
# UNCONDITIONAL "..a" -> "..ae"
2018-10-05 20:02:38 +00:00
pl_sb_U_a_ae_list = ("alumna", "alga", "vertebra", "persona")
(
si_sb_U_a_ae_list,
si_sb_U_a_ae_bysize,
pl_sb_U_a_ae_bysize,
pl_sb_U_a_ae,
) = make_pl_si_lists(pl_sb_U_a_ae_list, "e", None)
2015-01-02 16:59:08 +00:00
# CLASSICAL "..a" -> "..ae"
pl_sb_C_a_ae_list = (
2018-10-05 20:02:38 +00:00
"amoeba",
"antenna",
"formula",
"hyperbola",
"medusa",
"nebula",
"parabola",
"abscissa",
"hydra",
"nova",
"lacuna",
"aurora",
"umbra",
"flora",
"fauna",
2015-01-02 16:59:08 +00:00
)
2018-10-05 20:02:38 +00:00
(
si_sb_C_a_ae_list,
si_sb_C_a_ae_bysize,
pl_sb_C_a_ae_bysize,
pl_sb_C_a_ae,
) = make_pl_si_lists(pl_sb_C_a_ae_list, "e", None)
2015-01-02 16:59:08 +00:00
# CLASSICAL "..en" -> "..ina"
2018-10-05 20:02:38 +00:00
pl_sb_C_en_ina_list = ("stamen", "foramen", "lumen")
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
(
si_sb_C_en_ina_list,
si_sb_C_en_ina_bysize,
pl_sb_C_en_ina_bysize,
pl_sb_C_en_ina,
) = make_pl_si_lists(pl_sb_C_en_ina_list, "ina", 2)
2015-01-02 16:59:08 +00:00
# UNCONDITIONAL "..um" -> "..a"
pl_sb_U_um_a_list = (
2018-10-05 20:02:38 +00:00
"bacterium",
"agendum",
"desideratum",
"erratum",
"stratum",
"datum",
"ovum",
"extremum",
2015-01-02 16:59:08 +00:00
"candelabrum",
)
2018-10-05 20:02:38 +00:00
(
si_sb_U_um_a_list,
si_sb_U_um_a_bysize,
pl_sb_U_um_a_bysize,
pl_sb_U_um_a,
) = make_pl_si_lists(pl_sb_U_um_a_list, "a", 2)
2015-01-02 16:59:08 +00:00
# CLASSICAL "..um" -> "..a"
pl_sb_C_um_a_list = (
2018-10-05 20:02:38 +00:00
"maximum",
"minimum",
"momentum",
"optimum",
"quantum",
"cranium",
"curriculum",
"dictum",
"phylum",
"aquarium",
"compendium",
"emporium",
"enconium",
"gymnasium",
"honorarium",
"interregnum",
"lustrum",
"memorandum",
"millennium",
"rostrum",
"spectrum",
"speculum",
"stadium",
"trapezium",
"ultimatum",
"medium",
"vacuum",
"velum",
"consortium",
"arboretum",
2015-01-02 16:59:08 +00:00
)
2018-10-05 20:02:38 +00:00
(
si_sb_C_um_a_list,
si_sb_C_um_a_bysize,
pl_sb_C_um_a_bysize,
pl_sb_C_um_a,
) = make_pl_si_lists(pl_sb_C_um_a_list, "a", 2)
2015-01-02 16:59:08 +00:00
# UNCONDITIONAL "..us" -> "i"
pl_sb_U_us_i_list = (
2018-10-05 20:02:38 +00:00
"alumnus",
"alveolus",
"bacillus",
"bronchus",
"locus",
"nucleus",
"stimulus",
"meniscus",
2015-01-02 16:59:08 +00:00
"sarcophagus",
)
2018-10-05 20:02:38 +00:00
(
si_sb_U_us_i_list,
si_sb_U_us_i_bysize,
pl_sb_U_us_i_bysize,
pl_sb_U_us_i,
) = make_pl_si_lists(pl_sb_U_us_i_list, "i", 2)
2015-01-02 16:59:08 +00:00
# CLASSICAL "..us" -> "..i"
pl_sb_C_us_i_list = (
2018-10-05 20:02:38 +00:00
"focus",
"radius",
"genius",
"incubus",
"succubus",
"nimbus",
"fungus",
"nucleolus",
"stylus",
"torus",
"umbilicus",
"uterus",
"hippopotamus",
"cactus",
2015-01-02 16:59:08 +00:00
)
2018-10-05 20:02:38 +00:00
(
si_sb_C_us_i_list,
si_sb_C_us_i_bysize,
pl_sb_C_us_i_bysize,
pl_sb_C_us_i,
) = make_pl_si_lists(pl_sb_C_us_i_list, "i", 2)
2015-01-02 16:59:08 +00:00
# CLASSICAL "..us" -> "..us" (ASSIMILATED 4TH DECLENSION LATIN NOUNS)
pl_sb_C_us_us = (
2018-10-05 20:02:38 +00:00
"status",
"apparatus",
"prospectus",
"sinus",
"hiatus",
"impetus",
"plexus",
2015-01-02 16:59:08 +00:00
)
pl_sb_C_us_us_bysize = bysize(pl_sb_C_us_us)
# UNCONDITIONAL "..on" -> "a"
pl_sb_U_on_a_list = (
2018-10-05 20:02:38 +00:00
"criterion",
"perihelion",
"aphelion",
"phenomenon",
"prolegomenon",
"noumenon",
"organon",
"asyndeton",
"hyperbaton",
2015-01-02 16:59:08 +00:00
)
2018-10-05 20:02:38 +00:00
(
si_sb_U_on_a_list,
si_sb_U_on_a_bysize,
pl_sb_U_on_a_bysize,
pl_sb_U_on_a,
) = make_pl_si_lists(pl_sb_U_on_a_list, "a", 2)
2015-01-02 16:59:08 +00:00
# CLASSICAL "..on" -> "..a"
2018-10-05 20:02:38 +00:00
pl_sb_C_on_a_list = ("oxymoron",)
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
(
si_sb_C_on_a_list,
si_sb_C_on_a_bysize,
pl_sb_C_on_a_bysize,
pl_sb_C_on_a,
) = make_pl_si_lists(pl_sb_C_on_a_list, "a", 2)
2015-01-02 16:59:08 +00:00
# CLASSICAL "..o" -> "..i" (BUT NORMALLY -> "..os")
pl_sb_C_o_i = [
2018-10-05 20:02:38 +00:00
"solo",
"soprano",
"basso",
"alto",
"contralto",
"tempo",
"piano",
"virtuoso",
2015-01-02 16:59:08 +00:00
] # list not tuple so can concat for pl_sb_U_o_os
pl_sb_C_o_i_bysize = bysize(pl_sb_C_o_i)
2018-10-05 20:02:38 +00:00
si_sb_C_o_i_bysize = bysize(["%si" % w[:-1] for w in pl_sb_C_o_i])
2015-01-02 16:59:08 +00:00
pl_sb_C_o_i_stems = joinstem(-1, pl_sb_C_o_i)
# ALWAYS "..o" -> "..os"
2018-10-05 20:02:38 +00:00
pl_sb_U_o_os_complete = set(("ado", "ISO", "NATO", "NCO", "NGO", "oto"))
si_sb_U_o_os_complete = set("%ss" % w for w in pl_sb_U_o_os_complete)
2015-01-02 16:59:08 +00:00
pl_sb_U_o_os_endings = [
2018-10-05 20:02:38 +00:00
"aficionado",
"aggro",
"albino",
"allegro",
"ammo",
"Antananarivo",
"archipelago",
"armadillo",
"auto",
"avocado",
"Bamako",
"Barquisimeto",
"bimbo",
"bingo",
"Biro",
"bolero",
"Bolzano",
"bongo",
"Boto",
"burro",
"Cairo",
"canto",
"cappuccino",
"casino",
"cello",
"Chicago",
"Chimango",
"cilantro",
"cochito",
"coco",
"Colombo",
"Colorado",
"commando",
"concertino",
"contango",
"credo",
"crescendo",
"cyano",
"demo",
"ditto",
"Draco",
"dynamo",
"embryo",
"Esperanto",
"espresso",
"euro",
"falsetto",
"Faro",
"fiasco",
"Filipino",
"flamenco",
"furioso",
"generalissimo",
"Gestapo",
"ghetto",
"gigolo",
"gizmo",
"Greensboro",
"gringo",
"Guaiabero",
"guano",
"gumbo",
"gyro",
"hairdo",
"hippo",
"Idaho",
"impetigo",
"inferno",
"info",
"intermezzo",
"intertrigo",
"Iquico",
"jumbo",
"junto",
"Kakapo",
"kilo",
"Kinkimavo",
"Kokako",
"Kosovo",
"Lesotho",
"libero",
"libido",
"libretto",
"lido",
"Lilo",
"limbo",
"limo",
"lineno",
"lingo",
"lino",
"livedo",
"loco",
"logo",
"lumbago",
"macho",
"macro",
"mafioso",
"magneto",
"magnifico",
"Majuro",
"Malabo",
"manifesto",
"Maputo",
"Maracaibo",
"medico",
"memo",
"metro",
"Mexico",
"micro",
"Milano",
"Monaco",
"mono",
"Montenegro",
"Morocco",
"Muqdisho",
2015-01-02 16:59:08 +00:00
"myo",
2018-10-05 20:02:38 +00:00
"neutrino",
"Ningbo",
"octavo",
"oregano",
"Orinoco",
"Orlando",
"Oslo",
"panto",
"Paramaribo",
"Pardusco",
"pedalo",
"photo",
"pimento",
"pinto",
"pleco",
"Pluto",
"pogo",
"polo",
"poncho",
"Porto-Novo",
"Porto",
"pro",
"psycho",
"pueblo",
"quarto",
"Quito",
"rhino",
"risotto",
"rococo",
"rondo",
"Sacramento",
"saddo",
"sago",
"salvo",
"Santiago",
"Sapporo",
"Sarajevo",
"scherzando",
"scherzo",
"silo",
"sirocco",
"sombrero",
"staccato",
"sterno",
"stucco",
"stylo",
"sumo",
"Taiko",
"techno",
"terrazzo",
"testudo",
"timpano",
"tiro",
"tobacco",
"Togo",
"Tokyo",
"torero",
"Torino",
"Toronto",
"torso",
"tremolo",
"typo",
"tyro",
"ufo",
"UNESCO",
"vaquero",
"vermicello",
"verso",
"vibrato",
"violoncello",
"Virgo",
"weirdo",
"WHO",
"WTO",
"Yamoussoukro",
"yo-yo",
"zero",
"Zibo",
2015-01-02 16:59:08 +00:00
] + pl_sb_C_o_i
pl_sb_U_o_os_bysize = bysize(pl_sb_U_o_os_endings)
2018-10-05 20:02:38 +00:00
si_sb_U_o_os_bysize = bysize(["%ss" % w for w in pl_sb_U_o_os_endings])
2015-01-02 16:59:08 +00:00
# UNCONDITIONAL "..ch" -> "..chs"
2018-10-05 20:02:38 +00:00
pl_sb_U_ch_chs_list = ("czech", "eunuch", "stomach")
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
(
si_sb_U_ch_chs_list,
si_sb_U_ch_chs_bysize,
pl_sb_U_ch_chs_bysize,
pl_sb_U_ch_chs,
) = make_pl_si_lists(pl_sb_U_ch_chs_list, "s", None)
2015-01-02 16:59:08 +00:00
# UNCONDITIONAL "..[ei]x" -> "..ices"
2018-10-05 20:02:38 +00:00
pl_sb_U_ex_ices_list = ("codex", "murex", "silex")
(
si_sb_U_ex_ices_list,
si_sb_U_ex_ices_bysize,
pl_sb_U_ex_ices_bysize,
pl_sb_U_ex_ices,
) = make_pl_si_lists(pl_sb_U_ex_ices_list, "ices", 2)
pl_sb_U_ix_ices_list = ("radix", "helix")
(
si_sb_U_ix_ices_list,
si_sb_U_ix_ices_bysize,
pl_sb_U_ix_ices_bysize,
pl_sb_U_ix_ices,
) = make_pl_si_lists(pl_sb_U_ix_ices_list, "ices", 2)
2015-01-02 16:59:08 +00:00
# CLASSICAL "..[ei]x" -> "..ices"
pl_sb_C_ex_ices_list = (
2018-10-05 20:02:38 +00:00
"vortex",
"vertex",
"cortex",
"latex",
"pontifex",
"apex",
"index",
"simplex",
2015-01-02 16:59:08 +00:00
)
2018-10-05 20:02:38 +00:00
(
si_sb_C_ex_ices_list,
si_sb_C_ex_ices_bysize,
pl_sb_C_ex_ices_bysize,
pl_sb_C_ex_ices,
) = make_pl_si_lists(pl_sb_C_ex_ices_list, "ices", 2)
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
pl_sb_C_ix_ices_list = ("appendix",)
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
(
si_sb_C_ix_ices_list,
si_sb_C_ix_ices_bysize,
pl_sb_C_ix_ices_bysize,
pl_sb_C_ix_ices,
) = make_pl_si_lists(pl_sb_C_ix_ices_list, "ices", 2)
2015-01-02 16:59:08 +00:00
# ARABIC: ".." -> "..i"
2018-10-05 20:02:38 +00:00
pl_sb_C_i_list = ("afrit", "afreet", "efreet")
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
(si_sb_C_i_list, si_sb_C_i_bysize, pl_sb_C_i_bysize, pl_sb_C_i) = make_pl_si_lists(
pl_sb_C_i_list, "i", None
)
2015-01-02 16:59:08 +00:00
# HEBREW: ".." -> "..im"
2018-10-05 20:02:38 +00:00
pl_sb_C_im_list = ("goy", "seraph", "cherub")
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
(si_sb_C_im_list, si_sb_C_im_bysize, pl_sb_C_im_bysize, pl_sb_C_im) = make_pl_si_lists(
pl_sb_C_im_list, "im", None
)
2015-01-02 16:59:08 +00:00
# UNCONDITIONAL "..man" -> "..mans"
pl_sb_U_man_mans_list = """
ataman caiman cayman ceriman
desman dolman farman harman hetman
human leman ottoman shaman talisman
""".split()
pl_sb_U_man_mans_caps_list = """
Alabaman Bahaman Burman German
Hiroshiman Liman Nakayaman Norman Oklahoman
Panaman Roman Selman Sonaman Tacoman Yakiman
Yokohaman Yuman
""".split()
2018-10-05 20:02:38 +00:00
(
si_sb_U_man_mans_list,
si_sb_U_man_mans_bysize,
pl_sb_U_man_mans_bysize,
) = make_pl_si_lists(pl_sb_U_man_mans_list, "s", None, dojoinstem=False)
(
si_sb_U_man_mans_caps_list,
si_sb_U_man_mans_caps_bysize,
pl_sb_U_man_mans_caps_bysize,
) = make_pl_si_lists(pl_sb_U_man_mans_caps_list, "s", None, dojoinstem=False)
2015-01-02 16:59:08 +00:00
pl_sb_uninflected_s_complete = [
# PAIRS OR GROUPS SUBSUMED TO A SINGULAR...
2018-10-05 20:02:38 +00:00
"breeches",
"britches",
"pajamas",
"pyjamas",
"clippers",
"gallows",
"hijinks",
"headquarters",
"pliers",
"scissors",
"testes",
"herpes",
"pincers",
"shears",
"proceedings",
"trousers",
2015-01-02 16:59:08 +00:00
# UNASSIMILATED LATIN 4th DECLENSION
2018-10-05 20:02:38 +00:00
"cantus",
"coitus",
"nexus",
2015-01-02 16:59:08 +00:00
# RECENT IMPORTS...
2018-10-05 20:02:38 +00:00
"contretemps",
"corps",
"debris",
2015-01-02 16:59:08 +00:00
"siemens",
# DISEASES
"mumps",
# MISCELLANEOUS OTHERS...
2018-10-05 20:02:38 +00:00
"diabetes",
"jackanapes",
"series",
"species",
"subspecies",
"rabies",
"chassis",
"innings",
"news",
"mews",
"haggis",
2015-01-02 16:59:08 +00:00
]
pl_sb_uninflected_s_endings = [
# RECENT IMPORTS...
"ois",
# DISEASES
"measles",
]
2018-10-05 20:02:38 +00:00
pl_sb_uninflected_s = pl_sb_uninflected_s_complete + [
".*%s" % w for w in pl_sb_uninflected_s_endings
]
2015-01-02 16:59:08 +00:00
pl_sb_uninflected_herd = (
# DON'T INFLECT IN CLASSICAL MODE, OTHERWISE NORMAL INFLECTION
2018-10-05 20:02:38 +00:00
"wildebeest",
"swine",
"eland",
"bison",
"buffalo",
"elk",
"rhinoceros",
"zucchini",
"caribou",
"dace",
"grouse",
"guinea fowl",
"guinea-fowl",
"haddock",
"hake",
"halibut",
"herring",
"mackerel",
"pickerel",
"pike",
"roe",
"seed",
"shad",
"snipe",
"teal",
"turbot",
"water fowl",
"water-fowl",
2015-01-02 16:59:08 +00:00
)
pl_sb_uninflected_complete = [
# SOME FISH AND HERD ANIMALS
2018-10-05 20:02:38 +00:00
"tuna",
"salmon",
"mackerel",
"trout",
"bream",
"sea-bass",
"sea bass",
"carp",
"cod",
"flounder",
"whiting",
2015-01-02 16:59:08 +00:00
"moose",
# OTHER ODDITIES
2018-10-05 20:02:38 +00:00
"graffiti",
"djinn",
"samuri",
"offspring",
"pence",
"quid",
"hertz",
2015-01-02 16:59:08 +00:00
] + pl_sb_uninflected_s_complete
# SOME WORDS ENDING IN ...s (OFTEN PAIRS TAKEN AS A WHOLE)
pl_sb_uninflected_caps = [
# ALL NATIONALS ENDING IN -ese
2018-10-05 20:02:38 +00:00
"Portuguese",
"Amoyese",
"Borghese",
"Congoese",
"Faroese",
"Foochowese",
"Genevese",
"Genoese",
"Gilbertese",
"Hottentotese",
"Kiplingese",
"Kongoese",
"Lucchese",
"Maltese",
"Nankingese",
"Niasese",
"Pekingese",
"Piedmontese",
"Pistoiese",
"Sarawakese",
"Shavese",
"Vermontese",
"Wenchowese",
"Yengeese",
2015-01-02 16:59:08 +00:00
]
pl_sb_uninflected_endings = [
# SOME FISH AND HERD ANIMALS
"fish",
2018-10-05 20:02:38 +00:00
"deer",
"sheep",
2015-01-02 16:59:08 +00:00
# ALL NATIONALS ENDING IN -ese
2018-10-05 20:02:38 +00:00
"nese",
"rese",
"lese",
"mese",
2015-01-02 16:59:08 +00:00
# DISEASES
"pox",
# OTHER ODDITIES
2018-10-05 20:02:38 +00:00
"craft",
2015-01-02 16:59:08 +00:00
] + pl_sb_uninflected_s_endings
# SOME WORDS ENDING IN ...s (OFTEN PAIRS TAKEN AS A WHOLE)
pl_sb_uninflected_bysize = bysize(pl_sb_uninflected_endings)
# SINGULAR WORDS ENDING IN ...s (ALL INFLECT WITH ...es)
pl_sb_singular_s_complete = [
2018-10-05 20:02:38 +00:00
"acropolis",
"aegis",
"alias",
"asbestos",
"bathos",
"bias",
"bronchitis",
"bursitis",
"caddis",
"cannabis",
"canvas",
"chaos",
"cosmos",
"dais",
"digitalis",
"epidermis",
"ethos",
"eyas",
"gas",
"glottis",
"hubris",
"ibis",
"lens",
"mantis",
"marquis",
"metropolis",
"pathos",
"pelvis",
"polis",
"rhinoceros",
"sassafras",
"trellis",
2015-01-02 16:59:08 +00:00
] + pl_sb_C_is_ides_complete
2018-10-05 20:02:38 +00:00
pl_sb_singular_s_endings = ["ss", "us"] + pl_sb_C_is_ides_endings
2015-01-02 16:59:08 +00:00
pl_sb_singular_s_bysize = bysize(pl_sb_singular_s_endings)
2018-10-05 20:02:38 +00:00
si_sb_singular_s_complete = ["%ses" % w for w in pl_sb_singular_s_complete]
si_sb_singular_s_endings = ["%ses" % w for w in pl_sb_singular_s_endings]
2015-01-02 16:59:08 +00:00
si_sb_singular_s_bysize = bysize(si_sb_singular_s_endings)
2018-10-05 20:02:38 +00:00
pl_sb_singular_s_es = ["[A-Z].*es"]
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
pl_sb_singular_s = enclose(
"|".join(
pl_sb_singular_s_complete
+ [".*%s" % w for w in pl_sb_singular_s_endings]
+ pl_sb_singular_s_es
)
)
2015-01-02 16:59:08 +00:00
# PLURALS ENDING IN uses -> use
2018-10-05 20:02:38 +00:00
si_sb_ois_oi_case = ("Bolshois", "Hanois")
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
si_sb_uses_use_case = ("Betelgeuses", "Duses", "Meuses", "Syracuses", "Toulouses")
2015-01-02 16:59:08 +00:00
si_sb_uses_use = (
2018-10-05 20:02:38 +00:00
"abuses",
"applauses",
"blouses",
"carouses",
"causes",
"chartreuses",
"clauses",
"contuses",
"douses",
"excuses",
"fuses",
"grouses",
"hypotenuses",
"masseuses",
"menopauses",
"misuses",
"muses",
"overuses",
"pauses",
"peruses",
"profuses",
"recluses",
"reuses",
"ruses",
"souses",
"spouses",
"suffuses",
"transfuses",
"uses",
2015-01-02 16:59:08 +00:00
)
si_sb_ies_ie_case = (
2018-10-05 20:02:38 +00:00
"Addies",
"Aggies",
"Allies",
"Amies",
"Angies",
"Annies",
"Annmaries",
"Archies",
"Arties",
"Aussies",
"Barbies",
"Barries",
"Basies",
"Bennies",
"Bernies",
"Berties",
"Bessies",
"Betties",
"Billies",
"Blondies",
"Bobbies",
"Bonnies",
"Bowies",
"Brandies",
"Bries",
"Brownies",
"Callies",
"Carnegies",
"Carries",
"Cassies",
"Charlies",
"Cheries",
"Christies",
"Connies",
"Curies",
"Dannies",
"Debbies",
"Dixies",
"Dollies",
"Donnies",
"Drambuies",
"Eddies",
"Effies",
"Ellies",
"Elsies",
"Eries",
"Ernies",
"Essies",
"Eugenies",
"Fannies",
"Flossies",
"Frankies",
"Freddies",
"Gillespies",
"Goldies",
"Gracies",
"Guthries",
"Hallies",
"Hatties",
"Hetties",
"Hollies",
"Jackies",
"Jamies",
"Janies",
"Jannies",
"Jeanies",
"Jeannies",
"Jennies",
"Jessies",
"Jimmies",
"Jodies",
"Johnies",
"Johnnies",
"Josies",
"Julies",
"Kalgoorlies",
"Kathies",
"Katies",
"Kellies",
"Kewpies",
"Kristies",
"Laramies",
"Lassies",
"Lauries",
"Leslies",
"Lessies",
"Lillies",
"Lizzies",
"Lonnies",
"Lories",
"Lorries",
"Lotties",
"Louies",
"Mackenzies",
"Maggies",
"Maisies",
"Mamies",
"Marcies",
"Margies",
"Maries",
"Marjories",
"Matties",
"McKenzies",
"Melanies",
"Mickies",
"Millies",
"Minnies",
"Mollies",
"Mounties",
"Nannies",
"Natalies",
"Nellies",
"Netties",
"Ollies",
"Ozzies",
"Pearlies",
"Pottawatomies",
"Reggies",
"Richies",
"Rickies",
"Robbies",
"Ronnies",
"Rosalies",
"Rosemaries",
"Rosies",
"Roxies",
"Rushdies",
"Ruthies",
"Sadies",
"Sallies",
"Sammies",
"Scotties",
"Selassies",
"Sherries",
"Sophies",
"Stacies",
"Stefanies",
"Stephanies",
"Stevies",
"Susies",
"Sylvies",
"Tammies",
"Terries",
"Tessies",
"Tommies",
"Tracies",
"Trekkies",
"Valaries",
"Valeries",
"Valkyries",
"Vickies",
"Virgies",
"Willies",
"Winnies",
"Wylies",
"Yorkies",
2015-01-02 16:59:08 +00:00
)
si_sb_ies_ie = (
2018-10-05 20:02:38 +00:00
"aeries",
"baggies",
"belies",
"biggies",
"birdies",
"bogies",
"bonnies",
"boogies",
"bookies",
"bourgeoisies",
"brownies",
"budgies",
"caddies",
"calories",
"camaraderies",
"cockamamies",
"collies",
"cookies",
"coolies",
"cooties",
"coteries",
"crappies",
"curies",
"cutesies",
"dogies",
"eyrie",
"floozies",
"footsies",
"freebies",
"genies",
"goalies",
"groupies",
"hies",
"jalousies",
"junkies",
"kiddies",
"laddies",
"lassies",
"lies",
"lingeries",
"magpies",
"menageries",
"mommies",
"movies",
"neckties",
"newbies",
"nighties",
"oldies",
"organdies",
"overlies",
"pies",
"pinkies",
"pixies",
"potpies",
"prairies",
"quickies",
"reveries",
"rookies",
"rotisseries",
"softies",
"sorties",
"species",
"stymies",
"sweeties",
"ties",
"underlies",
"unties",
"veggies",
"vies",
"yuppies",
"zombies",
2015-01-02 16:59:08 +00:00
)
si_sb_oes_oe_case = (
2018-10-05 20:02:38 +00:00
"Chloes",
"Crusoes",
"Defoes",
"Faeroes",
"Ivanhoes",
"Joes",
"McEnroes",
"Moes",
"Monroes",
"Noes",
"Poes",
"Roscoes",
"Tahoes",
"Tippecanoes",
"Zoes",
2015-01-02 16:59:08 +00:00
)
si_sb_oes_oe = (
2018-10-05 20:02:38 +00:00
"aloes",
"backhoes",
"canoes",
"does",
"floes",
"foes",
"hoes",
"mistletoes",
"oboes",
"pekoes",
"roes",
"sloes",
"throes",
"tiptoes",
"toes",
"woes",
2015-01-02 16:59:08 +00:00
)
2018-10-05 20:02:38 +00:00
si_sb_z_zes = ("quartzes", "topazes")
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
si_sb_zzes_zz = ("buzzes", "fizzes", "frizzes", "razzes")
2015-01-02 16:59:08 +00:00
si_sb_ches_che_case = (
2018-10-05 20:02:38 +00:00
"Andromaches",
"Apaches",
"Blanches",
"Comanches",
"Nietzsches",
"Porsches",
"Roches",
2015-01-02 16:59:08 +00:00
)
si_sb_ches_che = (
2018-10-05 20:02:38 +00:00
"aches",
"avalanches",
"backaches",
"bellyaches",
"caches",
"cloches",
"creches",
"douches",
"earaches",
"fiches",
"headaches",
"heartaches",
"microfiches",
"niches",
"pastiches",
"psyches",
"quiches",
"stomachaches",
"toothaches",
2015-01-02 16:59:08 +00:00
)
2018-10-05 20:02:38 +00:00
si_sb_xes_xe = ("annexes", "axes", "deluxes", "pickaxes")
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
si_sb_sses_sse_case = ("Hesses", "Jesses", "Larousses", "Matisses")
2015-01-02 16:59:08 +00:00
si_sb_sses_sse = (
2018-10-05 20:02:38 +00:00
"bouillabaisses",
"crevasses",
"demitasses",
"impasses",
"mousses",
"posses",
2015-01-02 16:59:08 +00:00
)
si_sb_ves_ve_case = (
# *[nwl]ives -> [nwl]live
2018-10-05 20:02:38 +00:00
"Clives",
"Palmolives",
2015-01-02 16:59:08 +00:00
)
si_sb_ves_ve = (
# *[^d]eaves -> eave
2018-10-05 20:02:38 +00:00
"interweaves",
"weaves",
2015-01-02 16:59:08 +00:00
# *[nwl]ives -> [nwl]live
2018-10-05 20:02:38 +00:00
"olives",
2015-01-02 16:59:08 +00:00
# *[eoa]lves -> [eoa]lve
2018-10-05 20:02:38 +00:00
"bivalves",
"dissolves",
"resolves",
"salves",
"twelves",
"valves",
2015-01-02 16:59:08 +00:00
)
2018-10-05 20:02:38 +00:00
plverb_special_s = enclose(
"|".join(
[pl_sb_singular_s]
+ pl_sb_uninflected_s
+ list(pl_sb_irregular_s.keys())
+ ["(.*[csx])is", "(.*)ceps", "[A-Z].*s"]
)
)
2015-01-02 16:59:08 +00:00
pl_sb_postfix_adj = {
2018-10-05 20:02:38 +00:00
"general": ["(?!major|lieutenant|brigadier|adjutant|.*star)\S+"],
"martial": ["court"],
2015-01-02 16:59:08 +00:00
}
for k in list(pl_sb_postfix_adj.keys()):
pl_sb_postfix_adj[k] = enclose(
2018-10-05 20:02:38 +00:00
enclose("|".join(pl_sb_postfix_adj[k])) + "(?=(?:-|\\s+)%s)" % k
)
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
pl_sb_postfix_adj_stems = "(" + "|".join(list(pl_sb_postfix_adj.values())) + ")(.*)"
2015-01-02 16:59:08 +00:00
# PLURAL WORDS ENDING IS es GO TO SINGULAR is
si_sb_es_is = (
2018-10-05 20:02:38 +00:00
"amanuenses",
"amniocenteses",
"analyses",
"antitheses",
"apotheoses",
"arterioscleroses",
"atheroscleroses",
"axes",
2015-01-02 16:59:08 +00:00
# 'bases', # bases -> basis
2018-10-05 20:02:38 +00:00
"catalyses",
"catharses",
"chasses",
"cirrhoses",
"cocces",
"crises",
"diagnoses",
"dialyses",
"diereses",
"electrolyses",
"emphases",
"exegeses",
"geneses",
"halitoses",
"hydrolyses",
"hypnoses",
"hypotheses",
"hystereses",
"metamorphoses",
"metastases",
"misdiagnoses",
"mitoses",
"mononucleoses",
"narcoses",
"necroses",
"nemeses",
"neuroses",
"oases",
"osmoses",
"osteoporoses",
"paralyses",
"parentheses",
"parthenogeneses",
"periphrases",
"photosyntheses",
"probosces",
"prognoses",
"prophylaxes",
"prostheses",
"preces",
"psoriases",
"psychoanalyses",
"psychokineses",
"psychoses",
"scleroses",
"scolioses",
"sepses",
"silicoses",
"symbioses",
"synopses",
"syntheses",
"taxes",
"telekineses",
"theses",
"thromboses",
"tuberculoses",
"urinalyses",
2015-01-02 16:59:08 +00:00
)
pl_prep_list = """
about above across after among around at athwart before behind
below beneath beside besides between betwixt beyond but by
during except for from in into near of off on onto out over
since till to under until unto upon with""".split()
2018-10-05 20:02:38 +00:00
pl_prep_list_da = pl_prep_list + ["de", "du", "da"]
2015-01-02 16:59:08 +00:00
pl_prep_bysize = bysize(pl_prep_list_da)
2018-10-05 20:02:38 +00:00
pl_prep = enclose("|".join(pl_prep_list_da))
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
pl_sb_prep_dual_compound = (
r"(.*?)((?:-|\s+)(?:" + pl_prep + r")(?:-|\s+))a(?:-|\s+)(.*)"
)
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
singular_pronoun_genders = set(
[
"neuter",
"feminine",
"masculine",
"gender-neutral",
"feminine or masculine",
"masculine or feminine",
]
)
2015-01-02 16:59:08 +00:00
pl_pron_nom = {
# NOMINATIVE REFLEXIVE
2018-10-05 20:02:38 +00:00
"i": "we",
"myself": "ourselves",
"you": "you",
"yourself": "yourselves",
"she": "they",
"herself": "themselves",
"he": "they",
"himself": "themselves",
"it": "they",
"itself": "themselves",
"they": "they",
"themself": "themselves",
2015-01-02 16:59:08 +00:00
# POSSESSIVE
"mine": "ours",
"yours": "yours",
"hers": "theirs",
"his": "theirs",
"its": "theirs",
"theirs": "theirs",
}
si_pron = {}
2018-10-05 20:02:38 +00:00
si_pron["nom"] = dict([(v, k) for (k, v) in pl_pron_nom.items()])
si_pron["nom"]["we"] = "I"
2015-01-02 16:59:08 +00:00
pl_pron_acc = {
# ACCUSATIVE REFLEXIVE
2018-10-05 20:02:38 +00:00
"me": "us",
"myself": "ourselves",
"you": "you",
"yourself": "yourselves",
"her": "them",
"herself": "themselves",
"him": "them",
"himself": "themselves",
"it": "them",
"itself": "themselves",
"them": "them",
"themself": "themselves",
2015-01-02 16:59:08 +00:00
}
2018-10-05 20:02:38 +00:00
pl_pron_acc_keys = enclose("|".join(list(pl_pron_acc.keys())))
2015-01-02 16:59:08 +00:00
pl_pron_acc_keys_bysize = bysize(list(pl_pron_acc.keys()))
2018-10-05 20:02:38 +00:00
si_pron["acc"] = dict([(v, k) for (k, v) in pl_pron_acc.items()])
2015-01-02 16:59:08 +00:00
for thecase, plur, gend, sing in (
2018-10-05 20:02:38 +00:00
("nom", "they", "neuter", "it"),
("nom", "they", "feminine", "she"),
("nom", "they", "masculine", "he"),
("nom", "they", "gender-neutral", "they"),
("nom", "they", "feminine or masculine", "she or he"),
("nom", "they", "masculine or feminine", "he or she"),
("nom", "themselves", "neuter", "itself"),
("nom", "themselves", "feminine", "herself"),
("nom", "themselves", "masculine", "himself"),
("nom", "themselves", "gender-neutral", "themself"),
("nom", "themselves", "feminine or masculine", "herself or himself"),
("nom", "themselves", "masculine or feminine", "himself or herself"),
("nom", "theirs", "neuter", "its"),
("nom", "theirs", "feminine", "hers"),
("nom", "theirs", "masculine", "his"),
("nom", "theirs", "gender-neutral", "theirs"),
("nom", "theirs", "feminine or masculine", "hers or his"),
("nom", "theirs", "masculine or feminine", "his or hers"),
("acc", "them", "neuter", "it"),
("acc", "them", "feminine", "her"),
("acc", "them", "masculine", "him"),
("acc", "them", "gender-neutral", "them"),
("acc", "them", "feminine or masculine", "her or him"),
("acc", "them", "masculine or feminine", "him or her"),
("acc", "themselves", "neuter", "itself"),
("acc", "themselves", "feminine", "herself"),
("acc", "themselves", "masculine", "himself"),
("acc", "themselves", "gender-neutral", "themself"),
("acc", "themselves", "feminine or masculine", "herself or himself"),
("acc", "themselves", "masculine or feminine", "himself or herself"),
2015-01-02 16:59:08 +00:00
):
try:
si_pron[thecase][plur][gend] = sing
except TypeError:
si_pron[thecase][plur] = {}
si_pron[thecase][plur][gend] = sing
2018-10-05 20:02:38 +00:00
si_pron_acc_keys = enclose("|".join(list(si_pron["acc"].keys())))
si_pron_acc_keys_bysize = bysize(list(si_pron["acc"].keys()))
2015-01-02 16:59:08 +00:00
def get_si_pron(thecase, word, gender):
try:
sing = si_pron[thecase][word]
except KeyError:
raise # not a pronoun
try:
return sing[gender] # has several types due to gender
except TypeError:
return sing # answer independent of gender
2018-10-05 20:02:38 +00:00
2015-01-02 16:59:08 +00:00
plverb_irregular_pres = {
# 1st PERS. SING. 2ND PERS. SING. 3RD PERS. SINGULAR
# 3RD PERS. (INDET.)
2018-10-05 20:02:38 +00:00
"am": "are",
"are": "are",
"is": "are",
"was": "were",
"were": "were",
"was": "were",
"have": "have",
"have": "have",
"has": "have",
"do": "do",
"do": "do",
"does": "do",
2015-01-02 16:59:08 +00:00
}
plverb_ambiguous_pres = {
# 1st PERS. SING. 2ND PERS. SING. 3RD PERS. SINGULAR
# 3RD PERS. (INDET.)
2018-10-05 20:02:38 +00:00
"act": "act",
"act": "act",
"acts": "act",
"blame": "blame",
"blame": "blame",
"blames": "blame",
"can": "can",
"can": "can",
"can": "can",
"must": "must",
"must": "must",
"must": "must",
"fly": "fly",
"fly": "fly",
"flies": "fly",
"copy": "copy",
"copy": "copy",
"copies": "copy",
"drink": "drink",
"drink": "drink",
"drinks": "drink",
"fight": "fight",
"fight": "fight",
"fights": "fight",
"fire": "fire",
"fire": "fire",
"fires": "fire",
"like": "like",
"like": "like",
"likes": "like",
"look": "look",
"look": "look",
"looks": "look",
"make": "make",
"make": "make",
"makes": "make",
"reach": "reach",
"reach": "reach",
"reaches": "reach",
"run": "run",
"run": "run",
"runs": "run",
"sink": "sink",
"sink": "sink",
"sinks": "sink",
"sleep": "sleep",
"sleep": "sleep",
"sleeps": "sleep",
"view": "view",
"view": "view",
"views": "view",
2015-01-02 16:59:08 +00:00
}
2018-10-05 20:02:38 +00:00
plverb_ambiguous_pres_keys = enclose("|".join(list(plverb_ambiguous_pres.keys())))
2015-01-02 16:59:08 +00:00
plverb_irregular_non_pres = (
2018-10-05 20:02:38 +00:00
"did",
"had",
"ate",
"made",
"put",
"spent",
"fought",
"sank",
"gave",
"sought",
"shall",
"could",
"ought",
"should",
2015-01-02 16:59:08 +00:00
)
2018-10-05 20:02:38 +00:00
plverb_ambiguous_non_pres = enclose(
"|".join(("thought", "saw", "bent", "will", "might", "cut"))
)
2015-01-02 16:59:08 +00:00
# "..oes" -> "..oe" (the rest are "..oes" -> "o")
2018-10-05 20:02:38 +00:00
pl_v_oes_oe = ("canoes", "floes", "oboes", "roes", "throes", "woes")
pl_v_oes_oe_endings_size4 = ("hoes", "toes")
pl_v_oes_oe_endings_size5 = "shoes"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
pl_count_zero = ("0", "no", "zero", "nil")
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
pl_count_one = ("1", "a", "an", "one", "each", "every", "this", "that")
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
pl_adj_special = {"a": "some", "an": "some", "this": "these", "that": "those"}
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
pl_adj_special_keys = enclose("|".join(list(pl_adj_special.keys())))
2015-01-02 16:59:08 +00:00
pl_adj_poss = {
2018-10-05 20:02:38 +00:00
"my": "our",
"your": "your",
"its": "their",
"her": "their",
"his": "their",
2015-01-02 16:59:08 +00:00
"their": "their",
}
2018-10-05 20:02:38 +00:00
pl_adj_poss_keys = enclose("|".join(list(pl_adj_poss.keys())))
2015-01-02 16:59:08 +00:00
# 2. INDEFINITE ARTICLES
# THIS PATTERN MATCHES STRINGS OF CAPITALS STARTING WITH A "VOWEL-SOUND"
# CONSONANT FOLLOWED BY ANOTHER CONSONANT, AND WHICH ARE NOT LIKELY
# TO BE REAL WORDS (OH, ALL RIGHT THEN, IT'S JUST MAGIC!)
A_abbrev = r"""
(?! FJO | [HLMNS]Y. | RY[EO] | SQU
| ( F[LR]? | [HL] | MN? | N | RH? | S[CHKLMNPTVW]? | X(YL)?) [AEIOU])
[FHLMNRSX][A-Z]
"""
# THIS PATTERN CODES THE BEGINNINGS OF ALL ENGLISH WORDS BEGINING WITH A
# 'y' FOLLOWED BY A CONSONANT. ANY OTHER Y-CONSONANT PREFIX THEREFORE
# IMPLIES AN ABBREVIATION.
2018-10-05 20:02:38 +00:00
A_y_cons = "y(b[lor]|cl[ea]|fere|gg|p[ios]|rou|tt)"
2015-01-02 16:59:08 +00:00
# EXCEPTIONS TO EXCEPTIONS
2018-10-05 20:02:38 +00:00
A_explicit_a = enclose("|".join(("unabomber", "unanimous", "US")))
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
A_explicit_an = enclose(
"|".join(("euler", "hour(?!i)", "heir", "honest", "hono[ur]", "mpeg"))
)
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
A_ordinal_an = enclose("|".join(("[aefhilmnorsx]-?th",)))
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
A_ordinal_a = enclose("|".join(("[bcdgjkpqtuvwyz]-?th",)))
2015-01-02 16:59:08 +00:00
# NUMERICAL INFLECTIONS
nth = {
2018-10-05 20:02:38 +00:00
0: "th",
1: "st",
2: "nd",
3: "rd",
4: "th",
5: "th",
6: "th",
7: "th",
8: "th",
9: "th",
11: "th",
12: "th",
13: "th",
2015-01-02 16:59:08 +00:00
}
2018-10-05 20:02:38 +00:00
ordinal = dict(
ty="tieth",
one="first",
two="second",
three="third",
five="fifth",
eight="eighth",
nine="ninth",
twelve="twelfth",
)
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
ordinal_suff = "|".join(list(ordinal.keys()))
2015-01-02 16:59:08 +00:00
# NUMBERS
2018-10-05 20:02:38 +00:00
unit = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
teen = [
"ten",
"eleven",
"twelve",
"thirteen",
"fourteen",
"fifteen",
"sixteen",
"seventeen",
"eighteen",
"nineteen",
]
ten = [
"",
"",
"twenty",
"thirty",
"forty",
"fifty",
"sixty",
"seventy",
"eighty",
"ninety",
]
mill = [
" ",
" thousand",
" million",
" billion",
" trillion",
" quadrillion",
" quintillion",
" sextillion",
" septillion",
" octillion",
" nonillion",
" decillion",
]
2015-01-02 16:59:08 +00:00
# SUPPORT CLASSICAL PLURALIZATIONS
def_classical = dict(
2018-10-05 20:02:38 +00:00
all=False, zero=False, herd=False, names=True, persons=False, ancient=False
2015-01-02 16:59:08 +00:00
)
all_classical = dict((k, True) for k in list(def_classical.keys()))
no_classical = dict((k, False) for k in list(def_classical.keys()))
# TODO: .inflectrc file does not work
# can't just execute methods from another file like this
# for rcfile in (pathjoin(dirname(__file__), '.inflectrc'),
# expanduser(pathjoin(('~'), '.inflectrc'))):
# if isfile(rcfile):
# try:
# execfile(rcfile)
# except:
# print3("\nBad .inflectrc file (%s):\n" % rcfile)
# raise BadRcFileError
class engine:
def __init__(self):
self.classical_dict = def_classical.copy()
self.persistent_count = None
self.mill_count = 0
self.pl_sb_user_defined = []
self.pl_v_user_defined = []
self.pl_adj_user_defined = []
self.si_sb_user_defined = []
self.A_a_user_defined = []
2018-10-05 20:02:38 +00:00
self.thegender = "neuter"
deprecated_methods = dict(
pl="plural",
plnoun="plural_noun",
plverb="plural_verb",
pladj="plural_adj",
sinoun="single_noun",
prespart="present_participle",
numwords="number_to_words",
plequal="compare",
plnounequal="compare_nouns",
plverbequal="compare_verbs",
pladjequal="compare_adjs",
wordlist="join",
)
2015-01-02 16:59:08 +00:00
def __getattr__(self, meth):
if meth in self.deprecated_methods:
2018-10-05 20:02:38 +00:00
print3("%s() deprecated, use %s()" % (meth, self.deprecated_methods[meth]))
2015-01-02 16:59:08 +00:00
raise DeprecationWarning
raise AttributeError
def defnoun(self, singular, plural):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
Set the noun plural of singular to plural.
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
self.checkpat(singular)
self.checkpatplural(plural)
self.pl_sb_user_defined.extend((singular, plural))
self.si_sb_user_defined.extend((plural, singular))
return 1
def defverb(self, s1, p1, s2, p2, s3, p3):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
Set the verb plurals for s1, s2 and s3 to p1, p2 and p3 respectively.
Where 1, 2 and 3 represent the 1st, 2nd and 3rd person forms of the verb.
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
self.checkpat(s1)
self.checkpat(s2)
self.checkpat(s3)
self.checkpatplural(p1)
self.checkpatplural(p2)
self.checkpatplural(p3)
self.pl_v_user_defined.extend((s1, p1, s2, p2, s3, p3))
return 1
def defadj(self, singular, plural):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
Set the adjective plural of singular to plural.
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
self.checkpat(singular)
self.checkpatplural(plural)
self.pl_adj_user_defined.extend((singular, plural))
return 1
def defa(self, pattern):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
Define the indefinate article as 'a' for words matching pattern.
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
self.checkpat(pattern)
2018-10-05 20:02:38 +00:00
self.A_a_user_defined.extend((pattern, "a"))
2015-01-02 16:59:08 +00:00
return 1
def defan(self, pattern):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
Define the indefinate article as 'an' for words matching pattern.
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
self.checkpat(pattern)
2018-10-05 20:02:38 +00:00
self.A_a_user_defined.extend((pattern, "an"))
2015-01-02 16:59:08 +00:00
return 1
def checkpat(self, pattern):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
check for errors in a regex pattern
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
if pattern is None:
return
try:
2018-10-05 20:02:38 +00:00
match(pattern, "")
2015-01-02 16:59:08 +00:00
except reerror:
print3("\nBad user-defined singular pattern:\n\t%s\n" % pattern)
raise BadUserDefinedPatternError
def checkpatplural(self, pattern):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
check for errors in a regex replace pattern
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
return
# can't find a pattern that doesn't pass the following test:
# if pattern is None:
# return
# try:
# resub('', pattern, '')
# except reerror:
# print3("\nBad user-defined plural pattern:\n\t%s\n" % pattern)
# raise BadUserDefinedPatternError
def ud_match(self, word, wordlist):
for i in range(len(wordlist) - 2, -2, -2): # backwards through even elements
2018-10-05 20:02:38 +00:00
mo = search(r"^%s$" % wordlist[i], word, IGNORECASE)
2015-01-02 16:59:08 +00:00
if mo:
if wordlist[i + 1] is None:
return None
2018-10-05 20:02:38 +00:00
pl = resub(
r"\$(\d+)", r"\\1", wordlist[i + 1]
) # change $n to \n for expand
2015-01-02 16:59:08 +00:00
return mo.expand(pl)
return None
def classical(self, **kwargs):
"""
turn classical mode on and off for various categories
turn on all classical modes:
classical()
classical(all=True)
turn on or off specific claassical modes:
e.g.
classical(herd=True)
classical(names=False)
By default all classical modes are off except names.
unknown value in args or key in kwargs rasies exception: UnknownClasicalModeError
"""
classical_mode = list(def_classical.keys())
if not kwargs:
self.classical_dict = all_classical.copy()
return
2018-10-05 20:02:38 +00:00
if "all" in kwargs:
if kwargs["all"]:
2015-01-02 16:59:08 +00:00
self.classical_dict = all_classical.copy()
else:
self.classical_dict = no_classical.copy()
for k, v in list(kwargs.items()):
if k in classical_mode:
self.classical_dict[k] = v
else:
raise UnknownClassicalModeError
def num(self, count=None, show=None): # (;$count,$show)
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
Set the number to be used in other method calls.
Returns count.
Set show to False to return '' instead.
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
if count is not None:
try:
self.persistent_count = int(count)
except ValueError:
raise BadNumValueError
if (show is None) or show:
return str(count)
else:
self.persistent_count = None
2018-10-05 20:02:38 +00:00
return ""
2015-01-02 16:59:08 +00:00
def gender(self, gender):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
set the gender for the singular of plural pronouns
can be one of:
'neuter' ('they' -> 'it')
'feminine' ('they' -> 'she')
'masculine' ('they' -> 'he')
'gender-neutral' ('they' -> 'they')
'feminine or masculine' ('they' -> 'she or he')
'masculine or feminine' ('they' -> 'he or she')
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
if gender in singular_pronoun_genders:
self.thegender = gender
else:
raise BadGenderError
def nummo(self, matchobject):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
num but take a matchobject
use groups 1 and 2 in matchobject
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
return self.num(matchobject.group(1), matchobject.group(2))
def plmo(self, matchobject):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
plural but take a matchobject
use groups 1 and 3 in matchobject
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
return self.plural(matchobject.group(1), matchobject.group(3))
def plnounmo(self, matchobject):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
plural_noun but take a matchobject
use groups 1 and 3 in matchobject
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
return self.plural_noun(matchobject.group(1), matchobject.group(3))
def plverbmo(self, matchobject):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
plural_verb but take a matchobject
use groups 1 and 3 in matchobject
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
return self.plural_verb(matchobject.group(1), matchobject.group(3))
def pladjmo(self, matchobject):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
plural_adj but take a matchobject
use groups 1 and 3 in matchobject
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
return self.plural_adj(matchobject.group(1), matchobject.group(3))
def sinounmo(self, matchobject):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
singular_noun but take a matchobject
use groups 1 and 3 in matchobject
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
return self.singular_noun(matchobject.group(1), matchobject.group(3))
def amo(self, matchobject):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
A but take a matchobject
use groups 1 and 3 in matchobject
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
if matchobject.group(3) is None:
return self.a(matchobject.group(1))
return self.a(matchobject.group(1), matchobject.group(3))
def nomo(self, matchobject):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
NO but take a matchobject
use groups 1 and 3 in matchobject
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
return self.no(matchobject.group(1), matchobject.group(3))
def ordinalmo(self, matchobject):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
ordinal but take a matchobject
use group 1
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
return self.ordinal(matchobject.group(1))
def numwordsmo(self, matchobject):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
number_to_words but take a matchobject
use group 1
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
return self.number_to_words(matchobject.group(1))
def prespartmo(self, matchobject):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
prespart but take a matchobject
use group 1
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
return self.present_participle(matchobject.group(1))
2018-10-05 20:02:38 +00:00
# 0. PERFORM GENERAL INFLECTIONS IN A STRING
2015-01-02 16:59:08 +00:00
def inflect(self, text):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
Perform inflections in a string.
e.g. inflect('The plural of cat is plural(cat)') returns
'The plural of cat is cats'
can use plural, plural_noun, plural_verb, plural_adj, singular_noun, a, an, no, ordinal,
number_to_words and prespart
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
save_persistent_count = self.persistent_count
sections = splitre(r"(num\([^)]*\))", text)
inflection = []
for section in sections:
2018-10-05 20:02:38 +00:00
(section, count) = subn(
r"num\(\s*?(?:([^),]*)(?:,([^)]*))?)?\)", self.nummo, section
)
2015-01-02 16:59:08 +00:00
if not count:
total = -1
while total:
(section, total) = subn(
r"(?x)\bplural \( ([^),]*) (, ([^)]*) )? \) ",
2018-10-05 20:02:38 +00:00
self.plmo,
section,
)
2015-01-02 16:59:08 +00:00
(section, count) = subn(
r"(?x)\bplural_noun \( ([^),]*) (, ([^)]*) )? \) ",
2018-10-05 20:02:38 +00:00
self.plnounmo,
section,
)
2015-01-02 16:59:08 +00:00
total += count
(section, count) = subn(
r"(?x)\bplural_verb \( ([^),]*) (, ([^)]*) )? \) ",
2018-10-05 20:02:38 +00:00
self.plverbmo,
section,
)
2015-01-02 16:59:08 +00:00
total += count
(section, count) = subn(
r"(?x)\bplural_adj \( ([^),]*) (, ([^)]*) )? \) ",
2018-10-05 20:02:38 +00:00
self.pladjmo,
section,
)
2015-01-02 16:59:08 +00:00
total += count
(section, count) = subn(
r"(?x)\bsingular_noun \( ([^),]*) (, ([^)]*) )? \) ",
2018-10-05 20:02:38 +00:00
self.sinounmo,
section,
)
2015-01-02 16:59:08 +00:00
total += count
(section, count) = subn(
r"(?x)\ban? \( ([^),]*) (, ([^)]*) )? \) ",
2018-10-05 20:02:38 +00:00
self.amo,
section,
)
2015-01-02 16:59:08 +00:00
total += count
(section, count) = subn(
r"(?x)\bno \( ([^),]*) (, ([^)]*) )? \) ",
2018-10-05 20:02:38 +00:00
self.nomo,
section,
)
2015-01-02 16:59:08 +00:00
total += count
(section, count) = subn(
r"(?x)\bordinal \( ([^)]*) \) ",
2018-10-05 20:02:38 +00:00
self.ordinalmo,
section,
)
2015-01-02 16:59:08 +00:00
total += count
(section, count) = subn(
r"(?x)\bnumber_to_words \( ([^)]*) \) ",
2018-10-05 20:02:38 +00:00
self.numwordsmo,
section,
)
2015-01-02 16:59:08 +00:00
total += count
(section, count) = subn(
r"(?x)\bpresent_participle \( ([^)]*) \) ",
2018-10-05 20:02:38 +00:00
self.prespartmo,
section,
)
2015-01-02 16:59:08 +00:00
total += count
inflection.append(section)
self.persistent_count = save_persistent_count
return "".join(inflection)
2018-10-05 20:02:38 +00:00
# ## PLURAL SUBROUTINES
2015-01-02 16:59:08 +00:00
def postprocess(self, orig, inflected):
"""
FIX PEDANTRY AND CAPITALIZATION :-)
"""
2018-10-05 20:02:38 +00:00
if "|" in inflected:
inflected = inflected.split("|")[self.classical_dict["all"]]
2015-01-02 16:59:08 +00:00
if orig == "I":
return inflected
if orig == orig.upper():
return inflected.upper()
if orig[0] == orig[0].upper():
2018-10-05 20:02:38 +00:00
return "%s%s" % (inflected[0].upper(), inflected[1:])
2015-01-02 16:59:08 +00:00
return inflected
def partition_word(self, text):
2018-10-05 20:02:38 +00:00
mo = search(r"\A(\s*)(.+?)(\s*)\Z", text)
2015-01-02 16:59:08 +00:00
try:
return mo.group(1), mo.group(2), mo.group(3)
except AttributeError: # empty string
2018-10-05 20:02:38 +00:00
return "", "", ""
# def pl(self, *args, **kwds):
# print 'pl() deprecated, use plural()'
# raise DeprecationWarning
# return self.plural(*args, **kwds)
#
# def plnoun(self, *args, **kwds):
# print 'plnoun() deprecated, use plural_noun()'
# raise DeprecationWarning
# return self.plural_noun(*args, **kwds)
#
# def plverb(self, *args, **kwds):
# print 'plverb() deprecated, use plural_verb()'
# raise DeprecationWarning
# return self.plural_verb(*args, **kwds)
#
# def pladj(self, *args, **kwds):
# print 'pladj() deprecated, use plural_adj()'
# raise DeprecationWarning
# return self.plural_adj(*args, **kwds)
#
# def sinoun(self, *args, **kwds):
# print 'sinoun() deprecated, use singular_noun()'
# raise DeprecationWarning
# return self.singular_noun(*args, **kwds)
#
# def prespart(self, *args, **kwds):
# print 'prespart() deprecated, use present_participle()'
# raise DeprecationWarning
# return self.present_participle(*args, **kwds)
#
# def numwords(self, *args, **kwds):
# print 'numwords() deprecated, use number_to_words()'
# raise DeprecationWarning
# return self.number_to_words(*args, **kwds)
2015-01-02 16:59:08 +00:00
def plural(self, text, count=None):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
Return the plural of text.
If count supplied, then return text if count is one of:
1, a, an, one, each, every, this, that
otherwise return the plural.
Whitespace at the start and end is preserved.
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
pre, word, post = self.partition_word(text)
if not word:
return text
plural = self.postprocess(
word,
2018-10-05 20:02:38 +00:00
self._pl_special_adjective(word, count)
or self._pl_special_verb(word, count)
or self._plnoun(word, count),
)
2015-01-02 16:59:08 +00:00
return "%s%s%s" % (pre, plural, post)
def plural_noun(self, text, count=None):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
Return the plural of text, where text is a noun.
If count supplied, then return text if count is one of:
1, a, an, one, each, every, this, that
otherwise return the plural.
Whitespace at the start and end is preserved.
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
pre, word, post = self.partition_word(text)
if not word:
return text
plural = self.postprocess(word, self._plnoun(word, count))
return "%s%s%s" % (pre, plural, post)
def plural_verb(self, text, count=None):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
Return the plural of text, where text is a verb.
If count supplied, then return text if count is one of:
1, a, an, one, each, every, this, that
otherwise return the plural.
Whitespace at the start and end is preserved.
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
pre, word, post = self.partition_word(text)
if not word:
return text
2018-10-05 20:02:38 +00:00
plural = self.postprocess(
word,
self._pl_special_verb(word, count) or self._pl_general_verb(word, count),
)
2015-01-02 16:59:08 +00:00
return "%s%s%s" % (pre, plural, post)
def plural_adj(self, text, count=None):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
Return the plural of text, where text is an adjective.
If count supplied, then return text if count is one of:
1, a, an, one, each, every, this, that
otherwise return the plural.
Whitespace at the start and end is preserved.
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
pre, word, post = self.partition_word(text)
if not word:
return text
plural = self.postprocess(word, self._pl_special_adjective(word, count) or word)
return "%s%s%s" % (pre, plural, post)
def compare(self, word1, word2):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
compare word1 and word2 for equality regardless of plurality
return values:
eq - the strings are equal
p:s - word1 is the plural of word2
s:p - word2 is the plural of word1
p:p - word1 and word2 are two different plural forms of the one word
False - otherwise
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
return (
2018-10-05 20:02:38 +00:00
self._plequal(word1, word2, self.plural_noun)
or self._plequal(word1, word2, self.plural_verb)
or self._plequal(word1, word2, self.plural_adj)
)
2015-01-02 16:59:08 +00:00
def compare_nouns(self, word1, word2):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
compare word1 and word2 for equality regardless of plurality
word1 and word2 are to be treated as nouns
return values:
eq - the strings are equal
p:s - word1 is the plural of word2
s:p - word2 is the plural of word1
p:p - word1 and word2 are two different plural forms of the one word
False - otherwise
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
return self._plequal(word1, word2, self.plural_noun)
def compare_verbs(self, word1, word2):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
compare word1 and word2 for equality regardless of plurality
word1 and word2 are to be treated as verbs
return values:
eq - the strings are equal
p:s - word1 is the plural of word2
s:p - word2 is the plural of word1
p:p - word1 and word2 are two different plural forms of the one word
False - otherwise
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
return self._plequal(word1, word2, self.plural_verb)
def compare_adjs(self, word1, word2):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
compare word1 and word2 for equality regardless of plurality
word1 and word2 are to be treated as adjectives
return values:
eq - the strings are equal
p:s - word1 is the plural of word2
s:p - word2 is the plural of word1
p:p - word1 and word2 are two different plural forms of the one word
False - otherwise
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
return self._plequal(word1, word2, self.plural_adj)
def singular_noun(self, text, count=None, gender=None):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
Return the singular of text, where text is a plural noun.
If count supplied, then return the singular if count is one of:
1, a, an, one, each, every, this, that or if count is None
otherwise return text unchanged.
Whitespace at the start and end is preserved.
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
pre, word, post = self.partition_word(text)
if not word:
return text
sing = self._sinoun(word, count=count, gender=gender)
if sing is not False:
2018-10-05 20:02:38 +00:00
plural = self.postprocess(
word, self._sinoun(word, count=count, gender=gender)
)
2015-01-02 16:59:08 +00:00
return "%s%s%s" % (pre, plural, post)
return False
def _plequal(self, word1, word2, pl):
classval = self.classical_dict.copy()
self.classical_dict = all_classical.copy()
if word1 == word2:
return "eq"
if word1 == pl(word2):
return "p:s"
if pl(word1) == word2:
return "s:p"
self.classical_dict = no_classical.copy()
if word1 == pl(word2):
return "p:s"
if pl(word1) == word2:
return "s:p"
self.classical_dict = classval.copy()
if pl == self.plural or pl == self.plural_noun:
if self._pl_check_plurals_N(word1, word2):
return "p:p"
if self._pl_check_plurals_N(word2, word1):
return "p:p"
if pl == self.plural or pl == self.plural_adj:
if self._pl_check_plurals_adj(word1, word2):
return "p:p"
return False
def _pl_reg_plurals(self, pair, stems, end1, end2):
if search(r"(%s)(%s\|\1%s|%s\|\1%s)" % (stems, end1, end2, end2, end1), pair):
return True
return False
def _pl_check_plurals_N(self, word1, word2):
pair = "%s|%s" % (word1, word2)
if pair in list(pl_sb_irregular_s.values()):
return True
if pair in list(pl_sb_irregular.values()):
return True
if pair in list(pl_sb_irregular_caps.values()):
return True
for (stems, end1, end2) in (
(pl_sb_C_a_ata, "as", "ata"),
(pl_sb_C_is_ides, "is", "ides"),
(pl_sb_C_a_ae, "s", "e"),
(pl_sb_C_en_ina, "ens", "ina"),
(pl_sb_C_um_a, "ums", "a"),
(pl_sb_C_us_i, "uses", "i"),
(pl_sb_C_on_a, "ons", "a"),
(pl_sb_C_o_i_stems, "os", "i"),
(pl_sb_C_ex_ices, "exes", "ices"),
(pl_sb_C_ix_ices, "ixes", "ices"),
(pl_sb_C_i, "s", "i"),
(pl_sb_C_im, "s", "im"),
2018-10-05 20:02:38 +00:00
(".*eau", "s", "x"),
(".*ieu", "s", "x"),
(".*tri", "xes", "ces"),
(".{2,}[yia]n", "xes", "ges"),
2015-01-02 16:59:08 +00:00
):
if self._pl_reg_plurals(pair, stems, end1, end2):
return True
return False
def _pl_check_plurals_adj(self, word1, word2):
2018-10-05 20:02:38 +00:00
# VERSION: tuple in endswith requires python 2.5
word1a = word1[: word1.rfind("'")] if word1.endswith(("'s", "'")) else ""
word2a = word2[: word2.rfind("'")] if word2.endswith(("'s", "'")) else ""
2015-01-02 16:59:08 +00:00
# TODO: BUG? report upstream. I don't think you should chop off the s'
# word1b = word1[:-2] if word1.endswith("s'") else ''
# word2b = word2[:-2] if word2.endswith("s'") else ''
# TODO: dresses', dresses's -> dresses, dresses when chop off letters
# then they return False because they are the same. Need to fix this.
if word1a:
2018-10-05 20:02:38 +00:00
if word2a and (
self._pl_check_plurals_N(word1a, word2a)
or self._pl_check_plurals_N(word2a, word1a)
):
2015-01-02 16:59:08 +00:00
return True
# if word2b and ( self._pl_check_plurals_N(word1a, word2b)
# or self._pl_check_plurals_N(word2b, word1a) ):
# return True
# if word1b:
# if word2a and ( self._pl_check_plurals_N(word1b, word2a)
# or self._pl_check_plurals_N(word2a, word1b) ):
# return True
# if word2b and ( self._pl_check_plurals_N(word1b, word2b)
# or self._pl_check_plurals_N(word2b, word1b) ):
# return True
return False
def get_count(self, count=None):
if count is None and self.persistent_count is not None:
count = self.persistent_count
if count is not None:
2018-10-05 20:02:38 +00:00
count = (
1
if (
(str(count) in pl_count_one)
or (
self.classical_dict["zero"]
and str(count).lower() in pl_count_zero
)
)
else 2
)
2015-01-02 16:59:08 +00:00
else:
2018-10-05 20:02:38 +00:00
count = ""
2015-01-02 16:59:08 +00:00
return count
# @profile
def _plnoun(self, word, count=None):
count = self.get_count(count)
2018-10-05 20:02:38 +00:00
# DEFAULT TO PLURAL
2015-01-02 16:59:08 +00:00
if count == 1:
return word
2018-10-05 20:02:38 +00:00
# HANDLE USER-DEFINED NOUNS
2015-01-02 16:59:08 +00:00
value = self.ud_match(word, self.pl_sb_user_defined)
if value is not None:
return value
2018-10-05 20:02:38 +00:00
# HANDLE EMPTY WORD, SINGULAR COUNT AND UNINFLECTED PLURALS
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if word == "":
2015-01-02 16:59:08 +00:00
return word
lowerword = word.lower()
if lowerword in pl_sb_uninflected_complete:
return word
if word in pl_sb_uninflected_caps:
return word
for k, v in pl_sb_uninflected_bysize.items():
if lowerword[-k:] in v:
return word
2018-10-05 20:02:38 +00:00
if self.classical_dict["herd"] and lowerword in pl_sb_uninflected_herd:
2015-01-02 16:59:08 +00:00
return word
2018-10-05 20:02:38 +00:00
# HANDLE COMPOUNDS ("Governor General", "mother-in-law", "aide-de-camp", ETC.)
2015-01-02 16:59:08 +00:00
mo = search(r"^(?:%s)$" % pl_sb_postfix_adj_stems, word, IGNORECASE)
2018-10-05 20:02:38 +00:00
if mo and mo.group(2) != "":
2015-01-02 16:59:08 +00:00
return "%s%s" % (self._plnoun(mo.group(1), 2), mo.group(2))
2018-10-05 20:02:38 +00:00
if " a " in lowerword or "-a-" in lowerword:
2015-01-02 16:59:08 +00:00
mo = search(r"^(?:%s)$" % pl_sb_prep_dual_compound, word, IGNORECASE)
2018-10-05 20:02:38 +00:00
if mo and mo.group(2) != "" and mo.group(3) != "":
return "%s%s%s" % (
self._plnoun(mo.group(1), 2),
mo.group(2),
self._plnoun(mo.group(3)),
)
lowersplit = lowerword.split(" ")
2015-01-02 16:59:08 +00:00
if len(lowersplit) >= 3:
for numword in range(1, len(lowersplit) - 1):
if lowersplit[numword] in pl_prep_list_da:
2018-10-05 20:02:38 +00:00
return " ".join(
lowersplit[: numword - 1]
+ [self._plnoun(lowersplit[numword - 1], 2)]
+ lowersplit[numword:]
)
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
lowersplit = lowerword.split("-")
2015-01-02 16:59:08 +00:00
if len(lowersplit) >= 3:
for numword in range(1, len(lowersplit) - 1):
if lowersplit[numword] in pl_prep_list_da:
2018-10-05 20:02:38 +00:00
return " ".join(
lowersplit[: numword - 1]
+ [
self._plnoun(lowersplit[numword - 1], 2)
+ "-"
+ lowersplit[numword]
+ "-"
]
) + " ".join(lowersplit[(numword + 1) :])
# HANDLE PRONOUNS
2015-01-02 16:59:08 +00:00
for k, v in pl_pron_acc_keys_bysize.items():
if lowerword[-k:] in v: # ends with accusivate pronoun
for pk, pv in pl_prep_bysize.items():
if lowerword[:pk] in pv: # starts with a prep
2018-10-05 20:02:38 +00:00
if lowerword.split() == [
lowerword[:pk],
lowerword[-k:],
]: # only whitespace in between
2015-01-02 16:59:08 +00:00
return lowerword[:-k] + pl_pron_acc[lowerword[-k:]]
try:
return pl_pron_nom[word.lower()]
except KeyError:
pass
try:
return pl_pron_acc[word.lower()]
except KeyError:
pass
2018-10-05 20:02:38 +00:00
# HANDLE ISOLATED IRREGULAR PLURALS
2015-01-02 16:59:08 +00:00
wordsplit = word.split()
wordlast = wordsplit[-1]
lowerwordlast = wordlast.lower()
if wordlast in list(pl_sb_irregular_caps.keys()):
llen = len(wordlast)
2018-10-05 20:02:38 +00:00
return "%s%s" % (word[:-llen], pl_sb_irregular_caps[wordlast])
2015-01-02 16:59:08 +00:00
if lowerwordlast in list(pl_sb_irregular.keys()):
llen = len(lowerwordlast)
2018-10-05 20:02:38 +00:00
return "%s%s" % (word[:-llen], pl_sb_irregular[lowerwordlast])
if (" ".join(wordsplit[-2:])).lower() in list(pl_sb_irregular_compound.keys()):
llen = len(
" ".join(wordsplit[-2:])
) # TODO: what if 2 spaces between these words?
return "%s%s" % (
word[:-llen],
pl_sb_irregular_compound[(" ".join(wordsplit[-2:])).lower()],
)
if lowerword[-3:] == "quy":
return word[:-1] + "ies"
if lowerword[-6:] == "person":
if self.classical_dict["persons"]:
return word + "s"
2015-01-02 16:59:08 +00:00
else:
2018-10-05 20:02:38 +00:00
return word[:-4] + "ople"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
# HANDLE FAMILIES OF IRREGULAR PLURALS
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if lowerword[-3:] == "man":
2015-01-02 16:59:08 +00:00
for k, v in pl_sb_U_man_mans_bysize.items():
if lowerword[-k:] in v:
2018-10-05 20:02:38 +00:00
return word + "s"
2015-01-02 16:59:08 +00:00
for k, v in pl_sb_U_man_mans_caps_bysize.items():
if word[-k:] in v:
2018-10-05 20:02:38 +00:00
return word + "s"
return word[:-3] + "men"
if lowerword[-5:] == "mouse":
return word[:-5] + "mice"
if lowerword[-5:] == "louse":
return word[:-5] + "lice"
if lowerword[-5:] == "goose":
return word[:-5] + "geese"
if lowerword[-5:] == "tooth":
return word[:-5] + "teeth"
if lowerword[-4:] == "foot":
return word[:-4] + "feet"
if lowerword == "die":
return "dice"
# HANDLE UNASSIMILATED IMPORTS
if lowerword[-4:] == "ceps":
2015-01-02 16:59:08 +00:00
return word
2018-10-05 20:02:38 +00:00
if lowerword[-4:] == "zoon":
return word[:-2] + "a"
if lowerword[-3:] in ("cis", "sis", "xis"):
return word[:-2] + "es"
2015-01-02 16:59:08 +00:00
for lastlet, d, numend, post in (
2018-10-05 20:02:38 +00:00
("h", pl_sb_U_ch_chs_bysize, None, "s"),
("x", pl_sb_U_ex_ices_bysize, -2, "ices"),
("x", pl_sb_U_ix_ices_bysize, -2, "ices"),
("m", pl_sb_U_um_a_bysize, -2, "a"),
("s", pl_sb_U_us_i_bysize, -2, "i"),
("n", pl_sb_U_on_a_bysize, -2, "a"),
("a", pl_sb_U_a_ae_bysize, None, "e"),
2015-01-02 16:59:08 +00:00
):
if lowerword[-1] == lastlet: # this test to add speed
for k, v in d.items():
if lowerword[-k:] in v:
return word[:numend] + post
2018-10-05 20:02:38 +00:00
# HANDLE INCOMPLETELY ASSIMILATED IMPORTS
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if self.classical_dict["ancient"]:
if lowerword[-4:] == "trix":
return word[:-1] + "ces"
if lowerword[-3:] in ("eau", "ieu"):
return word + "x"
if lowerword[-3:] in ("ynx", "inx", "anx") and len(word) > 4:
return word[:-1] + "ges"
2015-01-02 16:59:08 +00:00
for lastlet, d, numend, post in (
2018-10-05 20:02:38 +00:00
("n", pl_sb_C_en_ina_bysize, -2, "ina"),
("x", pl_sb_C_ex_ices_bysize, -2, "ices"),
("x", pl_sb_C_ix_ices_bysize, -2, "ices"),
("m", pl_sb_C_um_a_bysize, -2, "a"),
("s", pl_sb_C_us_i_bysize, -2, "i"),
("s", pl_sb_C_us_us_bysize, None, ""),
("a", pl_sb_C_a_ae_bysize, None, "e"),
("a", pl_sb_C_a_ata_bysize, None, "ta"),
("s", pl_sb_C_is_ides_bysize, -1, "des"),
("o", pl_sb_C_o_i_bysize, -1, "i"),
("n", pl_sb_C_on_a_bysize, -2, "a"),
2015-01-02 16:59:08 +00:00
):
if lowerword[-1] == lastlet: # this test to add speed
for k, v in d.items():
if lowerword[-k:] in v:
return word[:numend] + post
for d, numend, post in (
2018-10-05 20:02:38 +00:00
(pl_sb_C_i_bysize, None, "i"),
(pl_sb_C_im_bysize, None, "im"),
2015-01-02 16:59:08 +00:00
):
for k, v in d.items():
if lowerword[-k:] in v:
return word[:numend] + post
2018-10-05 20:02:38 +00:00
# HANDLE SINGULAR NOUNS ENDING IN ...s OR OTHER SILIBANTS
2015-01-02 16:59:08 +00:00
if lowerword in pl_sb_singular_s_complete:
2018-10-05 20:02:38 +00:00
return word + "es"
2015-01-02 16:59:08 +00:00
for k, v in pl_sb_singular_s_bysize.items():
if lowerword[-k:] in v:
2018-10-05 20:02:38 +00:00
return word + "es"
if lowerword[-2:] == "es" and word[0] == word[0].upper():
return word + "es"
# Wouldn't special words
# ending with 's' always have been caught, regardless of them starting
# with a capital letter (i.e. being names)
# It makes sense below to do this for words ending in 'y' so that
# Sally -> Sallys. But not sure it makes sense here. Where is the case
# of a word ending in s that is caught here and would otherwise have been
# caught below?
#
# removing it as I can't find a case that executes it
# TODO: check this again
#
# if (self.classical_dict['names']):
# mo = search(r"([A-Z].*s)$", word)
# if mo:
# return "%ses" % mo.group(1)
if lowerword[-1] == "z":
2015-01-02 16:59:08 +00:00
for k, v in pl_sb_z_zes_bysize.items():
if lowerword[-k:] in v:
2018-10-05 20:02:38 +00:00
return word + "es"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if lowerword[-2:-1] != "z":
return word + "zes"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if lowerword[-2:] == "ze":
2015-01-02 16:59:08 +00:00
for k, v in pl_sb_ze_zes_bysize.items():
if lowerword[-k:] in v:
2018-10-05 20:02:38 +00:00
return word + "s"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if lowerword[-2:] in ("ch", "sh", "zz", "ss") or lowerword[-1] == "x":
return word + "es"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
# ## (r"(.*)(us)$", "%s%ses"), TODO: why is this commented?
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
# HANDLE ...f -> ...ves
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if lowerword[-3:] in ("elf", "alf", "olf"):
return word[:-1] + "ves"
if lowerword[-3:] == "eaf" and lowerword[-4:-3] != "d":
return word[:-1] + "ves"
if lowerword[-4:] in ("nife", "life", "wife"):
return word[:-2] + "ves"
if lowerword[-3:] == "arf":
return word[:-1] + "ves"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
# HANDLE ...y
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if lowerword[-1] == "y":
if lowerword[-2:-1] in "aeiou" or len(word) == 1:
return word + "s"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if self.classical_dict["names"]:
if lowerword[-1] == "y" and word[0] == word[0].upper():
return word + "s"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
return word[:-1] + "ies"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
# HANDLE ...o
2015-01-02 16:59:08 +00:00
if lowerword in pl_sb_U_o_os_complete:
2018-10-05 20:02:38 +00:00
return word + "s"
2015-01-02 16:59:08 +00:00
for k, v in pl_sb_U_o_os_bysize.items():
if lowerword[-k:] in v:
2018-10-05 20:02:38 +00:00
return word + "s"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if lowerword[-2:] in ("ao", "eo", "io", "oo", "uo"):
return word + "s"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if lowerword[-1] == "o":
return word + "es"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
# OTHERWISE JUST ADD ...s
2015-01-02 16:59:08 +00:00
return "%ss" % word
def _pl_special_verb(self, word, count=None):
2018-10-05 20:02:38 +00:00
if self.classical_dict["zero"] and str(count).lower() in pl_count_zero:
return False
2015-01-02 16:59:08 +00:00
count = self.get_count(count)
if count == 1:
return word
2018-10-05 20:02:38 +00:00
# HANDLE USER-DEFINED VERBS
2015-01-02 16:59:08 +00:00
value = self.ud_match(word, self.pl_v_user_defined)
if value is not None:
return value
2018-10-05 20:02:38 +00:00
# HANDLE IRREGULAR PRESENT TENSE (SIMPLE AND COMPOUND)
2015-01-02 16:59:08 +00:00
lowerword = word.lower()
try:
firstword = lowerword.split()[0]
except IndexError:
return False # word is ''
if firstword in list(plverb_irregular_pres.keys()):
2018-10-05 20:02:38 +00:00
return "%s%s" % (plverb_irregular_pres[firstword], word[len(firstword) :])
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
# HANDLE IRREGULAR FUTURE, PRETERITE AND PERFECT TENSES
2015-01-02 16:59:08 +00:00
if firstword in plverb_irregular_non_pres:
return word
2018-10-05 20:02:38 +00:00
# HANDLE PRESENT NEGATIONS (SIMPLE AND COMPOUND)
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if firstword.endswith("n't") and firstword[:-3] in list(
plverb_irregular_pres.keys()
):
return "%sn't%s" % (
plverb_irregular_pres[firstword[:-3]],
word[len(firstword) :],
)
2015-01-02 16:59:08 +00:00
if firstword.endswith("n't"):
return word
2018-10-05 20:02:38 +00:00
# HANDLE SPECIAL CASES
2015-01-02 16:59:08 +00:00
mo = search(r"^(%s)$" % plverb_special_s, word)
if mo:
return False
if search(r"\s", word):
return False
2018-10-05 20:02:38 +00:00
if lowerword == "quizzes":
return "quiz"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
# HANDLE STANDARD 3RD PERSON (CHOP THE ...(e)s OFF SINGLE WORDS)
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if (
lowerword[-4:] in ("ches", "shes", "zzes", "sses")
or lowerword[-3:] == "xes"
):
2015-01-02 16:59:08 +00:00
return word[:-2]
2018-10-05 20:02:38 +00:00
# # mo = search(r"^(.*)([cs]h|[x]|zz|ss)es$",
# # word, IGNORECASE)
# # if mo:
# # return "%s%s" % (mo.group(1), mo.group(2))
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if lowerword[-3:] == "ies" and len(word) > 3:
return lowerword[:-3] + "y"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if (
lowerword in pl_v_oes_oe
or lowerword[-4:] in pl_v_oes_oe_endings_size4
or lowerword[-5:] in pl_v_oes_oe_endings_size5
):
return word[:-1]
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if lowerword.endswith("oes") and len(word) > 3:
2015-01-02 16:59:08 +00:00
return lowerword[:-2]
mo = search(r"^(.*[^s])s$", word, IGNORECASE)
if mo:
return mo.group(1)
2018-10-05 20:02:38 +00:00
# OTHERWISE, A REGULAR VERB (HANDLE ELSEWHERE)
2015-01-02 16:59:08 +00:00
return False
def _pl_general_verb(self, word, count=None):
count = self.get_count(count)
if count == 1:
return word
2018-10-05 20:02:38 +00:00
# HANDLE AMBIGUOUS PRESENT TENSES (SIMPLE AND COMPOUND)
2015-01-02 16:59:08 +00:00
mo = search(r"^(%s)((\s.*)?)$" % plverb_ambiguous_pres_keys, word, IGNORECASE)
if mo:
return "%s%s" % (plverb_ambiguous_pres[mo.group(1).lower()], mo.group(2))
2018-10-05 20:02:38 +00:00
# HANDLE AMBIGUOUS PRETERITE AND PERFECT TENSES
2015-01-02 16:59:08 +00:00
mo = search(r"^(%s)((\s.*)?)$" % plverb_ambiguous_non_pres, word, IGNORECASE)
if mo:
return word
2018-10-05 20:02:38 +00:00
# OTHERWISE, 1st OR 2ND PERSON IS UNINFLECTED
2015-01-02 16:59:08 +00:00
return word
def _pl_special_adjective(self, word, count=None):
count = self.get_count(count)
if count == 1:
return word
2018-10-05 20:02:38 +00:00
# HANDLE USER-DEFINED ADJECTIVES
2015-01-02 16:59:08 +00:00
value = self.ud_match(word, self.pl_adj_user_defined)
if value is not None:
return value
2018-10-05 20:02:38 +00:00
# HANDLE KNOWN CASES
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
mo = search(r"^(%s)$" % pl_adj_special_keys, word, IGNORECASE)
2015-01-02 16:59:08 +00:00
if mo:
return "%s" % (pl_adj_special[mo.group(1).lower()])
2018-10-05 20:02:38 +00:00
# HANDLE POSSESSIVES
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
mo = search(r"^(%s)$" % pl_adj_poss_keys, word, IGNORECASE)
2015-01-02 16:59:08 +00:00
if mo:
return "%s" % (pl_adj_poss[mo.group(1).lower()])
2018-10-05 20:02:38 +00:00
mo = search(r"^(.*)'s?$", word)
2015-01-02 16:59:08 +00:00
if mo:
pl = self.plural_noun(mo.group(1))
2018-10-05 20:02:38 +00:00
trailing_s = "" if pl[-1] == "s" else "s"
2015-01-02 16:59:08 +00:00
return "%s'%s" % (pl, trailing_s)
2018-10-05 20:02:38 +00:00
# OTHERWISE, NO IDEA
2015-01-02 16:59:08 +00:00
return False
# @profile
def _sinoun(self, word, count=None, gender=None):
count = self.get_count(count)
2018-10-05 20:02:38 +00:00
# DEFAULT TO PLURAL
2015-01-02 16:59:08 +00:00
if count == 2:
return word
2018-10-05 20:02:38 +00:00
# SET THE GENDER
2015-01-02 16:59:08 +00:00
try:
if gender is None:
gender = self.thegender
elif gender not in singular_pronoun_genders:
raise BadGenderError
except (TypeError, IndexError):
raise BadGenderError
2018-10-05 20:02:38 +00:00
# HANDLE USER-DEFINED NOUNS
2015-01-02 16:59:08 +00:00
value = self.ud_match(word, self.si_sb_user_defined)
if value is not None:
return value
2018-10-05 20:02:38 +00:00
# HANDLE EMPTY WORD, SINGULAR COUNT AND UNINFLECTED PLURALS
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if word == "":
2015-01-02 16:59:08 +00:00
return word
lowerword = word.lower()
if word in si_sb_ois_oi_case:
return word[:-1]
if lowerword in pl_sb_uninflected_complete:
return word
if word in pl_sb_uninflected_caps:
return word
for k, v in pl_sb_uninflected_bysize.items():
if lowerword[-k:] in v:
return word
2018-10-05 20:02:38 +00:00
if self.classical_dict["herd"] and lowerword in pl_sb_uninflected_herd:
2015-01-02 16:59:08 +00:00
return word
2018-10-05 20:02:38 +00:00
# HANDLE COMPOUNDS ("Governor General", "mother-in-law", "aide-de-camp", ETC.)
2015-01-02 16:59:08 +00:00
mo = search(r"^(?:%s)$" % pl_sb_postfix_adj_stems, word, IGNORECASE)
2018-10-05 20:02:38 +00:00
if mo and mo.group(2) != "":
2015-01-02 16:59:08 +00:00
return "%s%s" % (self._sinoun(mo.group(1), 1, gender=gender), mo.group(2))
# how to reverse this one?
# mo = search(r"^(?:%s)$" % pl_sb_prep_dual_compound, word, IGNORECASE)
# if mo and mo.group(2) != '' and mo.group(3) != '':
# return "%s%s%s" % (self._sinoun(mo.group(1), 1),
# mo.group(2),
# self._sinoun(mo.group(3), 1))
2018-10-05 20:02:38 +00:00
lowersplit = lowerword.split(" ")
2015-01-02 16:59:08 +00:00
if len(lowersplit) >= 3:
for numword in range(1, len(lowersplit) - 1):
if lowersplit[numword] in pl_prep_list_da:
2018-10-05 20:02:38 +00:00
return " ".join(
lowersplit[: numword - 1]
+ [self._sinoun(lowersplit[numword - 1], 1, gender=gender)]
+ lowersplit[numword:]
)
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
lowersplit = lowerword.split("-")
2015-01-02 16:59:08 +00:00
if len(lowersplit) >= 3:
for numword in range(1, len(lowersplit) - 1):
if lowersplit[numword] in pl_prep_list_da:
2018-10-05 20:02:38 +00:00
return " ".join(
lowersplit[: numword - 1]
+ [
self._sinoun(lowersplit[numword - 1], 1, gender=gender)
+ "-"
+ lowersplit[numword]
+ "-"
]
) + " ".join(lowersplit[(numword + 1) :])
# HANDLE PRONOUNS
2015-01-02 16:59:08 +00:00
for k, v in si_pron_acc_keys_bysize.items():
if lowerword[-k:] in v: # ends with accusivate pronoun
for pk, pv in pl_prep_bysize.items():
if lowerword[:pk] in pv: # starts with a prep
2018-10-05 20:02:38 +00:00
if lowerword.split() == [
lowerword[:pk],
lowerword[-k:],
]: # only whitespace in between
return lowerword[:-k] + get_si_pron(
"acc", lowerword[-k:], gender
)
2015-01-02 16:59:08 +00:00
try:
2018-10-05 20:02:38 +00:00
return get_si_pron("nom", word.lower(), gender)
2015-01-02 16:59:08 +00:00
except KeyError:
pass
try:
2018-10-05 20:02:38 +00:00
return get_si_pron("acc", word.lower(), gender)
2015-01-02 16:59:08 +00:00
except KeyError:
pass
2018-10-05 20:02:38 +00:00
# HANDLE ISOLATED IRREGULAR PLURALS
2015-01-02 16:59:08 +00:00
wordsplit = word.split()
wordlast = wordsplit[-1]
lowerwordlast = wordlast.lower()
if wordlast in list(si_sb_irregular_caps.keys()):
llen = len(wordlast)
2018-10-05 20:02:38 +00:00
return "%s%s" % (word[:-llen], si_sb_irregular_caps[wordlast])
2015-01-02 16:59:08 +00:00
if lowerwordlast in list(si_sb_irregular.keys()):
llen = len(lowerwordlast)
2018-10-05 20:02:38 +00:00
return "%s%s" % (word[:-llen], si_sb_irregular[lowerwordlast])
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if (" ".join(wordsplit[-2:])).lower() in list(si_sb_irregular_compound.keys()):
llen = len(
" ".join(wordsplit[-2:])
) # TODO: what if 2 spaces between these words?
return "%s%s" % (
word[:-llen],
si_sb_irregular_compound[(" ".join(wordsplit[-2:])).lower()],
)
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if lowerword[-5:] == "quies":
return word[:-3] + "y"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if lowerword[-7:] == "persons":
2015-01-02 16:59:08 +00:00
return word[:-1]
2018-10-05 20:02:38 +00:00
if lowerword[-6:] == "people":
return word[:-4] + "rson"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
# HANDLE FAMILIES OF IRREGULAR PLURALS
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if lowerword[-4:] == "mans":
2015-01-02 16:59:08 +00:00
for k, v in si_sb_U_man_mans_bysize.items():
if lowerword[-k:] in v:
return word[:-1]
for k, v in si_sb_U_man_mans_caps_bysize.items():
if word[-k:] in v:
return word[:-1]
2018-10-05 20:02:38 +00:00
if lowerword[-3:] == "men":
return word[:-3] + "man"
if lowerword[-4:] == "mice":
return word[:-4] + "mouse"
if lowerword[-4:] == "lice":
return word[:-4] + "louse"
if lowerword[-5:] == "geese":
return word[:-5] + "goose"
if lowerword[-5:] == "teeth":
return word[:-5] + "tooth"
if lowerword[-4:] == "feet":
return word[:-4] + "foot"
if lowerword == "dice":
return "die"
# HANDLE UNASSIMILATED IMPORTS
if lowerword[-4:] == "ceps":
2015-01-02 16:59:08 +00:00
return word
2018-10-05 20:02:38 +00:00
if lowerword[-3:] == "zoa":
return word[:-1] + "on"
2015-01-02 16:59:08 +00:00
for lastlet, d, numend, post in (
2018-10-05 20:02:38 +00:00
("s", si_sb_U_ch_chs_bysize, -1, ""),
("s", si_sb_U_ex_ices_bysize, -4, "ex"),
("s", si_sb_U_ix_ices_bysize, -4, "ix"),
("a", si_sb_U_um_a_bysize, -1, "um"),
("i", si_sb_U_us_i_bysize, -1, "us"),
("a", si_sb_U_on_a_bysize, -1, "on"),
("e", si_sb_U_a_ae_bysize, -1, ""),
2015-01-02 16:59:08 +00:00
):
if lowerword[-1] == lastlet: # this test to add speed
for k, v in d.items():
if lowerword[-k:] in v:
return word[:numend] + post
2018-10-05 20:02:38 +00:00
# HANDLE INCOMPLETELY ASSIMILATED IMPORTS
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if self.classical_dict["ancient"]:
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if lowerword[-6:] == "trices":
return word[:-3] + "x"
if lowerword[-4:] in ("eaux", "ieux"):
2015-01-02 16:59:08 +00:00
return word[:-1]
2018-10-05 20:02:38 +00:00
if lowerword[-5:] in ("ynges", "inges", "anges") and len(word) > 6:
return word[:-3] + "x"
2015-01-02 16:59:08 +00:00
for lastlet, d, numend, post in (
2018-10-05 20:02:38 +00:00
("a", si_sb_C_en_ina_bysize, -3, "en"),
("s", si_sb_C_ex_ices_bysize, -4, "ex"),
("s", si_sb_C_ix_ices_bysize, -4, "ix"),
("a", si_sb_C_um_a_bysize, -1, "um"),
("i", si_sb_C_us_i_bysize, -1, "us"),
("s", pl_sb_C_us_us_bysize, None, ""),
("e", si_sb_C_a_ae_bysize, -1, ""),
("a", si_sb_C_a_ata_bysize, -2, ""),
("s", si_sb_C_is_ides_bysize, -3, "s"),
("i", si_sb_C_o_i_bysize, -1, "o"),
("a", si_sb_C_on_a_bysize, -1, "on"),
("m", si_sb_C_im_bysize, -2, ""),
("i", si_sb_C_i_bysize, -1, ""),
2015-01-02 16:59:08 +00:00
):
if lowerword[-1] == lastlet: # this test to add speed
for k, v in d.items():
if lowerword[-k:] in v:
return word[:numend] + post
2018-10-05 20:02:38 +00:00
# HANDLE PLURLS ENDING IN uses -> use
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if (
lowerword[-6:] == "houses"
or word in si_sb_uses_use_case
or lowerword in si_sb_uses_use
):
2015-01-02 16:59:08 +00:00
return word[:-1]
2018-10-05 20:02:38 +00:00
# HANDLE PLURLS ENDING IN ies -> ie
2015-01-02 16:59:08 +00:00
if word in si_sb_ies_ie_case or lowerword in si_sb_ies_ie:
return word[:-1]
2018-10-05 20:02:38 +00:00
# HANDLE PLURLS ENDING IN oes -> oe
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if (
lowerword[-5:] == "shoes"
or word in si_sb_oes_oe_case
or lowerword in si_sb_oes_oe
):
2015-01-02 16:59:08 +00:00
return word[:-1]
2018-10-05 20:02:38 +00:00
# HANDLE SINGULAR NOUNS ENDING IN ...s OR OTHER SILIBANTS
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if word in si_sb_sses_sse_case or lowerword in si_sb_sses_sse:
2015-01-02 16:59:08 +00:00
return word[:-1]
if lowerword in si_sb_singular_s_complete:
return word[:-2]
for k, v in si_sb_singular_s_bysize.items():
if lowerword[-k:] in v:
return word[:-2]
2018-10-05 20:02:38 +00:00
if lowerword[-4:] == "eses" and word[0] == word[0].upper():
2015-01-02 16:59:08 +00:00
return word[:-2]
2018-10-05 20:02:38 +00:00
# Wouldn't special words
# ending with 's' always have been caught, regardless of them starting
# with a capital letter (i.e. being names)
# It makes sense below to do this for words ending in 'y' so that
# Sally -> Sallys. But not sure it makes sense here. Where is the case
# of a word ending in s that is caught here and would otherwise have been
# caught below?
#
# removing it as I can't find a case that executes it
# TODO: check this again
#
# if (self.classical_dict['names']):
# mo = search(r"([A-Z].*ses)$", word)
# if mo:
# return "%s" % mo.group(1)
2015-01-02 16:59:08 +00:00
if lowerword in si_sb_z_zes:
return word[:-2]
if lowerword in si_sb_zzes_zz:
return word[:-2]
2018-10-05 20:02:38 +00:00
if lowerword[-4:] == "zzes":
2015-01-02 16:59:08 +00:00
return word[:-3]
2018-10-05 20:02:38 +00:00
if word in si_sb_ches_che_case or lowerword in si_sb_ches_che:
2015-01-02 16:59:08 +00:00
return word[:-1]
2018-10-05 20:02:38 +00:00
if lowerword[-4:] in ("ches", "shes"):
2015-01-02 16:59:08 +00:00
return word[:-2]
if lowerword in si_sb_xes_xe:
return word[:-1]
2018-10-05 20:02:38 +00:00
if lowerword[-3:] == "xes":
2015-01-02 16:59:08 +00:00
return word[:-2]
2018-10-05 20:02:38 +00:00
# (r"(.*)(us)es$", "%s%s"), TODO: why is this commented?
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
# HANDLE ...f -> ...ves
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if word in si_sb_ves_ve_case or lowerword in si_sb_ves_ve:
2015-01-02 16:59:08 +00:00
return word[:-1]
2018-10-05 20:02:38 +00:00
if lowerword[-3:] == "ves":
if lowerword[-5:-3] in ("el", "al", "ol"):
return word[:-3] + "f"
if lowerword[-5:-3] == "ea" and word[-6:-5] != "d":
return word[:-3] + "f"
if lowerword[-5:-3] in ("ni", "li", "wi"):
return word[:-3] + "fe"
if lowerword[-5:-3] == "ar":
return word[:-3] + "f"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
# HANDLE ...y
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if lowerword[-2:] == "ys":
if len(lowerword) > 2 and lowerword[-3] in "aeiou":
2015-01-02 16:59:08 +00:00
return word[:-1]
2018-10-05 20:02:38 +00:00
if self.classical_dict["names"]:
if lowerword[-2:] == "ys" and word[0] == word[0].upper():
2015-01-02 16:59:08 +00:00
return word[:-1]
2018-10-05 20:02:38 +00:00
if lowerword[-3:] == "ies":
return word[:-3] + "y"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
# HANDLE ...o
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if lowerword[-2:] == "os":
2015-01-02 16:59:08 +00:00
if lowerword in si_sb_U_o_os_complete:
return word[:-1]
for k, v in si_sb_U_o_os_bysize.items():
if lowerword[-k:] in v:
return word[:-1]
2018-10-05 20:02:38 +00:00
if lowerword[-3:] in ("aos", "eos", "ios", "oos", "uos"):
2015-01-02 16:59:08 +00:00
return word[:-1]
2018-10-05 20:02:38 +00:00
if lowerword[-3:] == "oes":
2015-01-02 16:59:08 +00:00
return word[:-2]
2018-10-05 20:02:38 +00:00
# UNASSIMILATED IMPORTS FINAL RULE
2015-01-02 16:59:08 +00:00
if word in si_sb_es_is:
2018-10-05 20:02:38 +00:00
return word[:-2] + "is"
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
# OTHERWISE JUST REMOVE ...s
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
if lowerword[-1] == "s":
2015-01-02 16:59:08 +00:00
return word[:-1]
2018-10-05 20:02:38 +00:00
# COULD NOT FIND SINGULAR
2015-01-02 16:59:08 +00:00
return False
2018-10-05 20:02:38 +00:00
# ADJECTIVES
2015-01-02 16:59:08 +00:00
def a(self, text, count=1):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
Return the appropriate indefinite article followed by text.
The indefinite article is either 'a' or 'an'.
If count is not one, then return count followed by text
instead of 'a' or 'an'.
Whitespace at the start and end is preserved.
2018-10-05 20:02:38 +00:00
"""
mo = search(r"\A(\s*)(?:an?\s+)?(.+?)(\s*)\Z", text, IGNORECASE)
2015-01-02 16:59:08 +00:00
if mo:
word = mo.group(2)
if not word:
return text
pre = mo.group(1)
post = mo.group(3)
result = self._indef_article(word, count)
return "%s%s%s" % (pre, result, post)
2018-10-05 20:02:38 +00:00
return ""
2015-01-02 16:59:08 +00:00
an = a
def _indef_article(self, word, count):
mycount = self.get_count(count)
if mycount != 1:
return "%s %s" % (count, word)
2018-10-05 20:02:38 +00:00
# HANDLE USER-DEFINED VARIANTS
2015-01-02 16:59:08 +00:00
value = self.ud_match(word, self.A_a_user_defined)
if value is not None:
return "%s %s" % (value, word)
2018-10-05 20:02:38 +00:00
# HANDLE ORDINAL FORMS
2015-01-02 16:59:08 +00:00
2018-10-05 20:02:38 +00:00
for a in ((r"^(%s)" % A_ordinal_a, "a"), (r"^(%s)" % A_ordinal_an, "an")):
2015-01-02 16:59:08 +00:00
mo = search(a[0], word, IGNORECASE)
if mo:
return "%s %s" % (a[1], word)
2018-10-05 20:02:38 +00:00
# HANDLE SPECIAL CASES
2015-01-02 16:59:08 +00:00
for a in (
2018-10-05 20:02:38 +00:00
(r"^(%s)" % A_explicit_an, "an"),
(r"^[aefhilmnorsx]$", "an"),
(r"^[bcdgjkpqtuvwyz]$", "a"),
2015-01-02 16:59:08 +00:00
):
mo = search(a[0], word, IGNORECASE)
if mo:
return "%s %s" % (a[1], word)
2018-10-05 20:02:38 +00:00
# HANDLE ABBREVIATIONS
2015-01-02 16:59:08 +00:00
for a in (
2018-10-05 20:02:38 +00:00
(r"(%s)" % A_abbrev, "an", VERBOSE),
(r"^[aefhilmnorsx][.-]", "an", IGNORECASE),
(r"^[a-z][.-]", "a", IGNORECASE),
2015-01-02 16:59:08 +00:00
):
mo = search(a[0], word, a[2])
if mo:
return "%s %s" % (a[1], word)
2018-10-05 20:02:38 +00:00
# HANDLE CONSONANTS
2015-01-02 16:59:08 +00:00
mo = search(r"^[^aeiouy]", word, IGNORECASE)
if mo:
return "a %s" % word
2018-10-05 20:02:38 +00:00
# HANDLE SPECIAL VOWEL-FORMS
2015-01-02 16:59:08 +00:00
for a in (
2018-10-05 20:02:38 +00:00
(r"^e[uw]", "a"),
(r"^onc?e\b", "a"),
(r"^onetime\b", "a"),
(r"^uni([^nmd]|mo)", "a"),
(r"^u[bcfghjkqrst][aeiou]", "a"),
(r"^ukr", "a"),
(r"^(%s)" % A_explicit_a, "a"),
2015-01-02 16:59:08 +00:00
):
mo = search(a[0], word, IGNORECASE)
if mo:
return "%s %s" % (a[1], word)
2018-10-05 20:02:38 +00:00
# HANDLE SPECIAL CAPITALS
2015-01-02 16:59:08 +00:00
mo = search(r"^U[NK][AIEO]?", word)
if mo:
return "a %s" % word
2018-10-05 20:02:38 +00:00
# HANDLE VOWELS
2015-01-02 16:59:08 +00:00
mo = search(r"^[aeiou]", word, IGNORECASE)
if mo:
return "an %s" % word
2018-10-05 20:02:38 +00:00
# HANDLE y... (BEFORE CERTAIN CONSONANTS IMPLIES (UNNATURALIZED) "i.." SOUND)
2015-01-02 16:59:08 +00:00
mo = search(r"^(%s)" % A_y_cons, word, IGNORECASE)
if mo:
return "an %s" % word
2018-10-05 20:02:38 +00:00
# OTHERWISE, GUESS "a"
2015-01-02 16:59:08 +00:00
return "a %s" % word
2018-10-05 20:02:38 +00:00
# 2. TRANSLATE ZERO-QUANTIFIED $word TO "no plural($word)"
2015-01-02 16:59:08 +00:00
def no(self, text, count=None):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
If count is 0, no, zero or nil, return 'no' followed by the plural
of text.
If count is one of:
1, a, an, one, each, every, this, that
return count followed by text.
Otherwise return count follow by the plural of text.
In the return value count is always followed by a space.
Whitespace at the start and end is preserved.
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
if count is None and self.persistent_count is not None:
count = self.persistent_count
if count is None:
count = 0
mo = search(r"\A(\s*)(.+?)(\s*)\Z", text)
pre = mo.group(1)
word = mo.group(2)
post = mo.group(3)
if str(count).lower() in pl_count_zero:
return "%sno %s%s" % (pre, self.plural(word, 0), post)
else:
return "%s%s %s%s" % (pre, count, self.plural(word, count), post)
2018-10-05 20:02:38 +00:00
# PARTICIPLES
2015-01-02 16:59:08 +00:00
def present_participle(self, word):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
Return the present participle for word.
word is the 3rd person singular verb.
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
plv = self.plural_verb(word, 2)
for pat, repl in (
2018-10-05 20:02:38 +00:00
(r"ie$", r"y"),
(r"ue$", r"u"), # TODO: isn't ue$ -> u encompassed in the following rule?
(r"([auy])e$", r"\g<1>"),
(r"ski$", r"ski"),
(r"[^b]i$", r""),
(r"^(are|were)$", r"be"),
(r"^(had)$", r"hav"),
(r"^(hoe)$", r"\g<1>"),
(r"([^e])e$", r"\g<1>"),
(r"er$", r"er"),
(r"([^aeiou][aeiouy]([bdgmnprst]))$", "\g<1>\g<2>"),
2015-01-02 16:59:08 +00:00
):
(ans, num) = subn(pat, repl, plv)
if num:
return "%sing" % ans
return "%sing" % ans
2018-10-05 20:02:38 +00:00
# NUMERICAL INFLECTIONS
2015-01-02 16:59:08 +00:00
def ordinal(self, num):
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
Return the ordinal of num.
num can be an integer or text
e.g. ordinal(1) returns '1st'
ordinal('one') returns 'first'
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
if match(r"\d", str(num)):
try:
num % 2
n = num
except TypeError:
2018-10-05 20:02:38 +00:00
if "." in str(num):
2015-01-02 16:59:08 +00:00
try:
2018-10-05 20:02:38 +00:00
n = int(
num[-1]
) # numbers after decimal, so only need last one for ordinal
2015-01-02 16:59:08 +00:00
except ValueError: # ends with '.', so need to use whole string
n = int(num[:-1])
else:
n = int(num)
try:
post = nth[n % 100]
except KeyError:
post = nth[n % 10]
return "%s%s" % (num, post)
else:
mo = search(r"(%s)\Z" % ordinal_suff, num)
try:
post = ordinal[mo.group(1)]
return resub(r"(%s)\Z" % ordinal_suff, post, num)
except AttributeError:
return "%sth" % num
def millfn(self, ind=0):
if ind > len(mill) - 1:
print3("number out of range")
raise NumOutOfRangeError
return mill[ind]
def unitfn(self, units, mindex=0):
return "%s%s" % (unit[units], self.millfn(mindex))
def tenfn(self, tens, units, mindex=0):
if tens != 1:
2018-10-05 20:02:38 +00:00
return "%s%s%s%s" % (
ten[tens],
"-" if tens and units else "",
unit[units],
self.millfn(mindex),
)
2015-01-02 16:59:08 +00:00
return "%s%s" % (teen[units], mill[mindex])
def hundfn(self, hundreds, tens, units, mindex):
if hundreds:
2018-10-05 20:02:38 +00:00
return "%s hundred%s%s%s, " % (
unit[hundreds], # use unit not unitfn as simpler
" %s " % self.number_args["andword"] if tens or units else "",
self.tenfn(tens, units),
self.millfn(mindex),
)
2015-01-02 16:59:08 +00:00
if tens or units:
return "%s%s, " % (self.tenfn(tens, units), self.millfn(mindex))
2018-10-05 20:02:38 +00:00
return ""
2015-01-02 16:59:08 +00:00
def group1sub(self, mo):
units = int(mo.group(1))
if units == 1:
2018-10-05 20:02:38 +00:00
return " %s, " % self.number_args["one"]
2015-01-02 16:59:08 +00:00
elif units:
# TODO: bug one and zero are padded with a space but other numbers aren't. check this in perl
return "%s, " % unit[units]
else:
2018-10-05 20:02:38 +00:00
return " %s, " % self.number_args["zero"]
2015-01-02 16:59:08 +00:00
def group1bsub(self, mo):
units = int(mo.group(1))
if units:
# TODO: bug one and zero are padded with a space but other numbers aren't. check this in perl
return "%s, " % unit[units]
else:
2018-10-05 20:02:38 +00:00
return " %s, " % self.number_args["zero"]
2015-01-02 16:59:08 +00:00
def group2sub(self, mo):
tens = int(mo.group(1))
units = int(mo.group(2))
if tens:
return "%s, " % self.tenfn(tens, units)
if units:
2018-10-05 20:02:38 +00:00
return " %s %s, " % (self.number_args["zero"], unit[units])
return " %s %s, " % (self.number_args["zero"], self.number_args["zero"])
2015-01-02 16:59:08 +00:00
def group3sub(self, mo):
hundreds = int(mo.group(1))
tens = int(mo.group(2))
units = int(mo.group(3))
if hundreds == 1:
2018-10-05 20:02:38 +00:00
hunword = " %s" % self.number_args["one"]
2015-01-02 16:59:08 +00:00
elif hundreds:
hunword = "%s" % unit[hundreds]
# TODO: bug one and zero are padded with a space but other numbers aren't. check this in perl
else:
2018-10-05 20:02:38 +00:00
hunword = " %s" % self.number_args["zero"]
2015-01-02 16:59:08 +00:00
if tens:
tenword = self.tenfn(tens, units)
elif units:
2018-10-05 20:02:38 +00:00
tenword = " %s %s" % (self.number_args["zero"], unit[units])
2015-01-02 16:59:08 +00:00
else:
2018-10-05 20:02:38 +00:00
tenword = " %s %s" % (self.number_args["zero"], self.number_args["zero"])
2015-01-02 16:59:08 +00:00
return "%s %s, " % (hunword, tenword)
def hundsub(self, mo):
2018-10-05 20:02:38 +00:00
ret = self.hundfn(
int(mo.group(1)), int(mo.group(2)), int(mo.group(3)), self.mill_count
)
2015-01-02 16:59:08 +00:00
self.mill_count += 1
return ret
def tensub(self, mo):
return "%s, " % self.tenfn(int(mo.group(1)), int(mo.group(2)), self.mill_count)
def unitsub(self, mo):
return "%s, " % self.unitfn(int(mo.group(1)), self.mill_count)
def enword(self, num, group):
# import pdb
# pdb.set_trace()
if group == 1:
num = resub(r"(\d)", self.group1sub, num)
elif group == 2:
num = resub(r"(\d)(\d)", self.group2sub, num)
num = resub(r"(\d)", self.group1bsub, num, 1)
# group1bsub same as
# group1sub except it doesn't use the default word for one.
# Is this required? i.e. is the default word not to beused when
# grouping in pairs?
#
# No. This is a bug. Fixed. TODO: report upstream.
elif group == 3:
num = resub(r"(\d)(\d)(\d)", self.group3sub, num)
num = resub(r"(\d)(\d)", self.group2sub, num, 1)
num = resub(r"(\d)", self.group1sub, num, 1)
elif int(num) == 0:
2018-10-05 20:02:38 +00:00
num = self.number_args["zero"]
2015-01-02 16:59:08 +00:00
elif int(num) == 1:
2018-10-05 20:02:38 +00:00
num = self.number_args["one"]
2015-01-02 16:59:08 +00:00
else:
2018-10-05 20:02:38 +00:00
num = num.lstrip().lstrip("0")
2015-01-02 16:59:08 +00:00
self.mill_count = 0
# surely there's a better way to do the next bit
mo = search(r"(\d)(\d)(\d)(?=\D*\Z)", num)
while mo:
num = resub(r"(\d)(\d)(\d)(?=\D*\Z)", self.hundsub, num, 1)
mo = search(r"(\d)(\d)(\d)(?=\D*\Z)", num)
num = resub(r"(\d)(\d)(?=\D*\Z)", self.tensub, num, 1)
num = resub(r"(\d)(?=\D*\Z)", self.unitsub, num, 1)
return num
def blankfn(self, mo):
2018-10-05 20:02:38 +00:00
""" do a global blank replace
2015-01-02 16:59:08 +00:00
TODO: surely this can be done with an option to resub
rather than this fn
2018-10-05 20:02:38 +00:00
"""
return ""
2015-01-02 16:59:08 +00:00
def commafn(self, mo):
2018-10-05 20:02:38 +00:00
""" do a global ',' replace
2015-01-02 16:59:08 +00:00
TODO: surely this can be done with an option to resub
rather than this fn
2018-10-05 20:02:38 +00:00
"""
return ","
2015-01-02 16:59:08 +00:00
def spacefn(self, mo):
2018-10-05 20:02:38 +00:00
""" do a global ' ' replace
2015-01-02 16:59:08 +00:00
TODO: surely this can be done with an option to resub
rather than this fn
2018-10-05 20:02:38 +00:00
"""
return " "
def number_to_words(
self,
num,
wantlist=False,
group=0,
comma=",",
andword="and",
zero="zero",
one="one",
decimal="point",
threshold=None,
):
"""
2015-01-02 16:59:08 +00:00
Return a number in words.
group = 1, 2 or 3 to group numbers before turning into words
comma: define comma
andword: word for 'and'. Can be set to ''.
e.g. "one hundred and one" vs "one hundred one"
zero: word for '0'
one: word for '1'
decimal: word for decimal point
threshold: numbers above threshold not turned into words
parameters not remembered from last call. Departure from Perl version.
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
self.number_args = dict(andword=andword, zero=zero, one=one)
2018-10-05 20:02:38 +00:00
num = "%s" % num
2015-01-02 16:59:08 +00:00
# Handle "stylistic" conversions (up to a given threshold)...
2018-10-05 20:02:38 +00:00
if threshold is not None and float(num) > threshold:
spnum = num.split(".", 1)
while comma:
2015-01-02 16:59:08 +00:00
(spnum[0], n) = subn(r"(\d)(\d{3}(?:,|\Z))", r"\1,\2", spnum[0])
if n == 0:
break
try:
return "%s.%s" % (spnum[0], spnum[1])
except IndexError:
return "%s" % spnum[0]
if group < 0 or group > 3:
raise BadChunkingOptionError
nowhite = num.lstrip()
2018-10-05 20:02:38 +00:00
if nowhite[0] == "+":
2015-01-02 16:59:08 +00:00
sign = "plus"
2018-10-05 20:02:38 +00:00
elif nowhite[0] == "-":
2015-01-02 16:59:08 +00:00
sign = "minus"
else:
sign = ""
2018-10-05 20:02:38 +00:00
myord = num[-2:] in ("st", "nd", "rd", "th")
2015-01-02 16:59:08 +00:00
if myord:
num = num[:-2]
finalpoint = False
if decimal:
if group != 0:
2018-10-05 20:02:38 +00:00
chunks = num.split(".")
2015-01-02 16:59:08 +00:00
else:
2018-10-05 20:02:38 +00:00
chunks = num.split(".", 1)
if chunks[-1] == "": # remove blank string if nothing after decimal
2015-01-02 16:59:08 +00:00
chunks = chunks[:-1]
finalpoint = True # add 'point' to end of output
else:
chunks = [num]
first = 1
loopstart = 0
2018-10-05 20:02:38 +00:00
if chunks[0] == "":
2015-01-02 16:59:08 +00:00
first = 0
if len(chunks) > 1:
loopstart = 1
for i in range(loopstart, len(chunks)):
chunk = chunks[i]
# remove all non numeric \D
chunk = resub(r"\D", self.blankfn, chunk)
if chunk == "":
chunk = "0"
2018-10-05 20:02:38 +00:00
if group == 0 and (first == 0 or first == ""):
2015-01-02 16:59:08 +00:00
chunk = self.enword(chunk, 1)
else:
chunk = self.enword(chunk, group)
2018-10-05 20:02:38 +00:00
if chunk[-2:] == ", ":
2015-01-02 16:59:08 +00:00
chunk = chunk[:-2]
chunk = resub(r"\s+,", self.commafn, chunk)
if group == 0 and first:
chunk = resub(r", (\S+)\s+\Z", " %s \\1" % andword, chunk)
chunk = resub(r"\s+", self.spacefn, chunk)
# chunk = resub(r"(\A\s|\s\Z)", self.blankfn, chunk)
chunk = chunk.strip()
if first:
2018-10-05 20:02:38 +00:00
first = ""
2015-01-02 16:59:08 +00:00
chunks[i] = chunk
numchunks = []
if first != 0:
numchunks = chunks[0].split("%s " % comma)
if myord and numchunks:
# TODO: can this be just one re as it is in perl?
mo = search(r"(%s)\Z" % ordinal_suff, numchunks[-1])
if mo:
2018-10-05 20:02:38 +00:00
numchunks[-1] = resub(
r"(%s)\Z" % ordinal_suff, ordinal[mo.group(1)], numchunks[-1]
)
2015-01-02 16:59:08 +00:00
else:
2018-10-05 20:02:38 +00:00
numchunks[-1] += "th"
2015-01-02 16:59:08 +00:00
for chunk in chunks[1:]:
numchunks.append(decimal)
numchunks.extend(chunk.split("%s " % comma))
if finalpoint:
numchunks.append(decimal)
# wantlist: Perl list context. can explictly specify in Python
if wantlist:
if sign:
numchunks = [sign] + numchunks
return numchunks
elif group:
2018-10-05 20:02:38 +00:00
signout = "%s " % sign if sign else ""
2015-01-02 16:59:08 +00:00
return "%s%s" % (signout, ", ".join(numchunks))
else:
2018-10-05 20:02:38 +00:00
signout = "%s " % sign if sign else ""
2015-01-02 16:59:08 +00:00
num = "%s%s" % (signout, numchunks.pop(0))
if decimal is None:
first = True
else:
first = not num.endswith(decimal)
for nc in numchunks:
if nc == decimal:
num += " %s" % nc
first = 0
elif first:
num += "%s %s" % (comma, nc)
else:
num += " %s" % nc
return num
2018-10-05 20:02:38 +00:00
# Join words with commas and a trailing 'and' (when appropriate)...
def join(
self,
words,
sep=None,
sep_spaced=True,
final_sep=None,
conj="and",
conj_spaced=True,
):
"""
2015-01-02 16:59:08 +00:00
Join words into a list.
e.g. join(['ant', 'bee', 'fly']) returns 'ant, bee, and fly'
options:
conj: replacement for 'and'
sep: separator. default ',', unless ',' is in the list then ';'
final_sep: final separator. default ',', unless ',' is in the list then ';'
conj_spaced: boolean. Should conj have spaces around it
2018-10-05 20:02:38 +00:00
"""
2015-01-02 16:59:08 +00:00
if not words:
return ""
if len(words) == 1:
return words[0]
if conj_spaced:
2018-10-05 20:02:38 +00:00
if conj == "":
conj = " "
2015-01-02 16:59:08 +00:00
else:
2018-10-05 20:02:38 +00:00
conj = " %s " % conj
2015-01-02 16:59:08 +00:00
if len(words) == 2:
return "%s%s%s" % (words[0], conj, words[1])
if sep is None:
2018-10-05 20:02:38 +00:00
if "," in "".join(words):
sep = ";"
2015-01-02 16:59:08 +00:00
else:
2018-10-05 20:02:38 +00:00
sep = ","
2015-01-02 16:59:08 +00:00
if final_sep is None:
final_sep = sep
final_sep = "%s%s" % (final_sep, conj)
if sep_spaced:
2018-10-05 20:02:38 +00:00
sep += " "
2015-01-02 16:59:08 +00:00
return "%s%s%s" % (sep.join(words[0:-1]), final_sep, words[-1])