Skip to content

Commit 2105a53

Browse files
committed
add tests for rbnf engine
1 parent 6896cf6 commit 2105a53

File tree

3 files changed

+87
-58
lines changed

3 files changed

+87
-58
lines changed

babel/rbnf.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -641,8 +641,11 @@ def get_rule_fractional(self, val):
641641

642642
# TODO create simpler repr and move logic to testing utils
643643
def __repr__(self):
644+
return f"<Ruleset {self.name} {'private' if self.private else ' '}{len(self.rules)} rules>"
645+
646+
def __str__(self):
644647
rules_str = '\n'.join(['\t' + str(r) for r in self.rules])
645-
return f'Ruleset {self.name} {self.private}\n{rules_str}\n'
648+
return f'Ruleset {self.name} {"private" if self.private else " "}\n{rules_str}\n'
646649

647650

648651
class Rule:
@@ -719,7 +722,6 @@ def apply(self, number, context):
719722

720723
elif t.type == PLURAL_TOKEN:
721724
form = context.speller.plural_rule(number)
722-
print(t.reference, type(t.reference))
723725
if form not in t.reference and "other" not in t.reference:
724726
raise RuleMalformed(f"Plural form {form} not in {self} and no fallback option ('other') either!")
725727

@@ -730,10 +732,18 @@ def apply(self, number, context):
730732

731733
return ''.join(res)
732734

733-
# TODO create simpler repr and move logic to testing utils
734735
def __repr__(self):
735-
tokens_str = '\n'.join(['\t\t' + str(t) for t in self.tokens])
736-
return f'Rule {self.value} - {self.divisor}\n{tokens_str}\n'
736+
return f"<Rule {self.value} - {self.divisor} {len(self.tokens)} tokens>"
737+
738+
def __str__(self):
739+
tokens_str = '\n'.join([f"[{t.reference}]" if t.optional else t.reference if t.type == TEXT_TOKEN else {
740+
INTEGRAL_TOKEN: f"←{t.reference}←",
741+
REMAINDER_TOKEN: f"→{t.reference}→",
742+
SUBSTITUTION_TOKEN: f"={t.reference}=",
743+
PREVIOUS_TOKEN: "→→→",
744+
PLURAL_TOKEN: f"$({','.join([f'{k}{{{v}}}' for k,v in t.reference.items()])})$",
745+
}[t.type] for t in self.tokens])
746+
return f'Rule {self.value} - {self.divisor}: {tokens_str}'
737747

738748

739749
@dataclass
@@ -759,3 +769,28 @@ def return_value_by_type(self, typ: int):
759769
REMAINDER_TOKEN: self.REMAINDER,
760770
SUBSTITUTION_TOKEN: self.SUBSTITUTION,
761771
}[typ]
772+
773+
774+
def parse_rbnf_rules(data, tree):
775+
"""
776+
Parse rules based on:
777+
http://www.unicode.org/reports/tr35/tr35-47/tr35-numbers.html#Rule-Based_Number_Formatting
778+
"""
779+
rbnf_rules = data.setdefault('rbnf_rules', {})
780+
781+
# ElementTree.dump(tree)
782+
783+
for ruleset_grouping in tree.findall('.//rbnf/rulesetGrouping'):
784+
group_name = ruleset_grouping.attrib['type']
785+
rbnf_rules[group_name] = [] # TODO check for overwrite
786+
for ruleset in ruleset_grouping.findall('ruleset'):
787+
ruleset_name = ruleset.attrib['type']
788+
private = ruleset.attrib.get('access') == 'private'
789+
ruleset_obj = Ruleset(ruleset_name, private)
790+
for rule in ruleset.findall('rbnfrule'):
791+
radix = rule.attrib.get('radix')
792+
if radix == "1,000": # HACK: work around misspelled radix in mt.xml
793+
radix = "1000"
794+
rule_obj = Rule(rule.attrib['value'], rule.text, radix)
795+
ruleset_obj.rules.append(rule_obj)
796+
rbnf_rules[group_name].append(ruleset_obj)

scripts/import_cldr.py

Lines changed: 4 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,10 @@ def _process_local_datas(sup, srcdir, destdir, force=False, dump_json=False):
472472
rbnf_filename = os.path.join(srcdir, 'rbnf', filename)
473473
if os.path.isfile(rbnf_filename):
474474
rbnf_tree = parse(rbnf_filename)
475-
parse_rbnf_rules(data, rbnf_tree)
475+
try:
476+
rbnf.parse_rbnf_rules(data, rbnf_tree)
477+
except rbnf.RBNFError as e:
478+
log(f"{data['locale_id']}: Unable to parse rule: {e}")
476479

477480
write_datafile(data_filename, data, dump_json=dump_json)
478481

@@ -1054,38 +1057,5 @@ def parse_measurement_systems(data, tree):
10541057
_import_type_text(measurement_systems, measurement_system, type=type)
10551058

10561059

1057-
def parse_rbnf_rules(data, tree):
1058-
"""
1059-
Parse rules based on:
1060-
http://www.unicode.org/reports/tr35/tr35-47/tr35-numbers.html#Rule-Based_Number_Formatting
1061-
"""
1062-
rbnf_rules = data.setdefault('rbnf_rules', {})
1063-
1064-
# ElementTree.dump(tree)
1065-
1066-
for ruleset_grouping in tree.findall('.//rbnf/rulesetGrouping'):
1067-
group_name = ruleset_grouping.attrib['type']
1068-
rbnf_rules[group_name] = [] # TODO check for overwrite
1069-
for ruleset in ruleset_grouping.findall('ruleset'):
1070-
ruleset_name = ruleset.attrib['type']
1071-
private = ruleset.attrib.get('access') == 'private'
1072-
ruleset_obj = rbnf.Ruleset(ruleset_name, private)
1073-
for rule in ruleset.findall('rbnfrule'):
1074-
radix = rule.attrib.get('radix')
1075-
if radix == "1,000": # HACK: work around misspelled radix in mt.xml
1076-
radix = "1000"
1077-
try:
1078-
rule_obj = rbnf.Rule(rule.attrib['value'], rule.text, radix)
1079-
ruleset_obj.rules.append(rule_obj)
1080-
except rbnf.TokenizationError:
1081-
log('%s: Unable to parse rule "%s%s: %s "' % (
1082-
data['locale_id'],
1083-
rule.attrib['value'],
1084-
rule.text,
1085-
'' if radix is None else ('/%s' % radix),
1086-
))
1087-
rbnf_rules[group_name].append(ruleset_obj)
1088-
1089-
10901060
if __name__ == '__main__':
10911061
main()

tests/test_number_spelling.py

Lines changed: 43 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1+
import os
12
import sys
23

34
import pytest
45

56
from babel import numbers, rbnf
6-
from babel.localedata import locale_identifiers
77

88
soft_hyphen = '\xad'
99

@@ -14,16 +14,19 @@ def test_basic():
1414
assert 'spellout-numbering' in x.available_rulesets
1515

1616

17-
def test_negotiation():
18-
for lid in locale_identifiers():
19-
try:
20-
loc = rbnf.RuleBasedNumberFormat.negotiate(lid)._locale
21-
except rbnf.RulesetNotFound:
22-
# generate warning if necessary
23-
continue
24-
# test groups
25-
for k in loc._data['rbnf_rules']:
26-
assert k in rbnf.RuleBasedNumberFormat.group_types
17+
@pytest.mark.all_rbnf_locales
18+
def test_negotiation(locale):
19+
negotiated_speller = rbnf.RuleBasedNumberFormat.negotiate(locale)
20+
negotiated_locale = negotiated_speller._locale
21+
22+
# test groups
23+
for k in negotiated_locale._data['rbnf_rules']:
24+
assert k in rbnf.RuleBasedNumberFormat.group_types
25+
26+
negotiated_speller.match_ruleset("numbering")
27+
28+
with pytest.raises(rbnf.RulesetNotFound):
29+
negotiated_speller.match_ruleset("nonexistent")
2730

2831

2932
def test_tokenization():
@@ -34,18 +37,39 @@ def test_tokenization():
3437
]
3538
assert x == res
3639

40+
rbnf.tokenize("→→→;") # should not raise
3741

38-
def test_xml_parsing():
42+
with pytest.raises(ValueError, match=r"Unable to.*"):
43+
list(rbnf.tokenize("==="))
44+
45+
with pytest.warns(SyntaxWarning, match=r"Reference parsing error.*"):
46+
list(rbnf.tokenize("←bad←;"))
47+
48+
49+
@pytest.mark.all_rbnf_locales
50+
def test_xml_parsing(locale):
3951
"""
40-
all the rules should be able to go through the parser and tokenizer
41-
made up some rules and run the tokenizer on them
52+
All the rues implicitly go through the arsing during CLDR import.
4253
43-
TODO
44-
read data from all the locales that have rbnf_rules defined
45-
all the raw rules should be in a specific structure based
46-
on the XML specification
54+
This tests replicates the parsing for the English locale to
55+
add coverage to the parsing parts of the code.
4756
"""
48-
assert True
57+
from xml.etree import ElementTree
58+
59+
rules = numbers.get_rbnf_rules(locale)
60+
61+
assert rules
62+
63+
rbnf_file = f"cldr/cldr-common-47.0/common/rbnf/{locale}.xml"
64+
65+
assert os.path.isfile(rbnf_file)
66+
67+
data = {}
68+
69+
rbnf_tree = ElementTree.parse(rbnf_file)
70+
rbnf.parse_rbnf_rules(data, rbnf_tree)
71+
72+
assert 'rbnf_rules' in data
4973

5074

5175
def test_compute_divisor():

0 commit comments

Comments
 (0)