add tests for rbnf engine

blagasz · blagasz · commit 2105a531a368 · 2025-09-29T09:53:44.000+02:00
diff --git a/babel/rbnf.py b/babel/rbnf.py
@@ -641,8 +641,11 @@ def get_rule_fractional(self, val):
 
     # TODO create simpler repr and move logic to testing utils
     def __repr__(self):
+        return f"<Ruleset {self.name} {'private' if self.private else ' '}{len(self.rules)} rules>"
+
+    def __str__(self):
         rules_str = '\n'.join(['\t' + str(r) for r in self.rules])
-        return f'Ruleset {self.name} {self.private}\n{rules_str}\n'
+        return f'Ruleset {self.name} {"private" if self.private else " "}\n{rules_str}\n'
 
 
 class Rule:
@@ -719,7 +722,6 @@ def apply(self, number, context):
 
             elif t.type == PLURAL_TOKEN:
                 form = context.speller.plural_rule(number)
-                print(t.reference, type(t.reference))
                 if form not in t.reference and "other" not in t.reference:
                     raise RuleMalformed(f"Plural form {form} not in {self} and no fallback option ('other') either!")
 
@@ -730,10 +732,18 @@ def apply(self, number, context):
 
         return ''.join(res)
 
-    # TODO create simpler repr and move logic to testing utils
     def __repr__(self):
-        tokens_str = '\n'.join(['\t\t' + str(t) for t in self.tokens])
-        return f'Rule {self.value} - {self.divisor}\n{tokens_str}\n'
+        return f"<Rule {self.value} - {self.divisor} {len(self.tokens)} tokens>"
+
+    def __str__(self):
+        tokens_str = '\n'.join([f"[{t.reference}]" if t.optional else t.reference if t.type == TEXT_TOKEN else {
+            INTEGRAL_TOKEN: f"←{t.reference}←",
+            REMAINDER_TOKEN: f"→{t.reference}→",
+            SUBSTITUTION_TOKEN: f"={t.reference}=",
+            PREVIOUS_TOKEN: "→→→",
+            PLURAL_TOKEN: f"$({','.join([f'{k}{{{v}}}' for k,v in t.reference.items()])})$",
+        }[t.type] for t in self.tokens])
+        return f'Rule {self.value} - {self.divisor}: {tokens_str}'
 
 
 @dataclass
@@ -759,3 +769,28 @@ def return_value_by_type(self, typ: int):
             REMAINDER_TOKEN: self.REMAINDER,
             SUBSTITUTION_TOKEN: self.SUBSTITUTION,
         }[typ]
+
+
+def parse_rbnf_rules(data, tree):
+    """
+    Parse rules based on:
+    http://www.unicode.org/reports/tr35/tr35-47/tr35-numbers.html#Rule-Based_Number_Formatting
+    """
+    rbnf_rules = data.setdefault('rbnf_rules', {})
+
+    # ElementTree.dump(tree)
+
+    for ruleset_grouping in tree.findall('.//rbnf/rulesetGrouping'):
+        group_name = ruleset_grouping.attrib['type']
+        rbnf_rules[group_name] = []  # TODO check for overwrite
+        for ruleset in ruleset_grouping.findall('ruleset'):
+            ruleset_name = ruleset.attrib['type']
+            private = ruleset.attrib.get('access') == 'private'
+            ruleset_obj = Ruleset(ruleset_name, private)
+            for rule in ruleset.findall('rbnfrule'):
+                radix = rule.attrib.get('radix')
+                if radix == "1,000":  # HACK: work around misspelled radix in mt.xml
+                    radix = "1000"
+                rule_obj = Rule(rule.attrib['value'], rule.text, radix)
+                ruleset_obj.rules.append(rule_obj)
+            rbnf_rules[group_name].append(ruleset_obj)
diff --git a/scripts/import_cldr.py b/scripts/import_cldr.py
@@ -472,7 +472,10 @@ def _process_local_datas(sup, srcdir, destdir, force=False, dump_json=False):
         rbnf_filename = os.path.join(srcdir, 'rbnf', filename)
         if os.path.isfile(rbnf_filename):
             rbnf_tree = parse(rbnf_filename)
-            parse_rbnf_rules(data, rbnf_tree)
+            try:
+                rbnf.parse_rbnf_rules(data, rbnf_tree)
+            except rbnf.RBNFError as e:
+                log(f"{data['locale_id']}: Unable to parse rule: {e}")
 
         write_datafile(data_filename, data, dump_json=dump_json)
 
@@ -1054,38 +1057,5 @@ def parse_measurement_systems(data, tree):
             _import_type_text(measurement_systems, measurement_system, type=type)
 
 
-def parse_rbnf_rules(data, tree):
-    """
-    Parse rules based on:
-    http://www.unicode.org/reports/tr35/tr35-47/tr35-numbers.html#Rule-Based_Number_Formatting
-    """
-    rbnf_rules = data.setdefault('rbnf_rules', {})
-
-    # ElementTree.dump(tree)
-
-    for ruleset_grouping in tree.findall('.//rbnf/rulesetGrouping'):
-        group_name = ruleset_grouping.attrib['type']
-        rbnf_rules[group_name] = []  # TODO check for overwrite
-        for ruleset in ruleset_grouping.findall('ruleset'):
-            ruleset_name = ruleset.attrib['type']
-            private = ruleset.attrib.get('access') == 'private'
-            ruleset_obj = rbnf.Ruleset(ruleset_name, private)
-            for rule in ruleset.findall('rbnfrule'):
-                radix = rule.attrib.get('radix')
-                if radix == "1,000":  # HACK: work around misspelled radix in mt.xml
-                    radix = "1000"
-                try:
-                    rule_obj = rbnf.Rule(rule.attrib['value'], rule.text, radix)
-                    ruleset_obj.rules.append(rule_obj)
-                except rbnf.TokenizationError:
-                    log('%s: Unable to parse rule "%s%s: %s "' % (
-                        data['locale_id'],
-                        rule.attrib['value'],
-                        rule.text,
-                        '' if radix is None else ('/%s' % radix),
-                    ))
-            rbnf_rules[group_name].append(ruleset_obj)
-
-
 if __name__ == '__main__':
     main()
diff --git a/tests/test_number_spelling.py b/tests/test_number_spelling.py
@@ -1,9 +1,9 @@
+import os
 import sys
 
 import pytest
 
 from babel import numbers, rbnf
-from babel.localedata import locale_identifiers
 
 soft_hyphen = '\xad'
 
@@ -14,16 +14,19 @@ def test_basic():
     assert 'spellout-numbering' in x.available_rulesets
 
 
-def test_negotiation():
-    for lid in locale_identifiers():
-        try:
-            loc = rbnf.RuleBasedNumberFormat.negotiate(lid)._locale
-        except rbnf.RulesetNotFound:
-            # generate warning if necessary
-            continue
-        # test groups
-        for k in loc._data['rbnf_rules']:
-            assert k in rbnf.RuleBasedNumberFormat.group_types
+@pytest.mark.all_rbnf_locales
+def test_negotiation(locale):
+    negotiated_speller = rbnf.RuleBasedNumberFormat.negotiate(locale)
+    negotiated_locale = negotiated_speller._locale
+
+    # test groups
+    for k in negotiated_locale._data['rbnf_rules']:
+        assert k in rbnf.RuleBasedNumberFormat.group_types
+
+    negotiated_speller.match_ruleset("numbering")
+
+    with pytest.raises(rbnf.RulesetNotFound):
+        negotiated_speller.match_ruleset("nonexistent")
 
 
 def test_tokenization():
@@ -34,18 +37,39 @@ def test_tokenization():
     ]
     assert x == res
 
+    rbnf.tokenize("→→→;")  # should not raise
 
-def test_xml_parsing():
+    with pytest.raises(ValueError, match=r"Unable to.*"):
+        list(rbnf.tokenize("==="))
+
+    with pytest.warns(SyntaxWarning, match=r"Reference parsing error.*"):
+        list(rbnf.tokenize("←bad←;"))
+
+
+@pytest.mark.all_rbnf_locales
+def test_xml_parsing(locale):
     """
-    all the rules should be able to go through the parser and tokenizer
-    made up some rules and run the tokenizer on them
+    All the rues implicitly go through the arsing during CLDR import.
 
-    TODO
-    read data from all the locales that have rbnf_rules defined
-    all the raw rules should be in a specific structure based
-    on the XML specification
+    This tests replicates the parsing for the English locale to
+    add coverage to the parsing parts of the code.
     """
-    assert True
+    from xml.etree import ElementTree
+
+    rules = numbers.get_rbnf_rules(locale)
+
+    assert rules
+
+    rbnf_file = f"cldr/cldr-common-47.0/common/rbnf/{locale}.xml"
+
+    assert os.path.isfile(rbnf_file)
+
+    data = {}
+
+    rbnf_tree = ElementTree.parse(rbnf_file)
+    rbnf.parse_rbnf_rules(data, rbnf_tree)
+
+    assert 'rbnf_rules' in data
 
 
 def test_compute_divisor():