Skip to content

Commit 1bdcb85

Browse files
authored
Merge pull request #12
Improve the parsing of list of quantities, and allow the parsing of percentage values.
2 parents 39d930f + 6629b6b commit 1bdcb85

File tree

4 files changed

+38
-5
lines changed

4 files changed

+38
-5
lines changed

src/unit_parse/config.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,6 @@ def __init__(self):
112112
["cu m", "m**3"], # pint gets confused
113113
["cu cm", "cm**3"], # pint gets confused
114114
["cu mm", "mm**3"], # pint gets confused
115-
["[0-9]{1,5} ?%", ""]
116115
]
117116

118117
self.pre_proc_split = [";"]

src/unit_parse/pre_processing_multiple.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from unit_parse.config import config
55
from unit_parse.logger import log_debug, log_info
6-
from unit_parse.utils import remove_empty_str
6+
from unit_parse.utils import contains_number, remove_empty_str
77

88

99
@log_info
@@ -52,6 +52,33 @@ def multiple_quantities(text_in: str, sep: list[str]) -> List[str]:
5252
result = re.split(sep, text_in)
5353
return [text.strip() for text in result]
5454

55+
def split_on_quantities(text_in: str) -> list[str]:
56+
"""
57+
Split the string into a list of strings, where each string contains a single quantity.
58+
59+
Examples
60+
--------
61+
'18 mm Hg @ 68 °F' --> ['18 mm Hg @', '68 °F']
62+
'Melting point: 75% -17.5 °C' --> ['Melting point: 75%', '-17.5 °C']
63+
'Pass me a 300 ml beer.' --> ['Pass me a 300 ml beer.']
64+
"""
65+
# Use regular expression to split the input text into possible groups of quantities
66+
# The pattern looks for spaces (\s) followed by a digit [-]?(\d)
67+
# The positive lookahead (?=...) ensures that the split happens without
68+
# consuming the digit
69+
quantities = re.split(r'(\s+)(?=[-]?\d)', text_in)
70+
71+
# This regex will sometimes produce groups of just text, so merge subsequent groups until
72+
# each group contains a number. This could be done in a more complex regex,
73+
# but a loop is pretty simple.
74+
results = []
75+
for result in quantities:
76+
if results and not contains_number(results[-1]):
77+
results[-1] = results[-1] + result
78+
else:
79+
results.append(result)
80+
return results
81+
5582

5683
@log_debug
5784
def condition_finder(text_in: str) -> List[str]:
@@ -89,7 +116,8 @@ def condition_finder(text_in: str) -> List[str]:
89116
result = re.split("@", text)
90117
out2 += [t.strip() for t in result]
91118
else:
92-
out2.append(text)
119+
result = split_on_quantities(text)
120+
out2 += result
93121

94122
return [text.strip() for text in out2]
95123

tests/test_main.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
['40°F', Quantity('40 degF')],
1515
['20.80 mmHg', Quantity('20.80 mmHg')],
1616
['20.80 mm Hg', Quantity('20.80 mmHg')], # correcting a unit that pint gets wrong
17+
['10%', Quantity('10 %')],
1718

1819
# scientific notation
1920
["15*10**2 s", Quantity("15*10**2 s")], # standard
@@ -76,7 +77,11 @@
7677
[[Quantity('18 mmHg'), Quantity('68 degF')], [Quantity('20 mmHg'), Quantity('77 degF')]]],
7778
["Low threshold= 13.1150 mg/cu m; High threshold= 26840 mg/cu m; Irritating concn= 22875 mg/cu m.",
7879
Quantity('22875 mg/m**3')],
79-
['Melting point: 75% -17.5 °C; 80% 4.6 °C; 85% 21 °C.', Quantity("4.6 degC")],
80+
['Melting point: 75% -17.5 °C; 80% 4.6 °C; 85% 21 °C.', [
81+
[Quantity('75%'), Quantity('-17.5 degC')],
82+
[Quantity("80%"), Quantity('4.6 degC')],
83+
[Quantity("85%"), Quantity('21 degC')],
84+
]],
8085

8186
# ranges
8287
['115.2-115.3 °C', Quantity('115.2 degC')],

tests/test_pre_processing_multiple.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ def test_reduce_parenthesis(input_, output_):
3535
['18 mm Hg @ 68 °F ', ['18 mm Hg', '68 °F']],
3636
['20 mm Hg @ 77° F', ['20 mm Hg', '77° F']],
3737
[' 20 mm Hg @ 77° F (NTP, 1992)', ['20 mm Hg', '77° F', 'NTP, 1992']],
38+
['Melting point: 75% -17.5 °C', ['Melting point: 75%', '-17.5 °C']],
39+
['20.8 mm Hg 25 °C', ['20.8 mm Hg', '25 °C']],
3840

3941
['40 °F (4 °C) (Closed cup)', ['40 °F', '4 °C', 'Closed cup']],
4042
['40 °F (4 °C)', ['40 °F', '4 °C']],
@@ -45,7 +47,6 @@ def test_reduce_parenthesis(input_, output_):
4547
['(4 °C Closed cup)', ['4 °C Closed cup']],
4648

4749
# negative control (fails)
48-
['20.8 mm Hg 25 °C', ['20.8 mm Hg 25 °C']],
4950
['20.8 mm Hgat25 °C', ['20.8 mm Hgat25 °C']],
5051
['Pass me a 300 ml beer.', ['Pass me a 300 ml beer.']],
5152
["42.3 gcm-3", ["42.3 gcm-3"]],

0 commit comments

Comments
 (0)