|
3 | 3 |
|
4 | 4 | from unit_parse.config import config |
5 | 5 | from unit_parse.logger import log_debug, log_info |
6 | | -from unit_parse.utils import remove_empty_str |
| 6 | +from unit_parse.utils import contains_number, remove_empty_str |
7 | 7 |
|
8 | 8 |
|
9 | 9 | @log_info |
@@ -52,6 +52,33 @@ def multiple_quantities(text_in: str, sep: list[str]) -> List[str]: |
52 | 52 | result = re.split(sep, text_in) |
53 | 53 | return [text.strip() for text in result] |
54 | 54 |
|
| 55 | +def split_on_quantities(text_in: str) -> list[str]: |
| 56 | + """ |
| 57 | + Split the string into a list of strings, where each string contains a single quantity. |
| 58 | +
|
| 59 | + Examples |
| 60 | + -------- |
| 61 | + '18 mm Hg @ 68 °F' --> ['18 mm Hg @', '68 °F'] |
| 62 | + 'Melting point: 75% -17.5 °C' --> ['Melting point: 75%', '-17.5 °C'] |
| 63 | + 'Pass me a 300 ml beer.' --> ['Pass me a 300 ml beer.'] |
| 64 | + """ |
| 65 | + # Use regular expression to split the input text into possible groups of quantities |
| 66 | + # The pattern looks for spaces (\s) followed by a digit [-]?(\d) |
| 67 | + # The positive lookahead (?=...) ensures that the split happens without |
| 68 | + # consuming the digit |
| 69 | + quantities = re.split(r'(\s+)(?=[-]?\d)', text_in) |
| 70 | + |
| 71 | + # This regex will sometimes produce groups of just text, so merge subsequent groups until |
| 72 | + # each group contains a number. This could be done in a more complex regex, |
| 73 | + # but a loop is pretty simple. |
| 74 | + results = [] |
| 75 | + for result in quantities: |
| 76 | + if results and not contains_number(results[-1]): |
| 77 | + results[-1] = results[-1] + result |
| 78 | + else: |
| 79 | + results.append(result) |
| 80 | + return results |
| 81 | + |
55 | 82 |
|
56 | 83 | @log_debug |
57 | 84 | def condition_finder(text_in: str) -> List[str]: |
@@ -89,7 +116,8 @@ def condition_finder(text_in: str) -> List[str]: |
89 | 116 | result = re.split("@", text) |
90 | 117 | out2 += [t.strip() for t in result] |
91 | 118 | else: |
92 | | - out2.append(text) |
| 119 | + result = split_on_quantities(text) |
| 120 | + out2 += result |
93 | 121 |
|
94 | 122 | return [text.strip() for text in out2] |
95 | 123 |
|
|
0 commit comments