Skip to content

Commit b8cc3e3

Browse files
percevalwJungack
andcommitted
feat: improve eds.table matcher
Co-Authored-By: Jacques Ung <[email protected]>
1 parent 967808d commit b8cc3e3

File tree

5 files changed

+183
-52
lines changed

5 files changed

+183
-52
lines changed

changelog.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
- Added a new `eds.ner_overlap_scorer` to evaluate matches between two lists of entities, counting true when the dice overlap is above a given threshold
2525
- `edsnlp.load` now accepts EDS-NLP models from the huggingface hub 🤗 !
2626
- New `python -m edsnlp.package` command to package a model for the huggingface hub or pypi-like registries
27+
- Improve table detection in `eds.tables` and support new options in `table._.to_pd_table(...)`:
28+
- `header=True` to use first row as header
29+
- `index=True` to use first column as index
30+
- `as_spans=True` to fill cells as document spans instead of strings
2731

2832
### Changed
2933

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
1-
from .patterns import regex, sep
21
from .tables import TablesMatcher
Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,2 @@
1-
sep = r"¦|\|"
2-
regex = dict(
3-
tables=rf"(\b.*{sep}.*\n)+",
4-
)
1+
sep = ["¦", "|"]
2+
regex_template = [r"(?:{sep}?(?:[^{sep}\n]*{sep})+[^{sep}\n]*{sep}?\n)+"]

edsnlp/pipes/misc/tables/tables.py

Lines changed: 135 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,18 @@
1-
from io import StringIO
1+
import re
22
from typing import Dict, Optional, Union
33

44
import pandas as pd
55
from spacy.tokens import Doc, Span
66

77
from edsnlp.core import PipelineProtocol
8-
from edsnlp.pipes.core.matcher.matcher import GenericMatcher
8+
from edsnlp.matchers.phrase import EDSPhraseMatcher
9+
from edsnlp.matchers.regex import RegexMatcher
10+
from edsnlp.pipes.base import BaseComponent
911
from edsnlp.pipes.misc.tables import patterns
10-
from edsnlp.utils.filter import get_spans
12+
from edsnlp.utils.typing import AsList
1113

1214

13-
class TablesMatcher(GenericMatcher):
15+
class TablesMatcher(BaseComponent):
1416
'''
1517
The `eds.tables` matcher detects tables in a documents.
1618
@@ -70,7 +72,11 @@ class TablesMatcher(GenericMatcher):
7072
# VMP ¦fL ¦11.5 + ¦7.4-10.8
7173
7274
# Convert span to Pandas table
73-
df = table._.to_pd_table()
75+
df = table._.to_pd_table(
76+
as_spans=False, # set True to set the table cells as spans instead of strings
77+
header=False, # set True to use the first row as header
78+
index=False, # set True to use the first column as index
79+
)
7480
type(df)
7581
# Out: pandas.core.frame.DataFrame
7682
```
@@ -96,7 +102,7 @@ class TablesMatcher(GenericMatcher):
96102
Parameters
97103
----------
98104
nlp : PipelineProtocol
99-
spaCy nlp pipeline to use for matching.
105+
Pipeline object
100106
name: str
101107
Name of the component.
102108
tables_pattern : Optional[Dict[str, str]]
@@ -120,41 +126,106 @@ class TablesMatcher(GenericMatcher):
120126
def __init__(
121127
self,
122128
nlp: PipelineProtocol,
123-
name: str = "tables",
129+
name: Optional[str] = "tables",
124130
*,
125-
tables_pattern: Optional[Dict[str, str]] = None,
126-
sep_pattern: Optional[str] = None,
131+
tables_pattern: Optional[AsList[str]] = None,
132+
sep_pattern: Optional[AsList[str]] = None,
127133
attr: Union[Dict[str, str], str] = "TEXT",
128134
ignore_excluded: bool = True,
129135
):
130-
if tables_pattern is None and sep_pattern is None:
131-
self.tables_pattern = patterns.regex
132-
self.sep = patterns.sep
133-
elif tables_pattern is None or sep_pattern is None:
134-
raise ValueError(
135-
"Both tables_pattern and sep_pattern must be provided "
136-
"for custom eds.table pipeline."
137-
)
138-
else:
139-
self.tables_pattern = tables_pattern
140-
self.sep = sep_pattern
141-
142-
super().__init__(
143-
nlp=nlp,
144-
name=name,
145-
terms=None,
146-
regex=self.tables_pattern,
147-
attr=attr,
148-
ignore_excluded=ignore_excluded,
136+
super().__init__(nlp, name)
137+
if tables_pattern is None:
138+
tables_pattern = patterns.regex_template
139+
140+
if sep_pattern is None:
141+
sep_pattern = patterns.sep
142+
143+
self.regex_matcher = RegexMatcher(attr=attr, ignore_excluded=ignore_excluded)
144+
self.regex_matcher.add(
145+
"table",
146+
list(
147+
dict.fromkeys(
148+
template.format(sep=re.escape(sep))
149+
for sep in sep_pattern
150+
for template in tables_pattern
151+
)
152+
),
153+
)
154+
155+
self.term_matcher = EDSPhraseMatcher(
156+
nlp.vocab, attr=attr, ignore_excluded=ignore_excluded
157+
)
158+
self.term_matcher.build_patterns(
159+
nlp,
160+
{
161+
"eol_pattern": "\n",
162+
"sep_pattern": sep_pattern,
163+
},
149164
)
150165

151166
if not Span.has_extension("to_pd_table"):
152167
Span.set_extension("to_pd_table", method=self.to_pd_table)
153168

154-
self.set_extensions()
169+
@classmethod
170+
def set_extensions(cls) -> None:
171+
"""
172+
Set extensions for the tables pipeline.
173+
"""
174+
175+
if not Span.has_extension("table"):
176+
Span.set_extension("table", default=None)
177+
178+
def get_table(self, table):
179+
"""
180+
Convert spans of tables to dictionaries
181+
Parameters
182+
----------
183+
table : Span
184+
185+
Returns
186+
-------
187+
List[Span]
188+
"""
189+
190+
# We store each row in a list and store each of hese lists
191+
# in processed_table for post processing
192+
# considering the self.col_names and self.row_names var
193+
processed_table = []
194+
delimiters = [
195+
delimiter
196+
for delimiter in self.term_matcher(table, as_spans=True)
197+
if delimiter.start >= table.start and delimiter.end <= table.end
198+
]
199+
200+
last = table.start
201+
row = []
202+
# Parse the table to match each cell thanks to delimiters
203+
for delimiter in delimiters:
204+
row.append(table[last - table.start : delimiter.start - table.start])
205+
last = delimiter.end
206+
207+
# End the actual row if there is an end of line
208+
if delimiter.label_ == "eol_pattern":
209+
processed_table.append(row)
210+
row = []
211+
212+
# Remove first or last column in case the separator pattern is
213+
# also used in the raw table to draw the outlines
214+
max_len = max(len(row) for row in processed_table)
215+
if all(row[0].start == row[0].end for row in processed_table):
216+
processed_table = [row[1:] for row in processed_table]
217+
if all(
218+
row[-1].start == row[-1].end
219+
for row in processed_table
220+
if len(row) == max_len
221+
):
222+
processed_table = [row[:-1] for row in processed_table]
223+
224+
return processed_table
155225

156226
def __call__(self, doc: Doc) -> Doc:
157-
"""Find spans that contain tables
227+
"""
228+
Find spans that contain tables
158229
159230
Parameters
160231
----------
@@ -164,21 +235,40 @@ def __call__(self, doc: Doc) -> Doc:
164235
-------
165236
Doc
166237
"""
167-
matches = self.process(doc)
168-
tables = get_spans(matches, "tables")
169-
# parsed = self.parse(tables=tables)
238+
matches = list(self.regex_matcher(doc, as_spans=True))
239+
doc.spans["tables"] = matches
240+
return doc
170241

171-
doc.spans["tables"] = tables
242+
def to_pd_table(
243+
self,
244+
span,
245+
as_spans=False,
246+
header: bool = False,
247+
index: bool = False,
248+
) -> pd.DataFrame:
249+
"""
250+
Return pandas DataFrame
172251
173-
return doc
252+
Parameters
253+
----------
254+
span : Span
255+
The span containing the table
256+
as_spans : bool
257+
Whether to return the table cells as spans
258+
header : bool
259+
Whether the table has a header
260+
index : bool
261+
Whether the table has an index
262+
"""
263+
table = self.get_table(span)
264+
if not as_spans:
265+
table = [[str(cell) for cell in data] for data in table]
174266

175-
def to_pd_table(self, span) -> pd.DataFrame:
176-
table_str_io = StringIO(span.text)
177-
parsed = pd.read_csv(
178-
table_str_io,
179-
sep=self.sep,
180-
engine="python",
181-
header=None,
182-
on_bad_lines="skip",
183-
)
184-
return parsed
267+
table = pd.DataFrame.from_records(table)
268+
if header:
269+
table.columns = [str(k) for k in table.iloc[0]]
270+
table = table[1:]
271+
if index:
272+
table.index = [str(k) for k in table.iloc[:, 0]]
273+
table = table.iloc[:, 1:]
274+
return table

tests/pipelines/misc/test_tables.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
import pytest
2+
from spacy.tokens.span import Span
3+
14
TEXT = """
25
Le patientqsfqfdf bla bla bla
36
Leucocytes ¦x10*9/L ¦4.97 ¦4.09-11
@@ -14,18 +17,55 @@
1417
1518
2/2Pat : <NOM> <Prenom> |<date> | <ipp> |Intitulé RCP
1619
20+
|Libellé | Unité | Valeur | Intervalle |
21+
|Leucocytes |x10*9/L |4.97 | 4.09-11 |
22+
|Hématies |x10*12/L|4.68 | 4.53-5.79 |
23+
|Hémoglobine |g/dL |14.8 | 13.4-16.7 |
24+
|Hématocrite ||44.2 | 39.2-48.6 |
25+
|VGM |fL | 94.4 + | 79.6-94 |
26+
|TCMH |pg |31.6 |
27+
|CCMH |g/dL
28+
|Plaquettes |x10*9/L |191 | 172-398 |
29+
|VMP |fL |11.5 + | 7.4-10.8 |
1730
1831
"""
1932

2033

2134
def test_tables(blank_nlp):
35+
if blank_nlp.lang != "eds":
36+
pytest.skip("Test only for eds language")
2237
blank_nlp.add_pipe("eds.normalizer")
2338
blank_nlp.add_pipe("eds.tables")
2439

2540
doc = blank_nlp(TEXT)
2641

27-
assert len(doc.spans["tables"]) == 1
42+
assert len(doc.spans["tables"]) == 2
2843

2944
span = doc.spans["tables"][0]
3045
df = span._.to_pd_table()
31-
assert df.iloc[5, 0] == "TCMH "
46+
assert len(df.columns) == 4
47+
assert len(df) == 9
48+
assert str(df.iloc[5, 0]) == "TCMH"
49+
50+
span = doc.spans["tables"][1]
51+
df = span._.to_pd_table(header=True, index=True, as_spans=True)
52+
print(df)
53+
assert df.columns.tolist() == [
54+
"Unité",
55+
"Valeur",
56+
"Intervalle",
57+
]
58+
assert df.index.tolist() == [
59+
"Leucocytes",
60+
"Hématies",
61+
"Hémoglobine",
62+
"Hématocrite",
63+
"VGM",
64+
"TCMH",
65+
"CCMH",
66+
"Plaquettes",
67+
"VMP",
68+
]
69+
cell = df.loc["TCMH", "Valeur"]
70+
assert isinstance(cell, Span)
71+
assert cell.text == "31.6"

0 commit comments

Comments
 (0)