1- from io import StringIO
1+ import re
22from typing import Dict , Optional , Union
33
44import pandas as pd
55from spacy .tokens import Doc , Span
66
77from edsnlp .core import PipelineProtocol
8- from edsnlp .pipes .core .matcher .matcher import GenericMatcher
8+ from edsnlp .matchers .phrase import EDSPhraseMatcher
9+ from edsnlp .matchers .regex import RegexMatcher
10+ from edsnlp .pipes .base import BaseComponent
911from edsnlp .pipes .misc .tables import patterns
10- from edsnlp .utils .filter import get_spans
12+ from edsnlp .utils .typing import AsList
1113
1214
13- class TablesMatcher (GenericMatcher ):
15+ class TablesMatcher (BaseComponent ):
1416 '''
1517 The `eds.tables` matcher detects tables in a documents.
1618
@@ -70,7 +72,11 @@ class TablesMatcher(GenericMatcher):
7072 # VMP ¦fL ¦11.5 + ¦7.4-10.8
7173
7274 # Convert span to Pandas table
73- df = table._.to_pd_table()
75+ df = table._.to_pd_table(
76+ as_spans=False, # set True to set the table cells as spans instead of strings
77+ header=False, # set True to use the first row as header
78+ index=False, # set True to use the first column as index
79+ )
7480 type(df)
7581 # Out: pandas.core.frame.DataFrame
7682 ```
@@ -96,7 +102,7 @@ class TablesMatcher(GenericMatcher):
96102 Parameters
97103 ----------
98104 nlp : PipelineProtocol
99- spaCy nlp pipeline to use for matching.
105+ Pipeline object
100106 name: str
101107 Name of the component.
102108 tables_pattern : Optional[Dict[str, str]]
@@ -120,41 +126,106 @@ class TablesMatcher(GenericMatcher):
120126 def __init__ (
121127 self ,
122128 nlp : PipelineProtocol ,
123- name : str = "tables" ,
129+ name : Optional [ str ] = "tables" ,
124130 * ,
125- tables_pattern : Optional [Dict [ str , str ]] = None ,
126- sep_pattern : Optional [str ] = None ,
131+ tables_pattern : Optional [AsList [ str ]] = None ,
132+ sep_pattern : Optional [AsList [ str ] ] = None ,
127133 attr : Union [Dict [str , str ], str ] = "TEXT" ,
128134 ignore_excluded : bool = True ,
129135 ):
130- if tables_pattern is None and sep_pattern is None :
131- self .tables_pattern = patterns .regex
132- self .sep = patterns .sep
133- elif tables_pattern is None or sep_pattern is None :
134- raise ValueError (
135- "Both tables_pattern and sep_pattern must be provided "
136- "for custom eds.table pipeline."
137- )
138- else :
139- self .tables_pattern = tables_pattern
140- self .sep = sep_pattern
141-
142- super ().__init__ (
143- nlp = nlp ,
144- name = name ,
145- terms = None ,
146- regex = self .tables_pattern ,
147- attr = attr ,
148- ignore_excluded = ignore_excluded ,
136+ super ().__init__ (nlp , name )
137+ if tables_pattern is None :
138+ tables_pattern = patterns .regex_template
139+
140+ if sep_pattern is None :
141+ sep_pattern = patterns .sep
142+
143+ self .regex_matcher = RegexMatcher (attr = attr , ignore_excluded = ignore_excluded )
144+ self .regex_matcher .add (
145+ "table" ,
146+ list (
147+ dict .fromkeys (
148+ template .format (sep = re .escape (sep ))
149+ for sep in sep_pattern
150+ for template in tables_pattern
151+ )
152+ ),
153+ )
154+
155+ self .term_matcher = EDSPhraseMatcher (
156+ nlp .vocab , attr = attr , ignore_excluded = ignore_excluded
157+ )
158+ self .term_matcher .build_patterns (
159+ nlp ,
160+ {
161+ "eol_pattern" : "\n " ,
162+ "sep_pattern" : sep_pattern ,
163+ },
149164 )
150165
151166 if not Span .has_extension ("to_pd_table" ):
152167 Span .set_extension ("to_pd_table" , method = self .to_pd_table )
153168
154- self .set_extensions ()
169+ @classmethod
170+ def set_extensions (cls ) -> None :
171+ """
172+ Set extensions for the tables pipeline.
173+ """
174+
175+ if not Span .has_extension ("table" ):
176+ Span .set_extension ("table" , default = None )
177+
178+ def get_table (self , table ):
179+ """
180+ Convert spans of tables to dictionaries
181+ Parameters
182+ ----------
183+ table : Span
184+
185+ Returns
186+ -------
187+ List[Span]
188+ """
189+
190+ # We store each row in a list and store each of hese lists
191+ # in processed_table for post processing
192+ # considering the self.col_names and self.row_names var
193+ processed_table = []
194+ delimiters = [
195+ delimiter
196+ for delimiter in self .term_matcher (table , as_spans = True )
197+ if delimiter .start >= table .start and delimiter .end <= table .end
198+ ]
199+
200+ last = table .start
201+ row = []
202+ # Parse the table to match each cell thanks to delimiters
203+ for delimiter in delimiters :
204+ row .append (table [last - table .start : delimiter .start - table .start ])
205+ last = delimiter .end
206+
207+ # End the actual row if there is an end of line
208+ if delimiter .label_ == "eol_pattern" :
209+ processed_table .append (row )
210+ row = []
211+
212+ # Remove first or last column in case the separator pattern is
213+ # also used in the raw table to draw the outlines
214+ max_len = max (len (row ) for row in processed_table )
215+ if all (row [0 ].start == row [0 ].end for row in processed_table ):
216+ processed_table = [row [1 :] for row in processed_table ]
217+ if all (
218+ row [- 1 ].start == row [- 1 ].end
219+ for row in processed_table
220+ if len (row ) == max_len
221+ ):
222+ processed_table = [row [:- 1 ] for row in processed_table ]
223+
224+ return processed_table
155225
156226 def __call__ (self , doc : Doc ) -> Doc :
157- """Find spans that contain tables
227+ """
228+ Find spans that contain tables
158229
159230 Parameters
160231 ----------
@@ -164,21 +235,40 @@ def __call__(self, doc: Doc) -> Doc:
164235 -------
165236 Doc
166237 """
167- matches = self .process (doc )
168- tables = get_spans ( matches , "tables" )
169- # parsed = self.parse(tables=tables)
238+ matches = list ( self .regex_matcher (doc , as_spans = True ) )
239+ doc . spans [ " tables" ] = matches
240+ return doc
170241
171- doc .spans ["tables" ] = tables
242+ def to_pd_table (
243+ self ,
244+ span ,
245+ as_spans = False ,
246+ header : bool = False ,
247+ index : bool = False ,
248+ ) -> pd .DataFrame :
249+ """
250+ Return pandas DataFrame
172251
173- return doc
252+ Parameters
253+ ----------
254+ span : Span
255+ The span containing the table
256+ as_spans : bool
257+ Whether to return the table cells as spans
258+ header : bool
259+ Whether the table has a header
260+ index : bool
261+ Whether the table has an index
262+ """
263+ table = self .get_table (span )
264+ if not as_spans :
265+ table = [[str (cell ) for cell in data ] for data in table ]
174266
175- def to_pd_table (self , span ) -> pd .DataFrame :
176- table_str_io = StringIO (span .text )
177- parsed = pd .read_csv (
178- table_str_io ,
179- sep = self .sep ,
180- engine = "python" ,
181- header = None ,
182- on_bad_lines = "skip" ,
183- )
184- return parsed
267+ table = pd .DataFrame .from_records (table )
268+ if header :
269+ table .columns = [str (k ) for k in table .iloc [0 ]]
270+ table = table [1 :]
271+ if index :
272+ table .index = [str (k ) for k in table .iloc [:, 0 ]]
273+ table = table .iloc [:, 1 :]
274+ return table
0 commit comments