@@ -240,87 +240,101 @@ def __init__(
240240
241241 def __call__ (self , obj , tokenizer = None ):
242242 # tok = get_current_tokenizer() if self.tokenizer is None else self.tokenizer
243- tok = tokenizer or self .tokenizer or get_current_tokenizer ()
244- doc = tok (obj ["text" ] or "" )
245- doc ._ .note_id = obj .get ("doc_id" , obj .get (FILENAME ))
246-
247- entities = {}
248- spans = []
249-
250- for dst in (
251- * (() if self .span_attributes is None else self .span_attributes .values ()),
252- * self .default_attributes ,
253- ):
254- if not Span .has_extension (dst ):
255- Span .set_extension (dst , default = None )
256-
257- for ent in obj .get ("entities" ) or ():
258- fragments = (
259- [
260- {
261- "begin" : min (f ["begin" ] for f in ent ["fragments" ]),
262- "end" : max (f ["end" ] for f in ent ["fragments" ]),
263- }
264- ]
265- if not self .split_fragments
266- else ent ["fragments" ]
267- )
268- for fragment in fragments :
269- span = doc .char_span (
270- fragment ["begin" ],
271- fragment ["end" ],
272- label = ent ["label" ],
273- alignment_mode = "expand" ,
274- )
275- attributes = (
276- {a ["label" ]: a ["value" ] for a in ent ["attributes" ]}
277- if isinstance (ent ["attributes" ], list )
278- else ent ["attributes" ]
243+ note_id = obj .get ("doc_id" , obj .get (FILENAME ))
244+ try :
245+ tok = tokenizer or self .tokenizer or get_current_tokenizer ()
246+ doc = tok (obj ["text" ] or "" )
247+ doc ._ .note_id = note_id
248+
249+ entities = {}
250+ spans = []
251+
252+ for dst in (
253+ * (
254+ ()
255+ if self .span_attributes is None
256+ else self .span_attributes .values ()
257+ ),
258+ * self .default_attributes ,
259+ ):
260+ if not Span .has_extension (dst ):
261+ Span .set_extension (dst , default = None )
262+
263+ for ent in obj .get ("entities" ) or ():
264+ fragments = (
265+ [
266+ {
267+ "begin" : min (f ["begin" ] for f in ent ["fragments" ]),
268+ "end" : max (f ["end" ] for f in ent ["fragments" ]),
269+ }
270+ ]
271+ if not self .split_fragments
272+ else ent ["fragments" ]
279273 )
280- if self .notes_as_span_attribute and ent ["notes" ]:
281- ent ["attributes" ][self .notes_as_span_attribute ] = "|" .join (
282- note ["value" ] for note in ent ["notes" ]
274+ for fragment in fragments :
275+ span = doc .char_span (
276+ fragment ["begin" ],
277+ fragment ["end" ],
278+ label = ent ["label" ],
279+ alignment_mode = "expand" ,
283280 )
284- for label , value in attributes .items ():
285- new_name = (
286- self .span_attributes .get (label , None )
287- if self .span_attributes is not None
288- else label
281+ attributes = (
282+ {}
283+ if "attributes" not in ent
284+ else {a ["label" ]: a ["value" ] for a in ent ["attributes" ]}
285+ if isinstance (ent ["attributes" ], list )
286+ else ent ["attributes" ]
289287 )
290- if self .span_attributes is None and not Span .has_extension (
291- new_name
292- ):
293- Span .set_extension (new_name , default = None )
294-
295- if new_name :
296- value = True if value is None else value
297- if not self .keep_raw_attribute_values :
298- value = (
299- True
300- if value in ("True" , "true" )
301- else False
302- if value in ("False" , "false" )
303- else value
304- )
305- span ._ .set (new_name , value )
306-
307- entities .setdefault (ent ["entity_id" ], []).append (span )
308- spans .append (span )
309-
310- set_spans (doc , spans , span_setter = self .span_setter )
311- for attr , value in self .default_attributes .items ():
312- for span in spans :
313- if span ._ .get (attr ) is None :
314- span ._ .set (attr , value )
315-
316- for relation in obj .get ("relations" , []):
317- relation_label = relation ["relation_label" ]
318- from_entity_id = relation ["from_entity_id" ]
319- to_entity_id = relation ["to_entity_id" ]
320-
321- for head in entities [from_entity_id ]:
322- for tail in entities [to_entity_id ]:
323- head ._ .rel .setdefault (relation_label , set ()).add (tail )
288+ if self .notes_as_span_attribute and ent ["notes" ]:
289+ ent ["attributes" ][self .notes_as_span_attribute ] = "|" .join (
290+ note ["value" ] for note in ent ["notes" ]
291+ )
292+ for label , value in attributes .items ():
293+ new_name = (
294+ self .span_attributes .get (label , None )
295+ if self .span_attributes is not None
296+ else label
297+ )
298+ if self .span_attributes is None and not Span .has_extension (
299+ new_name
300+ ):
301+ Span .set_extension (new_name , default = None )
302+
303+ if new_name :
304+ value = True if value is None else value
305+ if not self .keep_raw_attribute_values :
306+ value = (
307+ True
308+ if value in ("True" , "true" )
309+ else False
310+ if value in ("False" , "false" )
311+ else value
312+ )
313+ span ._ .set (new_name , value )
314+
315+ entities .setdefault (ent ["entity_id" ], []).append (span )
316+ spans .append (span )
317+
318+ set_spans (doc , spans , span_setter = self .span_setter )
319+ for attr , value in self .default_attributes .items ():
320+ for span in spans :
321+ if span ._ .get (attr ) is None :
322+ span ._ .set (attr , value )
323+
324+ for relation in obj .get ("relations" , []):
325+ relation_label = (
326+ relation ["relation_label" ]
327+ if "relation_label" in relation
328+ else relation ["label" ]
329+ )
330+ from_entity_id = relation ["from_entity_id" ]
331+ to_entity_id = relation ["to_entity_id" ]
332+
333+ for head in entities .get (from_entity_id , ()):
334+ for tail in entities .get (to_entity_id , ()):
335+ head ._ .rel .setdefault (relation_label , set ()).add (tail )
336+ except Exception :
337+ raise ValueError (f"Error when processing { note_id } " )
324338
325339 return doc
326340
0 commit comments