aphp
diff --git a/‎changelog.md‎
Lines changed: 1 addition & 0 deletions b/‎changelog.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎edsnlp/data/converters.py‎
Lines changed: 92 additions & 78 deletions b/‎edsnlp/data/converters.py‎
Lines changed: 92 additions & 78 deletions
diff --git a/‎edsnlp/data/standoff.py‎
Lines changed: 21 additions & 13 deletions b/‎edsnlp/data/standoff.py‎
Lines changed: 21 additions & 13 deletions
diff --git a/‎edsnlp/metrics/relations.py‎
Lines changed: 22 additions & 2 deletions b/‎edsnlp/metrics/relations.py‎
Lines changed: 22 additions & 2 deletions
diff --git a/‎edsnlp/pipes/base.py‎
Lines changed: 7 additions & 4 deletions b/‎edsnlp/pipes/base.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎edsnlp/pipes/trainable/embeddings/span_pooler/span_pooler.py‎
Lines changed: 3 additions & 1 deletion b/‎edsnlp/pipes/trainable/embeddings/span_pooler/span_pooler.py‎
Lines changed: 3 additions & 1 deletion
@@ -14,6 +14,7 @@
 - Added support for multiple loggers (`tensorboard`, `wandb`, `comet_ml`, `aim`, `mlflow`, `clearml`, `dvclive`, `csv`, `json`, `rich`) in `edsnlp.train` via the `logger` parameter. Default is [`json` and `rich`] for backward compatibility.
 - Added clickable snippets in the documentation for more registered functions
 - New trainable `eds.relation_detector_ffn` component to detect relations between entities. These relations are stored in each entity: `head._.rel[relation_label] = [tail1, tail2, ...]`.
+- Load "Status" annotator notes as `status` dict attribute
 
 ### Changed
 
 
@@ -240,87 +240,101 @@ def __init__(
 
     def __call__(self, obj, tokenizer=None):
         # tok = get_current_tokenizer() if self.tokenizer is None else self.tokenizer
-        tok = tokenizer or self.tokenizer or get_current_tokenizer()
-        doc = tok(obj["text"] or "")
-        doc._.note_id = obj.get("doc_id", obj.get(FILENAME))
-
-        entities = {}
-        spans = []
-
-        for dst in (
-            *(() if self.span_attributes is None else self.span_attributes.values()),
-            *self.default_attributes,
-        ):
-            if not Span.has_extension(dst):
-                Span.set_extension(dst, default=None)
-
-        for ent in obj.get("entities") or ():
-            fragments = (
-                [
-                    {
-                        "begin": min(f["begin"] for f in ent["fragments"]),
-                        "end": max(f["end"] for f in ent["fragments"]),
-                    }
-                ]
-                if not self.split_fragments
-                else ent["fragments"]
-            )
-            for fragment in fragments:
-                span = doc.char_span(
-                    fragment["begin"],
-                    fragment["end"],
-                    label=ent["label"],
-                    alignment_mode="expand",
-                )
-                attributes = (
-                    {a["label"]: a["value"] for a in ent["attributes"]}
-                    if isinstance(ent["attributes"], list)
-                    else ent["attributes"]
+        note_id = obj.get("doc_id", obj.get(FILENAME))
+        try:
+            tok = tokenizer or self.tokenizer or get_current_tokenizer()
+            doc = tok(obj["text"] or "")
+            doc._.note_id = note_id
+
+            entities = {}
+            spans = []
+
+            for dst in (
+                *(
+                    ()
+                    if self.span_attributes is None
+                    else self.span_attributes.values()
+                ),
+                *self.default_attributes,
+            ):
+                if not Span.has_extension(dst):
+                    Span.set_extension(dst, default=None)
+
+            for ent in obj.get("entities") or ():
+                fragments = (
+                    [
+                        {
+                            "begin": min(f["begin"] for f in ent["fragments"]),
+                            "end": max(f["end"] for f in ent["fragments"]),
+                        }
+                    ]
+                    if not self.split_fragments
+                    else ent["fragments"]
                 )
-                if self.notes_as_span_attribute and ent["notes"]:
-                    ent["attributes"][self.notes_as_span_attribute] = "|".join(
-                        note["value"] for note in ent["notes"]
+                for fragment in fragments:
+                    span = doc.char_span(
+                        fragment["begin"],
+                        fragment["end"],
+                        label=ent["label"],
+                        alignment_mode="expand",
                     )
-                for label, value in attributes.items():
-                    new_name = (
-                        self.span_attributes.get(label, None)
-                        if self.span_attributes is not None
-                        else label
+                    attributes = (
+                        {}
+                        if "attributes" not in ent
+                        else {a["label"]: a["value"] for a in ent["attributes"]}
+                        if isinstance(ent["attributes"], list)
+                        else ent["attributes"]
                     )
-                    if self.span_attributes is None and not Span.has_extension(
-                        new_name
-                    ):
-                        Span.set_extension(new_name, default=None)
-
-                    if new_name:
-                        value = True if value is None else value
-                        if not self.keep_raw_attribute_values:
-                            value = (
-                                True
-                                if value in ("True", "true")
-                                else False
-                                if value in ("False", "false")
-                                else value
-                            )
-                        span._.set(new_name, value)
-
-                entities.setdefault(ent["entity_id"], []).append(span)
-                spans.append(span)
-
-        set_spans(doc, spans, span_setter=self.span_setter)
-        for attr, value in self.default_attributes.items():
-            for span in spans:
-                if span._.get(attr) is None:
-                    span._.set(attr, value)
-
-        for relation in obj.get("relations", []):
-            relation_label = relation["relation_label"]
-            from_entity_id = relation["from_entity_id"]
-            to_entity_id = relation["to_entity_id"]
-
-            for head in entities[from_entity_id]:
-                for tail in entities[to_entity_id]:
-                    head._.rel.setdefault(relation_label, set()).add(tail)
+                    if self.notes_as_span_attribute and ent["notes"]:
+                        ent["attributes"][self.notes_as_span_attribute] = "|".join(
+                            note["value"] for note in ent["notes"]
+                        )
+                    for label, value in attributes.items():
+                        new_name = (
+                            self.span_attributes.get(label, None)
+                            if self.span_attributes is not None
+                            else label
+                        )
+                        if self.span_attributes is None and not Span.has_extension(
+                            new_name
+                        ):
+                            Span.set_extension(new_name, default=None)
+
+                        if new_name:
+                            value = True if value is None else value
+                            if not self.keep_raw_attribute_values:
+                                value = (
+                                    True
+                                    if value in ("True", "true")
+                                    else False
+                                    if value in ("False", "false")
+                                    else value
+                                )
+                            span._.set(new_name, value)
+
+                    entities.setdefault(ent["entity_id"], []).append(span)
+                    spans.append(span)
+
+            set_spans(doc, spans, span_setter=self.span_setter)
+            for attr, value in self.default_attributes.items():
+                for span in spans:
+                    if span._.get(attr) is None:
+                        span._.set(attr, value)
+
+            for relation in obj.get("relations", []):
+                relation_label = (
+                    relation["relation_label"]
+                    if "relation_label" in relation
+                    else relation["label"]
+                )
+                from_entity_id = relation["from_entity_id"]
+                to_entity_id = relation["to_entity_id"]
+
+                for head in entities.get(from_entity_id, ()):
+                    for tail in entities.get(to_entity_id, ()):
+                        head._.rel.setdefault(relation_label, set()).add(tail)
+        except Exception:
+            raise ValueError(f"Error when processing {note_id}")
 
         return doc
 
 
@@ -32,6 +32,7 @@
 REGEX_ATTRIBUTE = re.compile(r"^([AM]\d+)\t(.+?) ([TE]\d+)(?: (.+))?$")
 REGEX_EVENT = re.compile(r"^(E\d+)\t(.+)$")
 REGEX_EVENT_PART = re.compile(r"(\S+):([TE]\d+)")
+REGEX_STATUS = re.compile(r"^(#\d+)\tStatus ([^\t]+)\t(.*)$")
 
 
 class BratParsingError(ValueError):
@@ -71,6 +72,7 @@ def parse_standoff_file(
     entities = {}
     relations = []
     events = {}
+    doc = {}
 
     with fs.open(txt_path, "r", encoding="utf-8") as f:
         text = f.read()
@@ -178,6 +180,11 @@ def parse_standoff_file(
                             "arguments": arguments,
                         }
                     elif line.startswith("#"):
+                        match = REGEX_STATUS.match(line)
+                        if match:
+                            comment = match.group(3)
+                            doc["status"] = comment
+                            continue
                         match = REGEX_NOTE.match(line)
                         if match is None:
                             raise BratParsingError(ann_file, line)
@@ -201,6 +208,7 @@ def parse_standoff_file(
         "entities": list(entities.values()),
         "relations": relations,
         "events": list(events.values()),
+        **doc,
     }
 
 
@@ -260,19 +268,19 @@ def dump_standoff_file(
                                 )
                                 attribute_idx += 1
 
-                    # fmt: off
-                    if "relations" in doc:
-                        for i, relation in enumerate(doc["relations"]):
-                            entity_from = entities_ids[relation["from_entity_id"]]
-                            entity_to = entities_ids[relation["to_entity_id"]]
-                            print(
-                                "R{}\t{} Arg1:{} Arg2:{}\t".format(
-                                    i + 1, str(relation["label"]), entity_from,
-                                    entity_to
-                                ),
-                                file=f,
-                            )
-                    # fmt: on
+            # fmt: off
+            if "relations" in doc:
+                for i, relation in enumerate(doc["relations"]):
+                    entity_from = entities_ids[relation["from_entity_id"]]
+                    entity_to = entities_ids[relation["to_entity_id"]]
+                    print(
+                        "R{}\t{} Arg1:{} Arg2:{}\t".format(
+                            i + 1, str(relation["label"]), entity_from,
+                            entity_to
+                        ),
+                        file=f,
+                    )
+            # fmt: on
 
 
 class StandoffReader(FileBasedReader):
 
@@ -51,6 +51,8 @@ def relations_scorer(
         head_getter = candidate["head"]
         tail_getter = candidate["tail"]
         labels = candidate["labels"]
+        symmetric = candidate.get("symmetric") or False
+        label_filter = candidate.get("label_filter")
         for eg_idx, eg in enumerate(examples):
             pred_heads = [
                 ((h.start, h.end, h.label_), h)
@@ -61,9 +63,21 @@ def relations_scorer(
                 for t in get_spans(eg.predicted, tail_getter)
             ]
             for (h_key, head), (t_key, tail) in product(pred_heads, pred_tails):
+                if (
+                    label_filter is not None
+                    and head.label_ not in label_filter
+                    or tail.label_ not in label_filter
+                ):
+                    continue
                 total_pred_count += 1
                 for label in labels:
-                    if tail in head._.rel.get(label, ()):
+                    if (
+                        tail in head._.rel.get(label, ())
+                        or symmetric
+                        and head in tail._.rel.get(label, ())
+                    ):
+                        if symmetric and h_key > t_key:
+                            h_key, t_key = t_key, h_key
                         annotations[label][0].add((eg_idx, h_key, t_key, label))
                         annotations[micro_key][0].add((eg_idx, h_key, t_key, label))
 
@@ -78,7 +92,13 @@ def relations_scorer(
             for (h_key, head), (t_key, tail) in product(gold_heads, gold_tails):
                 total_gold_count += 1
                 for label in labels:
-                    if tail in head._.rel.get(label, ()):
+                    if (
+                        tail in head._.rel.get(label, ())
+                        or symmetric
+                        and head in tail._.rel.get(label, ())
+                    ):
+                        if symmetric and h_key > t_key:
+                            h_key, t_key = t_key, h_key
                         annotations[label][1].add((eg_idx, h_key, t_key, label))
                         annotations[micro_key][1].add((eg_idx, h_key, t_key, label))
 
 
@@ -208,10 +208,6 @@ def qualifiers(self, value):  # pragma: no cover
 
 
 class BaseRelationDetectorComponent(BaseComponent, abc.ABC):
-    head_getter: SpanGetter
-    tail_getter: SpanGetter
-    labels: List[str]
-
     def __init__(
         self,
         nlp: PipelineProtocol = None,
@@ -226,6 +222,13 @@ def __init__(
                 "head": validate_span_getter(candidate["head"]),
                 "tail": validate_span_getter(candidate["tail"]),
                 "labels": candidate["labels"],
+                "label_filter": {
+                    head: set(tail_labels)
+                    for head, tail_labels in candidate["label_filter"].items()
+                }
+                if candidate.get("label_filter")
+                else None,
+                "symmetric": candidate.get("symmetric") or False,
             }
             for candidate in candidate_getter
         ]
 
@@ -210,7 +210,9 @@ def forward(self, batch: SpanPoolerBatchInput) -> SpanPoolerBatchOutput:
                 "embeddings": batch["begins"].with_data(span_embeds),
             }
 
-        embeds = self.embedding(batch["embedding"])["embeddings"]
+        embeds = self.embedding(batch["embedding"])["embeddings"].refold(
+            ["context", "word"]
+        )
         _, n_words, dim = embeds.shape
         device = embeds.device
Original file line number	Diff line number	Diff line change
`@@ -210,7 +210,9 @@ def forward(self, batch: SpanPoolerBatchInput) -> SpanPoolerBatchOutput:`
`210`	`210`	`"embeddings": batch["begins"].with_data(span_embeds),`
`211`	`211`	`}`
`212`	`212`
`213`		`- embeds = self.embedding(batch["embedding"])["embeddings"]`
	`213`	`+ embeds = self.embedding(batch["embedding"])["embeddings"].refold(`
	`214`	`+ ["context", "word"]`
	`215`	`+ )`
`214`	`216`	`_, n_words, dim = embeds.shape`
`215`	`217`	`device = embeds.device`
`216`	`218`