Skip to content

Commit bb86cc0

Browse files
committed
feat: add ignore_space_tokens to relevant components and update docs & tests
1 parent 1186444 commit bb86cc0

File tree

25 files changed

+443
-126
lines changed

25 files changed

+443
-126
lines changed

changelog.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
### Added
66

77
- Add `eds.spaces` (or `eds.normalizer` with `spaces=True`) to detect space tokens, and add `ignore_space_tokens` to `EDSPhraseMatcher` and `SimstringMatcher` to skip them
8+
- Add `ignore_space_tokens` option in most components
89

910
## v0.8.0 (2023-03-09)
1011

docs/pipelines/core/normalisation.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ Moreover, every span exposes a `normalized_variant` extension getter, which comp
6262
The pipeline can be configured using the following parameters :
6363

6464
::: edsnlp.pipelines.core.normalizer.factory.create_component
65-
options:
66-
only_parameters: true
65+
options:
66+
only_parameters: true
6767

6868
## Pipelines
6969

docs/pipelines/core/terminology.md

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,9 @@ This snippet is complete, and should run as is.
5555

5656
The pipeline can be configured using the following parameters :
5757

58-
| Parameter | Explanation | Default |
59-
| ----------------- | ------------------------------------------------ | ----------------------- |
60-
| `label` | Top-level label. | Required |
61-
| `terms` | Terms patterns. Expects a dictionary. | `None` (use regex only) |
62-
| `regex` | RegExp patterns. Expects a dictionary. | `None` (use terms only) |
63-
| `attr` | spaCy attribute to match on (eg `NORM`, `LOWER`) | `"TEXT"` |
64-
| `ignore_excluded` | Whether to skip excluded tokens during matching | `False` |
58+
::: edsnlp.pipelines.core.terminology.factory.create_component
59+
options:
60+
only_parameters: true
6561

6662
Patterns, be they `terms` or `regex`, are defined as dictionaries where keys become the `kb_id_` of the extracted entities.
6763
Dictionary values are a either a single expression or a list of expressions that match the concept (see [example](#usage)).

edsnlp/pipelines/core/contextual_matcher/contextual_matcher.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ class ContextualMatcher(BaseComponent):
5151
Attribute to match on, eg `TEXT`, `NORM`, etc.
5252
ignore_excluded : bool
5353
Whether to skip excluded tokens during matching.
54+
ignore_space_tokens: bool
55+
Whether to skip space tokens during matching.
5456
alignment_mode : str
5557
Overwrite alignment mode.
5658
regex_flags : Union[re.RegexFlag, int]
@@ -65,12 +67,13 @@ def __init__(
6567
nlp: Language,
6668
name: str,
6769
patterns: Union[Dict[str, Any], List[Dict[str, Any]]],
68-
assign_as_span: bool,
69-
alignment_mode: str,
70-
attr: str,
71-
regex_flags: Union[re.RegexFlag, int],
72-
ignore_excluded: bool,
73-
include_assigned: bool,
70+
assign_as_span: bool = False,
71+
alignment_mode: str = "expand",
72+
attr: str = "NORM",
73+
regex_flags: Union[re.RegexFlag, int] = 0,
74+
ignore_excluded: bool = False,
75+
ignore_space_tokens: bool = False,
76+
include_assigned: bool = False,
7477
):
7578
self.name = name
7679
self.nlp = nlp
@@ -160,6 +163,7 @@ def __init__(
160163
attr=p["regex_attr"] or self.attr,
161164
flags=p["regex_flags"] or self.regex_flags,
162165
ignore_excluded=ignore_excluded,
166+
ignore_space_tokens=ignore_space_tokens,
163167
alignment_mode=alignment_mode,
164168
span_from_group=True,
165169
)
@@ -290,8 +294,9 @@ def assign_one(self, span: Span) -> Span:
290294
end_char=match.end(0),
291295
key=matcher["matcher"].regex[0][0],
292296
attr=matcher["matcher"].regex[0][2],
293-
alignment_mode=matcher["matcher"].regex[0][4],
297+
alignment_mode=matcher["matcher"].regex[0][5],
294298
ignore_excluded=matcher["matcher"].regex[0][3],
299+
ignore_space_tokens=matcher["matcher"].regex[0][4],
295300
),
296301
)
297302
for (span, match) in assigned_list

edsnlp/pipelines/core/contextual_matcher/factory.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
DEFAULT_CONFIG = dict(
1010
attr="NORM",
1111
ignore_excluded=False,
12+
ignore_space_tokens=False,
1213
regex_flags=0,
1314
alignment_mode="expand",
1415
assign_as_span=False,
@@ -28,6 +29,7 @@ def create_component(
2829
alignment_mode: str,
2930
attr: str,
3031
ignore_excluded: bool,
32+
ignore_space_tokens: bool,
3133
regex_flags: Union[re.RegexFlag, int],
3234
include_assigned: bool,
3335
):
@@ -68,6 +70,7 @@ def create_component(
6870
alignment_mode,
6971
attr=attr,
7072
ignore_excluded=ignore_excluded,
73+
ignore_space_tokens=ignore_space_tokens,
7174
regex_flags=regex_flags,
7275
include_assigned=include_assigned,
7376
)

edsnlp/pipelines/core/endlines/endlines.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ def __init__(
4949
new_line=r"\n+",
5050
),
5151
ignore_excluded=False,
52+
ignore_space_tokens=False,
5253
**kwargs,
5354
)
5455

edsnlp/pipelines/core/matcher/factory.py

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
regex=None,
1212
attr="TEXT",
1313
ignore_excluded=False,
14+
ignore_space_tokens=False,
1415
term_matcher=GenericTermMatcher.exact,
1516
term_matcher_config={},
1617
)
@@ -27,14 +28,45 @@
2728
)
2829
def create_component(
2930
nlp: Language,
30-
name: str,
31-
terms: Optional[Dict[str, Union[str, List[str]]]],
32-
attr: Union[str, Dict[str, str]],
33-
regex: Optional[Dict[str, Union[str, List[str]]]],
34-
ignore_excluded: bool,
35-
term_matcher: GenericTermMatcher,
36-
term_matcher_config: Dict[str, Any],
31+
name: str = "eds.matcher",
32+
terms: Optional[Dict[str, Union[str, List[str]]]] = None,
33+
attr: Union[str, Dict[str, str]] = None,
34+
regex: Optional[Dict[str, Union[str, List[str]]]] = "TEXT",
35+
ignore_excluded: bool = False,
36+
ignore_space_tokens: bool = False,
37+
term_matcher: GenericTermMatcher = GenericTermMatcher.exact,
38+
term_matcher_config: Dict[str, Any] = {},
3739
):
40+
"""
41+
Provides a generic matcher component.
42+
43+
Parameters
44+
----------
45+
nlp : Language
46+
The spaCy object.
47+
name: str
48+
The name of the component.
49+
terms : Optional[Patterns]
50+
A dictionary of terms.
51+
regex : Optional[Patterns]
52+
A dictionary of regular expressions.
53+
attr : str
54+
The default attribute to use for matching.
55+
Can be overridden using the `terms` and `regex` configurations.
56+
ignore_excluded : bool
57+
Whether to skip excluded tokens (requires an upstream
58+
pipeline to mark excluded tokens).
59+
ignore_space_tokens: bool
60+
Whether to skip space tokens during matching.
61+
62+
You won't be able to match on newlines if this is enabled and
63+
the "spaces"/"newline" option of `eds.normalizer` is enabled (by default).
64+
term_matcher: GenericTermMatcher
65+
The matcher to use for matching phrases ?
66+
One of (exact, simstring)
67+
term_matcher_config: Dict[str,Any]
68+
Parameters of the matcher class
69+
"""
3870
assert not (terms is None and regex is None)
3971

4072
if terms is None:
@@ -48,6 +80,7 @@ def create_component(
4880
attr=attr,
4981
regex=regex,
5082
ignore_excluded=ignore_excluded,
83+
ignore_space_tokens=ignore_space_tokens,
5184
term_matcher=term_matcher,
5285
term_matcher_config=term_matcher_config,
5386
)

edsnlp/pipelines/core/matcher/matcher.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,11 @@ class GenericMatcher(BaseComponent):
3535
ignore_excluded : bool
3636
Whether to skip excluded tokens (requires an upstream
3737
pipeline to mark excluded tokens).
38-
ignore_excluded : bool
39-
Whether to skip space tokens (requires an upstream
40-
pipeline to mark space tokens).
38+
ignore_space_tokens: bool
39+
Whether to skip space tokens during matching.
40+
41+
You won't be able to match on newlines if this is enabled and
42+
the "spaces"/"newline" option of `eds.normalizer` is enabled (by default).
4143
term_matcher: GenericTermMatcher
4244
The matcher to use for matching phrases ?
4345
One of (exact, simstring)
@@ -86,6 +88,7 @@ def __init__(
8688
self.regex_matcher = RegexMatcher(
8789
attr=attr,
8890
ignore_excluded=ignore_excluded,
91+
ignore_space_tokens=ignore_space_tokens,
8992
)
9093

9194
self.phrase_matcher.build_patterns(nlp=nlp, terms=terms)

edsnlp/pipelines/core/terminology/factory.py

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66

77
DEFAULT_CONFIG = dict(
88
terms=None,
9-
regex=None,
109
attr="TEXT",
10+
regex=None,
1111
ignore_excluded=False,
12+
ignore_space_tokens=False,
1213
term_matcher="exact",
1314
term_matcher_config={},
1415
)
@@ -21,29 +22,59 @@
2122
)
2223
def create_component(
2324
nlp: Language,
24-
name: str,
2525
label: str,
2626
terms: Optional[Dict[str, Union[str, List[str]]]],
27-
attr: Union[str, Dict[str, str]],
28-
regex: Optional[Dict[str, Union[str, List[str]]]],
29-
ignore_excluded: bool,
30-
term_matcher: TerminologyTermMatcher,
31-
term_matcher_config: Dict[str, Any],
27+
name: str = "eds.terminology",
28+
attr: Union[str, Dict[str, str]] = "TEXT",
29+
regex: Optional[Dict[str, Union[str, List[str]]]] = None,
30+
ignore_excluded: bool = False,
31+
ignore_space_tokens: bool = False,
32+
term_matcher: TerminologyTermMatcher = "exact",
33+
term_matcher_config: Dict[str, Any] = {},
3234
):
33-
assert not (terms is None and regex is None)
35+
"""
36+
Provides a terminology matching component.
3437
35-
if terms is None:
36-
terms = dict()
37-
if regex is None:
38-
regex = dict()
38+
The terminology matching component differs from the simple matcher component in that
39+
the `regex` and `terms` keys are used as spaCy's `kb_id`. All matched entities
40+
have the same label, defined in the top-level constructor (argument `label`).
41+
42+
Parameters
43+
----------
44+
nlp : Language
45+
The spaCy object.
46+
name: str
47+
The name of the component.
48+
label : str
49+
Top-level label
50+
terms : Optional[Patterns]
51+
A dictionary of terms.
52+
regex : Optional[Patterns]
53+
A dictionary of regular expressions.
54+
attr : str
55+
The default attribute to use for matching.
56+
Can be overridden using the `terms` and `regex` configurations.
57+
ignore_excluded : bool
58+
Whether to skip excluded tokens (requires an upstream
59+
pipeline to mark excluded tokens).
60+
ignore_space_tokens: bool
61+
Whether to skip space tokens during matching.
62+
term_matcher: TerminologyTermMatcher
63+
The matcher to use for matching phrases ?
64+
One of (exact, simstring)
65+
term_matcher_config: Dict[str,Any]
66+
Parameters of the matcher class
67+
"""
68+
assert not (terms is None and regex is None)
3969

4070
return TerminologyMatcher(
4171
nlp,
4272
label=label,
43-
terms=terms,
73+
terms=terms or dict(),
4474
attr=attr,
45-
regex=regex,
75+
regex=regex or dict(),
4676
ignore_excluded=ignore_excluded,
77+
ignore_space_tokens=ignore_space_tokens,
4778
term_matcher=term_matcher,
4879
term_matcher_config=term_matcher_config,
4980
)

edsnlp/pipelines/core/terminology/terminology.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ class TerminologyMatcher(BaseComponent):
4242
ignore_excluded : bool
4343
Whether to skip excluded tokens (requires an upstream
4444
pipeline to mark excluded tokens).
45+
ignore_space_tokens: bool
46+
Whether to skip space tokens during matching.
4547
term_matcher: TerminologyTermMatcher
4648
The matcher to use for matching phrases ?
4749
One of (exact, simstring)
@@ -57,6 +59,7 @@ def __init__(
5759
regex: Optional[Patterns],
5860
attr: str,
5961
ignore_excluded: bool,
62+
ignore_space_tokens: bool = False,
6063
term_matcher: TerminologyTermMatcher = TerminologyTermMatcher.exact,
6164
term_matcher_config=None,
6265
):
@@ -72,13 +75,15 @@ def __init__(
7275
self.nlp.vocab,
7376
attr=attr,
7477
ignore_excluded=ignore_excluded,
78+
ignore_space_tokens=ignore_space_tokens,
7579
**(term_matcher_config or {}),
7680
)
7781
elif term_matcher == TerminologyTermMatcher.simstring:
7882
self.phrase_matcher = SimstringMatcher(
7983
vocab=self.nlp.vocab,
8084
attr=attr,
8185
ignore_excluded=ignore_excluded,
86+
ignore_space_tokens=ignore_space_tokens,
8287
**(term_matcher_config or {}),
8388
)
8489
else:
@@ -90,6 +95,7 @@ def __init__(
9095
self.regex_matcher = RegexMatcher(
9196
attr=attr,
9297
ignore_excluded=ignore_excluded,
98+
ignore_space_tokens=ignore_space_tokens,
9399
)
94100

95101
self.phrase_matcher.build_patterns(nlp=nlp, terms=terms, progress=True)

0 commit comments

Comments
 (0)