Skip to content

Commit 55433ab

Browse files
committed
adding possibility to select cim10 and atc in eds.cim10 and eds.drugs
1 parent fa135e6 commit 55433ab

File tree

4 files changed

+33
-7
lines changed

4 files changed

+33
-7
lines changed

edsnlp/pipes/ner/cim10/factory.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Any, Dict
1+
from typing import Any, Dict, List
22

33
from typing_extensions import Literal
44

@@ -28,6 +28,7 @@ def create_component(
2828
name: str = "cim10",
2929
*,
3030
attr: str = "NORM",
31+
cim10: List[str] = None,
3132
ignore_excluded: bool = False,
3233
ignore_space_tokens: bool = False,
3334
term_matcher: Literal["exact", "simstring"] = "exact",
@@ -75,6 +76,9 @@ def create_component(
7576
The pipeline object
7677
name : str
7778
The name of the component
79+
cim10 : str
80+
List of cim10 to retrieve. If None, all cim10 will be searched,
81+
resulting in higher computation time.
7882
attr : str
7983
The default attribute to use for matching.
8084
ignore_excluded : bool
@@ -104,7 +108,7 @@ def create_component(
104108
nlp=nlp,
105109
name=name,
106110
regex=dict(),
107-
terms=get_patterns(),
111+
terms=get_patterns(cim10),
108112
attr=attr,
109113
ignore_excluded=ignore_excluded,
110114
ignore_space_tokens=ignore_space_tokens,

edsnlp/pipes/ner/cim10/patterns.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,14 @@
55
from edsnlp import BASE_DIR
66

77

8-
def get_patterns() -> Dict[str, List[str]]:
8+
def filter_dict_by_keys(D: Dict[str, List[str]], L: List[str]):
9+
filtered_dict = {
10+
k: v for k, v in D.items() if any(k.startswith(prefix) for prefix in L)
11+
}
12+
return filtered_dict
13+
14+
15+
def get_patterns(cim10: List[str] = None) -> Dict[str, List[str]]:
916
df = pd.read_csv(BASE_DIR / "resources" / "cim10.csv.gz")
1017

1118
df["code_pattern"] = df["code"]
@@ -30,4 +37,6 @@ def get_patterns() -> Dict[str, List[str]]:
3037

3138
patterns = df.groupby("code")["patterns"].agg(list).to_dict()
3239

40+
patterns = filter_dict_by_keys(patterns, cim10) if cim10 else patterns
41+
3342
return patterns

edsnlp/pipes/ner/drugs/factory.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Any, Dict
1+
from typing import Any, Dict, List
22

33
from typing_extensions import Literal
44

@@ -28,6 +28,7 @@ def create_component(
2828
name: str = "drugs",
2929
*,
3030
attr: str = "NORM",
31+
atc: List[str] = None,
3132
ignore_excluded: bool = False,
3233
ignore_space_tokens: bool = False,
3334
term_matcher: Literal["exact", "simstring"] = "exact",
@@ -83,6 +84,9 @@ def create_component(
8384
The name of the component
8485
attr : str
8586
The default attribute to use for matching.
87+
atc : str
88+
List of atc to retrieve. If None, all atc will be searched,
89+
resulting in higher computation time.
8690
ignore_excluded : bool
8791
Whether to skip excluded tokens (requires an upstream
8892
pipeline to mark excluded tokens).
@@ -111,7 +115,7 @@ def create_component(
111115
nlp=nlp,
112116
name=name,
113117
regex=dict(),
114-
terms=get_patterns(),
118+
terms=get_patterns(atc),
115119
attr=attr,
116120
ignore_excluded=ignore_excluded,
117121
ignore_space_tokens=ignore_space_tokens,

edsnlp/pipes/ner/drugs/patterns.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,15 @@
66
drugs_file = BASE_DIR / "resources" / "drugs.json"
77

88

9-
def get_patterns() -> Dict[str, List[str]]:
9+
def filter_dict_by_keys(D: Dict[str, List[str]], L: List[str]):
10+
filtered_dict = {
11+
k: v for k, v in D.items() if any(k.startswith(prefix) for prefix in L)
12+
}
13+
return filtered_dict
14+
15+
16+
def get_patterns(atc: List[str]) -> Dict[str, List[str]]:
1017
with open(drugs_file, "r") as f:
11-
return json.load(f)
18+
patterns = json.load(f)
19+
patterns = filter_dict_by_keys(patterns, atc) if atc else patterns
20+
return patterns

0 commit comments

Comments
 (0)