aphp
diff --git a/‎docs/pipes/qualifiers/llm-qualifier.md‎
Lines changed: 26 additions & 0 deletions b/‎docs/pipes/qualifiers/llm-qualifier.md‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎docs/tutorials/qualifying-entities-with-llm.md‎
Lines changed: 229 additions & 0 deletions b/‎docs/tutorials/qualifying-entities-with-llm.md‎
Lines changed: 229 additions & 0 deletions
diff --git a/‎edsnlp/pipes/llm/llm_span_qualifier/__init__.py‎ b/‎edsnlp/pipes/llm/llm_span_qualifier/__init__.py‎
diff --git a/‎edsnlp/pipes/llm/llm_span_qualifier/factory.py‎
Lines changed: 7 additions & 0 deletions b/‎edsnlp/pipes/llm/llm_span_qualifier/factory.py‎
Lines changed: 7 additions & 0 deletions
@@ -0,0 +1,26 @@
+## LLM Span Classifier {: #edsnlp.pipes.qualifiers.llm.factory.create_component }
+
+::: edsnlp.pipes.qualifiers.llm.factory.create_component
+    options:
+        heading_level: 3
+        show_bases: false
+        show_source: false
+        only_class_level: true
+
+## APIParams {: #edsnlp.pipes.qualifiers.llm.llm_qualifier.APIParams }
+
+::: edsnlp.pipes.qualifiers.llm.llm_qualifier.APIParams
+    options:
+        heading_level: 3
+        show_bases: false
+        show_source: false
+        only_class_level: true
+
+## PromptConfig {: #edsnlp.pipes.qualifiers.llm.llm_qualifier.PromptConfig }
+
+::: edsnlp.pipes.qualifiers.llm.llm_qualifier.PromptConfig
+    options:
+        heading_level: 3
+        show_bases: false
+        show_source: false
+        only_class_level: true
@@ -0,0 +1,229 @@
+# Using a LLM as a span qualifier
+In this tutorial we woud learn how to use the `LLMSpanClassifier` pipe to qualify spans.
+You should install the extra dependencies before in a python environment (python>='3.8'):
+```bash
+pip install edsnlp[llm]
+```
+
+## Using a local LLM server
+We suppose that there is an available LLM server compatible with OpenAI API.
+For example, using the library vllm you can launch an LLM server as follows in command line:
+```bash
+vllm serve Qwen/Qwen3-8B --port 8000 --enable-prefix-caching --tensor-parallel-size 1 --max-num-seqs=10 --max-num-batched-tokens=35000
+```
+
+## Using an external API
+You can also use the [Openai API](https://openai.com/index/openai-api/) or the [Groq API](https://groq.com/).
+
+!!! warning
+
+    As you are probably working with sensitive medical data, please check whether you can use an external API or if you need to expose an API in your own infrastructure.
+
+## Import dependencies
+```{ .python .no-check }
+from datetime import datetime
+
+import pandas as pd
+
+import edsnlp
+import edsnlp.pipes as eds
+from edsnlp.pipes.qualifiers.llm.llm_qualifier import LLMSpanClassifier
+from edsnlp.utils.span_getters import make_span_context_getter
+```
+## Define prompt and examples
+```{ .python .no-check }
+task_prompts = {
+    0: {
+        "normalized_task_name": "biopsy_procedure",
+        "system_prompt": "You are a medical assistant and you will help answering questions about dates present in clinical notes. Don't answer reasoning. "
+        + "We are interested in detecting biopsy dates (either procedure, analysis or result). "
+        + "You should answer in a JSON object following this schema {'biopsy':bool}. "
+        + "If there is not enough information, answer {'biopsy':'False'}."
+        + "\n\n#### Examples:\n",
+        "examples": [
+            (
+                "07/12/2020",
+                "07/12/2020 : Anapath / biopsies rectales : Muqueuse rectale normale sous réserve de fragments de petite taille.",
+                "{'biopsy':'True'}",
+            ),
+            (
+                "24/12/2021",
+                "Chirurgie 24/12/2021 : Colectomie gauche + anastomose colo rectale + clearance hépatique gauche (une méta posée sur",
+                "{'biopsy':'False'}",
+            ),
+        ],
+        "prefix_prompt": "\nDetermine if '{span}' corresponds to a biopsy date. The text is as follows:\n<<< ",
+        "suffix_prompt": " >>>",
+        "json_schema": {
+            "properties": {
+                "biopsy": {"title": "Biopsy", "type": "boolean"},
+            },
+            "required": [
+                "biopsy",
+            ],
+            "title": "DateModel",
+            "type": "object",
+        },
+        "response_mapping": {
+            "(?i)(oui)|(yes)|(true)": "1",
+            "(?i)(non)|(no)|(false)|(don't)|(not)": "0",
+        },
+    },
+}
+```
+
+## Format these examples for few-shot learning
+```{ .python .no-check }
+def format_examples(raw_examples, prefix_prompt, suffix_prompt):
+    examples = []
+
+    for date, context, answer in raw_examples:
+        prompt = prefix_prompt.format(span=date) + context + suffix_prompt
+        examples.append((prompt, answer))
+
+    return examples
+```
+
+## Set parameters and prompts
+```{ .python .no-check }
+# Set prompt
+prompt_id = 0
+raw_examples = task_prompts.get(prompt_id).get("examples")
+prefix_prompt = task_prompts.get(prompt_id).get("prefix_prompt")
+user_prompt = task_prompts.get(prompt_id).get("user_prompt")
+system_prompt = task_prompts.get(prompt_id).get("system_prompt")
+suffix_prompt = task_prompts.get(prompt_id).get("suffix_prompt")
+examples = format_examples(raw_examples, prefix_prompt, suffix_prompt)
+
+# Define JSON schema
+response_format = {
+    "type": "json_schema",
+    "json_schema": {
+        "name": "DateModel",
+        # "strict": True,
+        "schema": task_prompts.get(prompt_id)["json_schema"],
+    },
+}
+
+# Set parameters
+response_mapping = None
+max_tokens = 200
+extra_body = {
+    # "chat_template_kwargs": {"enable_thinking": False},
+}
+temperature = 0
+```
+
+=== "For local serving"
+
+    ```{ .python .no-check }
+    ### For local serving
+    model_name = "Qwen/Qwen3-8B"
+    api_url = "http://localhost:8000/v1"
+    api_key = "EMPTY_API_KEY"
+    ```
+
+
+=== "Using the Groq API"
+    !!! warning
+        ⚠️ This section involves the use of an external API. Please ensure you have the necessary credentials and understand the potential risks associated with external API usage.
+
+    ```{ .python .no-check }
+    ### Using Groq API
+    model_name = "openai/gpt-oss-20b"
+    api_url = "https://api.groq.com/openai/v1"
+    api_key = "TOKEN" ## your API KEY
+    ```
+
+## Define the pipeline
+```{ .python .no-check }
+nlp = edsnlp.blank("eds")
+nlp.add_pipe("sentencizer")
+nlp.add_pipe(eds.dates())
+nlp.add_pipe(
+    LLMSpanClassifier(
+        name="llm",
+        model=model_name,
+        span_getter=["dates"],
+        attributes={"_.biopsy_procedure": True},
+        context_getter=make_span_context_getter(
+            context_sents=(3, 3),
+            context_words=(1, 1),
+        ),
+        prompt=dict(
+            system_prompt=system_prompt,
+            user_prompt=user_prompt,
+            prefix_prompt=prefix_prompt,
+            suffix_prompt=suffix_prompt,
+            examples=examples,
+        ),
+        api_params=dict(
+            max_tokens=max_tokens,
+            temperature=temperature,
+            response_format=response_format,
+            extra_body=extra_body,
+        ),
+        api_url=api_url,
+        api_key=api_key,
+        response_mapping=response_mapping,
+        n_concurrent_tasks=4,
+    )
+)
+```
+
+## Apply it on a document
+
+```{ .python .no-check }
+# Let's try with a fake LLM generated text
+text = """
+Centre Hospitalier Départemental – RCP Prostate – 20/02/2025
+
+M. Bernard P., 69 ans, retraité, consulte après avoir noté une faiblesse du jet urinaire et des levers nocturnes répétés depuis un an. PSA à 15,2 ng/mL (05/02/2025). TR : nodule ferme sur lobe gauche.
+
+IRM multiparamétrique du 10/02/2025 : lésion PIRADS 5, 2,1 cm, atteinte de la capsule suspectée.
+Biopsies du 12/02/2025 : adénocarcinome Gleason 4+4=8, toutes les carottes gauches positives.
+Scanner TAP et scintigraphie osseuse du 14/02 : absence de métastases viscérales ou osseuses.
+
+En RCP du 20/02/2025, patient classé cT3a N0 M0, haut risque. Décision : radiothérapie externe + hormonothérapie longue (24 mois). Planification de la simulation scanner le 25/02.
+"""
+```
+
+```{ .python .no-check }
+t0 = datetime.now()
+doc = nlp(text)
+t1 = datetime.now()
+print("Execution time", t1 - t0)
+
+for span in doc.spans["dates"]:
+    print(span, span._.biopsy_procedure)
+```
+
+Lets check the type
+```{ .python .no-check }
+type(span._.biopsy_procedure)
+```
+# Apply on multiple documents
+```{ .python .no-check }
+texts = [
+    text,
+] * 2
+
+notes = pd.DataFrame({"note_id": range(len(texts)), "note_text": texts})
+docs = edsnlp.data.from_pandas(notes, nlp=nlp, converter="omop")
+predicted_docs = docs.map_pipeline(nlp, 2)
+```
+
+```{ .python .no-check }
+t0 = datetime.now()
+note_nlp = edsnlp.data.to_pandas(
+    predicted_docs,
+    converter="ents",
+    span_getter="dates",
+    span_attributes=[
+        "biopsy_procedure",
+    ],
+)
+t1 = datetime.now()
+print("Execution time", t1 - t0)
+note_nlp.head()
+```
@@ -0,0 +1,7 @@
+from edsnlp.core import registry
+
+from .llm_span_qualifier import LLMSpanClassifier
+
+create_component = registry.factory.register(
+    "eds.llm_span_qualifier",
+)(LLMSpanClassifier)