Skip to content

Commit 827e105

Browse files
authored
patient intake structured extraction example (#680)
1 parent 7105afc commit 827e105

File tree

9 files changed

+207
-0
lines changed

9 files changed

+207
-0
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Postgres database address for cocoindex
2+
COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
3+
4+
OPENAI_API_KEY=
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# Extract structured data from patient intake forms with LLM
2+
[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex)
3+
4+
5+
This repo shows how to use OpenAI API to extract structured data from patient intake forms with different formats, like PDF, Docx, etc. from Google Drive.
6+
7+
We appreciate a star ⭐ at [CocoIndex Github](https://github.com/cocoindex-io/cocoindex) if this is helpful.
8+
9+
![Structured Data From Patient Intake Forms](https://github.com/user-attachments/assets/1f6afb69-d26d-4a08-8774-13982d6aec1e)
10+
11+
12+
## Tutorials
13+
- Step by step tutorial - Check out the [blog](https://cocoindex.io/blogs/patient-intake-form-extraction-with-llm).
14+
- Video tutorial - [Youtube](https://youtu.be/_mjlwVtnBn0?si=cpH-4kkOAYm2HhK6).
15+
16+
## Prerequisite
17+
1. [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
18+
19+
2. Install CocoIndex
20+
```bash
21+
pip install -U cocoindex
22+
```
23+
24+
3. Install MarkItDown
25+
```bash
26+
pip install 'markitdown[all]'
27+
```
28+
4. Create a `.env` file from `.env.example`, and fill `OPENAI_API_KEY`.
29+
30+
## Run
31+
32+
Setup index:
33+
34+
```bash
35+
cocoindex setup main.py
36+
```
37+
38+
Update index:
39+
40+
```bash
41+
cocoindex update main.py
42+
```
43+
44+
Run query:
45+
46+
```bash
47+
python main.py
48+
```
49+
50+
Run with CocoInsight:
51+
```bash
52+
cocoindex server -ci main.py
53+
```
54+
<img width="1405" alt="Screenshot 2025-07-02 at 11 59 24 AM" src="https://github.com/user-attachments/assets/6f5154cd-8a53-4baa-b914-cd60ffecf3d4" />
55+
56+
57+
58+
View results at https://cocoindex.io/cocoinsight
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
## Note:
2+
Example files here are purely artificial and not real, for testing purposes only.
3+
Please do not use these examples for any other purpose.
4+
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
import datetime
2+
import tempfile
3+
import dataclasses
4+
import os
5+
6+
from markitdown import MarkItDown
7+
from openai import OpenAI
8+
9+
import cocoindex
10+
11+
@dataclasses.dataclass
12+
class Contact:
13+
name: str
14+
phone: str
15+
relationship: str
16+
17+
@dataclasses.dataclass
18+
class Address:
19+
street: str
20+
city: str
21+
state: str
22+
zip_code: str
23+
24+
@dataclasses.dataclass
25+
class Pharmacy:
26+
name: str
27+
phone: str
28+
address: Address
29+
30+
@dataclasses.dataclass
31+
class Insurance:
32+
provider: str
33+
policy_number: str
34+
group_number: str | None
35+
policyholder_name: str
36+
relationship_to_patient: str
37+
38+
@dataclasses.dataclass
39+
class Condition:
40+
name: str
41+
diagnosed: bool
42+
43+
@dataclasses.dataclass
44+
class Medication:
45+
name: str
46+
dosage: str
47+
48+
@dataclasses.dataclass
49+
class Allergy:
50+
name: str
51+
52+
@dataclasses.dataclass
53+
class Surgery:
54+
name: str
55+
date: str
56+
57+
@dataclasses.dataclass
58+
class Patient:
59+
name: str
60+
dob: datetime.date
61+
gender: str
62+
address: Address
63+
phone: str
64+
email: str
65+
preferred_contact_method: str
66+
emergency_contact: Contact
67+
insurance: Insurance | None
68+
reason_for_visit: str
69+
symptoms_duration: str
70+
past_conditions: list[Condition]
71+
current_medications: list[Medication]
72+
allergies: list[Allergy]
73+
surgeries: list[Surgery]
74+
occupation: str | None
75+
pharmacy: Pharmacy | None
76+
consent_given: bool
77+
consent_date: datetime.date | None
78+
79+
80+
class ToMarkdown(cocoindex.op.FunctionSpec):
81+
"""Convert a document to markdown."""
82+
83+
@cocoindex.op.executor_class(gpu=True, cache=True, behavior_version=1)
84+
class ToMarkdownExecutor:
85+
"""Executor for ToMarkdown."""
86+
87+
spec: ToMarkdown
88+
_converter: MarkItDown
89+
90+
def prepare(self):
91+
client = OpenAI()
92+
self._converter = MarkItDown(llm_client=client, llm_model="gpt-4o")
93+
94+
def __call__(self, content: bytes, filename: str) -> str:
95+
suffix = os.path.splitext(filename)[1]
96+
with tempfile.NamedTemporaryFile(delete=True, suffix=suffix) as temp_file:
97+
temp_file.write(content)
98+
temp_file.flush()
99+
text = self._converter.convert(temp_file.name).text_content
100+
return text
101+
102+
@cocoindex.flow_def(name="PatientIntakeExtraction")
103+
def patient_intake_extraction_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
104+
"""
105+
Define a flow that extracts patient information from intake forms.
106+
"""
107+
data_scope["documents"] = flow_builder.add_source(
108+
cocoindex.sources.LocalFile(path="data/patient_forms", binary=True))
109+
110+
patients_index = data_scope.add_collector()
111+
112+
with data_scope["documents"].row() as doc:
113+
114+
doc["markdown"] = doc["content"].transform(ToMarkdown(), filename = doc["filename"])
115+
doc["patient_info"] = doc["markdown"].transform(
116+
cocoindex.functions.ExtractByLlm(
117+
llm_spec=cocoindex.LlmSpec(
118+
api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
119+
output_type=Patient,
120+
instruction="Please extract patient information from the intake form."))
121+
patients_index.collect(
122+
filename=doc["filename"],
123+
patient_info=doc["patient_info"],
124+
)
125+
126+
patients_index.export(
127+
"patients",
128+
cocoindex.storages.Postgres(table_name="patients_info"),
129+
primary_key_fields=["filename"],
130+
)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[project]
2+
name = "patient-intake-extraction"
3+
version = "0.1.0"
4+
description = "Extract structured information from patient intake forms using LLM."
5+
requires-python = ">=3.10"
6+
dependencies = [
7+
"cocoindex>=0.1.45",
8+
"python-dotenv>=1.0.1",
9+
"markitdown>=0.1.2",
10+
"openai>=1.68.2"
11+
]

0 commit comments

Comments
 (0)