diff --git a/evaluation/README.md b/evaluation/README.md
index a1d0ad70..669141d8 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -1,40 +1,178 @@
-
 # Evaluations
 
-## LLM Output Evaluator
+## `evals`: LLM evaluations to test and improve model outputs
+
+### Evaluation Metrics
+
+Natural Language Generation Performance:
+
+[Extractiveness](https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks):
+
+* Extractiveness Coverage: Extent to which a summary is derivative of a text
+* Extractiveness Density: How well the word sequence can be described as series of extractions
+* Extractiveness Compression: Word ratio between the article and the summary
+
+API Performance:
+
+* Token Usage (input/output)
+* Estimated Cost in USD
+* Duration (in seconds)
+
+### Test Data
+
+Generate the dataset file by connecting to a database of research papers:
 
-The `evals` script evaluates the outputs of Large Language Models (LLMs) and estimates the associated token usage and cost.
+Connect to the Postgres database of your local Balancer instance:
 
-It supports batch evalaution via a configuration CSV and produces a detailed metrics report in CSV format.
+```
+from sqlalchemy import create_engine
 
-### Usage
+engine = create_engine("postgresql+psycopg2://balancer:balancer@localhost:5433/balancer_dev")
+```
 
-This script evaluates LLM outputs using the `lighteval` library: https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks
+Connect to the Postgres database of the production Balancer instance using a SQL file:
 
-Ensure you have the `lighteval` library and any model SDKs (e.g., OpenAI, Anthropic) configured properly.
+```
+# Add Postgres.app binaries to the PATH 
+echo 'export PATH="/Applications/Postgres.app/Contents/Versions/latest/bin:$PATH"' >> ~/.zshrc
+
+createdb <DB_NAME>
+pg_restore -v -d <DB_NAME> <PATH_TO_BACKUP>.sql
+```
 
+Generate the dataset CSV file:
 
-```bash
-python evals.py --config path/to/config.csv --reference path/to/reference.csv --output path/to/results.csv
 ```
+from sqlalchemy import create_engine
+import pandas as pd
 
-The arguments to the script are:
+engine = create_engine("postgresql://<USER>@localhost:5432/<DB_NAME>")
 
-- Path to the config CSV file: Must include the columns "Model Name" and "Query"
-- Path to the reference CSV file: Must include the columns "Context" and "Reference"
-- Path where the evaluation resuls will be saved
+query = "SELECT * FROM api_embeddings;"
+df = pd.read_sql(query, engine)
+
+df['INPUT'] = df.apply(lambda row: f"ID: {row['chunk_number']} | CONTENT: {row['text']}", axis=1)
+
+# Ensure the chunks are joined in order of chunk_number by sorting the DataFrame before grouping and joining
+df = df.sort_values(by=['name', 'upload_file_id', 'chunk_number'])
+df_grouped = df.groupby(['name', 'upload_file_id'])['INPUT'].apply(lambda chunks: "\n".join(chunks)).reset_index()
+
+df_grouped.to_csv('<DATASET_CSV_PATH>', index=False)
+```
 
+### Running an Evaluation
 
-The script outputs a CSV with the following columns:
+#### Bulk Model and Prompt Experimentation
 
-* Evaluates LLM outputs for:
+Compare the results of many different prompts and models at once
 
-  * Extractiveness Coverage
-  * Extractiveness Density
-  * Extractiveness Compression
+```
+import pandas as pd
+
+data = [
+    {
+    "MODEL": "<MODEL_NAME_1>",
+    "INSTRUCTIONS": """<YOUR_QUERY_1>"""
+    },
+    {
+    "MODEL": "<MODEL_NAME_2>",
+    "INSTRUCTIONS": """<YOUR_QUERY_2>"""
+    },
+]
+
+df = pd.DataFrame.from_records(data)
+
+df.to_csv("<EXPERIMENTS_CSV_PATH>", index=False)
+```
+
+
+#### Execute on the Command Line
+
+
+Execute [using `uv` to manage dependencies](https://docs.astral.sh/uv/guides/scripts/) without manually managing enviornments:
+
+```sh
+uv run evals.py --experiments path/to/<EXPERIMENTS_CSV> --dataset path/to/<DATASET_CSV> --results path/to/<RESULTS_CSV>
+```
+
+Execute without using uv run by ensuring it is executable:
+
+```sh
+./evals.py --experiments path/to/<EXPERIMENTS_CSV> --dataset path/to/<DATASET_CSV> --results path/to/<RESULTS_CSV>
+```
+
+### Analyzing Test Results
+
+```
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+df = pd.read_csv("<RESULTS_CSV_PATH>")
+
+# Define the metrics of interest
+extractiveness_cols = ['Extractiveness Coverage', 'Extractiveness Density', 'Extractiveness Compression']
+token_cols = ['Input Token Usage', 'Output Token Usage']
+other_metrics = ['Cost (USD)', 'Duration (s)']
+all_metrics = extractiveness_cols + token_cols + other_metrics
+
+# Metric Histograms by Model
+plt.style.use('default')
+fig, axes = plt.subplots(len(all_metrics), 1, figsize=(12, 4 * len(all_metrics)))
+
+models = df['MODEL'].unique()
+colors = plt.cm.Set3(np.linspace(0, 1, len(models)))
+
+for i, metric in enumerate(all_metrics):
+    ax = axes[i] if len(all_metrics) > 1 else axes
+    
+    # Create histogram for each model
+    for j, model in enumerate(models):
+        model_data = df[df['MODEL'] == model][metric]
+        ax.hist(model_data, alpha=0.7, label=model, bins=min(8, len(model_data)), 
+                color=colors[j], edgecolor='black', linewidth=0.5)
+    
+    ax.set_title(f'{metric} Distribution by Model', fontsize=14, fontweight='bold')
+    ax.set_xlabel(metric, fontsize=12)
+    ax.set_ylabel('Frequency', fontsize=12)
+    ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
+    ax.grid(True, alpha=0.3)
+
+plt.tight_layout()
+plt.show()
+
+# Metric Statistics by Model
+for metric in all_metrics:
+    print(f"\n{metric.upper()}:")
+    desc_stats = df.groupby('MODEL')[metric].agg([
+        'count', 'mean', 'std', 'min', 'median','max'
+    ])
+
+    print(desc_stats)
+
+
+# Calculate Efficiency Metrics By model
+df_analysis = df.copy()
+df_analysis['Total Token Usage'] = df_analysis['Input Token Usage'] + df_analysis['Output Token Usage']
+df_analysis['Cost per Token'] = df_analysis['Cost (USD)'] / df_analysis['Total Token Usage']
+df_analysis['Tokens per Second'] = df_analysis['Total Token Usage'] / df_analysis['Duration (s)']
+df_analysis['Cost per Second'] = df_analysis['Cost (USD)'] / df_analysis['Duration (s)']
+
+efficiency_metrics = ['Cost per Token', 'Tokens per Second', 'Cost per Second']
+
+for metric in efficiency_metrics:
+    print(f"\n{metric.upper()}:")
+    eff_stats = df_analysis.groupby('MODEL')[metric].agg([
+        'count', 'mean', 'std', 'min', 'median', 'max'
+    ])
+
+    for col in ['mean', 'std', 'min', 'median', 'max']:
+        eff_stats[col] = eff_stats[col].apply(lambda x: f"{x:.3g}")
+    print(eff_stats)
+
+
+```
 
-* Computes:
+### Contributing
 
-  * Token usage (input/output)
-  * Estimated cost in USD
-  * Duration (in seconds)
+You're welcome to add LLM models to test in `server/api/services/llm_services`
\ No newline at end of file
diff --git a/evaluation/evals.py b/evaluation/evals.py
old mode 100644
new mode 100755
index f6e9bb3d..08eda2bc
--- a/evaluation/evals.py
+++ b/evaluation/evals.py
@@ -1,9 +1,20 @@
+#!/usr/bin/env -S uv run --script
+# /// script
+# requires-python = "==3.11.11"
+# dependencies = [
+#   "pandas==2.2.3",
+#   "lighteval==0.10.0",
+#   "openai==1.83.0",
+#   "spacy==3.8.7",
+#   "pip"
+#
+# ]
+# ///
+
 """
 Evaluate LLM outputs using multiple metrics and compute associated costs
 """
 
-# TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs
-
 import sys
 import os
 
@@ -14,6 +25,8 @@
 import logging
 
 import pandas as pd
+
+# lighteval depends on `sentencepiece` and it only has prebuilt wheels for Python 3.11 or below
 from lighteval.tasks.requests import Doc
 from lighteval.metrics.metrics_sample import Extractiveness
 
@@ -24,130 +37,161 @@
 )
 
 
-def evaluate_response(
-    model_name: str, query: str, context: str, reference: str
-) -> pd.DataFrame:
+def evaluate_response(model: str, instructions: str, input: str) -> pd.DataFrame:
+    """
+    Test a prompt with a set of test data by scoring each item in the data set
     """
-    Evaluates the response of a model to a given query and context, computes extractiveness metrics, token usage, and cost
 
-    Args:
-        model_name (str): The name of the model to be used for evaluation.
-        query (str): The user query to be processed.
-        context (str): The context or document content to be used.
-        reference (str): The reference text for comparison (not used in this function, but can be used for further evaluations).
+    try:
+        handler = ModelFactory.get_handler(model)
 
-    Returns:
-        pd.DataFrame: A DataFrame containing the output text, extractiveness metrics, token usage, cost, and duration.
-    """
+        generated_text, token_usage, pricing, duration = handler.handle_request(
+            instructions, input
+        )
 
-    handler = ModelFactory.get_handler(model_name)
+        doc = Doc(query="", choices=[], gold_index=0, specific={"text": input})
+        extractiveness = Extractiveness().compute(
+            formatted_doc=doc, predictions=[generated_text]
+        )
 
-    # TODO: Add error handling for unsupported models
+        cost_metrics = calculate_cost_metrics(token_usage, pricing)
 
-    output_text, token_usage, pricing, duration = handler.handle_request(query, context)
+        result = pd.DataFrame(
+            [
+                {
+                    "Generated Text": generated_text,
+                    "Extractiveness Coverage": extractiveness["summarization_coverage"],
+                    "Extractiveness Density": extractiveness["summarization_density"],
+                    "Extractiveness Compression": extractiveness[
+                        "summarization_compression"
+                    ],
+                    "Input Token Usage": token_usage.input_tokens,
+                    "Output Token Usage": token_usage.output_tokens,
+                    "Cost (USD)": cost_metrics["total_cost"],
+                    "Duration (s)": duration,
+                }
+            ]
+        )
 
-    doc = Doc(query="", choices=[], gold_index=0, specific={"text": context})
-    extractiveness = Extractiveness().compute(
-        formatted_doc=doc, predictions=[output_text]
-    )
+    except Exception as e:
+        logging.error(f"Error evaluating response for model {model}: {e}")
+        result = pd.DataFrame(
+            [
+                {
+                    "Generated Text": None,
+                    "Extractiveness Coverage": None,
+                    "Extractiveness Density": None,
+                    "Extractiveness Compression": None,
+                    "Input Token Usage": None,
+                    "Output Token Usage": None,
+                    "Cost (USD)": None,
+                    "Duration (s)": None,
+                }
+            ]
+        )
 
-    input_cost_dollars = (pricing["input"] / 1000000) * token_usage.input_tokens
-    output_cost_dollars = (pricing["output"] / 1000000) * token_usage.output_tokens
+    return result
 
+
+def calculate_cost_metrics(token_usage: dict, pricing: dict) -> dict:
+    """
+    Calculate cost metrics based on token usage and pricing
+    """
+
+    TOKENS_PER_MILLION = 1_000_000
+
+    # Pricing is in dollars per million tokens
+    input_cost_dollars = (
+        pricing["input"] / TOKENS_PER_MILLION
+    ) * token_usage.input_tokens
+    output_cost_dollars = (
+        pricing["output"] / TOKENS_PER_MILLION
+    ) * token_usage.output_tokens
     total_cost_dollars = input_cost_dollars + output_cost_dollars
 
-    return pd.DataFrame(
-        [
-            {
-                "Output Text": output_text,
-                "Extractiveness Coverage": extractiveness["summarization_coverage"],
-                "Extractiveness Density": extractiveness["summarization_density"],
-                "Extractiveness Compression": extractiveness[
-                    "summarization_compression"
-                ],
-                "Input Token Usage": token_usage.input_tokens,
-                "Output Token Usage": token_usage.output_tokens,
-                "Cost (USD)": total_cost_dollars,
-                "Duration (s)": duration,
-            }
-        ]
-    )
+    return {
+        "input_cost": input_cost_dollars,
+        "output_cost": output_cost_dollars,
+        "total_cost": total_cost_dollars,
+    }
 
 
-if __name__ == "__main__":
-    # TODO: Add CLI argument to specify the metrics to be computed
-    parser = argparse.ArgumentParser(
-        description="Evaluate LLM outputs using multiple metrics and compute associated costs"
-    )
-    parser.add_argument("--config", "-c", required=True, help="Path to config CSV file")
-    parser.add_argument(
-        "--reference", "-r", required=True, help="Path to reference CSV file"
-    )
-    parser.add_argument("--output", "-o", required=True, help="Path to output CSV file")
+def load_csv(file_path: str, required_columns: list) -> pd.DataFrame:
+    """
+    Load a CSV file and validate that it contains the required columns
 
-    args = parser.parse_args()
+    Args:
+        file_path (str): Path to the CSV file
+        required_columns (list): List of required column names
 
-    df_config = pd.read_csv(args.config)
-    logging.info(f"Config DataFrame shape: {df_config.shape}")
-    logging.info(f"Config DataFrame columns: {df_config.columns.tolist()}")
+    Returns:
+        pd.DataFrame
+    """
+
+    df = pd.read_csv(file_path)
 
-    # Remove the trailing whitespace from column names
-    df_config.columns = df_config.columns.str.strip()
+    # Remove trailing whitespace from column names
+    df.columns = df.columns.str.strip()
+
+    # Uppercase the column names to match the expected format
+    df.columns = df.columns.str.upper()
 
     # Check if the required columns are present
-    required_columns = ["Model Name", "Query"]
-    if not all(col in df_config.columns for col in required_columns):
+    if not all(col in df.columns for col in required_columns):
         raise ValueError(
-            f"Config DataFrame must contain the following columns: {required_columns}"
+            f"{file_path} must contain the following columns: {required_columns}"
         )
 
-    # Check if all models in the config are supported by ModelFactory
+    return df
+
+
+if __name__ == "__main__":
+    # TODO: Add test evaluation argument to run on the first 10 rows of the dataset file
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--experiments", "-e", required=True, help="Path to experiments CSV file"
+    )
+    parser.add_argument(
+        "--dataset", "-d", required=True, help="Path to dataset CSV file"
+    )
+    parser.add_argument(
+        "--results", "-r", required=True, help="Path to results CSV file"
+    )
+
+    args = parser.parse_args()
+
+    df_experiment = load_csv(
+        args.experiments, required_columns=["MODEL", "INSTRUCTIONS"]
+    )
+    # Check if all models are supported by ModelFactory
     if not all(
         model in ModelFactory.HANDLERS.keys()
-        for model in df_config["Model Name"].unique()
+        for model in df_experiment["MODEL"].unique()
     ):
         raise ValueError(
-            f"Unsupported model(s) found in config: {set(df_config['Model Name'].unique()) - set(ModelFactory.HANDLERS.keys())}"
+            f"Unsupported model(s) found: {set(df_experiment['MODEL'].unique()) - set(ModelFactory.HANDLERS.keys())}"
         )
+    df_dataset = load_csv(args.dataset, required_columns=["INPUT"])
 
-    df_reference = pd.read_csv(args.reference)
-    logging.info(f"Reference DataFrame shape: {df_reference.shape}")
-    logging.info(f"Reference DataFrame columns: {df_reference.columns.tolist()}")
-
-    # Remove the trailing whitespace from column names
-    df_reference.columns = df_reference.columns.str.strip()
-    # Check if the required columns are present
-    required_columns = ["Context", "Reference"]
-    if not all(col in df_reference.columns for col in required_columns):
-        raise ValueError(
-            f"Reference DataFrame must contain the following columns: {required_columns}"
-        )
+    # Bulk model and prompt experimentation: Cross join the experiment and dataset DataFrames
+    df_in = df_experiment.merge(df_dataset, how="cross")
 
-    # Cross join the config and reference DataFrames
-    df_in = df_config.merge(df_reference, how="cross")
-
-    # TODO: Parallelize the evaluation process for each row in df_in using concurrent.futures or similar libraries
-    df_evals = pd.DataFrame()
-    for index, row in df_in.iterrows():
-        df_evals = pd.concat(
-            [
-                df_evals,
-                evaluate_response(
-                    row["Model Name"], row["Query"], row["Context"], row["Reference"]
-                ),
-            ],
-            axis=0,
-        )
+    # Evaluate each row in the input DataFrame
+    results = []
+    for index, row in enumerate(df_in.itertuples(index=False)):
+        result = evaluate_response(row.MODEL, row.INSTRUCTIONS, row.INPUT)
+        results.append(result)
 
+        # TODO: Use tqdm or similar library to show progress bar
         logging.info(f"Processed row {index + 1}/{len(df_in)}")
 
-    # Concatenate the input and evaluations DataFrames
+    df_evals = pd.concat(results, axis=0, ignore_index=True)
 
+    # Concatenate the input and evaluations DataFrames
     df_out = pd.concat(
         [df_in.reset_index(drop=True), df_evals.reset_index(drop=True)], axis=1
     )
-
-    df_out.to_csv(args.output, index=False)
-    logging.info(f"Output DataFrame shape: {df_out.shape}")
-    logging.info(f"Results saved to {args.output}")
+    df_out.to_csv(args.results, index=False)
+    logging.info(f"Results saved to {args.results}")
     logging.info("Evaluation completed successfully.")
diff --git a/server/api/services/llm_services.py b/server/api/services/llm_services.py
index 89eb2659..18c6e58f 100644
--- a/server/api/services/llm_services.py
+++ b/server/api/services/llm_services.py
@@ -7,7 +7,6 @@
 import logging
 from abc import ABC, abstractmethod
 
-import anthropic
 import openai
 
 
@@ -19,138 +18,15 @@ def handle_request(
         pass
 
 
-class ClaudeHaiku35CitationsHandler(BaseModelHandler):
-    MODEL = "claude-3-5-haiku-20241022"
-    # Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing
-    PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.80, "output": 4.00}
+# LLM Pricing Calculator: https://www.llm-prices.com/
+# TODO: Add support for more models and their pricing
 
-    def __init__(self) -> None:
-        self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
-
-    def handle_request(
-        self, query: str, context: str
-    ) -> tuple[str, dict[str, int], dict[str, float], float]:
-        """
-        Handles the request to the Claude Haiku 3.5 model with citations enabled
-
-        Args:
-            query: The user query to be processed
-            context: The context or document content to be used for citations
-
-        """
-
-        start_time = time.time()
-        # TODO: Add error handling for API requests and invalid responses
-        message = self.client.messages.create(
-            model=self.MODEL,
-            max_tokens=1024,
-            messages=[
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": query},
-                        {
-                            "type": "document",
-                            "source": {"type": "content", "content": context},
-                            "citations": {"enabled": True},
-                        },
-                    ],
-                }
-            ],
-        )
-        duration = time.time() - start_time
-
-        # Response Structure: https://docs.anthropic.com/en/docs/build-with-claude/citations#response-structure
-
-        text = []
-        cited_text = []
-        for content in message.to_dict()["content"]:
-            text.append(content["text"])
-            if "citations" in content.keys():
-                text.append(
-                    " ".join(
-                        [
-                            f"<{citation['start_block_index']} - {citation['end_block_index']}>"
-                            for citation in content["citations"]
-                        ]
-                    )
-                )
-                cited_text.append(
-                    " ".join(
-                        [
-                            f"<{citation['start_block_index']} - {citation['end_block_index']}> {citation['cited_text']}"
-                            for citation in content["citations"]
-                        ]
-                    )
-                )
-
-        full_text = " ".join(text)
-
-        return (
-            full_text,
-            message.usage,
-            self.PRICING_DOLLARS_PER_MILLION_TOKENS,
-            duration,
-        )
-
-
-class ClaudeHaiku3Handler(BaseModelHandler):
-    MODEL = "claude-3-haiku-20240307"
-    # Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing
-    PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.25, "output": 1.25}
-
-    def __init__(self) -> None:
-        self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
-
-    def handle_request(
-        self, query: str, context: str
-    ) -> tuple[str, dict[str, int], dict[str, float], float]:
-        """
-        Handles the request to the Claude Haiku 3 model with citations disabled
-
-        Args:
-            query: The user query to be processed
-            context: The context or document content to be used
-
-        """
-
-        start_time = time.time()
-        # TODO: Add error handling for API requests and invalid responses
-        message = self.client.messages.create(
-            model=self.MODEL,
-            max_tokens=1024,
-            messages=[
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": query},
-                        {
-                            "type": "document",
-                            "source": {"type": "content", "content": context},
-                            "citations": {"enabled": False},
-                        },
-                    ],
-                }
-            ],
-        )
-        duration = time.time() - start_time
-
-        text = []
-        for content in message.to_dict()["content"]:
-            text.append(content["text"])
-
-        full_text = " ".join(text)
-
-        return (
-            full_text,
-            message.usage,
-            self.PRICING_DOLLARS_PER_MILLION_TOKENS,
-            duration,
-        )
+# Anthropic  Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing
 
 
 class GPT4OMiniHandler(BaseModelHandler):
     MODEL = "gpt-4o-mini"
+    # TODO: Get the latest model pricing from OpenAI's API or documentation
     # Model Pricing: https://platform.openai.com/docs/pricing
     PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.15, "output": 0.60}
 
@@ -171,9 +47,7 @@ def handle_request(
         start_time = time.time()
         # TODO: Add error handling for API requests and invalid responses
         response = self.client.responses.create(
-            model=self.MODEL,
-            instructions=query,
-            input=context,
+            model=self.MODEL, instructions=query, input=context, temperature=0.0
         )
         duration = time.time() - start_time
 
@@ -187,9 +61,67 @@ def handle_request(
 
 class GPT41NanoHandler(BaseModelHandler):
     MODEL = "gpt-4.1-nano"
+
     # Model Pricing: https://platform.openai.com/docs/pricing
     PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.10, "output": 0.40}
 
+    # GPT 4.1 Prompting Guide: https://cookbook.openai.com/examples/gpt4-1_prompting_guide
+
+    # Long context performance can degrade as more items are required to be retrieved,
+    # or perform complex reasoning that requires knowledge of the state of the entire context
+
+    #
+
+    INSTRUCTIONS = """
+        
+    # Role and Objective
+    
+    - You are a seasoned physician or medical professional who is developing a bipolar disorder treatment algorithim
+
+    - You are extracting bipolar medication decision points from a research paper that is chunked into multiple parts each labeled with an ID
+
+    # Instructions
+
+    - Identify decision points for bipolar medications
+
+    - For each decision point you find, return a JSON object using the following format:
+
+        {
+            "criterion": "<condition or concern>",
+            "decision": "INCLUDE" or "EXCLUDE",
+            "medications": ["<medication 1>", "<medication 2>", ...],
+            "reason": "<short explanation for why this criterion applies>",
+            "sources": ["<ID-X>"]
+        }
+
+
+    - Only extract bipolar medication decision points that are explicitly stated or strongly implied in the context and never rely on your own knowledge
+
+    # Output Format
+
+    - Return the extracted bipolar medication decision points as a JSON array and if no decision points are found in the context return an empty array
+
+    # Example
+
+    [
+        {
+            "criterion": "History of suicide attempts",
+            "decision": "INCLUDE",
+            "medications": ["Lithium"],
+            "reason": "Lithium is the only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder",
+            "sources": ["ID-0"]
+        },
+        {
+            "criterion": "Weight gain concerns",
+            "decision": "EXCLUDE",
+            "medications": ["Quetiapine", "Aripiprazole", "Olanzapine", "Risperidone"],
+            "reason": "Seroquel, Risperdal, Abilify, and Zyprexa are known for causing weight gain",
+            "sources": ["ID-0", "ID-1", "ID-2"]
+        }
+    ]
+
+    """
+
     def __init__(self) -> None:
         self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
 
@@ -204,12 +136,16 @@ def handle_request(
             context: The context or document content to be used
 
         """
+
+        # If no query is provided, use the default instructions
+        if not query:
+            query = self.INSTRUCTIONS
+
         start_time = time.time()
         # TODO: Add error handling for API requests and invalid responses
+
         response = self.client.responses.create(
-            model=self.MODEL,
-            instructions=query,
-            input=context,
+            model=self.MODEL, instructions=query, input=context, temperature=0.0
         )
         duration = time.time() - start_time
 
@@ -222,9 +158,10 @@ def handle_request(
 
 
 class ModelFactory:
+    # TODO: Define structured fields to extract from unstructured input data
+    # https://platform.openai.com/docs/guides/structured-outputs?api-mode=responses&example=structured-data#examples
+
     HANDLERS = {
-        "CLAUDE_HAIKU_3_5_CITATIONS": ClaudeHaiku35CitationsHandler,
-        "CLAUDE_HAIKU_3": ClaudeHaiku3Handler,
         "GPT_4O_MINI": GPT4OMiniHandler,
         "GPT_41_NANO": GPT41NanoHandler,
     }
diff --git a/server/api/views/text_extraction/views.py b/server/api/views/text_extraction/views.py
index e0110a8e..e4122851 100644
--- a/server/api/views/text_extraction/views.py
+++ b/server/api/views/text_extraction/views.py
@@ -1,5 +1,7 @@
 import os
-from ...services.openai_services import openAIServices
+import json
+import re
+
 from rest_framework.views import APIView
 from rest_framework.permissions import IsAuthenticated
 from rest_framework.response import Response
@@ -7,8 +9,8 @@
 from django.utils.decorators import method_decorator
 from django.views.decorators.csrf import csrf_exempt
 import anthropic
-import json
-import re
+
+from ...services.openai_services import openAIServices
 from api.models.model_embeddings import Embeddings
 
 USER_PROMPT = """
@@ -30,11 +32,23 @@
 """
 
 
-# TODO: Add docstrings and type hints
-def anthropic_citations(client, user_prompt, content_chunks): 
+def anthropic_citations(client: anthropic.Client, user_prompt: str, content_chunks: list) -> tuple:
     """
+    Sends a message to Anthropic Citations and extract and format the response 
+
+    Parameters
+    ----------
+    client: An instance of the Anthropic API client used to make the request
+    user_prompt: The user's question or instruction to be processed by the model
+    content_chunks: A list of text chunks that provide context for the model to use during generation
+
+    Returns
+    -------
+    tuple
+
     """
 
+
     message = client.messages.create(
         model="claude-3-5-haiku-20241022",
         max_tokens=1024,
@@ -92,6 +106,7 @@ def get(self, request):
 
             query = Embeddings.objects.filter(upload_file__guid=guid)
 
+            # TODO: Format into the Anthropic API"s expected input format in the anthropic_citations function
             chunks = [{"type": "text", "text": chunk.text} for chunk in query]
 
             texts, cited_texts = anthropic_citations(client, USER_PROMPT, chunks)