diff --git a/evaluation/README.md b/evaluation/README.md index a1d0ad70..669141d8 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -1,40 +1,178 @@ - # Evaluations -## LLM Output Evaluator +## `evals`: LLM evaluations to test and improve model outputs + +### Evaluation Metrics + +Natural Language Generation Performance: + +[Extractiveness](https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks): + +* Extractiveness Coverage: Extent to which a summary is derivative of a text +* Extractiveness Density: How well the word sequence can be described as series of extractions +* Extractiveness Compression: Word ratio between the article and the summary + +API Performance: + +* Token Usage (input/output) +* Estimated Cost in USD +* Duration (in seconds) + +### Test Data + +Generate the dataset file by connecting to a database of research papers: -The `evals` script evaluates the outputs of Large Language Models (LLMs) and estimates the associated token usage and cost. +Connect to the Postgres database of your local Balancer instance: -It supports batch evalaution via a configuration CSV and produces a detailed metrics report in CSV format. +``` +from sqlalchemy import create_engine -### Usage +engine = create_engine("postgresql+psycopg2://balancer:balancer@localhost:5433/balancer_dev") +``` -This script evaluates LLM outputs using the `lighteval` library: https://huggingface.co/docs/lighteval/en/metric-list#automatic-metrics-for-generative-tasks +Connect to the Postgres database of the production Balancer instance using a SQL file: -Ensure you have the `lighteval` library and any model SDKs (e.g., OpenAI, Anthropic) configured properly. +``` +# Add Postgres.app binaries to the PATH +echo 'export PATH="/Applications/Postgres.app/Contents/Versions/latest/bin:$PATH"' >> ~/.zshrc + +createdb +pg_restore -v -d .sql +``` +Generate the dataset CSV file: -```bash -python evals.py --config path/to/config.csv --reference path/to/reference.csv --output path/to/results.csv ``` +from sqlalchemy import create_engine +import pandas as pd -The arguments to the script are: +engine = create_engine("postgresql://@localhost:5432/") -- Path to the config CSV file: Must include the columns "Model Name" and "Query" -- Path to the reference CSV file: Must include the columns "Context" and "Reference" -- Path where the evaluation resuls will be saved +query = "SELECT * FROM api_embeddings;" +df = pd.read_sql(query, engine) + +df['INPUT'] = df.apply(lambda row: f"ID: {row['chunk_number']} | CONTENT: {row['text']}", axis=1) + +# Ensure the chunks are joined in order of chunk_number by sorting the DataFrame before grouping and joining +df = df.sort_values(by=['name', 'upload_file_id', 'chunk_number']) +df_grouped = df.groupby(['name', 'upload_file_id'])['INPUT'].apply(lambda chunks: "\n".join(chunks)).reset_index() + +df_grouped.to_csv('', index=False) +``` +### Running an Evaluation -The script outputs a CSV with the following columns: +#### Bulk Model and Prompt Experimentation -* Evaluates LLM outputs for: +Compare the results of many different prompts and models at once - * Extractiveness Coverage - * Extractiveness Density - * Extractiveness Compression +``` +import pandas as pd + +data = [ + { + "MODEL": "", + "INSTRUCTIONS": """""" + }, + { + "MODEL": "", + "INSTRUCTIONS": """""" + }, +] + +df = pd.DataFrame.from_records(data) + +df.to_csv("", index=False) +``` + + +#### Execute on the Command Line + + +Execute [using `uv` to manage dependencies](https://docs.astral.sh/uv/guides/scripts/) without manually managing enviornments: + +```sh +uv run evals.py --experiments path/to/ --dataset path/to/ --results path/to/ +``` + +Execute without using uv run by ensuring it is executable: + +```sh +./evals.py --experiments path/to/ --dataset path/to/ --results path/to/ +``` + +### Analyzing Test Results + +``` +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + +df = pd.read_csv("") + +# Define the metrics of interest +extractiveness_cols = ['Extractiveness Coverage', 'Extractiveness Density', 'Extractiveness Compression'] +token_cols = ['Input Token Usage', 'Output Token Usage'] +other_metrics = ['Cost (USD)', 'Duration (s)'] +all_metrics = extractiveness_cols + token_cols + other_metrics + +# Metric Histograms by Model +plt.style.use('default') +fig, axes = plt.subplots(len(all_metrics), 1, figsize=(12, 4 * len(all_metrics))) + +models = df['MODEL'].unique() +colors = plt.cm.Set3(np.linspace(0, 1, len(models))) + +for i, metric in enumerate(all_metrics): + ax = axes[i] if len(all_metrics) > 1 else axes + + # Create histogram for each model + for j, model in enumerate(models): + model_data = df[df['MODEL'] == model][metric] + ax.hist(model_data, alpha=0.7, label=model, bins=min(8, len(model_data)), + color=colors[j], edgecolor='black', linewidth=0.5) + + ax.set_title(f'{metric} Distribution by Model', fontsize=14, fontweight='bold') + ax.set_xlabel(metric, fontsize=12) + ax.set_ylabel('Frequency', fontsize=12) + ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left') + ax.grid(True, alpha=0.3) + +plt.tight_layout() +plt.show() + +# Metric Statistics by Model +for metric in all_metrics: + print(f"\n{metric.upper()}:") + desc_stats = df.groupby('MODEL')[metric].agg([ + 'count', 'mean', 'std', 'min', 'median','max' + ]) + + print(desc_stats) + + +# Calculate Efficiency Metrics By model +df_analysis = df.copy() +df_analysis['Total Token Usage'] = df_analysis['Input Token Usage'] + df_analysis['Output Token Usage'] +df_analysis['Cost per Token'] = df_analysis['Cost (USD)'] / df_analysis['Total Token Usage'] +df_analysis['Tokens per Second'] = df_analysis['Total Token Usage'] / df_analysis['Duration (s)'] +df_analysis['Cost per Second'] = df_analysis['Cost (USD)'] / df_analysis['Duration (s)'] + +efficiency_metrics = ['Cost per Token', 'Tokens per Second', 'Cost per Second'] + +for metric in efficiency_metrics: + print(f"\n{metric.upper()}:") + eff_stats = df_analysis.groupby('MODEL')[metric].agg([ + 'count', 'mean', 'std', 'min', 'median', 'max' + ]) + + for col in ['mean', 'std', 'min', 'median', 'max']: + eff_stats[col] = eff_stats[col].apply(lambda x: f"{x:.3g}") + print(eff_stats) + + +``` -* Computes: +### Contributing - * Token usage (input/output) - * Estimated cost in USD - * Duration (in seconds) +You're welcome to add LLM models to test in `server/api/services/llm_services` \ No newline at end of file diff --git a/evaluation/evals.py b/evaluation/evals.py old mode 100644 new mode 100755 index f6e9bb3d..08eda2bc --- a/evaluation/evals.py +++ b/evaluation/evals.py @@ -1,9 +1,20 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = "==3.11.11" +# dependencies = [ +# "pandas==2.2.3", +# "lighteval==0.10.0", +# "openai==1.83.0", +# "spacy==3.8.7", +# "pip" +# +# ] +# /// + """ Evaluate LLM outputs using multiple metrics and compute associated costs """ -# TODO: Add tests on a small dummy dataset to confirm it handles errors gracefully and produces expected outputs - import sys import os @@ -14,6 +25,8 @@ import logging import pandas as pd + +# lighteval depends on `sentencepiece` and it only has prebuilt wheels for Python 3.11 or below from lighteval.tasks.requests import Doc from lighteval.metrics.metrics_sample import Extractiveness @@ -24,130 +37,161 @@ ) -def evaluate_response( - model_name: str, query: str, context: str, reference: str -) -> pd.DataFrame: +def evaluate_response(model: str, instructions: str, input: str) -> pd.DataFrame: + """ + Test a prompt with a set of test data by scoring each item in the data set """ - Evaluates the response of a model to a given query and context, computes extractiveness metrics, token usage, and cost - Args: - model_name (str): The name of the model to be used for evaluation. - query (str): The user query to be processed. - context (str): The context or document content to be used. - reference (str): The reference text for comparison (not used in this function, but can be used for further evaluations). + try: + handler = ModelFactory.get_handler(model) - Returns: - pd.DataFrame: A DataFrame containing the output text, extractiveness metrics, token usage, cost, and duration. - """ + generated_text, token_usage, pricing, duration = handler.handle_request( + instructions, input + ) - handler = ModelFactory.get_handler(model_name) + doc = Doc(query="", choices=[], gold_index=0, specific={"text": input}) + extractiveness = Extractiveness().compute( + formatted_doc=doc, predictions=[generated_text] + ) - # TODO: Add error handling for unsupported models + cost_metrics = calculate_cost_metrics(token_usage, pricing) - output_text, token_usage, pricing, duration = handler.handle_request(query, context) + result = pd.DataFrame( + [ + { + "Generated Text": generated_text, + "Extractiveness Coverage": extractiveness["summarization_coverage"], + "Extractiveness Density": extractiveness["summarization_density"], + "Extractiveness Compression": extractiveness[ + "summarization_compression" + ], + "Input Token Usage": token_usage.input_tokens, + "Output Token Usage": token_usage.output_tokens, + "Cost (USD)": cost_metrics["total_cost"], + "Duration (s)": duration, + } + ] + ) - doc = Doc(query="", choices=[], gold_index=0, specific={"text": context}) - extractiveness = Extractiveness().compute( - formatted_doc=doc, predictions=[output_text] - ) + except Exception as e: + logging.error(f"Error evaluating response for model {model}: {e}") + result = pd.DataFrame( + [ + { + "Generated Text": None, + "Extractiveness Coverage": None, + "Extractiveness Density": None, + "Extractiveness Compression": None, + "Input Token Usage": None, + "Output Token Usage": None, + "Cost (USD)": None, + "Duration (s)": None, + } + ] + ) - input_cost_dollars = (pricing["input"] / 1000000) * token_usage.input_tokens - output_cost_dollars = (pricing["output"] / 1000000) * token_usage.output_tokens + return result + +def calculate_cost_metrics(token_usage: dict, pricing: dict) -> dict: + """ + Calculate cost metrics based on token usage and pricing + """ + + TOKENS_PER_MILLION = 1_000_000 + + # Pricing is in dollars per million tokens + input_cost_dollars = ( + pricing["input"] / TOKENS_PER_MILLION + ) * token_usage.input_tokens + output_cost_dollars = ( + pricing["output"] / TOKENS_PER_MILLION + ) * token_usage.output_tokens total_cost_dollars = input_cost_dollars + output_cost_dollars - return pd.DataFrame( - [ - { - "Output Text": output_text, - "Extractiveness Coverage": extractiveness["summarization_coverage"], - "Extractiveness Density": extractiveness["summarization_density"], - "Extractiveness Compression": extractiveness[ - "summarization_compression" - ], - "Input Token Usage": token_usage.input_tokens, - "Output Token Usage": token_usage.output_tokens, - "Cost (USD)": total_cost_dollars, - "Duration (s)": duration, - } - ] - ) + return { + "input_cost": input_cost_dollars, + "output_cost": output_cost_dollars, + "total_cost": total_cost_dollars, + } -if __name__ == "__main__": - # TODO: Add CLI argument to specify the metrics to be computed - parser = argparse.ArgumentParser( - description="Evaluate LLM outputs using multiple metrics and compute associated costs" - ) - parser.add_argument("--config", "-c", required=True, help="Path to config CSV file") - parser.add_argument( - "--reference", "-r", required=True, help="Path to reference CSV file" - ) - parser.add_argument("--output", "-o", required=True, help="Path to output CSV file") +def load_csv(file_path: str, required_columns: list) -> pd.DataFrame: + """ + Load a CSV file and validate that it contains the required columns - args = parser.parse_args() + Args: + file_path (str): Path to the CSV file + required_columns (list): List of required column names - df_config = pd.read_csv(args.config) - logging.info(f"Config DataFrame shape: {df_config.shape}") - logging.info(f"Config DataFrame columns: {df_config.columns.tolist()}") + Returns: + pd.DataFrame + """ + + df = pd.read_csv(file_path) - # Remove the trailing whitespace from column names - df_config.columns = df_config.columns.str.strip() + # Remove trailing whitespace from column names + df.columns = df.columns.str.strip() + + # Uppercase the column names to match the expected format + df.columns = df.columns.str.upper() # Check if the required columns are present - required_columns = ["Model Name", "Query"] - if not all(col in df_config.columns for col in required_columns): + if not all(col in df.columns for col in required_columns): raise ValueError( - f"Config DataFrame must contain the following columns: {required_columns}" + f"{file_path} must contain the following columns: {required_columns}" ) - # Check if all models in the config are supported by ModelFactory + return df + + +if __name__ == "__main__": + # TODO: Add test evaluation argument to run on the first 10 rows of the dataset file + + parser = argparse.ArgumentParser() + parser.add_argument( + "--experiments", "-e", required=True, help="Path to experiments CSV file" + ) + parser.add_argument( + "--dataset", "-d", required=True, help="Path to dataset CSV file" + ) + parser.add_argument( + "--results", "-r", required=True, help="Path to results CSV file" + ) + + args = parser.parse_args() + + df_experiment = load_csv( + args.experiments, required_columns=["MODEL", "INSTRUCTIONS"] + ) + # Check if all models are supported by ModelFactory if not all( model in ModelFactory.HANDLERS.keys() - for model in df_config["Model Name"].unique() + for model in df_experiment["MODEL"].unique() ): raise ValueError( - f"Unsupported model(s) found in config: {set(df_config['Model Name'].unique()) - set(ModelFactory.HANDLERS.keys())}" + f"Unsupported model(s) found: {set(df_experiment['MODEL'].unique()) - set(ModelFactory.HANDLERS.keys())}" ) + df_dataset = load_csv(args.dataset, required_columns=["INPUT"]) - df_reference = pd.read_csv(args.reference) - logging.info(f"Reference DataFrame shape: {df_reference.shape}") - logging.info(f"Reference DataFrame columns: {df_reference.columns.tolist()}") - - # Remove the trailing whitespace from column names - df_reference.columns = df_reference.columns.str.strip() - # Check if the required columns are present - required_columns = ["Context", "Reference"] - if not all(col in df_reference.columns for col in required_columns): - raise ValueError( - f"Reference DataFrame must contain the following columns: {required_columns}" - ) + # Bulk model and prompt experimentation: Cross join the experiment and dataset DataFrames + df_in = df_experiment.merge(df_dataset, how="cross") - # Cross join the config and reference DataFrames - df_in = df_config.merge(df_reference, how="cross") - - # TODO: Parallelize the evaluation process for each row in df_in using concurrent.futures or similar libraries - df_evals = pd.DataFrame() - for index, row in df_in.iterrows(): - df_evals = pd.concat( - [ - df_evals, - evaluate_response( - row["Model Name"], row["Query"], row["Context"], row["Reference"] - ), - ], - axis=0, - ) + # Evaluate each row in the input DataFrame + results = [] + for index, row in enumerate(df_in.itertuples(index=False)): + result = evaluate_response(row.MODEL, row.INSTRUCTIONS, row.INPUT) + results.append(result) + # TODO: Use tqdm or similar library to show progress bar logging.info(f"Processed row {index + 1}/{len(df_in)}") - # Concatenate the input and evaluations DataFrames + df_evals = pd.concat(results, axis=0, ignore_index=True) + # Concatenate the input and evaluations DataFrames df_out = pd.concat( [df_in.reset_index(drop=True), df_evals.reset_index(drop=True)], axis=1 ) - - df_out.to_csv(args.output, index=False) - logging.info(f"Output DataFrame shape: {df_out.shape}") - logging.info(f"Results saved to {args.output}") + df_out.to_csv(args.results, index=False) + logging.info(f"Results saved to {args.results}") logging.info("Evaluation completed successfully.") diff --git a/server/api/services/llm_services.py b/server/api/services/llm_services.py index 89eb2659..18c6e58f 100644 --- a/server/api/services/llm_services.py +++ b/server/api/services/llm_services.py @@ -7,7 +7,6 @@ import logging from abc import ABC, abstractmethod -import anthropic import openai @@ -19,138 +18,15 @@ def handle_request( pass -class ClaudeHaiku35CitationsHandler(BaseModelHandler): - MODEL = "claude-3-5-haiku-20241022" - # Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing - PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.80, "output": 4.00} +# LLM Pricing Calculator: https://www.llm-prices.com/ +# TODO: Add support for more models and their pricing - def __init__(self) -> None: - self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY")) - - def handle_request( - self, query: str, context: str - ) -> tuple[str, dict[str, int], dict[str, float], float]: - """ - Handles the request to the Claude Haiku 3.5 model with citations enabled - - Args: - query: The user query to be processed - context: The context or document content to be used for citations - - """ - - start_time = time.time() - # TODO: Add error handling for API requests and invalid responses - message = self.client.messages.create( - model=self.MODEL, - max_tokens=1024, - messages=[ - { - "role": "user", - "content": [ - {"type": "text", "text": query}, - { - "type": "document", - "source": {"type": "content", "content": context}, - "citations": {"enabled": True}, - }, - ], - } - ], - ) - duration = time.time() - start_time - - # Response Structure: https://docs.anthropic.com/en/docs/build-with-claude/citations#response-structure - - text = [] - cited_text = [] - for content in message.to_dict()["content"]: - text.append(content["text"]) - if "citations" in content.keys(): - text.append( - " ".join( - [ - f"<{citation['start_block_index']} - {citation['end_block_index']}>" - for citation in content["citations"] - ] - ) - ) - cited_text.append( - " ".join( - [ - f"<{citation['start_block_index']} - {citation['end_block_index']}> {citation['cited_text']}" - for citation in content["citations"] - ] - ) - ) - - full_text = " ".join(text) - - return ( - full_text, - message.usage, - self.PRICING_DOLLARS_PER_MILLION_TOKENS, - duration, - ) - - -class ClaudeHaiku3Handler(BaseModelHandler): - MODEL = "claude-3-haiku-20240307" - # Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing - PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.25, "output": 1.25} - - def __init__(self) -> None: - self.client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY")) - - def handle_request( - self, query: str, context: str - ) -> tuple[str, dict[str, int], dict[str, float], float]: - """ - Handles the request to the Claude Haiku 3 model with citations disabled - - Args: - query: The user query to be processed - context: The context or document content to be used - - """ - - start_time = time.time() - # TODO: Add error handling for API requests and invalid responses - message = self.client.messages.create( - model=self.MODEL, - max_tokens=1024, - messages=[ - { - "role": "user", - "content": [ - {"type": "text", "text": query}, - { - "type": "document", - "source": {"type": "content", "content": context}, - "citations": {"enabled": False}, - }, - ], - } - ], - ) - duration = time.time() - start_time - - text = [] - for content in message.to_dict()["content"]: - text.append(content["text"]) - - full_text = " ".join(text) - - return ( - full_text, - message.usage, - self.PRICING_DOLLARS_PER_MILLION_TOKENS, - duration, - ) +# Anthropic Model Pricing: https://docs.anthropic.com/en/docs/about-claude/pricing#model-pricing class GPT4OMiniHandler(BaseModelHandler): MODEL = "gpt-4o-mini" + # TODO: Get the latest model pricing from OpenAI's API or documentation # Model Pricing: https://platform.openai.com/docs/pricing PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.15, "output": 0.60} @@ -171,9 +47,7 @@ def handle_request( start_time = time.time() # TODO: Add error handling for API requests and invalid responses response = self.client.responses.create( - model=self.MODEL, - instructions=query, - input=context, + model=self.MODEL, instructions=query, input=context, temperature=0.0 ) duration = time.time() - start_time @@ -187,9 +61,67 @@ def handle_request( class GPT41NanoHandler(BaseModelHandler): MODEL = "gpt-4.1-nano" + # Model Pricing: https://platform.openai.com/docs/pricing PRICING_DOLLARS_PER_MILLION_TOKENS = {"input": 0.10, "output": 0.40} + # GPT 4.1 Prompting Guide: https://cookbook.openai.com/examples/gpt4-1_prompting_guide + + # Long context performance can degrade as more items are required to be retrieved, + # or perform complex reasoning that requires knowledge of the state of the entire context + + # + + INSTRUCTIONS = """ + + # Role and Objective + + - You are a seasoned physician or medical professional who is developing a bipolar disorder treatment algorithim + + - You are extracting bipolar medication decision points from a research paper that is chunked into multiple parts each labeled with an ID + + # Instructions + + - Identify decision points for bipolar medications + + - For each decision point you find, return a JSON object using the following format: + + { + "criterion": "", + "decision": "INCLUDE" or "EXCLUDE", + "medications": ["", "", ...], + "reason": "", + "sources": [""] + } + + + - Only extract bipolar medication decision points that are explicitly stated or strongly implied in the context and never rely on your own knowledge + + # Output Format + + - Return the extracted bipolar medication decision points as a JSON array and if no decision points are found in the context return an empty array + + # Example + + [ + { + "criterion": "History of suicide attempts", + "decision": "INCLUDE", + "medications": ["Lithium"], + "reason": "Lithium is the only medication on the market that has been proven to reduce suicidality in patients with bipolar disorder", + "sources": ["ID-0"] + }, + { + "criterion": "Weight gain concerns", + "decision": "EXCLUDE", + "medications": ["Quetiapine", "Aripiprazole", "Olanzapine", "Risperidone"], + "reason": "Seroquel, Risperdal, Abilify, and Zyprexa are known for causing weight gain", + "sources": ["ID-0", "ID-1", "ID-2"] + } + ] + + """ + def __init__(self) -> None: self.client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) @@ -204,12 +136,16 @@ def handle_request( context: The context or document content to be used """ + + # If no query is provided, use the default instructions + if not query: + query = self.INSTRUCTIONS + start_time = time.time() # TODO: Add error handling for API requests and invalid responses + response = self.client.responses.create( - model=self.MODEL, - instructions=query, - input=context, + model=self.MODEL, instructions=query, input=context, temperature=0.0 ) duration = time.time() - start_time @@ -222,9 +158,10 @@ def handle_request( class ModelFactory: + # TODO: Define structured fields to extract from unstructured input data + # https://platform.openai.com/docs/guides/structured-outputs?api-mode=responses&example=structured-data#examples + HANDLERS = { - "CLAUDE_HAIKU_3_5_CITATIONS": ClaudeHaiku35CitationsHandler, - "CLAUDE_HAIKU_3": ClaudeHaiku3Handler, "GPT_4O_MINI": GPT4OMiniHandler, "GPT_41_NANO": GPT41NanoHandler, } diff --git a/server/api/views/text_extraction/views.py b/server/api/views/text_extraction/views.py index e0110a8e..e4122851 100644 --- a/server/api/views/text_extraction/views.py +++ b/server/api/views/text_extraction/views.py @@ -1,5 +1,7 @@ import os -from ...services.openai_services import openAIServices +import json +import re + from rest_framework.views import APIView from rest_framework.permissions import IsAuthenticated from rest_framework.response import Response @@ -7,8 +9,8 @@ from django.utils.decorators import method_decorator from django.views.decorators.csrf import csrf_exempt import anthropic -import json -import re + +from ...services.openai_services import openAIServices from api.models.model_embeddings import Embeddings USER_PROMPT = """ @@ -30,11 +32,23 @@ """ -# TODO: Add docstrings and type hints -def anthropic_citations(client, user_prompt, content_chunks): +def anthropic_citations(client: anthropic.Client, user_prompt: str, content_chunks: list) -> tuple: """ + Sends a message to Anthropic Citations and extract and format the response + + Parameters + ---------- + client: An instance of the Anthropic API client used to make the request + user_prompt: The user's question or instruction to be processed by the model + content_chunks: A list of text chunks that provide context for the model to use during generation + + Returns + ------- + tuple + """ + message = client.messages.create( model="claude-3-5-haiku-20241022", max_tokens=1024, @@ -92,6 +106,7 @@ def get(self, request): query = Embeddings.objects.filter(upload_file__guid=guid) + # TODO: Format into the Anthropic API"s expected input format in the anthropic_citations function chunks = [{"type": "text", "text": chunk.text} for chunk in query] texts, cited_texts = anthropic_citations(client, USER_PROMPT, chunks)