diff --git a/CHANGELOG.md b/CHANGELOG.md index df33baf..7cb4b77 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - `Session` class `from_dataframe` method - `Session` class `reset_render_counter` method - `xml_usage.py` example +- `dataframe_usage` example ### Changed - `LLMModel` enum updated - Test system modified diff --git a/examples/README.md b/examples/README.md index 39d1e2d..4ed7a4e 100644 --- a/examples/README.md +++ b/examples/README.md @@ -16,3 +16,7 @@ It leverages the Memor library to manage independent conversation histories for ## XML Usage This example shows how XML can serve as a structured contract between a user, program logic, and an LLM using Memor. By converting XML to a tree, modifying it programmatically, and having the LLM refine it, the workflow becomes both dynamic and model-friendly. + +## Dataframe Usage +This example demonstrates how Memor can serialize an entire chat session into a dataframe, clean it using pandas, and rebuild a safe version for use with another LLM. +A simulated multi-turn conversation is first exported as a DataFrame. In a second script, sensitive information (emails, IDs, passport-like strings) and oversized messages are automatically filtered out using regex rules and length constraints. The trimmed DataFrame is then converted back into a Memor Session and sent to another model like Mistral. diff --git a/examples/dataframe_usage/1_secure_chat.py b/examples/dataframe_usage/1_secure_chat.py new file mode 100644 index 0000000..b46526e --- /dev/null +++ b/examples/dataframe_usage/1_secure_chat.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +"""Step 1: Example conversation content for the premium session.""" + +from memor import Prompt, Response, Session, Role, RenderFormat +from mistralai import Mistral + + +sample_messages = [ + "Hey, I need help planning my 3-month Europe trip. I'll be visiting France, Italy, and Germany.", + "My budget is around $7,200. I'm trying to keep track of flight costs, hotels, and food.", + "Also here is my email just in case: personal.email@example.com", + "I'm thinking of booking a multi-city flight. Found one for $1240 on Lufthansa.", + "Here's a chunk of my notes: " + "lorem ipsum " * 400, # intentionally long + "My passport number is X12345678. Please remind me to renew it.", + "Can you help me create a daily itinerary for France first?", +] + +system_instruction = "You are a helpful assistant. Provide concise and accurate answers." +system_prompt = Prompt(message=system_instruction, role=Role.SYSTEM) + +MISTRAL_API_KEY = "YOUR_MISTRAL_API_KEY" +MISTRAL_MODEL = "mistral-large-latest" +mistral_client = Mistral(api_key=MISTRAL_API_KEY) + +session = Session(title="Private Chat") +session.add_message(system_prompt) + +for msg in sample_messages: + p = Prompt(message=msg, role=Role.USER) + session.add_message(p) + + response = mistral_client.chat.complete( + model=MISTRAL_MODEL, + messages=session.render(RenderFormat.OPENAI) + ).choices[0].message.content + + r = Response( + message=response, + role=Role.ASSISTANT, + ) + session.add_message(r) + +df = session.to_dataframe() +df.to_pickle('2_session_df.pkl') diff --git a/examples/dataframe_usage/2_session_df.pkl b/examples/dataframe_usage/2_session_df.pkl new file mode 100644 index 0000000..2410155 Binary files /dev/null and b/examples/dataframe_usage/2_session_df.pkl differ diff --git a/examples/dataframe_usage/3_trimmed_chat.py b/examples/dataframe_usage/3_trimmed_chat.py new file mode 100644 index 0000000..f26773e --- /dev/null +++ b/examples/dataframe_usage/3_trimmed_chat.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +"""Step 3: Mistral session picking up the same problem but giving a simpler session.""" + +from memor import Prompt, Session, Role, RenderFormat +from mistralai import Mistral +import pandas as pd +import re + + +SENSITIVE_REGEX = re.compile( + r"(?:[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})|" # email + r"(?:\bX\d{8}\b)|" # passport-like + r"(?:\b(?:\d[ -]*?){13,16}\b)", # credit card + flags=re.IGNORECASE +) +MAX_CHARS = 1500 # threshold for free LLM context window + + +df = pd.read_pickle('2_session_df.pkl') +print("Main session size:", len(df)) +print(df.head()) +df["contains_sensitive"] = df["message"].astype(str).str.contains(SENSITIVE_REGEX) + +# Turn the status of ones with sensitive information or long to off: +df.loc[df["contains_sensitive"], "status"] = False +df.loc[df["message"].astype(str).str.len() > MAX_CHARS, "status"] = False + +# Anonymize names inside remaining messages +NAME_REGEX = re.compile(r"\b[A-Z][a-z]{1,20}\s[A-Z][a-z]{1,20}\b") + +def anonymize_names(text: str) -> str: + """ + Anonymize the text by replacing names with [REDACTED_NAME]. + + :param text: the text to be modified + """ + return NAME_REGEX.sub("[REDACTED_NAME]", text) + + +df["message"] = df["message"].astype(str).apply(anonymize_names) + + +# Keep only active messages +df = df[df["status"]].reset_index(drop=True) +print("Trimmed session size:", len(df)) +print(df.head()) + +MISTRAL_API_KEY = "YOUR_MISTRAL_API_KEY" +MISTRAL_MODEL = "mistral-large-latest" +mistral_client = Mistral(api_key=MISTRAL_API_KEY) + +session = Session(title="Trimmed Conversation") +session.from_dataframe(df) + +p = Prompt(message="Can you summarize my plan?", role=Role.USER) +session.add_message(p) + +mistral_response = mistral_client.chat.complete( + model=MISTRAL_MODEL, + messages=session.render(RenderFormat.OPENAI), +).choices[0].message.content + +print(mistral_response)