Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions introspector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import streamlit as st
import urllib.parse
import requests

def resolver(url):
data2 = requests.get(url)
value = data2.text
return value

def get_input():
total = ""
q= st.experimental_get_query_params()
new_messages = []
if "text-input" in q:
return q["text-input"]
if "messages" in q:
for item in q["messages"]:
new1 = urllib.parse.unquote(item)

if new1.startswith("http"):
#st.write("DEBUG1",new1)
new2 = resolver(new1)
#st.write("DEBUG2",new2)
else:
st.write("OTHER",new1)
new2 = new1
pass
total = total + new2
#st.session_state['text-input'] = total
#st.write("DEBUG",total)
return total
115 changes: 103 additions & 12 deletions splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,45 +2,108 @@
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, Language
import code_snippets as code_snippets
import tiktoken

import introspector
import urllib
import urllib.parse
oparams = st.experimental_get_query_params()
params = {
x: oparams[x][0] for x in oparams
}

# Streamlit UI
st.title("Text Splitter Playground")
st.title("Introspector Text Splitter Playground")
st.info("""Split a text into chunks using a **Text Splitter**. Parameters include:
Fork of the amazing https://langchain-text-splitter.streamlit.app

## URL Specification

This specification outlines the structure of URLs used in the application, detailing the query parameters and their expected values.

### General URL Structure
- URLs should follow the standard format: `http://example.com/path/to/resource?query_parameter=value`

### Query Parameters

1. `text-input` (Optional)
- Description: Represents text input for the application.
- Value: A URL-encoded string containing the text input data.
- Example: `http://example.com/app?text-input=This+is+an+example+text`

2. `messages` (Optional)
- Description: Represents a list of messages or data items.
- Value: A list of URL-encoded strings, where each string represents a message or data item.
- Example: `http://example.com/app?messages=http%3A%2F%2Fmessage1.com&messages=http%3A%2F%2Fmessage2.com`

3 `chunk-size`: Max size of the resulting chunks (in either characters or tokens, as selected)
4 `chunk-overlap`: Overlap between the resulting chunks (in either characters or tokens, as selected)
5 `length-function`: How to measure lengths of chunks, examples are included for either characters or tokens
- The type of the text splitter, this largely controls the separators used to split on
6. 'base-url': what url to use as base
7. 'text-splitter: what algo to use splitter_choices = ["RecursiveCharacter", "Character"] + [str(v) for v in Language]

### Processing Logic

1. If the `text-input` parameter is present in the URL, the application should use the value associated with `text-input` as the text input data.

2. If the `messages` parameter is present in the URL, the application should iterate through each item in the list of messages.

3. For each message (item) in the list, the application should:
- Decode URL-encoded characters in the message.
- Check if the decoded message starts with "http" (indicating a URL).
- If the message starts with "http," the application should resolve the URL using the `resolver` function and use the resolved content.
- If the message doesn't start with "http," the application should handle it as other content.

### Handling Other Content

- If a message doesn't start with "http" (indicating other content), the application should:
- Use the original content a iput


- `chunk_size`: Max size of the resulting chunks (in either characters or tokens, as selected)
- `chunk_overlap`: Overlap between the resulting chunks (in either characters or tokens, as selected)
- `length_function`: How to measure lengths of chunks, examples are included for either characters or tokens
- The type of the text splitter, this largely controls the separators used to split on
""")
col1, col2, col3, col4 = st.columns([1, 1, 1, 2])

with col1:
chunk_size = st.number_input(min_value=1, label="Chunk Size", value=1000)
chunk_size = st.number_input(
min_value=1,
label="Chunk Size",
value=int(params.get("chunk-size",1000)),
key="chunk-size")

with col2:
# Setting the max value of chunk_overlap based on chunk_size
chunk_overlap = st.number_input(
min_value=1,
max_value=chunk_size - 1,
label="Chunk Overlap",
value=int(chunk_size * 0.2),
value=int(params.get("chunk-overlap",int(chunk_size * 0.2))),
key="chunk-overlap"
)

# Display a warning if chunk_overlap is not less than chunk_size
if chunk_overlap >= chunk_size:
st.warning("Chunk Overlap should be less than Chunk Length!")

with col3:
opts =["Characters", "Tokens"]
length_function = st.selectbox(
"Length Function", ["Characters", "Tokens"]
"Length Function", opts,
key="length-function",
index=opts.index(params.get("length-function","Characters"))
)

splitter_choices = ["RecursiveCharacter", "Character"] + [str(v) for v in Language]

with col4:
# splitter_choices
choice = params.get("text_splitter",splitter_choices[0])
opt_index = 0
if choice in splitter_choices:
opt_index = splitter_choices.index(choice)

splitter_choice = st.selectbox(
"Select a Text Splitter", splitter_choices
"Select a Text Splitter", splitter_choices,
key="text-splitter",
index=opt_index,
)

if length_function == "Characters":
Expand Down Expand Up @@ -84,10 +147,31 @@ def length_function(text: str) -> int:

st.info(import_text)

#for x in oparams:
# if x in st.session_state:
# fixme validate thise
#if x in ("mode","input_id","workflow"):
#st.write("DEBUG",x,st.session_state[x],oparams[x][0])
#st.session_state[x] = oparams[x][0]

# Box for pasting text
doc = st.text_area("Paste your text here:")
default_text = introspector.get_input()
#st.code(default_text)
base_url = st.text_input("base_url", key="base-url", value=params.get("base-url",""), help="for the target")

doc = st.text_area("Paste your text here:", key="text-input", value=default_text, height=400)

## create self link
q= st.experimental_get_query_params()
for x in st.session_state:
v = st.session_state[x]
q[x]= v
q["text-input"]=q["text-input"][0:256] #truncate
encoded_query = urllib.parse.urlencode(q, doseq=True)
st.markdown(f"* share [input_link full]({base_url}/?{encoded_query})")

# Split text button
#if (len(default_text ) >10) or
if st.button("Split Text"):
# Choose splitter
if splitter_choice == "Character":
Expand All @@ -113,5 +197,12 @@ def length_function(text: str) -> int:
# Display the splits
for idx, split in enumerate(splits, start=1):
st.text_area(
f"Split {idx}", split, height=200
f"Split {idx}", split, height=200,
)
q["text-input"] = split
q["idx"] = split
encoded_query = urllib.parse.urlencode(q, doseq=True)
st.markdown(f"* share [input_link {split[0:50]}]({base_url}/?{encoded_query})")