diff --git a/introspector.py b/introspector.py new file mode 100644 index 0000000..c60fd70 --- /dev/null +++ b/introspector.py @@ -0,0 +1,31 @@ +import streamlit as st +import urllib.parse +import requests + +def resolver(url): + data2 = requests.get(url) + value = data2.text + return value + +def get_input(): + total = "" + q= st.experimental_get_query_params() + new_messages = [] + if "text-input" in q: + return q["text-input"] + if "messages" in q: + for item in q["messages"]: + new1 = urllib.parse.unquote(item) + + if new1.startswith("http"): + #st.write("DEBUG1",new1) + new2 = resolver(new1) + #st.write("DEBUG2",new2) + else: + st.write("OTHER",new1) + new2 = new1 + pass + total = total + new2 + #st.session_state['text-input'] = total + #st.write("DEBUG",total) + return total diff --git a/splitter.py b/splitter.py index 02ad4d9..2f1d363 100644 --- a/splitter.py +++ b/splitter.py @@ -2,21 +2,72 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, Language import code_snippets as code_snippets import tiktoken - +import introspector +import urllib +import urllib.parse +oparams = st.experimental_get_query_params() +params = { + x: oparams[x][0] for x in oparams +} # Streamlit UI -st.title("Text Splitter Playground") +st.title("Introspector Text Splitter Playground") st.info("""Split a text into chunks using a **Text Splitter**. Parameters include: +Fork of the amazing https://langchain-text-splitter.streamlit.app + +## URL Specification + +This specification outlines the structure of URLs used in the application, detailing the query parameters and their expected values. + +### General URL Structure +- URLs should follow the standard format: `http://example.com/path/to/resource?query_parameter=value` + +### Query Parameters + +1. `text-input` (Optional) + - Description: Represents text input for the application. + - Value: A URL-encoded string containing the text input data. + - Example: `http://example.com/app?text-input=This+is+an+example+text` + +2. `messages` (Optional) + - Description: Represents a list of messages or data items. + - Value: A list of URL-encoded strings, where each string represents a message or data item. + - Example: `http://example.com/app?messages=http%3A%2F%2Fmessage1.com&messages=http%3A%2F%2Fmessage2.com` + +3 `chunk-size`: Max size of the resulting chunks (in either characters or tokens, as selected) +4 `chunk-overlap`: Overlap between the resulting chunks (in either characters or tokens, as selected) +5 `length-function`: How to measure lengths of chunks, examples are included for either characters or tokens + - The type of the text splitter, this largely controls the separators used to split on +6. 'base-url': what url to use as base +7. 'text-splitter: what algo to use splitter_choices = ["RecursiveCharacter", "Character"] + [str(v) for v in Language] + +### Processing Logic + +1. If the `text-input` parameter is present in the URL, the application should use the value associated with `text-input` as the text input data. + +2. If the `messages` parameter is present in the URL, the application should iterate through each item in the list of messages. + +3. For each message (item) in the list, the application should: + - Decode URL-encoded characters in the message. + - Check if the decoded message starts with "http" (indicating a URL). + - If the message starts with "http," the application should resolve the URL using the `resolver` function and use the resolved content. + - If the message doesn't start with "http," the application should handle it as other content. + +### Handling Other Content + +- If a message doesn't start with "http" (indicating other content), the application should: + - Use the original content a iput + -- `chunk_size`: Max size of the resulting chunks (in either characters or tokens, as selected) -- `chunk_overlap`: Overlap between the resulting chunks (in either characters or tokens, as selected) -- `length_function`: How to measure lengths of chunks, examples are included for either characters or tokens -- The type of the text splitter, this largely controls the separators used to split on """) col1, col2, col3, col4 = st.columns([1, 1, 1, 2]) with col1: - chunk_size = st.number_input(min_value=1, label="Chunk Size", value=1000) + chunk_size = st.number_input( + min_value=1, + label="Chunk Size", + value=int(params.get("chunk-size",1000)), + key="chunk-size") with col2: # Setting the max value of chunk_overlap based on chunk_size @@ -24,7 +75,8 @@ min_value=1, max_value=chunk_size - 1, label="Chunk Overlap", - value=int(chunk_size * 0.2), + value=int(params.get("chunk-overlap",int(chunk_size * 0.2))), + key="chunk-overlap" ) # Display a warning if chunk_overlap is not less than chunk_size @@ -32,15 +84,26 @@ st.warning("Chunk Overlap should be less than Chunk Length!") with col3: + opts =["Characters", "Tokens"] length_function = st.selectbox( - "Length Function", ["Characters", "Tokens"] + "Length Function", opts, + key="length-function", + index=opts.index(params.get("length-function","Characters")) ) splitter_choices = ["RecursiveCharacter", "Character"] + [str(v) for v in Language] with col4: +# splitter_choices + choice = params.get("text_splitter",splitter_choices[0]) + opt_index = 0 + if choice in splitter_choices: + opt_index = splitter_choices.index(choice) + splitter_choice = st.selectbox( - "Select a Text Splitter", splitter_choices + "Select a Text Splitter", splitter_choices, + key="text-splitter", + index=opt_index, ) if length_function == "Characters": @@ -84,10 +147,31 @@ def length_function(text: str) -> int: st.info(import_text) +#for x in oparams: +# if x in st.session_state: + # fixme validate thise + #if x in ("mode","input_id","workflow"): + #st.write("DEBUG",x,st.session_state[x],oparams[x][0]) + #st.session_state[x] = oparams[x][0] + # Box for pasting text -doc = st.text_area("Paste your text here:") +default_text = introspector.get_input() +#st.code(default_text) +base_url = st.text_input("base_url", key="base-url", value=params.get("base-url",""), help="for the target") + +doc = st.text_area("Paste your text here:", key="text-input", value=default_text, height=400) + +## create self link +q= st.experimental_get_query_params() +for x in st.session_state: + v = st.session_state[x] + q[x]= v +q["text-input"]=q["text-input"][0:256] #truncate +encoded_query = urllib.parse.urlencode(q, doseq=True) +st.markdown(f"* share [input_link full]({base_url}/?{encoded_query})") # Split text button +#if (len(default_text ) >10) or if st.button("Split Text"): # Choose splitter if splitter_choice == "Character": @@ -113,5 +197,12 @@ def length_function(text: str) -> int: # Display the splits for idx, split in enumerate(splits, start=1): st.text_area( - f"Split {idx}", split, height=200 + f"Split {idx}", split, height=200, ) + q["text-input"] = split + q["idx"] = split + encoded_query = urllib.parse.urlencode(q, doseq=True) + st.markdown(f"* share [input_link {split[0:50]}]({base_url}/?{encoded_query})") + + +