From 2a5dcd619828b264644173a0ce19eb82ca22c42d Mon Sep 17 00:00:00 2001 From: mike dupont Date: Fri, 15 Sep 2023 14:58:10 -0400 Subject: [PATCH 1/4] v1 --- introspector.py | 31 ++++++++++++++++++++++++ splitter.py | 64 +++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 88 insertions(+), 7 deletions(-) create mode 100644 introspector.py diff --git a/introspector.py b/introspector.py new file mode 100644 index 0000000..2bc994b --- /dev/null +++ b/introspector.py @@ -0,0 +1,31 @@ +import streamlit as st +import urllib.parse +import requests + +def resolver(url): + data2 = requests.get(url) + value = data2.text + return value + +def get_input(): + total = "" + q= st.experimental_get_query_params() + new_messages = [] + if "text" in q: + return q["text"] + if "messages" in q: + for item in q["messages"]: + new1 = urllib.parse.unquote(item) + + if new1.startswith("http"): + #st.write("DEBUG1",new1) + new2 = resolver(new1) + #st.write("DEBUG2",new2) + else: + st.write("OTHER",new1) + new2 = new1 + pass + total = total + new2 + #st.session_state['text-input'] = total + #st.write("DEBUG",total) + return total diff --git a/splitter.py b/splitter.py index 02ad4d9..2e9db4b 100644 --- a/splitter.py +++ b/splitter.py @@ -2,7 +2,13 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, Language import code_snippets as code_snippets import tiktoken - +import introspector +import urllib +import urllib.parse +oparams = st.experimental_get_query_params() +params = { + x: oparams[x][0] for x in oparams +} # Streamlit UI st.title("Text Splitter Playground") @@ -16,7 +22,11 @@ col1, col2, col3, col4 = st.columns([1, 1, 1, 2]) with col1: - chunk_size = st.number_input(min_value=1, label="Chunk Size", value=1000) + chunk_size = st.number_input( + min_value=1, + label="Chunk Size", + value=int(params.get("chunk-size",1000)), + key="chunk-size") with col2: # Setting the max value of chunk_overlap based on chunk_size @@ -24,7 +34,8 @@ min_value=1, max_value=chunk_size - 1, label="Chunk Overlap", - value=int(chunk_size * 0.2), + value=int(params.get("chunk-overlap",int(chunk_size * 0.2))), + key="chunk-overlap" ) # Display a warning if chunk_overlap is not less than chunk_size @@ -32,15 +43,26 @@ st.warning("Chunk Overlap should be less than Chunk Length!") with col3: + opts =["Characters", "Tokens"] length_function = st.selectbox( - "Length Function", ["Characters", "Tokens"] + "Length Function", opts, + key="LengthFunction", + index=opts.index(params.get("LengthFunction","Characters")) ) splitter_choices = ["RecursiveCharacter", "Character"] + [str(v) for v in Language] with col4: +# splitter_choices + choice = params.get("text_splitter",splitter_choices[0]) + opt_index = 0 + if choice in splitter_choices: + opt_index = splitter_choices.index(choice) + splitter_choice = st.selectbox( - "Select a Text Splitter", splitter_choices + "Select a Text Splitter", splitter_choices, + key="text_splitter", + index=opt_index, ) if length_function == "Characters": @@ -84,10 +106,29 @@ def length_function(text: str) -> int: st.info(import_text) +#for x in oparams: +# if x in st.session_state: + # fixme validate thise + #if x in ("mode","input_id","workflow"): + #st.write("DEBUG",x,st.session_state[x],oparams[x][0]) + #st.session_state[x] = oparams[x][0] + # Box for pasting text -doc = st.text_area("Paste your text here:") +default_text = introspector.get_input() +#st.code(default_text) +doc = st.text_area("Paste your text here:", key="text-input", value=default_text, height=400) + +## create self link +q= st.experimental_get_query_params() +for x in st.session_state: + v = st.session_state[x] + q[x]= v +q["text-input"]=q["text-input"][0:256] #truncate +encoded_query = urllib.parse.urlencode(q, doseq=True) +st.markdown(f"* share [input_link full](/?{encoded_query})") # Split text button +#if (len(default_text ) >10) or if st.button("Split Text"): # Choose splitter if splitter_choice == "Character": @@ -113,5 +154,14 @@ def length_function(text: str) -> int: # Display the splits for idx, split in enumerate(splits, start=1): st.text_area( - f"Split {idx}", split, height=200 + + f"Split {idx}", split, height=200, + #key= ) + q["text"] = split + q["idx"] = split + encoded_query = urllib.parse.urlencode(q, doseq=True) + st.markdown(f"* share [input_link {split[0:50]}](/?{encoded_query})") + + + From 80530f45a8c503c657f8b6b6973d62175401f97f Mon Sep 17 00:00:00 2001 From: mike dupont Date: Fri, 15 Sep 2023 15:31:11 -0400 Subject: [PATCH 2/4] working better --- introspector.py | 4 ++-- splitter.py | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/introspector.py b/introspector.py index 2bc994b..c60fd70 100644 --- a/introspector.py +++ b/introspector.py @@ -11,8 +11,8 @@ def get_input(): total = "" q= st.experimental_get_query_params() new_messages = [] - if "text" in q: - return q["text"] + if "text-input" in q: + return q["text-input"] if "messages" in q: for item in q["messages"]: new1 = urllib.parse.unquote(item) diff --git a/splitter.py b/splitter.py index 2e9db4b..d16e795 100644 --- a/splitter.py +++ b/splitter.py @@ -116,6 +116,8 @@ def length_function(text: str) -> int: # Box for pasting text default_text = introspector.get_input() #st.code(default_text) +base_url = st.text_input("base_url", key="base-url", value=params.get("base-url",""), help="for the target") + doc = st.text_area("Paste your text here:", key="text-input", value=default_text, height=400) ## create self link @@ -125,7 +127,7 @@ def length_function(text: str) -> int: q[x]= v q["text-input"]=q["text-input"][0:256] #truncate encoded_query = urllib.parse.urlencode(q, doseq=True) -st.markdown(f"* share [input_link full](/?{encoded_query})") +st.markdown(f"* share [input_link full]({base_url}/?{encoded_query})") # Split text button #if (len(default_text ) >10) or @@ -158,10 +160,10 @@ def length_function(text: str) -> int: f"Split {idx}", split, height=200, #key= ) - q["text"] = split + q["text-input"] = split q["idx"] = split encoded_query = urllib.parse.urlencode(q, doseq=True) - st.markdown(f"* share [input_link {split[0:50]}](/?{encoded_query})") + st.markdown(f"* share [input_link {split[0:50]}]({base_url}/?{encoded_query})") From 4b3b4265f58c6f2f208ebecea22567ea11305df3 Mon Sep 17 00:00:00 2001 From: mike dupont Date: Fri, 15 Sep 2023 15:42:53 -0400 Subject: [PATCH 3/4] next version --- splitter.py | 58 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 10 deletions(-) diff --git a/splitter.py b/splitter.py index d16e795..84739ae 100644 --- a/splitter.py +++ b/splitter.py @@ -11,13 +11,53 @@ } # Streamlit UI -st.title("Text Splitter Playground") +st.title("Introspector Text Splitter Playground") st.info("""Split a text into chunks using a **Text Splitter**. Parameters include: -- `chunk_size`: Max size of the resulting chunks (in either characters or tokens, as selected) -- `chunk_overlap`: Overlap between the resulting chunks (in either characters or tokens, as selected) -- `length_function`: How to measure lengths of chunks, examples are included for either characters or tokens -- The type of the text splitter, this largely controls the separators used to split on +## URL Specification + +This specification outlines the structure of URLs used in the application, detailing the query parameters and their expected values. + +### General URL Structure +- URLs should follow the standard format: `http://example.com/path/to/resource?query_parameter=value` + +### Query Parameters + +1. `text-input` (Optional) + - Description: Represents text input for the application. + - Value: A URL-encoded string containing the text input data. + - Example: `http://example.com/app?text-input=This+is+an+example+text` + +2. `messages` (Optional) + - Description: Represents a list of messages or data items. + - Value: A list of URL-encoded strings, where each string represents a message or data item. + - Example: `http://example.com/app?messages=http%3A%2F%2Fmessage1.com&messages=http%3A%2F%2Fmessage2.com` + +3 `chunk-size`: Max size of the resulting chunks (in either characters or tokens, as selected) +4 `chunk-overlap`: Overlap between the resulting chunks (in either characters or tokens, as selected) +5 `length-function`: How to measure lengths of chunks, examples are included for either characters or tokens + - The type of the text splitter, this largely controls the separators used to split on +6. 'base-url': what url to use as base +7. 'text-splitter: what algo to use splitter_choices = ["RecursiveCharacter", "Character"] + [str(v) for v in Language] + +### Processing Logic + +1. If the `text-input` parameter is present in the URL, the application should use the value associated with `text-input` as the text input data. + +2. If the `messages` parameter is present in the URL, the application should iterate through each item in the list of messages. + +3. For each message (item) in the list, the application should: + - Decode URL-encoded characters in the message. + - Check if the decoded message starts with "http" (indicating a URL). + - If the message starts with "http," the application should resolve the URL using the `resolver` function and use the resolved content. + - If the message doesn't start with "http," the application should handle it as other content. + +### Handling Other Content + +- If a message doesn't start with "http" (indicating other content), the application should: + - Use the original content a iput + + """) col1, col2, col3, col4 = st.columns([1, 1, 1, 2]) @@ -46,8 +86,8 @@ opts =["Characters", "Tokens"] length_function = st.selectbox( "Length Function", opts, - key="LengthFunction", - index=opts.index(params.get("LengthFunction","Characters")) + key="length-function", + index=opts.index(params.get("length-function","Characters")) ) splitter_choices = ["RecursiveCharacter", "Character"] + [str(v) for v in Language] @@ -61,7 +101,7 @@ splitter_choice = st.selectbox( "Select a Text Splitter", splitter_choices, - key="text_splitter", + key="text-splitter", index=opt_index, ) @@ -156,9 +196,7 @@ def length_function(text: str) -> int: # Display the splits for idx, split in enumerate(splits, start=1): st.text_area( - f"Split {idx}", split, height=200, - #key= ) q["text-input"] = split q["idx"] = split From c726a6787aef56966b952603e17b734d2ee678b3 Mon Sep 17 00:00:00 2001 From: mike dupont Date: Fri, 15 Sep 2023 15:46:57 -0400 Subject: [PATCH 4/4] shout out --- splitter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/splitter.py b/splitter.py index 84739ae..2f1d363 100644 --- a/splitter.py +++ b/splitter.py @@ -13,6 +13,7 @@ # Streamlit UI st.title("Introspector Text Splitter Playground") st.info("""Split a text into chunks using a **Text Splitter**. Parameters include: +Fork of the amazing https://langchain-text-splitter.streamlit.app ## URL Specification