lisadunlap · nazcol · Jan 29, 2025 · Feb 7, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/jupyter_notebook_naz/cache/llm_cache/data.mdb b/jupyter_notebook_naz/cache/llm_cache/data.mdb
diff --git a/jupyter_notebook_naz/cache/llm_cache/lock.mdb b/jupyter_notebook_naz/cache/llm_cache/lock.mdb
diff --git a/jupyter_notebook_naz/cache/llm_embed_cache/data.mdb b/jupyter_notebook_naz/cache/llm_embed_cache/data.mdb
diff --git a/jupyter_notebook_naz/cache/llm_embed_cache/lock.mdb b/jupyter_notebook_naz/cache/llm_embed_cache/lock.mdb
diff --git a/jupyter_notebook_naz/jupyter_classifier.ipynb b/jupyter_notebook_naz/jupyter_classifier.ipynb
diff --git a/llama_vs_not_llama_with_categories_pref.csv b/llama_vs_not_llama_with_categories_pref.csv
diff --git a/main_naz.py b/main_naz.py
diff --git a/no_iteration_working_main_naz.py b/no_iteration_working_main_naz.py
diff --git a/preset_new_vibes_naz.yaml b/preset_new_vibes_naz.yaml
@@ -0,0 +1,51 @@
+project: VibeCheck_Trigger
+entitiy : clipinvariance
+wandb: True # Set to True to log to Weights and Biases
+
+num_samples: False
+output_name: preset_gpt
+filter_mm_only: False
+filter: False
+num_final_eval: 10
+
+save_dir: pipeline_results
+
+
+#note: error = absolute paths --> different env or remote server issue
+
+data_path: data/gpt_vs_human/multisource_testing_train.csv
+#data_path: data/gpt_vs_human/vibecheck+llama.csv
+
+test_data_path: data/gpt_vs_human/multisource_testing_test.csv
+
+
+models : [human_answers,chatgpt_answers]
+
+judges: [gpt-4o-mini]
+eval_only: True
+
+axes: [
+    "Cognitive Tension - High: Complex, abstract language requiring significant cognitive effort to interpret. Low: Simple, easy-to-process language with minimal cognitive demand.",
+    "Surprise - High: Unexpected word choices, twists in phrasing, or novel expressions that violate typical expectations. Low: Predictable, conventional language patterns.",
+    "Affective Arousal - High: Intense, stimulating language that evokes strong emotional activation. Low: Calm, relaxed language with minimal emotional stimulation.",
+    "Trustworthiness - High: Direct, clear, and transparent language increasing perceived honesty. Low: Language feels evasive, vague, or exaggerated, reducing perceived credibility.",
+    "Ambiguity Tolerance - High: Intentionally ambiguous or poetic language allowing multiple interpretations. Low: Clear, definitive statements with minimal room for alternative interpretations.",
+    "Temporal Urgency - High: Rapid pacing, short sentence bursts, and urgent framing of ideas. Low: Relaxed pacing with longer pauses and slower information delivery.",
+    "Cognitive Ease - High: Clear, simple language with a natural flow, easy to process in real time. Low: Dense, technical language requiring re-reading or specialized knowledge.",
+    "Emotional Volatility - High: Frequent shifts in emotional tone or intensity within a single output. Low: Consistent emotional tone throughout the output.",
+    "Rhythmic Cadence - High: Smooth, rhythmic sentence patterns with a musical quality. Low: Choppy, fragmented sentence structures lacking flow.",
+    "Intimacy - High: Warm, personal language that builds emotional connection. Low: Distant, formal language that maintains emotional detachment."
+]
+
+proposer_prompt: proposer_prompt_freeform
+
+# models
+rubric_generation_model: "gpt-4o"
+proposer_model: "gpt-4o"
+
+#tf-idf
+use_tfidf: True
+num_tfidf_features: 1000
+idf_min_df: 2
+idf_max_df: 0.95
+idf_norm: l2
diff --git a/requirements_naz.txt b/requirements_naz.txt
@@ -0,0 +1,22 @@
+click
+pandas
+omegaconf
+tqdm
+wandb
+lmdb
+openai
+anthropic
+Pillow
+seaborn
+scipy
+scikit-learn
+fuzzywuzzy
+plotly
+sentence_transformers
+torch
+clip
+torchvision
+InstructorEmbedding
+open_clip_torch
+flask
+python-Levenshtein
diff --git a/updated_pref 2.py b/updated_pref 2.py
@@ -0,0 +1,130 @@
+from utils import parse_bullets
+from utils_llm import get_llm_output
+from utils_general import get_from_cache, save_to_cache
+import numpy as np
+import pandas as pd
+import re
+import argparse
+import openai
+import os
+from tqdm import tqdm
+
+os.environ["OPENAI_API_KEY"] = "YOUR OPENAI KEY"
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+
+judge_prompt = """You are an impartial judge and evaluate the quality of the responses provided by two AI assistants (A and B) to the user question displayed below. You should choose the assistant that follows the user’s instructions and answers the user’s question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. 
+
+Here is the prompt and the outputs of A and B respectively:
+
+{judge_input}
+
+Please respond with the model which contains a higher quality response. Based on your analysis, please explain your reasoning before assigning a score. Use the following format for your response:
+Analysis: {{reasoning}}
+Model: {{A, B, tie}}
+"""
+
+judge_prompt_reversed = """You are an impartial judge and evaluate the quality of the responses provided by two AI assistants (A and B) to the user question displayed below. You should choose the assistant that follows the user’s instructions and answers the user’s question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. 
+
+Here is the prompt and the outputs of A and B respectively:
+
+{judge_input_reversed}
+
+Please respond with the model which contains a higher quality response. Based on your analysis, please explain your reasoning before assigning a score. Use the following format for your response:
+Analysis: {{reasoning}}
+Model: {{A, B, tie}}
+"""
+
+
+def extract_scores(output):
+    """parse out the score from the output of the following format
+    Analysis: {{reasoning}}
+    Model: {{A or B}}
+    """
+    output = output.replace("Output ", "").replace("output ", "")
+    output = re.sub(r"[#*]", "", output)
+    # ignore spaces
+    score_pattern = re.compile(r"Model: (A|B|tie)", re.IGNORECASE | re.MULTILINE)
+    score = score_pattern.findall(output)
+    # apply end_of_output parse if necessary
+    end_of_output = output[-20:]
+    end_of_out_pattern = re.compile(r"\b(A|B|tie)\b", re.IGNORECASE | re.MULTILINE)
+    try:
+        if len(score) == 0:
+            score = end_of_out_pattern.findall(end_of_output)
+        if score[0] == "A" or score[0] == "a":
+            return 1
+        elif score[0] == "B" or score[0] == "b":
+            return -1
+        elif score[0] == "tie" or score[0] == "Tie":
+            return 0
+        else:
+            print(f"Invalid score: {score[0]}")
+            return 0
+    except:
+        print(f"Invalid score: {score}")
+        return 0
+
+
+import argparse
+
+
+def __main__():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_path", type=str, default="/home/nazcol/VibeCheck/data/llama3-70b-arena/llama_vs_not_llama_with_categories.csv")
+    parser.add_argument("--models", nargs="+", default=["human_answers", "chatgpt_answers"])
+    parser.add_argument("--output_path", type=str, default="/home/nazcol/VibeCheck/data/llama3-70b-arena/llama_vs_not_llama_with_categories_pref.csv")
+    parser.add_argument("--judge_model", type=str, default="gpt-4o", help="Model to use for judging preference")
+    parser.add_argument("--test", action="store_true")
+    args = parser.parse_args()
+
+
+    print(f"Loading data from {args.data_path}...")
+    df = pd.read_csv(args.data_path)
+    print(f"Data loaded: {len(df)} rows.")
+
+    if args.test:
+        df = df.head(10)
+        print("Running in test mode: Using first 10 rows.")
+
+    print("Generating judge inputs...")
+
+    judge_inputs = []
+    judge_inputs_reversed = []
+
+    for _, row in df.iterrows():
+        judge_inputs.append(f"Prompt: {row['question']}\nOutput A: {row[args.models[0]]}\nOutput B: {row[args.models[1]]}")
+        judge_inputs_reversed.append(f"Prompt: {row['question']}\nOutput A: {row[args.models[1]]}\nOutput B: {row[args.models[0]]}")
+
+    df["judge_input"] = judge_inputs
+    df["judge_input_reversed"] = judge_inputs_reversed
+
+    print("Getting preferences from LLM using 32 threads...")
+    preferences = get_llm_output(judge_inputs, args.judge_model)
+    preferences_reversed = get_llm_output(judge_inputs_reversed, args.judge_model)
+
+    df["preference"] = preferences
+    df["preference_reversed"] = preferences_reversed
+
+    print("Extracting scores...")
+    preference_list = df["preference"].tolist()
+    preference_reversed_list = df["preference_reversed"].tolist()
+
+    extracted_scores = [extract_scores(x) for x in tqdm(preference_list, desc="Extracting Scores")]
+    extracted_scores_reversed = [extract_scores(x) for x in tqdm(preference_reversed_list, desc="Extracting Reversed Scores")]
+
+    df["preference"] = extracted_scores
+    df["preference_reversed"] = extracted_scores_reversed
+
+    df["position_bias"] = df["preference_reversed"] == df["preference"]
+    df["preference_feature"] = df.apply(lambda row: row["preference"] if not row["position_bias"] else 0, axis=1)
+    df["preference"] = df["preference_feature"].apply(lambda x: {"-1": args.models[1], "1": args.models[0], "0": "equal"}[str(x)])
+    df["preference_model"] = args.judge_model
+    print("Final preference counts:", df["preference"].value_counts().to_dict())
+
+    print("Preference counts:", df.preference.value_counts().to_dict())
+    df.to_csv(args.output_path, index=False)
+    print(f"Saved to {args.output_path}")
+
+if __name__ == "__main__":
+    __main__()
diff --git a/utils_general_naz.py b/utils_general_naz.py
@@ -0,0 +1,46 @@
+import hashlib
+from typing import Dict, List, Optional
+
+import lmdb
+from PIL import Image
+import pickle
+
+
+def hash_key(key) -> str:
+    return hashlib.sha256(key.encode()).hexdigest()
+
+
+def get_from_cache(key: str, env: lmdb.Environment) -> Optional[str]:
+    with env.begin(write=False) as txn:
+        hashed_key = hash_key(key)
+        value = txn.get(hashed_key.encode())
+    if value:
+        return value.decode()
+    return None
+
+
+def save_to_cache(key: str, value: str, env: lmdb.Environment):
+    with env.begin(write=True) as txn:
+        hashed_key = hash_key(key)
+        txn.put(hashed_key.encode(), value.encode())
+
+
+def save_emb_to_cache(key: str, value, env: lmdb.Environment):
+    with env.begin(write=True) as txn:
+        hashed_key = hash_key(key)
+        # Use pickle to serialize the value
+        serialized_value = pickle.dumps(value)
+        txn.put(hashed_key.encode(), serialized_value)
+
+
+def get_emb_from_cache(key: str, env: lmdb.Environment):
+    with env.begin(write=False) as txn:
+        hashed_key = hash_key(key)
+        serialized_value = txn.get(hashed_key.encode())
+        if serialized_value is not None:
+            # Deserialize the value back into a Python object
+            value = pickle.loads(serialized_value)
+            return value
+        else:
+            # Handle the case where the key does not exist in the cache
+            return None