Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added jupyter_notebook_naz/cache/llm_cache/data.mdb
Binary file not shown.
Binary file added jupyter_notebook_naz/cache/llm_cache/lock.mdb
Binary file not shown.
Binary file not shown.
Binary file not shown.
307 changes: 307 additions & 0 deletions jupyter_notebook_naz/jupyter_classifier.ipynb

Large diffs are not rendered by default.

332,043 changes: 332,043 additions & 0 deletions llama_vs_not_llama_with_categories_pref.csv

Large diffs are not rendered by default.

434 changes: 434 additions & 0 deletions main_naz.py

Large diffs are not rendered by default.

599 changes: 599 additions & 0 deletions no_iteration_working_main_naz.py

Large diffs are not rendered by default.

51 changes: 51 additions & 0 deletions preset_new_vibes_naz.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
project: VibeCheck_Trigger
entitiy : clipinvariance
wandb: True # Set to True to log to Weights and Biases

num_samples: False
output_name: preset_gpt
filter_mm_only: False
filter: False
num_final_eval: 10

save_dir: pipeline_results


#note: error = absolute paths --> different env or remote server issue

data_path: data/gpt_vs_human/multisource_testing_train.csv
#data_path: data/gpt_vs_human/vibecheck+llama.csv

test_data_path: data/gpt_vs_human/multisource_testing_test.csv


models : [human_answers,chatgpt_answers]

judges: [gpt-4o-mini]
eval_only: True

axes: [
"Cognitive Tension - High: Complex, abstract language requiring significant cognitive effort to interpret. Low: Simple, easy-to-process language with minimal cognitive demand.",
"Surprise - High: Unexpected word choices, twists in phrasing, or novel expressions that violate typical expectations. Low: Predictable, conventional language patterns.",
"Affective Arousal - High: Intense, stimulating language that evokes strong emotional activation. Low: Calm, relaxed language with minimal emotional stimulation.",
"Trustworthiness - High: Direct, clear, and transparent language increasing perceived honesty. Low: Language feels evasive, vague, or exaggerated, reducing perceived credibility.",
"Ambiguity Tolerance - High: Intentionally ambiguous or poetic language allowing multiple interpretations. Low: Clear, definitive statements with minimal room for alternative interpretations.",
"Temporal Urgency - High: Rapid pacing, short sentence bursts, and urgent framing of ideas. Low: Relaxed pacing with longer pauses and slower information delivery.",
"Cognitive Ease - High: Clear, simple language with a natural flow, easy to process in real time. Low: Dense, technical language requiring re-reading or specialized knowledge.",
"Emotional Volatility - High: Frequent shifts in emotional tone or intensity within a single output. Low: Consistent emotional tone throughout the output.",
"Rhythmic Cadence - High: Smooth, rhythmic sentence patterns with a musical quality. Low: Choppy, fragmented sentence structures lacking flow.",
"Intimacy - High: Warm, personal language that builds emotional connection. Low: Distant, formal language that maintains emotional detachment."
]

proposer_prompt: proposer_prompt_freeform

# models
rubric_generation_model: "gpt-4o"
proposer_model: "gpt-4o"

#tf-idf
use_tfidf: True
num_tfidf_features: 1000
idf_min_df: 2
idf_max_df: 0.95
idf_norm: l2
22 changes: 22 additions & 0 deletions requirements_naz.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
click
pandas
omegaconf
tqdm
wandb
lmdb
openai
anthropic
Pillow
seaborn
scipy
scikit-learn
fuzzywuzzy
plotly
sentence_transformers
torch
clip
torchvision
InstructorEmbedding
open_clip_torch
flask
python-Levenshtein
130 changes: 130 additions & 0 deletions updated_pref 2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
from utils import parse_bullets
from utils_llm import get_llm_output
from utils_general import get_from_cache, save_to_cache
import numpy as np
import pandas as pd
import re
import argparse
import openai
import os
from tqdm import tqdm

os.environ["OPENAI_API_KEY"] = "YOUR OPENAI KEY"
openai.api_key = os.getenv("OPENAI_API_KEY")


judge_prompt = """You are an impartial judge and evaluate the quality of the responses provided by two AI assistants (A and B) to the user question displayed below. You should choose the assistant that follows the user’s instructions and answers the user’s question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.

Here is the prompt and the outputs of A and B respectively:

{judge_input}

Please respond with the model which contains a higher quality response. Based on your analysis, please explain your reasoning before assigning a score. Use the following format for your response:
Analysis: {{reasoning}}
Model: {{A, B, tie}}
"""

judge_prompt_reversed = """You are an impartial judge and evaluate the quality of the responses provided by two AI assistants (A and B) to the user question displayed below. You should choose the assistant that follows the user’s instructions and answers the user’s question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible.

Here is the prompt and the outputs of A and B respectively:

{judge_input_reversed}

Please respond with the model which contains a higher quality response. Based on your analysis, please explain your reasoning before assigning a score. Use the following format for your response:
Analysis: {{reasoning}}
Model: {{A, B, tie}}
"""


def extract_scores(output):
"""parse out the score from the output of the following format
Analysis: {{reasoning}}
Model: {{A or B}}
"""
output = output.replace("Output ", "").replace("output ", "")
output = re.sub(r"[#*]", "", output)
# ignore spaces
score_pattern = re.compile(r"Model: (A|B|tie)", re.IGNORECASE | re.MULTILINE)
score = score_pattern.findall(output)
# apply end_of_output parse if necessary
end_of_output = output[-20:]
end_of_out_pattern = re.compile(r"\b(A|B|tie)\b", re.IGNORECASE | re.MULTILINE)
try:
if len(score) == 0:
score = end_of_out_pattern.findall(end_of_output)
if score[0] == "A" or score[0] == "a":
return 1
elif score[0] == "B" or score[0] == "b":
return -1
elif score[0] == "tie" or score[0] == "Tie":
return 0
else:
print(f"Invalid score: {score[0]}")
return 0
except:
print(f"Invalid score: {score}")
return 0


import argparse


def __main__():
parser = argparse.ArgumentParser()
parser.add_argument("--data_path", type=str, default="/home/nazcol/VibeCheck/data/llama3-70b-arena/llama_vs_not_llama_with_categories.csv")
parser.add_argument("--models", nargs="+", default=["human_answers", "chatgpt_answers"])
parser.add_argument("--output_path", type=str, default="/home/nazcol/VibeCheck/data/llama3-70b-arena/llama_vs_not_llama_with_categories_pref.csv")
parser.add_argument("--judge_model", type=str, default="gpt-4o", help="Model to use for judging preference")
parser.add_argument("--test", action="store_true")
args = parser.parse_args()


print(f"Loading data from {args.data_path}...")
df = pd.read_csv(args.data_path)
print(f"Data loaded: {len(df)} rows.")

if args.test:
df = df.head(10)
print("Running in test mode: Using first 10 rows.")

print("Generating judge inputs...")

judge_inputs = []
judge_inputs_reversed = []

for _, row in df.iterrows():
judge_inputs.append(f"Prompt: {row['question']}\nOutput A: {row[args.models[0]]}\nOutput B: {row[args.models[1]]}")
judge_inputs_reversed.append(f"Prompt: {row['question']}\nOutput A: {row[args.models[1]]}\nOutput B: {row[args.models[0]]}")

df["judge_input"] = judge_inputs
df["judge_input_reversed"] = judge_inputs_reversed

print("Getting preferences from LLM using 32 threads...")
preferences = get_llm_output(judge_inputs, args.judge_model)
preferences_reversed = get_llm_output(judge_inputs_reversed, args.judge_model)

df["preference"] = preferences
df["preference_reversed"] = preferences_reversed

print("Extracting scores...")
preference_list = df["preference"].tolist()
preference_reversed_list = df["preference_reversed"].tolist()

extracted_scores = [extract_scores(x) for x in tqdm(preference_list, desc="Extracting Scores")]
extracted_scores_reversed = [extract_scores(x) for x in tqdm(preference_reversed_list, desc="Extracting Reversed Scores")]

df["preference"] = extracted_scores
df["preference_reversed"] = extracted_scores_reversed

df["position_bias"] = df["preference_reversed"] == df["preference"]
df["preference_feature"] = df.apply(lambda row: row["preference"] if not row["position_bias"] else 0, axis=1)
df["preference"] = df["preference_feature"].apply(lambda x: {"-1": args.models[1], "1": args.models[0], "0": "equal"}[str(x)])
df["preference_model"] = args.judge_model
print("Final preference counts:", df["preference"].value_counts().to_dict())

print("Preference counts:", df.preference.value_counts().to_dict())
df.to_csv(args.output_path, index=False)
print(f"Saved to {args.output_path}")

if __name__ == "__main__":
__main__()
46 changes: 46 additions & 0 deletions utils_general_naz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import hashlib
from typing import Dict, List, Optional

import lmdb
from PIL import Image
import pickle


def hash_key(key) -> str:
return hashlib.sha256(key.encode()).hexdigest()


def get_from_cache(key: str, env: lmdb.Environment) -> Optional[str]:
with env.begin(write=False) as txn:
hashed_key = hash_key(key)
value = txn.get(hashed_key.encode())
if value:
return value.decode()
return None


def save_to_cache(key: str, value: str, env: lmdb.Environment):
with env.begin(write=True) as txn:
hashed_key = hash_key(key)
txn.put(hashed_key.encode(), value.encode())


def save_emb_to_cache(key: str, value, env: lmdb.Environment):
with env.begin(write=True) as txn:
hashed_key = hash_key(key)
# Use pickle to serialize the value
serialized_value = pickle.dumps(value)
txn.put(hashed_key.encode(), serialized_value)


def get_emb_from_cache(key: str, env: lmdb.Environment):
with env.begin(write=False) as txn:
hashed_key = hash_key(key)
serialized_value = txn.get(hashed_key.encode())
if serialized_value is not None:
# Deserialize the value back into a Python object
value = pickle.loads(serialized_value)
return value
else:
# Handle the case where the key does not exist in the cache
return None
Loading