diff --git a/ai/gemini.py b/ai/gemini.py index 241a72e..fa48bb7 100644 --- a/ai/gemini.py +++ b/ai/gemini.py @@ -23,7 +23,17 @@ def _execute_model_request(self, text): This image contains a multiple choice question. Using the latest accurate information from search results, tell me which answer is correct. Only tell me the correct answer letter (A, B, C, D, etc.), no explanation needed. - Question: {text}""" + Question: {text} + + IMPORTANT OUTPUT FORMATTING INSTRUCTIONS: + - If the question has multiple choice options (A, B, C, D, etc.), respond with ONLY the letter (e.g., 'A' or 'B') + - If the question asks for a number, respond with ONLY the number (e.g., '4' not 'four' or '4 times') + - If the question asks for a time period, respond with the most concise standard form (e.g., 'Quarterly' for questions about reporting frequency) + - If the question asks for a percentage, respond with ONLY the number and % symbol (e.g., '15%') + - If the question asks for a dollar amount, respond with ONLY the number and $ symbol (e.g., '$100') + - Do not include periods, explanatory text, or elaboration + - Do not include phrases like 'The answer is' or 'The correct answer is' + - Respond with the most standardized, concise form possible""" # Send the request with Google Search grounding enabled response = self.client.models.generate_content( diff --git a/ai/gpt4.py b/ai/gpt4.py index 264f75a..f42effe 100644 --- a/ai/gpt4.py +++ b/ai/gpt4.py @@ -27,7 +27,7 @@ def _execute_model_request(self, text): }, { "role": "user", - "content": f"This contains a multiple choice question. Tell me which answer is correct. Only tell me the correct answer, no explanation needed.\n\n{text}" + "content": f"This contains a multiple choice question. Tell me which answer is correct. Only tell me the correct answer, no explanation needed.\n\n{text}\n\nIMPORTANT OUTPUT FORMATTING INSTRUCTIONS:\n- If the question has multiple choice options (A, B, C, D, etc.), respond with ONLY the letter (e.g., 'A' or 'B')\n- If the question asks for a number, respond with ONLY the number (e.g., '4' not 'four' or '4 times')\n- If the question asks for a time period, respond with the most concise standard form (e.g., 'Quarterly' for questions about reporting frequency)\n- If the question asks for a percentage, respond with ONLY the number and % symbol (e.g., '15%')\n- If the question asks for a dollar amount, respond with ONLY the number and $ symbol (e.g., '$100')\n- Do not include periods, explanatory text, or elaboration\n- Do not include phrases like 'The answer is' or 'The correct answer is'\n- Respond with the most standardized, concise form possible" } ], "max_tokens": 100 diff --git a/ai/perplexity.py b/ai/perplexity.py index 8949a91..a4f10d1 100644 --- a/ai/perplexity.py +++ b/ai/perplexity.py @@ -27,7 +27,7 @@ def _execute_model_request(self, text): }, { "role": "user", - "content": f"This image contains a multiple choice question. Using the latest information tell me which answer is correct. Only tell me the correct answer, no explanation needed.\n\n{text}" + "content": f"This image contains a multiple choice question. Using the latest information tell me which answer is correct. Only tell me the correct answer, no explanation needed.\n\n{text}\n\nIMPORTANT OUTPUT FORMATTING INSTRUCTIONS:\n- If the question has multiple choice options (A, B, C, D, etc.), respond with ONLY the letter (e.g., 'A' or 'B')\n- If the question asks for a number, respond with ONLY the number (e.g., '4' not 'four' or '4 times')\n- If the question asks for a time period, respond with the most concise standard form (e.g., 'Quarterly' for questions about reporting frequency)\n- If the question asks for a percentage, respond with ONLY the number and % symbol (e.g., '15%')\n- If the question asks for a dollar amount, respond with ONLY the number and $ symbol (e.g., '$100')\n- Do not include periods, explanatory text, or elaboration\n- Do not include phrases like 'The answer is' or 'The correct answer is'\n- Respond with the most standardized, concise form possible" } ] } diff --git a/core/app.py b/core/app.py index 3f42f4b..8697f7e 100644 --- a/core/app.py +++ b/core/app.py @@ -346,29 +346,36 @@ def get_model_result(model_name): sonar_result = results["sonar"]["result"] gemini_result = results["gemini"]["result"] + # Normalize all answers for semantic comparison + from core.utils import normalize_answer + gpt4_normalized = normalize_answer(gpt4_result) + sonar_pro_normalized = normalize_answer(sonar_pro_result) + sonar_normalized = normalize_answer(sonar_result) + gemini_normalized = normalize_answer(gemini_result) + print("\n" + "="*60) - if gpt4_result == sonar_pro_result == sonar_result == gemini_result: - print("All models agree on the answer!") - elif gpt4_result == sonar_pro_result == sonar_result: - print("GPT-4, Sonar Pro, and Sonar agree, but Gemini differs") - elif gpt4_result == sonar_pro_result == gemini_result: - print("GPT-4, Sonar Pro, and Gemini agree, but Sonar differs") - elif gpt4_result == sonar_result == gemini_result: - print("GPT-4, Sonar, and Gemini agree, but Sonar Pro differs") - elif sonar_pro_result == sonar_result == gemini_result: - print("Sonar Pro, Sonar, and Gemini agree, but GPT-4 differs") - elif gpt4_result == sonar_pro_result: - print("GPT-4 and Sonar Pro agree, but Sonar and Gemini differ") - elif gpt4_result == sonar_result: - print("GPT-4 and Sonar agree, but Sonar Pro and Gemini differ") - elif gpt4_result == gemini_result: - print("GPT-4 and Gemini agree, but Sonar Pro and Sonar differ") - elif sonar_pro_result == sonar_result: - print("Sonar Pro and Sonar agree, but GPT-4 and Gemini differ") - elif sonar_pro_result == gemini_result: - print("Sonar Pro and Gemini agree, but GPT-4 and Sonar differ") - elif sonar_result == gemini_result: - print("Sonar and Gemini agree, but GPT-4 and Sonar Pro differ") + if gpt4_normalized == sonar_pro_normalized == sonar_normalized == gemini_normalized: + print("✅ All models agree on the answer!") + elif gpt4_normalized == sonar_pro_normalized == sonar_normalized: + print("⚠️ GPT-4, Sonar Pro, and Sonar agree, but Gemini differs") + elif gpt4_normalized == sonar_pro_normalized == gemini_normalized: + print("⚠️ GPT-4, Sonar Pro, and Gemini agree, but Sonar differs") + elif gpt4_normalized == sonar_normalized == gemini_normalized: + print("⚠️ GPT-4, Sonar, and Gemini agree, but Sonar Pro differs") + elif sonar_pro_normalized == sonar_normalized == gemini_normalized: + print("⚠️ Sonar Pro, Sonar, and Gemini agree, but GPT-4 differs") + elif gpt4_normalized == sonar_pro_normalized: + print("⚠️ GPT-4 and Sonar Pro agree, but Sonar and Gemini differ") + elif gpt4_normalized == sonar_normalized: + print("⚠️ GPT-4 and Sonar agree, but Sonar Pro and Gemini differ") + elif gpt4_normalized == gemini_normalized: + print("⚠️ GPT-4 and Gemini agree, but Sonar Pro and Sonar differ") + elif sonar_pro_normalized == sonar_normalized: + print("⚠️ Sonar Pro and Sonar agree, but GPT-4 and Gemini differ") + elif sonar_pro_normalized == gemini_normalized: + print("⚠️ Sonar Pro and Gemini agree, but GPT-4 and Sonar differ") + elif sonar_normalized == gemini_normalized: + print("⚠️ Sonar and Gemini agree, but GPT-4 and Sonar Pro differ") else: print("❌ All models give different answers") print("="*60 + "\n") diff --git a/core/utils.py b/core/utils.py new file mode 100644 index 0000000..a3f3ecf --- /dev/null +++ b/core/utils.py @@ -0,0 +1,126 @@ +import re + +def normalize_answer(answer): + """Normalize answer text for comparison by removing formatting and extra whitespace""" + if not answer: + return "" + + # Remove markdown formatting (bold, italic, etc.) + normalized = re.sub(r'\*\*([^*]+)\*\*', r'\1', answer) # Remove **text** + normalized = re.sub(r'\*([^*]+)\*', r'\1', normalized) # Remove *text* + normalized = re.sub(r'_([^_]+)_', r'\1', normalized) # Remove _text_ + + # Remove extra whitespace and newlines + normalized = re.sub(r'\s+', ' ', normalized).strip() + + # Remove leading/trailing punctuation that might be artifacts + normalized = normalized.strip('.,!?;:') + + # Convert to lowercase for case-insensitive comparison + normalized = normalized.lower() + + # Enhanced semantic normalization for common financial/business terms + + # Handle frequency/time period equivalences + frequency_mappings = { + 'quarterly': '4', + 'four times a year': '4', + '4 times a year': '4', + 'four times per year': '4', + '4 times per year': '4', + 'every quarter': '4', + 'every 3 months': '4', + 'semi-annually': '2', + 'twice a year': '2', + '2 times a year': '2', + 'twice per year': '2', + '2 times per year': '2', + 'every 6 months': '2', + 'annually': '1', + 'once a year': '1', + '1 time a year': '1', + 'yearly': '1', + 'monthly': '12', + '12 times a year': '12', + 'twelve times a year': '12' + } + + # Handle percentage equivalences + percentage_mappings = { + 'fifty percent': '50%', + 'twenty-five percent': '25%', + 'ten percent': '10%', + 'five percent': '5%', + 'one percent': '1%', + 'zero percent': '0%' + } + + # Handle number word equivalences + number_mappings = { + 'zero': '0', + 'one': '1', + 'two': '2', + 'three': '3', + 'four': '4', + 'five': '5', + 'six': '6', + 'seven': '7', + 'eight': '8', + 'nine': '9', + 'ten': '10', + 'eleven': '11', + 'twelve': '12', + 'thirteen': '13', + 'fourteen': '14', + 'fifteen': '15', + 'sixteen': '16', + 'seventeen': '17', + 'eighteen': '18', + 'nineteen': '19', + 'twenty': '20' + } + + # Apply mappings + for phrase, standardized in frequency_mappings.items(): + if phrase in normalized: + normalized = standardized + break + + for phrase, standardized in percentage_mappings.items(): + if phrase in normalized: + normalized = standardized + break + + for word, number in number_mappings.items(): + if normalized == word: + normalized = number + break + + # Clean up common phrases that don't add value + cleanup_patterns = [ + r'^the answer is\s*', + r'^the correct answer is\s*', + r'^answer:\s*', + r'^correct answer:\s*', + r'^based on.*?the.*?answer is\s*', + r'^according to.*?the.*?answer is\s*', + r'\..*$', # Remove everything after first period + ] + + for pattern in cleanup_patterns: + normalized = re.sub(pattern, '', normalized, flags=re.IGNORECASE) + + # Final cleanup + normalized = normalized.strip('.,!?;: ') + + # Extract just the core answer if it's a letter choice (A, B, C, D, etc.) + letter_match = re.search(r'\b([a-e])\b', normalized) + if letter_match: + normalized = letter_match.group(1) + + # Extract just numbers if the answer appears to be numeric + number_match = re.search(r'\b(\d+(?:\.\d+)?%?)\b', normalized) + if number_match and len(normalized.split()) > 1: + normalized = number_match.group(1) + + return normalized \ No newline at end of file diff --git a/ui/renderer.py b/ui/renderer.py index 58f5d61..d34651c 100644 --- a/ui/renderer.py +++ b/ui/renderer.py @@ -1,4 +1,5 @@ import cv2 +from core.utils import normalize_answer class TextRenderer: """Handles text rendering with wrapping and formatting""" @@ -103,7 +104,12 @@ def render_result_overlay(frame, question_text, results, is_processing): sonar_pro_result = results["sonar_pro"]["result"] sonar_result = results["sonar"]["result"] - if gpt4_result == sonar_pro_result == sonar_result: + # Normalize answers for comparison + gpt4_normalized = normalize_answer(gpt4_result) + sonar_pro_normalized = normalize_answer(sonar_pro_result) + sonar_normalized = normalize_answer(sonar_result) + + if gpt4_normalized == sonar_pro_normalized == sonar_normalized: cv2.putText(display_frame, "All models agree!", (10, y_pos), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2) else: