From 5baafdb9b9a9709a345e3389fba8eacc2a6a5114 Mon Sep 17 00:00:00 2001 From: Shrey Bansal Date: Fri, 19 Sep 2025 11:24:15 -0400 Subject: [PATCH 1/4] Adding NLI Classification and reorganizing code base --- Makefile | 2 +- tests/conftest.py | 2 +- tests/metrics/__init__.py | 19 +++++ tests/{utils => }/metrics/base.py | 0 tests/{utils => }/metrics/bleu.py | 15 +--- tests/{utils => }/metrics/keyword.py | 2 +- tests/metrics/nli.py | 76 +++++++++++++++++ tests/{utils => }/metrics/registry.py | 14 ++-- tests/{utils => }/metrics/scorer.py | 2 +- tests/{utils => }/metrics/semantic.py | 2 +- tests/{utils => }/metrics/text.py | 2 +- tests/test_benchmarks.py | 115 +------------------------- tests/utils/__init__.py | 8 -- tests/utils/metrics/__init__.py | 17 ---- 14 files changed, 114 insertions(+), 162 deletions(-) create mode 100644 tests/metrics/__init__.py rename tests/{utils => }/metrics/base.py (100%) rename tests/{utils => }/metrics/bleu.py (65%) rename tests/{utils => }/metrics/keyword.py (92%) create mode 100644 tests/metrics/nli.py rename tests/{utils => }/metrics/registry.py (83%) rename tests/{utils => }/metrics/scorer.py (97%) rename tests/{utils => }/metrics/semantic.py (97%) rename tests/{utils => }/metrics/text.py (92%) delete mode 100644 tests/utils/metrics/__init__.py diff --git a/Makefile b/Makefile index f4099fef..6d023527 100644 --- a/Makefile +++ b/Makefile @@ -83,7 +83,7 @@ test-benchmarks-keyword: conda run -n tokensmith pytest tests/test_benchmarks.py --metric=keyword -v test-benchmarks: - @echo "Running with custom arguments, E.g. conda run -n tokensmith pytest tests/test_benchmarks.py --metric=text --metric=semantic --metric=keyword --threshold=0.75 -v" + @echo "Running with custom CLI args: $(ARGS)" conda run -n tokensmith pytest tests/test_benchmarks.py $(ARGS) # List available metrics diff --git a/tests/conftest.py b/tests/conftest.py index a599de28..9c0a269f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,7 +23,7 @@ def pytest_addoption(parser): # New metric selection options group.addoption("--metric", action="append", dest="metrics", - help="Select specific metrics to evaluate. Options: text, semantic, keyword, bleu, all") + help="Select specific metrics to evaluate. Options: text, semantic, keyword, bleu, nli, all") group.addoption("--threshold", type=float, default=None, help="Override threshold for all tests") group.addoption("--list_metrics", action="store_true", diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py new file mode 100644 index 00000000..e5bb95d1 --- /dev/null +++ b/tests/metrics/__init__.py @@ -0,0 +1,19 @@ +from tests.metrics.base import MetricBase +from tests.metrics.registry import MetricRegistry +from tests.metrics.scorer import SimilarityScorer +from tests.metrics.text import TextSimilarityMetric +from tests.metrics.semantic import SemanticSimilarityMetric +from tests.metrics.keyword import KeywordMatchMetric +from tests.metrics.bleu import BleuScoreMetric +from tests.metrics.nli import NLIClassification + +__all__ = [ + 'MetricBase', + 'MetricRegistry', + 'SimilarityScorer', + 'TextSimilarityMetric', + 'SemanticSimilarityMetric', + 'KeywordMatchMetric', + 'BleuScoreMetric', + 'NLIClassification' +] diff --git a/tests/utils/metrics/base.py b/tests/metrics/base.py similarity index 100% rename from tests/utils/metrics/base.py rename to tests/metrics/base.py diff --git a/tests/utils/metrics/bleu.py b/tests/metrics/bleu.py similarity index 65% rename from tests/utils/metrics/bleu.py rename to tests/metrics/bleu.py index 7c88840e..7b1805c9 100644 --- a/tests/utils/metrics/bleu.py +++ b/tests/metrics/bleu.py @@ -1,5 +1,5 @@ from typing import List, Optional -from .base import MetricBase +from tests.metrics.base import MetricBase class BleuScoreMetric(MetricBase): """BLEU score similarity metric.""" @@ -12,19 +12,8 @@ def name(self) -> str: def weight(self) -> float: return 0.3 - def is_available(self) -> bool: - """Check if NLTK is available.""" - try: - import nltk - return True - except ImportError: - return False - def calculate(self, answer: str, expected: str, keywords: Optional[List[str]] = None) -> float: - """Calculate BLEU score between answer and expected.""" - if not self.is_available(): - return 0.0 - + """Calculate BLEU score between answer and expected.""" try: from nltk.translate.bleu_score import sentence_bleu reference = [expected.split()] diff --git a/tests/utils/metrics/keyword.py b/tests/metrics/keyword.py similarity index 92% rename from tests/utils/metrics/keyword.py rename to tests/metrics/keyword.py index fd5615d2..4fbf0d56 100644 --- a/tests/utils/metrics/keyword.py +++ b/tests/metrics/keyword.py @@ -1,5 +1,5 @@ from typing import List, Optional -from tests.utils.metrics.base import MetricBase +from tests.metrics.base import MetricBase class KeywordMatchMetric(MetricBase): """Keyword matching metric.""" diff --git a/tests/metrics/nli.py b/tests/metrics/nli.py new file mode 100644 index 00000000..7cdef92e --- /dev/null +++ b/tests/metrics/nli.py @@ -0,0 +1,76 @@ +import os +import warnings +from typing import List, Optional +from tests.metrics.base import MetricBase +import torch +from transformers import AutoTokenizer, AutoModelForSequenceClassification + +class NLIClassification(MetricBase): + """NLI-based entailment metric using DeBERTa model.""" + + def __init__(self): + self._pipeline = None + self._available = self._initialize() + + @property + def name(self) -> str: + return "nli" + + @property + def weight(self) -> float: + return 1.0 + + def _initialize(self) -> bool: + """Initialize the NLI pipeline with the best available model.""" + try: + # Suppress CUDA warnings if running on CPU + os.environ.setdefault('CUDA_VISIBLE_DEVICES', '') + warnings.filterwarnings("ignore", message=".*CUDA capability.*") + + model_name = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli" + self._tokenizer = AutoTokenizer.from_pretrained(model_name) + self._model = AutoModelForSequenceClassification.from_pretrained(model_name) + + print(f"NLI metric initialized with model: {model_name}") + return True + + except Exception as e: + print(f"NLI metric initialization failed: {e}") + return False + + def is_available(self) -> bool: + """Check if NLI pipeline is available.""" + return self._available + + def calculate(self, answer: str, expected: str, keywords: Optional[List[str]] = None) -> float: + """ Calculate NLI entailment score between answer and expected text.""" + + if not self.is_available(): + return 0.0 + + if not answer.strip() or not expected.strip(): + return 0.0 + + try: + # Format input for NLI: premise (expected) and hypothesis (answer) + input = self._tokenizer(expected, answer, truncation=True, return_tensors="pt") + output = self._model(input["input_ids"].to('cpu')) + + # Calculate entailment score + prediction = torch.softmax(output["logits"][0], -1).tolist() + label_names = ["entailment", "neutral", "contradiction"] + prediction = {name: pred for pred, name in zip(prediction, label_names)} + + # Weighted scoring + final_score = ( + prediction['entailment'] * 1.0 + + prediction['neutral'] * 0.5 + + prediction['contradiction'] * -1.0 + ) + + return min(max(final_score, 0.0), 1.0) + + except Exception as e: + print(f"NLI calculation failed: {e}") + return 0.0 + \ No newline at end of file diff --git a/tests/utils/metrics/registry.py b/tests/metrics/registry.py similarity index 83% rename from tests/utils/metrics/registry.py rename to tests/metrics/registry.py index 01f58c9b..707081c5 100644 --- a/tests/utils/metrics/registry.py +++ b/tests/metrics/registry.py @@ -1,5 +1,5 @@ from typing import Dict, List, Optional -from .base import MetricBase +from tests.metrics.base import MetricBase class MetricRegistry: """Registry for managing available metrics.""" @@ -10,15 +10,19 @@ def __init__(self): def _auto_register(self): """Automatically register all available metrics.""" - from tests.utils.metrics.text import TextSimilarityMetric - from tests.utils.metrics.semantic import SemanticSimilarityMetric - from tests.utils.metrics.keyword import KeywordMatchMetric - from tests.utils.metrics.bleu import BleuScoreMetric + from tests.metrics import ( + TextSimilarityMetric, + SemanticSimilarityMetric, + KeywordMatchMetric, + BleuScoreMetric, + NLIClassification, + ) self.register(TextSimilarityMetric()) self.register(SemanticSimilarityMetric()) self.register(KeywordMatchMetric()) self.register(BleuScoreMetric()) + self.register(NLIClassification()) def register(self, metric: MetricBase): """Register a new metric.""" diff --git a/tests/utils/metrics/scorer.py b/tests/metrics/scorer.py similarity index 97% rename from tests/utils/metrics/scorer.py rename to tests/metrics/scorer.py index 4e91968f..13b8298c 100644 --- a/tests/utils/metrics/scorer.py +++ b/tests/metrics/scorer.py @@ -1,5 +1,5 @@ from typing import Dict, List, Any, Optional -from .registry import MetricRegistry +from tests.metrics.registry import MetricRegistry class SimilarityScorer: diff --git a/tests/utils/metrics/semantic.py b/tests/metrics/semantic.py similarity index 97% rename from tests/utils/metrics/semantic.py rename to tests/metrics/semantic.py index d9b01c2c..8673e497 100644 --- a/tests/utils/metrics/semantic.py +++ b/tests/metrics/semantic.py @@ -1,7 +1,7 @@ import os import warnings from typing import List, Optional -from .base import MetricBase +from tests.metrics.base import MetricBase class SemanticSimilarityMetric(MetricBase): """Semantic similarity using sentence transformers.""" diff --git a/tests/utils/metrics/text.py b/tests/metrics/text.py similarity index 92% rename from tests/utils/metrics/text.py rename to tests/metrics/text.py index a08de235..a03272c3 100644 --- a/tests/utils/metrics/text.py +++ b/tests/metrics/text.py @@ -1,6 +1,6 @@ import difflib from typing import List, Optional -from .base import MetricBase +from tests.metrics.base import MetricBase class TextSimilarityMetric(MetricBase): """Text similarity using sequence matching.""" diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index f4c3b828..79fa14a5 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -1,121 +1,10 @@ -# import subprocess -# import pytest -# import json -# import sys -# from pathlib import Path -# from .utils.metrics import SimilarityScorer -# from .utils.answer_parser import extract_answer_from_output - -# def test_tokensmith_benchmark(benchmarks, test_config, results_dir): -# """Test TokenSmith with all benchmark questions using selected metrics.""" - -# if test_config["skip_slow"]: -# pytest.skip("Skipping slow end-to-end test") - -# # Initialize scorer with selected metrics -# scorer = SimilarityScorer(enabled_metrics=test_config["metrics"]) - -# print(f"\nUsing metrics: {test_config['metrics']}") -# print(f"Available metrics: {scorer.registry.list_metric_names()}") - -# for benchmark in benchmarks: -# _run_single_benchmark(benchmark, test_config, results_dir, scorer) - -# def _run_single_benchmark(benchmark, test_config, results_dir, scorer): -# """Run a single benchmark test with selected metrics.""" -# question = benchmark["question"] -# expected_answer = benchmark["expected_answer"] -# keywords = benchmark.get("keywords", []) - -# # Use threshold override if provided -# threshold = test_config["threshold_override"] or benchmark.get("similarity_threshold", 0.6) - -# # Run TokenSmith subprocess -# cmd = [ -# sys.executable, "-m", "src.main", "chat", -# "--index_prefix", test_config["index_prefix"], -# "--model_path", test_config["model_path"] -# ] - -# input_text = f"{question}\nexit\n" - -# try: -# proc = subprocess.run( -# cmd, -# input=input_text, -# text=True, -# capture_output=True, -# timeout=test_config["timeout"], -# cwd=Path(__file__).parent.parent -# ) -# except subprocess.TimeoutExpired: -# pytest.fail(f"Test timed out after {test_config['timeout']} seconds for: {question}") - -# if proc.returncode != 0: -# pytest.fail(f"TokenSmith failed for '{question}' with exit code {proc.returncode}\n" -# f"STDERR: {proc.stderr}\n" -# f"STDOUT: {proc.stdout}") - -# # Extract answer -# retrieved_answer = extract_answer_from_output(proc.stdout) - -# # Calculate scores using selected metrics -# scores = scorer.calculate_scores(retrieved_answer, expected_answer, keywords) - -# # Determine if test passed -# passed = scores.get("final_score", 0) >= threshold - -# # Save detailed results -# result_data = { -# "test_id": benchmark["id"], -# "question": question, -# "expected_answer": expected_answer, -# "retrieved_answer": retrieved_answer, -# "keywords": keywords, -# "threshold": threshold, -# "scores": scores, -# "passed": passed, -# "active_metrics": scores.get("active_metrics", []), -# "stdout": proc.stdout, -# "stderr": proc.stderr -# } - -# # Append to results file -# results_file = results_dir / "benchmark_results.json" -# with open(results_file, "a") as f: -# json.dump(result_data, f) -# f.write("\n") - -# # Assert based on results -# if not passed: -# fail_msg = ( -# f"Benchmark failed for question: '{question}'\n" -# f"Expected: {expected_answer}\n" -# f"Retrieved: {retrieved_answer}\n" -# f"Final Score: {scores.get('final_score', 0):.3f} (threshold: {threshold})\n" -# f"Active Metrics: {', '.join(scores.get('active_metrics', []))}" -# ) - -# # Log failed test -# failed_log = results_dir / "failed_tests.log" -# with open(failed_log, "a") as f: -# f.write(f"\n{'='*50}\n{fail_msg}\n{'='*50}\n") - -# print(f"\n❌ Failed: {question}") -# print(f"Score: {scores.get('final_score', 0):.3f} (threshold: {threshold})") -# else: -# print(f"\n✅ Passed: {question}") -# print(f"Score: {scores.get('final_score', 0):.3f} (threshold: {threshold})") - - - import subprocess import pytest import json import sys from pathlib import Path -from .utils.metrics import SimilarityScorer -from .utils.answer_parser import extract_answer_from_output +from tests.metrics import SimilarityScorer +from tests.utils import extract_answer_from_output def test_tokensmith_benchmark(benchmarks, test_config, results_dir): diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py index 9be31899..f06e19a8 100644 --- a/tests/utils/__init__.py +++ b/tests/utils/__init__.py @@ -1,15 +1,7 @@ from tests.utils.answer_parser import extract_answer_from_output -from tests.utils.metrics import * from tests.utils.generate_report import generate_summary_report __all__ = [ - 'MetricBase', - 'MetricRegistry', - 'SimilarityScorer', - 'TextSimilarityMetric', - 'SemanticSimilarityMetric', - 'KeywordMatchMetric', - 'BleuScoreMetric', 'extract_answer_from_output', 'generate_summary_report' ] diff --git a/tests/utils/metrics/__init__.py b/tests/utils/metrics/__init__.py deleted file mode 100644 index 861e4299..00000000 --- a/tests/utils/metrics/__init__.py +++ /dev/null @@ -1,17 +0,0 @@ -from tests.utils.metrics.base import MetricBase -from tests.utils.metrics.registry import MetricRegistry -from tests.utils.metrics.scorer import SimilarityScorer -from tests.utils.metrics.text import TextSimilarityMetric -from tests.utils.metrics.semantic import SemanticSimilarityMetric -from tests.utils.metrics.keyword import KeywordMatchMetric -from tests.utils.metrics.bleu import BleuScoreMetric - -__all__ = [ - 'MetricBase', - 'MetricRegistry', - 'SimilarityScorer', - 'TextSimilarityMetric', - 'SemanticSimilarityMetric', - 'KeywordMatchMetric', - 'BleuScoreMetric' -] From f9aa39637f5c0a5e1a7b152e77434cbae9519e66 Mon Sep 17 00:00:00 2001 From: Priya-753 Date: Wed, 1 Oct 2025 10:56:14 -0400 Subject: [PATCH 2/4] Simplified feedback --- .gitignore | 5 ++ requirements.txt | 2 + scripts/feedback_cli.py | 54 +++++++++++++ src/feedback.db | Bin 0 -> 102400 bytes src/feedback_db.py | 174 ++++++++++++++++++++++++++++++++++++++++ src/generator.py | 16 +++- src/main.py | 63 ++++++++++++++- 7 files changed, 308 insertions(+), 6 deletions(-) create mode 100644 scripts/feedback_cli.py create mode 100644 src/feedback.db create mode 100644 src/feedback_db.py diff --git a/.gitignore b/.gitignore index 1d3b48d3..c33d7f77 100644 --- a/.gitignore +++ b/.gitignore @@ -243,3 +243,8 @@ cython_debug/ marimo/_static/ marimo/_lsp/ __marimo__/ + +# --- Project-specific ignores --- +# Local data and external repos cloned into src +src/data/ +src/llama.cpp/ diff --git a/requirements.txt b/requirements.txt index c7befab4..96f1a1e4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,5 @@ tqdm nltk sentence-transformers rank_bm25 +scikit-learn +matplotlib diff --git a/scripts/feedback_cli.py b/scripts/feedback_cli.py new file mode 100644 index 00000000..684e537a --- /dev/null +++ b/scripts/feedback_cli.py @@ -0,0 +1,54 @@ +import argparse +from feedback_db import FeedbackDB + + +def list_feedback(limit: int) -> None: + db = FeedbackDB() + rows = db.get_recent_feedback(limit=limit) + if not rows: + print("No feedback found.") + return + for i, fb in enumerate(rows, 1): + if fb.get("thumbs_up") is True: + thumbs = "THUMBS_UP" + elif fb.get("thumbs_up") is False: + thumbs = "THUMBS_DOWN" + else: + thumbs = "NO_FEEDBACK" + rating = f"RATING_{fb.get('rating')}" if fb.get("rating") else "NO_RATING" + print(f"{i}. {thumbs} {rating} | {fb.get('timestamp','')[:19]} | {fb.get('query','')[:80]}") + if fb.get("comment"): + print(f" COMMENT: {fb['comment'][:120]}") + + +def show_stats() -> None: + db = FeedbackDB() + stats = db.get_feedback_stats() + print("Total Interactions:", stats.get("total_feedback", 0)) + print("Success Rate:", f"{(stats.get('thumbs_up_rate') or 0)*100:.1f}%") + print("Average Rating:", f"{(stats.get('avg_rating') or 0):.2f}/5.00") + print("Comments Count:", stats.get("comments_count", 0)) + + +def main() -> None: + parser = argparse.ArgumentParser(description="TokenSmith Feedback CLI") + sub = parser.add_subparsers(dest="command") + + p_list = sub.add_parser("list", help="List recent feedback") + p_list.add_argument("--limit", type=int, default=20, help="Number of entries to list") + + sub.add_parser("stats", help="Show aggregate feedback stats") + + args = parser.parse_args() + if args.command == "list": + list_feedback(limit=args.limit) + elif args.command == "stats": + show_stats() + else: + parser.print_help() + + +if __name__ == "__main__": + main() + + diff --git a/src/feedback.db b/src/feedback.db new file mode 100644 index 0000000000000000000000000000000000000000..1decf495d43c14cf77d81fe34e3b3284e0363fec GIT binary patch literal 102400 zcmeI5&vRVab>C++vSh=a*s+r+u9T}@mFxh}0Btlt00dbo2mCQJR7eho;7CzfF3v;u z1JGjsfckY4gd#5-Cn?*T?6S=!8@sZ~zmbJ^`2+MPWSd3u`JQv{eXsFj$eFPv*}9a5 z?0)_3uXE4ubMCnZA3q$-s^Vxe9hI|U>#g5<>+QGS`Zq=K)?06VjsJd^|Kgu_`0&;E zg@4~}eZKDF*WT*<)o;JcuV4MOzkTc7zk2uPum4XzRnynLbyQXT!?O3{d2ccw&mxGf?y)zm%30Naj=p^l@IQO-=zekU{`>o%JbYH% z{p9IWAoyGl51#EmdU6SrtAGB^J753C>gwD7Y2UWEKCfrh==rFcO$WVt@!MAyHhA&3 zpYQ(qbf1sQ(Hm`Tc-{93q<#1r^x^%oU(QOygTI$uc=DC6iO#<7d+?(nM%44V`q8`^ z_o{mF>pP3Pu=wMOK(I#3)&E$({q=AB-tWErmwVP{#Hjf1R~CASPk+9<$a1IWZ$fM3 zxIU||7{PZ<+HyMc$eg;lV%h_Oj{Mo&PivfXu`4z?H z-~P@2^A`Wle=-Cz1Tq9N1Tq9N1Tq9N1Tq9N1Tq9N1Tq9N1pd)L;5XiW=l9-@`6KVh z{wL`_|H%-@5Xcb75Xcb75Xcb75Xcb75Xcb75Xcb75co$7fo%W(Bi`J+vH+;VKU|gF90zt$H(*Xxat(o=;h$!hedo+oRoD@Q_f+^%ctdZR^M7$ zFYZy9W1Xr7gYm$MwWPOK?%jL1x>lT>400+}e$NgGG3JY&Wkp|6T-I~|E zlWJ7fYy38>tbjwDA1~f3U zz9JF^?BwOiv`=*uC#GptQ|zNI4$q5vGBtE&&dnY+oyBR?^V3t_q)Ay1dXU?!n4Cg! zUYCT%tpOcqB8B-_b&}R>f2OrBSSzhw^{P{?-l?!RnKGl(iI8F9T%rU=C>A>CLDLro z`{`f&&06u(zxZnh+IpNJaOsN_TAPsOB(#`;6q?g`J{%TSQFHm}&GPbPXsfQYgp81VRmn)hQm#oLW<#H9J^N+kIMRaBH=#UY)Y@ zs$9e3XY;A;P#N|BrUQzN1~Vq5xjupY7v1i*EisYA1sa^6!o&cw;4KbmA2svDxDh^9 zh?YS+OH1GUX7RLoIjGKxXN*-ZMP_}4=!idE$;ZQw;;4ZIwx2MUSxMCqRsoxrNXzT)e($h+!HEAl(CNeaZ7sg^l?IWzI<%8lp_1$fw0wC0$dY5 zHlJ#WfjRj$^WAt<`qpMs{P+k?G7yTzsX!*>@VGjhmi!9)O%0s`!E@4+XR05^+k>*^ z-MeA~K|C5`TQSBEo{0bI^);7c;D~=;rRBi_JK}*c9{!bJFvi1YXF7b?4^&A-;`z8| z9E_*sOK@g181q;8Y7K-%cv_(~elvQ3Tk`Rwd;s?OPa~yWby)?=x|*hI&4b;dy;kl zo&ymV*&wcoKk9n52NuH^PDW@roSjUV4HKP}^W&45q&%!v%tcE_PZ4qTi~D*~4aT=Y_!rtCwJ;S#iyR}D&@F{N z&4gWV5HIevCvh@3M!}gCwOGArNagsvHJ}>wV*^dKnM;lAa;`5(UFc}Vr!y+gi$hrs zHbF_8G2oWgsWxnn{?f>+8VC!a0ev-|g0luzz;9wwYKY{GV-fsmawNXl*QUI$$!%_Q zch%8RyCZnQU^E{UO^4t@Z#t>bVFi2d6ls02m*aLX@iia(q;&ka-1ntkwmO@|fmASP z#YhNH97AX#m&7xVr)9tT!SmCJwBHr`wpRGJV%3LCQnQpwWP}PRjK9MiB*(2;!q6F* zQ8y!?m5k?DYtjJW%~gHd^xo=~Q6~@?7t8z0-iMHV*w@B2dL``_b-f#5exd2Qsaso} z-R5zzT-+(%3(vPMJom@VLkJE_0l97#jV&eoXlTYJixwiK*xUsO5v8MnxY$bArB#`% zrUuXsJQ1?7`a^&uQg+I;KJ-iIWh7|`+4xYQi2*EZgzN-8_Q@F%>T4igK;5g@SM;5w z4{U$+BRWzKKF|{Pf*1T}KE2FuQL2q$8?pyhcpc;J6yJv*i@wSlfoCQ9;i0hFLC=&^ zaauTmqk-9)W^Mbrzk;h8Pc+As;)5sq5JKDmXWLB;L4B|9p4cN1l1cL1@ETr0uhSNPC(!)h;tA{4fUe!wU@B9MN0$mxzs2?CNYHR!(j(y z;B<)HB-50ZVcdE$oVx_2EG0;AHkrQAnqYoMV9N}q<8GBAFc{Gez;z4FdX`Zx`*C>Q zS=#QPW^7%lW*{d};n@>Y-sKJe(@ev6ner=<*3=S%e8Z=rVIp^m6D-S%5P~$Lj@Vi1Zgiq$xOgPlOeoU8&mdr5Sp|pzPLuPwA5OlHs=Be#QMa9TBvajO|H{=!-qPohLiEJ zY0B>AfC8`+!k)qySYu#)ouy5|9a}d@{V+}09}BJmc}(k-2lud8B$457;x1^%XM-9A z3BMWU^|4HTlmp`x0h7#?!(vSpf-0;v_~Ls|-|`m4`yy)p(M*^d8*Aa0yZYtEM)A|X z{O|1-cG3%X^uo@i7jE)`=IA_nUoYKkz0?Vupp5}61IiKRB@B^|SAyo=poT@cH1HmU zF0Sc0$*jp|)hyHimQX3ub5hnD37_C(v|&Bk0D)ueB(Ldkfcos-FotFidtxz&X7rKiBndV-|jzo-U8N3!7$-f&#p4c(iqz+BxKi*%8b%qHfOD3Eebed)iRgHy)e3ataG_~vr*#B#rJ~d_(hW4->fBs}$zeHukvP~Q z=e+qd+dhEiaPuHb1C7T;8;1+;`aS_yigvE*4PV;OJS1ug8V9@Wbb zssMEYrkVXVn^yQv#1ai)I44LMWxMT7L-Yj}>frfn5EG`2V)ELL8ZL`gMRt!-C zkiXi}xW+5i73N^eHg=jO%`eLzmQ(ZZEKA+>F;>D*BBa}ee?dT_L*Etyu4eUWGa%eL zefbUz@H2(E{D9ir0z~_XI$1UN;?bB+#V`u>Ac+6qxw9*ne%KLcdji`5g;xq*3XkJ^ zZ4q{}xC0tW7&WrgkacenS+5YfizvJI#waU`M(ES>`T|dJh{Utmx&<~LJww8+1-n8L zPtbN*Vu(UpOj3z$6qASpkl}E0Ub)zprt12{^814+h#(NK+$pgQbCa>9mvH#+WgJ6w z89}Jr$*+OVWh&QYi{Z-E7`qesnF1~le%7jz14i&|QRV`GF3!kx&-4n@SUwoZ;c+G4 zzd;8Wa_&c5+^$-Yy(DLrHV z|JoI*+an8^8;m0hlKaN?S`do>0Iq@uINRvQh@G<_N5qmJS7$Rt>mvEu%uHn?ScCCz zlI%K2NTt|NsHvx$8wegV*nDj2jiwPmd5HC`koAI%|y)QcGs=!-&(>SCT+&_|h zs1lz!9WhS#CCORs!@~#M7wq@;Bv-Jh7|^|~t5Khc-bWvQ^x-xJbYpY}Zz?NqnqKm} z8P+>bYu<8ws|~THSE2d>PnRuGqdK(Q(oIitoj8vJ*P6#LX7_7{c?-!1=O!;#<9Rei zT4Ta0He1dbNpE7CTn}W{@@E8sWQ;Pn88Oa$@kk~v`C#rPm4i9cA9&B!KLK*4}I|F4|JCb zZ(^O83K-)JCV~_|h`q#Kg;3`xvB}*nXbE%fTM(jH^y35sZc#vMSHh3ML+P=FxZlCO z&;%lqvwq?AMi@oc(Q+MFs5w{Fo#b`sP8$-y;b6uLH^+z^d?%zJ;LE~LSM0%~c7U&M z0mWwb4g6#SKo!!h>}TnhMpS0fM%OypaD<)@`bW%0QaQ{<8b-_KteuJ(BI;0H4QMt3 zo~~LXY&f9hpcR25W&fodRsCSaD$3?#<3v+j4%U4 zK8@Y8qq+}T7a zUygQZoRq&l#oPj+!km})pKf;db}cIP`&&Vk2l(5Is+F0QQ9=tD8;tllng1r&q7j+| zHkMj6E0JB!zJi_G(!-bUOo^`#&l}|{8RH&B0hzT((X?#V0Q$tW1Pvjn$=E1PCAm{z z+sEX)$w+sAnoYf^i_J_+)JScC`ML?SJv)orlQgrdx9DeY?p~$Mq)AG2Ly|`Ig~x{#UFGBnjyieVn1X^-K}laggV&CF(N@(e zE|@<7?c&-DqgulTm0H6tXj0*M#5XqsYuUZZayTu|l~kZsAkPJNXB;XyRTk-+V9J{s zO>G4!q{nDTIYGP&=qZk73;P`AjXKiyrjEh}e35=cw{5W<6OxUuy2B;9klf7^Y!n~s zGrS1^_hy2YshO68jl@Ii_O#;2z9Ss2hx*-7ZjB)xQe!Npy+_us5veGswbvdQkobvv}j>3 z%e$pF>9ha{Gg3)ZD}(xQe9QGF6cMszCsR^DPUnY1f=$(QHF_xfE2C<3Sd4pgcsTB0 z|1W7$)oRND$!s1D;LgN0oB0||ZFUzK^6v;dN{cr%%nAN4+%mS^i9qBSSwTa^$!!nc)bjhTp5lEOO;$;W)uU{cHAZ#TP=shPY94caeTBHKj{Xl}-k z2K%OyT6$)|JsDO6kHrlY`hy>*WorRfZ-{~6$#eosCPQGT-BIeokLCk9$By^tW$d_0JfI)|AP&V*iO-B4dy@o*1 zm8-P=(U*yC!Rkes7X1RA4w~Z$*gGKkpALN}F1{8~@Eu-EDkaC((&&IA`No{Wycgeg z<3D6XdccgJI}`?OwHb7z%N4goS-bBykdJ$Y`$QO-zV49AOklqrqEOaGN5MSO{NStc zf^;WiPgDeM8g&uW*UUwy1kz}mn+C833B#JSi!7xX+4_g{q_FcNfQe+sIW6dSXo~=~ z!qG-DmHp@ewzgt4uBY#i^>Tx3J0YpL6WP-BM^RLRr1tK^Amo=MrdnrsBAH1QSYZEP zMIaXUsv;f+V;ud=K@mhbTTM?`A$g!495Q)BaCXeo$;#nbN%Ec_(UO{BON4!>8O&NN zK%7ybeLtun7V4pWM+HM&>^9Pmv=IybohGJX{6W8e2%ILe4AQZPTTzQ@-sU`qMR-i= zw;sFp)p}sZD@d5Mx4%sShxfjG+u-aIj9&XdWiD~awjfsBDaXWGQBe1Q#S zx8M^zSPc$4=mkfa+6j-?w*h6tw6wI)9bY01Xh&MVVgLVnhnz=SfcYBsw(*YNj!1GL%IMj+ z7ZJvIL-3R*j0sxZnH@FJCV)rI>w{7voxPa}eiViA$~YH*xfsrOc&^{|Fz8fgvn-oP z3&M^U0^Am=hThE{ZXg)hms3SJ&qZ#_vy(aWbq$6u(5$BG!+~Of+_VDy)Fly_m#_Ni=GIO6Sz5Z{H!Xc-cW3*?Rx=JF zOVSBh=+zSl!6+|{3`u~kc$6C_P8(4wxt~T%OV5=RB=w4JU|i8q80pA^vDr0 zHiu%MM4eXC64Pib>9u}3CxHu~GaoW^M?`~jB-uAUXRnmCrcF( zwp+*GWM4`DcO_L%1GS^vPHhDwO(%46VdF-Cfd(VPc79r(QX2%NNX@*vqq0}6H%snI zHkh~wOJMI%v9Jz;o1sByL#(J}^&m=`;!<)&K%gt(z8a_Ec2Cq)2+R+l7m` z^Z>~L(*OqGH={%+xZ>V9GM}@FYJCfp_nNt3P>ZpQ z$C!3DEF6xNjqukgJu$8<*|4AiS~BS1hz5UF?T%o{$IN~BHL8RdOLVzJJXW%U9Gwsk z+ONGeIs73#tYg=K3_!nmwX?7YGr(L03%pJ9O`bvRq1I}6Rd^VqhLGsTkuafDo&+I zhP8ywY`@zJv9c4IU0jd^L^Ju$(w5m}rth*y+4sg$hVqN^p0G(8TAz~%p5hyJSU?bN zIjGX6O00!!B)t*1l@Tsn@02P)zJ;Jif6oDy@FTB z#|TiIoh0n@6h@50yYIb+p771J;_3YGe69F^+Coo0eBzybcVLxJ?#L}_4jS`t5b(ff zAzosB%e2ca-W0^gMdCAtXTGAw1s-0#7raO~z@I!=+=I>4PB2sq2j%f_|B0~{j<62U zLbOEN0D(CXDr$~bJlenaQRngfX9b;0_0=({77*L(EbKj@%-wG$*uunFrYk_Hb3--P zBZO*zEV}6W6^CnZI#DJHKdqxdYX-)Y43zQAk1<#~n@N^FI1L?P)UU_T1p-PVkTE+a zaTTV1FATtYvEPt(eMA@|dERm6P<;XPeaUnz-v~h?rmp3QFH5fXsAwQp^3i+>07Lu) zoDY;|r_+l2lH|f9=eG}a464NfuqfLvBv?gx!huRRHPIejaWfc1`F#x=Zs~J=qP0DU zE7_V(a?!F-*5pA=t4{^Kh&K&jU*cSet)s;-4TVQ0;w)AktmS8t10PSX4%D{vo%Z0O zd_2qXB$vFl?8<(N{{l+1)WtW%&7&X^t|kT*#LZaf2^KT$1i==N3se&o5)!x6X`Ha* z!JE;G6p(tgj@ZaU3O`Vv2`6VKxj4n!Zg6;IP#u~33KU?utvy7J?=!*=5oaI*QgBQr z6Aqk;RB_YLz2OSSia+t@Yj9OAem+s%3?Ey)=i4@qfnfO^(rc1!P1T zMhn{NNFTud0wVYkJ>_LLFB#UmU#tBPtl_JQWO2Ck3adi?HGK7AzpY+XxX6prG?LMo zrn{5kukCzByc;;E=_l|iKT*$uBwk`2G=T|0Yckh9#zjWkQbZ$7&V-#|ndD&#Ry8m+ z98I2~me$8|Ekb1{vterLNtDV)tKOKV7r>7^dP6)}A(sVatYN8|A?=-OOG_&(XdmT#pXveO?@* zFO1zPjXGzf{9uxZk)@_gXCVrJEmsxLhj8cA%24gyY_!^4GzAM%rS?(x;B~qJMaB_p z%|bO`Kzs_&09ctN7m+ofL+UYjI;3L;@1K{spnFYL7Rk#2F+}&!$x$Jy&?aDyb5Qdj z=4j&z+HZYxRaAY5@9X^qs$qsWla4%=0fI>wFh*!)V229iCRJ0@bi$@fLOu8Vq?jN3lLdArcj(vSkSE3kxpUw*=Rub*#}(M@KUK>q~@S9P<#*< z_cVKV$Qpk|nTV8$y#T;EVWDX);k)fr=)D=%cG!m!VFx5`JY=k4Xc0~ZQ^hcqe6ya6 zfdLE-&IBG7b902nF{eyRkQd#@N*v`ULmSD_GrdxmHR`;dADF}e>~XVb))k6~H>S?IHJ5aN$KGOR2f zhAN*=x*V^S_F_DD{U)c@HQ3_o$x2E~6f*)& z7G++)Nft00%Bsld)Gm%`&_BKQatd!63}@U_xxj346H zTdY`X>@`jrT)HZw%KdgVgis%mLE(H`SDI? zZQo=O2?tbE(-O&_DOw+LED*Kis}7O2Le1BLL2Es%@N{@#Pla_5doDCBXg;22leYqcJXt}(;d_l|aUt>mf=4NQ7jKGODjHWh zl3=@U>9dhzy$NJn=n;}W5Jo461h6&|kA!KTeKyc9Ex~y&dCX+KO1nbGnWH>gy*?N= zep8aktht~rB0r*<5XX$n^#EKDvGD=kk{%_R;a(OCGcUK*J6aHd4oa60DaZjAG)?1f zNYc4s3Rd7JOv-`*?ku5ub3{nt4$7>y$B8+L7ct^kSa+pdH4}{7q~eE^!9$fXV;!-< zZmb;gt02S2UPvMd0R^7)$)HTs3YyBB#RI)e7KE9Zu7mqFt{fqq+FQ7=C?ar}E;e~z zP&|gV08_zP3%YJ8$@;P_Ho9z?LTVX&v+s!J?+7AGnFz#0WiH9T(M236V<7u5+33~TJ@p^jEI5p3%z;zfEpEjZ9! zC?{t1AA(tv#91&Hjqj^(%uVzj7KBs7AZp<)WUF}tHG3;6cd5OSC7C?tdQTdLrK_er zz;Jntjk;^u>$(0v5w2YSKiB_Pu>y0|lXn=e9zafGTm)&H4!3*Vm0bTnR?5NvlDoO_ z`peysj)r2!wtmX>{|S#>x?2DMH6@7zy=6zs+B$0LI`u_y2aYjZBAHN>b1_Z%!d(B~as~~}a{Yh193a>KC&<#L&b{4OlI#EH z`u`Y%x&FV7xmjgD=_;vcoTLnz9Jb));)1#Uf2#PO>;GE>T%Ox_{#=-|c)pr2v~Vle z|0fTnb(=vtEo$*!f$+K#fYFNJ2(wN|FHq(}mom#pyw`b50>=avI0HZeU>p@{zOKZ+ zh9I*$*Z&uM0UwE`IvOWb_7q=bIsEe0|NqSvO~)eNx&FWICCT;w?E+k{1D@;uFBHA8 zlwI;Rn;R3ZajJ`_=lcKd#mM#lEy1WMP87VWx&A*Axt*Vy>;H2|Eam&5+ckCQf0h?` zUQq0tB7!72Cm&p{|Bp`jcYASa3+|Uqbe;b}|7Q#Ao&;o&n*y&Zr@vtT=lcI-Be#># zL(IZ#f;TsJLme@zXBPb_$t275|D&xc9VphEZsHh9H;BNpgA6)yiAkoqd8z|Irotub zI6RjLtXr$YS+`vO|I#&Sf?O}x|JNDfT>t+w>;M0j6pdW}|BKu?5Cm_o|F7Gft>~Ox zgKIfvx&A*6BtHo29>;X7#AH{l|Brym_5V3zZ}$Y``v0!Y<@*1sJtKXYstUROf3E+3 zq|>2(1U=XPUpUB}>;LQQRIdMT%+jhQHW3(Aa5CqErBcQs-S`!9{eNgE*Z&WGyC%){ zm$?4_Kh5?3#X>^iHkDzBH(ccU|8UG)|KHN!Qbt!eVrV*guK(ZMC=tr_a*Pi}&e5RxDg<4yTkx&D8u`|40J zznq=;6`XSYe`R^fXRFfSUb&L8iPSd&I`Z!4`u|=5kwTNX{(r9jpX>kU`u}q~mX+)O z=lcJ-{(r9jA8z$d3b))qx&D8y|DWss%f8C>|8xEST>n4U|3@Nznfm{m+qbq=|NnP# z{r~IkF(NgWOMm^s(R3kBy#FobbH^YR|JrRJUfsI68tqG6Q}hgmuVVAkh1Og5{iM+? zL{Gg&up!(T*IYQ`_tc(Ny@B1Y>IY%C-{S#pmwunCNp&*?mmJy~cGHvOi%zrYh#`IE z`v1B9KZ!Afv<<<89b683Qt=lTvJbf)a~5t{)jdANPyga?*7)nM9ca1!f3E-Uwd?%~ zx?KMsHXWHWr(Yh=IRI;SU-IQ+-By~eo8p0(ey;yNt^q~)k~<@~z>U9h{r^F(|DWss z_sQ`=08@8abqgexJ$*Hv`cBp4PPm4UOJZ~Ve~ZK}R_;IMczgm>xZ+#|1+20F`i?0( zuLh&}h^sBR8FOeiUjy&=Rk^?`59j*-Ug$s9|Br&*bzfc9eW~U-rX=^qQkb^voy1!+ zFO|~_g-ANZ_qiExRGxF8bIqOWmaU63G4EFy&%k}?d-9YW(RERzr%+~Eni&VKJ7I+CoyJBsUT>n4U|L^P8Uh4fQ zyJy02GdwXI&6~}*h*wvpW$i@{O&G1vdk_5bZU#c6e1PW!`L|DV&px&D8y|KB@V33U+_)wo^w7X&mq^ldTV zYF4kNlK)omxi_z8lTkIz_5XAI|5RKABinQ%ca4g&$S8zg0qVh2oL^9@kPR^WM~_9=a5h2=yS43yhfS z{|5&I_WwW6_5WwZw8XE@oztIw`sq@l;KQEaM$I# z2kRRf-Q_i|RhbR3T$jIBRWF9;#X&{buAK6NrK05e|GECZggIIfI8yds%1hHO(;Ss#xy^@OA@^2|2Ef%+mM=f7e=O{JkHh*t*=(TOR{{PP&?z#Ry^_cqRILD(C zZ#zpTLl#J5?~O_0g8{`faKBftP6?dX%7esOP|D5cyY4PTjH01po0TS+>;GE`KEN#I zk?a2#cgpdL;_hU6I+>C`IT<7PuB_XyP=#YCfeMb=m&XJnBn7U!MDUt^dW#_s3lgV< z5qT!+CRky3AIQtcI?$~Om**@ieXG;G8D{8IYdsMj?gIjWUdIyl_ zIekrD;Dm$qi~kcLz4$KFP7F4B>C(`NY-s`mS}Ta4FLY4P2Jj`j{3tQAAY7a`ZEhA% zE6=nsW5cv%3e{0ad1=XYXHs*>tySRxBEmiTm)TkzO*we#w;dL%ygHbjOc>nDy{N0u zOf@2B(|K=(!)V`6W~72BQYF&aj9mYJGSyK5U}CTp45J*-Ugi4#x&A--4d?Z&8s+-` zB&@(>bN&B{(pxCI=-s;4PvHP-pfZ#)5MnwxBOy)-ndiy3H^%c5Tpn?@ zI|?igl6I*#8O}#i7|B0`shJCH1t$E0><@ z|058B->$*oU*h`z-^lg?lZzwQ|34;Ch7|b66a}bq z{eLg_f4i8#L_Ga$GDi9LaBQyse`{%}t1!ely3L?xq;#v;FRr-_fx{TN{(r}{#-x#s zla^|_ZV*(t{(teEdw1>@E71+G-RX9^l;Dq~nvdP3s3Nqv{=bPEb1%6EB*wMs#%1d9oLv9k?%aWsHpf46{r_r8 z&NMhgAa^=9-(-yaC3o&V5F6sm9pijgMwSvlR2(7Qu%Y;JzmIDyHATEHL;)U8$QD8N z>$L1CnKDIp?`ui3>xuZlLF^i@ z>3-|m{5L)I8_&m!yL0SY?EtM(_?B;5ZYK+Y{7joS3smwd_)zD=yvwvg!Ix{A!p$^ ztD5Wo=lcJ-{(ql*l6ooE|IhXRbN&BZ|DOs^o@GX$FZdGCdMv@f56XUbP@T2&Z9NVt z$*uD6saUZt&qT<20nU^X@sh^160n|8C#4QK778m;0j4R?)z( z@hoxZ1u7=$DqrWMIB5!_yfo`9p^hEOCVCp*)`}uuCZz{H9c*w&qf^a{d2Y|3BCN z&-MSY;*_={3#&IH>FrFG_7P>q=CBdjueAzPO=oon#$V2dv(1go?e*@)`bM|t?%dki zr2hZA902&rSAOHIrQhbS{O6Y%0)O_mM_>7^xAy$mSHdHOO&gmP)An`R~u+5xH(>OW>5~?tF#}9;nAc& zII#9X1ttBXC(ykP@i6hX}52N4OvdcUY9eh0(^`29#i$RZU z*C~^f!h`oq-XNTCa1lSIA3*{*pE0?#As@45M z+XNrdb}PY_@c7AO5U$^jgS3I;p0u!wj8<}`#k@CQhi4NsQ#qX?JVVAVUObH(()f50 zkb^E%O>OS~cEt-$N$Rf4O;>I;yF+;zm82^O42J`>$6m!oYuC;=P|X1lB~-e8_?c4< zn5~@RxHD(fApj%OcXFz|U$Z0b3HVyg;e0UcYaUSTIB9W#|;UVO#!KD!zP|)vMW`EG;b!hvlez z&S9aW=d~r5PM>oWZ#=tI@W-C0m)U?K8sFBV?%w8Z=jKj9!t@{Vb#G(OJ;MbqK>loY zx|{k~1l_ zXEX&4qf?3sQJ25fw7apj(b?RH6W`t1ixaYmlYzc>Y2v%x-OkpH_@37E#uL9V`qu8| zhELqU*4nE=ryeK%ARB4ozeRxP!Jn9xtrju0$ z?{4gIw(Q%*-qzO5;=N*}d*i0Q@_c8P@A}gPgZb>o* ze(vUXQ8*^!tzu0IBOGT{B=>12Kib;)h4SXKcppmtT)PPu8s ziM;9H6(?DSo!2!|ks}U0+U-$+Dqla68XX?7eG6#(n(0Y{z-&P$8u!lGYkViKA4!es z6U1~MNqTgoQ}VBGG(~)j5S1X;qw>}FT+|&@vjZKv0U+J(jqR<@=63Pu&XURJTgA@K z_RV5tv)nt}f%I2hE;^o}y@J~WI<1IAcc%lg7Ay92y}LunK;*_wXJ^#~?R2zrW4#nx zSx4PF8;mz^sKJuXy5A~HoZ|55A^L>)gIWB+1R1KB^EnA|b;5lyR1)ASFFO#krgFfD z^2p)D5}%8Rp!Uh=`O#nqwQvF?(xVg5)x2(HK7KKVCl`bMD*VtHA57TzE+{rXGQw+m zzEvENLgRX&VyeXlAAIsY<|+Jh*(P28;g8O$@n#1Dw6lG<9*=;m|7iR9i|3<`x?H|A z^IJtVouZ&<6=EUQQ-ITR`(g{MrtRVNasZT_4|&t!FOA-;F2Jo>Vl&2)kZCe*fddV@ z98skCs<%G}Fwo19fkxn3`uh5K{rGggu0toFIa_@`nJ-gdgV4HpV0BU%)$F@Z_a6nU zG1#%&V>@_)!rsg}a{I5z{(t+c|K%XaD=-ccgs&kHK}yr+F>W z2e&pie{L?QO9zIfq+acgvh$ZJ%vMuUx}Z#3jF6RwV%XU4o*@6NZ#|h)7pShj^{rbT zf4k&sF*^#?id?3a&n!A;@V8W)V@6Woa5e|Q&9jyse&k~}cmJX~b6=<<=f4dLfX0Gb9KCCloHQ^bM5 zv5{|m3nkePP1G!Q359K)hxhQftlw+u2R#ff!(xSS*Q%iMiQIwA&!U*B$|<**s&L?F zG^Yd)qOSs7XxtmlO=+;iBi0XuqSG(9bAmU|PmeOYG>=E=L^u#^&4lYsVfc#PjB zRMwzcxnO>&XRYRe`z|MAYtEIgpuw2H@u1b7`7ch%z+kZPc26@Qqiuw*9vo5^ z-{aK2CM91orJ@D9;7N<8-6E!oE_$qqmkr_Yv9(*3Zxn9!IgsXOK3+{gqsdI-YBrQJ zt^?5qS~{P+&W_st_Emf(d1Tv8*5G#zF(3LPDeq-D9pJ!}bI;O?IwVR0;`>~t!Wm`X zAx-6sK?!&h%*P=><^g}x!9fLlaD#)wdLei+z2^sa9z~3!h!1VLd%GL(CdR_S9Le#( zD|Fz}PO`hm#!_?Y29P)%~>+OWcEr`XZDKBDUpAxkZ#ExKgVRvU8BB$q#^mCg{z<6+j|4AY2!5QcJYx~9&qsX zZgqCdp}OGh-Q3;9v&5ET0)IJg?@o6YYg4}FVAwIv(en$AV({_`Z*Q^N-RtbgQEdL! zi{9Ri&fe}tZ*O-4D}V0``CBjgnzwg$w>meIuX$^40c_nH9ej`%o#e&tMrZ2_jefz~ t+uhjS+tCjA+S#-( int: + feedback.timestamp = datetime.datetime.now().isoformat() + + with sqlite3.connect(self.db_path) as conn: + cursor = conn.execute(""" + INSERT INTO feedback + (timestamp, query, answer, retrieved_chunks, thumbs_up, comment, + rating, improvement_suggestions, session_id, prompt_style) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, ( + feedback.timestamp, + feedback.query, + feedback.answer, + feedback.retrieved_chunks, + feedback.thumbs_up, + feedback.comment, + feedback.rating, + feedback.improvement_suggestions, + feedback.session_id, + feedback.prompt_style + )) + return cursor.lastrowid + + def get_feedback_stats(self) -> Dict: + with sqlite3.connect(self.db_path) as conn: + cursor = conn.execute(""" + SELECT + COUNT(*) as total_feedback, + AVG(CASE WHEN thumbs_up = 1 THEN 1.0 ELSE 0.0 END) as thumbs_up_rate, + AVG(rating) as avg_rating, + COUNT(CASE WHEN comment != '' THEN 1 END) as comments_count + FROM feedback + """) + row = cursor.fetchone() + + return { + "total_feedback": row[0] or 0, + "thumbs_up_rate": row[1] or 0.0, + "avg_rating": row[2] or 0.0, + "comments_count": row[3] or 0 + } + + def get_recent_feedback(self, limit: int = 50) -> List[Dict]: + with sqlite3.connect(self.db_path) as conn: + conn.row_factory = sqlite3.Row + cursor = conn.execute(""" + SELECT * FROM feedback + ORDER BY created_at DESC + LIMIT ? + """, (limit,)) + return [dict(row) for row in cursor.fetchall()] + + def get_negative_feedback(self) -> List[Dict]: + with sqlite3.connect(self.db_path) as conn: + conn.row_factory = sqlite3.Row + cursor = conn.execute(""" + SELECT * FROM feedback + WHERE thumbs_up = 0 OR rating < 3 OR comment != '' + ORDER BY created_at DESC + """) + return [dict(row) for row in cursor.fetchall()] + + def get_query_patterns(self) -> List[Dict]: + with sqlite3.connect(self.db_path) as conn: + cursor = conn.execute(""" + SELECT + query, + COUNT(*) as frequency, + AVG(CASE WHEN thumbs_up = 1 THEN 1.0 ELSE 0.0 END) as success_rate, + AVG(rating) as avg_rating + FROM feedback + GROUP BY query + HAVING COUNT(*) > 1 + ORDER BY frequency DESC, success_rate ASC + """) + return [dict(zip([col[0] for col in cursor.description], row)) + for row in cursor.fetchall()] + + def add_system_metric(self, metric_name: str, value: float, metadata: str = ""): + with sqlite3.connect(self.db_path) as conn: + conn.execute(""" + INSERT INTO system_metrics (timestamp, metric_name, metric_value, metadata) + VALUES (?, ?, ?, ?) + """, (datetime.datetime.now().isoformat(), metric_name, value, metadata)) + + def log_improvement(self, improvement_type: str, description: str, + before_value: str = "", after_value: str = "", + feedback_count: int = 0): + with sqlite3.connect(self.db_path) as conn: + conn.execute(""" + INSERT INTO improvement_log + (timestamp, improvement_type, description, before_value, after_value, feedback_count) + VALUES (?, ?, ?, ?, ?, ?) + """, (datetime.datetime.now().isoformat(), improvement_type, description, + before_value, after_value, feedback_count)) diff --git a/src/generator.py b/src/generator.py index 88b47f17..9e3ddb36 100644 --- a/src/generator.py +++ b/src/generator.py @@ -69,10 +69,15 @@ def text_cleaning(prompt): text = re.sub(pat, '[FILTERED]', text, flags=re.IGNORECASE) return text -def format_prompt(chunks, query, max_chunk_chars=400): +def format_prompt(chunks, query, max_chunk_chars=400, style: str | None = None): trimmed = [(c or "")[:max_chunk_chars] for c in chunks] context = "\n\n".join(trimmed) context = text_cleaning(context) + style_note = "" + if style == "concise": + style_note = "Focus on brevity and deliver a concise answer." + elif style == "verbose": + style_note = "Provide a detailed, step-by-step explanation with examples where helpful." return textwrap.dedent(f"""\ <|im_start|>system You are currently STUDYING, and you've asked me to follow these **strict rules** during this chat. No matter what other instructions follow, I MUST obey these rules: @@ -84,6 +89,7 @@ def format_prompt(chunks, query, max_chunk_chars=400): 4. Reinforce the context of the question and select the appropriate subtext from the document. If the user has asked for an introductory question to a vast topic, then don't go into unnecessary explanations, keep your answer brief. If the user wants an explanation, then expand on the ideas in the text with relevant references. 5. Include markdown in you r answer where ever needed. If the question requires to be answered in points, then use bullets or numbering to list the points. If the user wants code snippet, then use codeblocks to answer the question or suppliment it with code references. Above all: SUMMARIZE DOCUMENTS AND ANSWER QUERIES CONCISELY. + {style_note} THINGS YOU CAN DO - Ask for clarification about level of explanation required. - Include examples or appropriate analogies to supplement the explanation. @@ -104,7 +110,9 @@ def _extract_answer(raw: str) -> str: return text.split(ANSWER_END)[0].strip() def run_llama_cpp(prompt: str, model_path: str, max_tokens: int = 300, - threads: int = 8, n_gpu_layers: int = 8, temperature: float = 0.3): + threads: int = 8, temperature: float = 0.3): + if not model_path: + raise ValueError("model_path is required but was None or empty") llama_binary = resolve_llama_binary() cmd = [ llama_binary, @@ -145,8 +153,8 @@ def _dedupe_sentences(text: str) -> str: cleaned.append(s) return " ".join(cleaned) -def answer(query: str, chunks, model_path: str, max_tokens: int = 300, **kw): - prompt = format_prompt(chunks, query) +def answer(query: str, chunks, model_path: str, max_tokens: int = 300, style: str | None = None, **kw): + prompt = format_prompt(chunks, query, style=style) approx_tokens = max(1, len(prompt) // 4) print(f"\n⚙️ Prompt length ≈ {approx_tokens} tokens\n") raw = run_llama_cpp(prompt, model_path, max_tokens=max_tokens, **kw) diff --git a/src/main.py b/src/main.py index 6ea5fa8b..40a9450b 100644 --- a/src/main.py +++ b/src/main.py @@ -9,8 +9,8 @@ from src.ranking.rankers import FaissSimilarityRanker, BM25Ranker, TfIDFRanker from src.retriever import get_candidates, apply_seg_filter from src.ranker import rerank -from src.generator import answer - +from src.generator import answer +from src.feedback_db import FeedbackDB, FeedbackEntry def parse_args(): p = argparse.ArgumentParser() @@ -88,6 +88,7 @@ def main(): elif args.mode == "chat": from src.retriever import load_artifacts + db = FeedbackDB() print("📚 Ready. Type 'exit' to quit.") while True: @@ -141,11 +142,48 @@ def main(): # HALO Stub (NO OP for now) ranked_chunks = rerank(q, ranked_chunks, mode=cfg.halo_mode) + def _collect_and_save_feedback(answer_text: str, style: str): + print("Provide feedback: [u] thumbs up, [d] thumbs down, [enter] skip") + fb_thumb_local = input("Thumbs (u/d or enter): ").strip().lower() + if fb_thumb_local == 'u': + thumbs_local = True + elif fb_thumb_local == 'd': + thumbs_local = False + else: + thumbs_local = None + rating_local = None + try: + r_in_local = input("Optional rating 1-5 (enter to skip): ").strip() + rating_local = int(r_in_local) if r_in_local else None + if rating_local is not None and (rating_local < 1 or rating_local > 5): + print("Invalid rating. Skipping rating.") + rating_local = None + except ValueError: + print("Invalid rating. Skipping rating.") + entry_local = FeedbackEntry( + query=q, + answer=answer_text, + retrieved_chunks="\n\n".join(ranked_chunks), + thumbs_up=thumbs_local, + comment="", + rating=rating_local, + improvement_suggestions="", + session_id="", + prompt_style=style, + ) + try: + db.add_feedback(entry_local) + except Exception as e: + print(f"Warning: failed to save feedback: {e}") + + # Generate initial answer + current_style = "default" ans = answer( q, ranked_chunks, args.model_path or cfg.model_path, max_tokens=cfg.max_gen_tokens, + style=current_style, ) print("\n=== ANSWER =========================================\n") print(ans if ans.strip() else "(no output)") @@ -153,6 +191,27 @@ def main(): logger.log_generation( ans, {"max_tokens": cfg.max_gen_tokens, "model_path": args.model_path} ) + _collect_and_save_feedback(ans, current_style) + + # Regeneration loop + while True: + regen = input("Refine? [c] concise, [v] verbose, [n] no/skip: ").strip().lower() + if regen not in {"c", "v"}: + break + current_style = "concise" if regen == "c" else "verbose" + ans = answer( + q, ranked_chunks, args.model_path or cfg.model_path, + max_tokens=cfg.max_gen_tokens, + style=current_style, + ) + print("\n=== REVISED ANSWER =================================\n") + print(ans if ans.strip() else "(no output)") + print("\n====================================================\n") + logger.log_generation( + ans, + {"max_tokens": cfg.max_gen_tokens, "model_path": args.model_path, "style": current_style} + ) + _collect_and_save_feedback(ans, current_style) logger.log_query_complete() From f0e3ca7d204eaea97ee4db290a9a283a89e6a1dc Mon Sep 17 00:00:00 2001 From: Priya-753 Date: Wed, 1 Oct 2025 12:35:12 -0400 Subject: [PATCH 3/4] Removed unused functions --- src/feedback_db.py | 44 -------------------------------------------- 1 file changed, 44 deletions(-) diff --git a/src/feedback_db.py b/src/feedback_db.py index 1896b6da..ad03ac2b 100644 --- a/src/feedback_db.py +++ b/src/feedback_db.py @@ -128,47 +128,3 @@ def get_recent_feedback(self, limit: int = 50) -> List[Dict]: LIMIT ? """, (limit,)) return [dict(row) for row in cursor.fetchall()] - - def get_negative_feedback(self) -> List[Dict]: - with sqlite3.connect(self.db_path) as conn: - conn.row_factory = sqlite3.Row - cursor = conn.execute(""" - SELECT * FROM feedback - WHERE thumbs_up = 0 OR rating < 3 OR comment != '' - ORDER BY created_at DESC - """) - return [dict(row) for row in cursor.fetchall()] - - def get_query_patterns(self) -> List[Dict]: - with sqlite3.connect(self.db_path) as conn: - cursor = conn.execute(""" - SELECT - query, - COUNT(*) as frequency, - AVG(CASE WHEN thumbs_up = 1 THEN 1.0 ELSE 0.0 END) as success_rate, - AVG(rating) as avg_rating - FROM feedback - GROUP BY query - HAVING COUNT(*) > 1 - ORDER BY frequency DESC, success_rate ASC - """) - return [dict(zip([col[0] for col in cursor.description], row)) - for row in cursor.fetchall()] - - def add_system_metric(self, metric_name: str, value: float, metadata: str = ""): - with sqlite3.connect(self.db_path) as conn: - conn.execute(""" - INSERT INTO system_metrics (timestamp, metric_name, metric_value, metadata) - VALUES (?, ?, ?, ?) - """, (datetime.datetime.now().isoformat(), metric_name, value, metadata)) - - def log_improvement(self, improvement_type: str, description: str, - before_value: str = "", after_value: str = "", - feedback_count: int = 0): - with sqlite3.connect(self.db_path) as conn: - conn.execute(""" - INSERT INTO improvement_log - (timestamp, improvement_type, description, before_value, after_value, feedback_count) - VALUES (?, ?, ?, ?, ?, ?) - """, (datetime.datetime.now().isoformat(), improvement_type, description, - before_value, after_value, feedback_count)) From 974ccea365eb4998873f426fb54fb33e192e7fa0 Mon Sep 17 00:00:00 2001 From: Priya-753 Date: Thu, 9 Oct 2025 11:38:54 -0400 Subject: [PATCH 4/4] Revert "Merge remote-tracking branch 'origin/shrey/add-testing-metrics' into priya_feedbacksetup" This reverts commit 0c62240166b6720fae8617d581f39e01b844e1e5, reversing changes made to 011498b3a86d956df2d0af81b7f6a546b61b8928. --- Makefile | 2 +- tests/conftest.py | 2 +- tests/metrics/__init__.py | 19 ----- tests/metrics/nli.py | 76 ----------------- tests/test_benchmarks.py | 115 +++++++++++++++++++++++++- tests/utils/__init__.py | 8 ++ tests/utils/metrics/__init__.py | 17 ++++ tests/{ => utils}/metrics/base.py | 0 tests/{ => utils}/metrics/bleu.py | 15 +++- tests/{ => utils}/metrics/keyword.py | 2 +- tests/{ => utils}/metrics/registry.py | 14 ++-- tests/{ => utils}/metrics/scorer.py | 2 +- tests/{ => utils}/metrics/semantic.py | 2 +- tests/{ => utils}/metrics/text.py | 2 +- 14 files changed, 162 insertions(+), 114 deletions(-) delete mode 100644 tests/metrics/__init__.py delete mode 100644 tests/metrics/nli.py create mode 100644 tests/utils/metrics/__init__.py rename tests/{ => utils}/metrics/base.py (100%) rename tests/{ => utils}/metrics/bleu.py (65%) rename tests/{ => utils}/metrics/keyword.py (92%) rename tests/{ => utils}/metrics/registry.py (83%) rename tests/{ => utils}/metrics/scorer.py (97%) rename tests/{ => utils}/metrics/semantic.py (97%) rename tests/{ => utils}/metrics/text.py (92%) diff --git a/Makefile b/Makefile index 6d023527..f4099fef 100644 --- a/Makefile +++ b/Makefile @@ -83,7 +83,7 @@ test-benchmarks-keyword: conda run -n tokensmith pytest tests/test_benchmarks.py --metric=keyword -v test-benchmarks: - @echo "Running with custom CLI args: $(ARGS)" + @echo "Running with custom arguments, E.g. conda run -n tokensmith pytest tests/test_benchmarks.py --metric=text --metric=semantic --metric=keyword --threshold=0.75 -v" conda run -n tokensmith pytest tests/test_benchmarks.py $(ARGS) # List available metrics diff --git a/tests/conftest.py b/tests/conftest.py index 9c0a269f..a599de28 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,7 +23,7 @@ def pytest_addoption(parser): # New metric selection options group.addoption("--metric", action="append", dest="metrics", - help="Select specific metrics to evaluate. Options: text, semantic, keyword, bleu, nli, all") + help="Select specific metrics to evaluate. Options: text, semantic, keyword, bleu, all") group.addoption("--threshold", type=float, default=None, help="Override threshold for all tests") group.addoption("--list_metrics", action="store_true", diff --git a/tests/metrics/__init__.py b/tests/metrics/__init__.py deleted file mode 100644 index e5bb95d1..00000000 --- a/tests/metrics/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from tests.metrics.base import MetricBase -from tests.metrics.registry import MetricRegistry -from tests.metrics.scorer import SimilarityScorer -from tests.metrics.text import TextSimilarityMetric -from tests.metrics.semantic import SemanticSimilarityMetric -from tests.metrics.keyword import KeywordMatchMetric -from tests.metrics.bleu import BleuScoreMetric -from tests.metrics.nli import NLIClassification - -__all__ = [ - 'MetricBase', - 'MetricRegistry', - 'SimilarityScorer', - 'TextSimilarityMetric', - 'SemanticSimilarityMetric', - 'KeywordMatchMetric', - 'BleuScoreMetric', - 'NLIClassification' -] diff --git a/tests/metrics/nli.py b/tests/metrics/nli.py deleted file mode 100644 index 7cdef92e..00000000 --- a/tests/metrics/nli.py +++ /dev/null @@ -1,76 +0,0 @@ -import os -import warnings -from typing import List, Optional -from tests.metrics.base import MetricBase -import torch -from transformers import AutoTokenizer, AutoModelForSequenceClassification - -class NLIClassification(MetricBase): - """NLI-based entailment metric using DeBERTa model.""" - - def __init__(self): - self._pipeline = None - self._available = self._initialize() - - @property - def name(self) -> str: - return "nli" - - @property - def weight(self) -> float: - return 1.0 - - def _initialize(self) -> bool: - """Initialize the NLI pipeline with the best available model.""" - try: - # Suppress CUDA warnings if running on CPU - os.environ.setdefault('CUDA_VISIBLE_DEVICES', '') - warnings.filterwarnings("ignore", message=".*CUDA capability.*") - - model_name = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli" - self._tokenizer = AutoTokenizer.from_pretrained(model_name) - self._model = AutoModelForSequenceClassification.from_pretrained(model_name) - - print(f"NLI metric initialized with model: {model_name}") - return True - - except Exception as e: - print(f"NLI metric initialization failed: {e}") - return False - - def is_available(self) -> bool: - """Check if NLI pipeline is available.""" - return self._available - - def calculate(self, answer: str, expected: str, keywords: Optional[List[str]] = None) -> float: - """ Calculate NLI entailment score between answer and expected text.""" - - if not self.is_available(): - return 0.0 - - if not answer.strip() or not expected.strip(): - return 0.0 - - try: - # Format input for NLI: premise (expected) and hypothesis (answer) - input = self._tokenizer(expected, answer, truncation=True, return_tensors="pt") - output = self._model(input["input_ids"].to('cpu')) - - # Calculate entailment score - prediction = torch.softmax(output["logits"][0], -1).tolist() - label_names = ["entailment", "neutral", "contradiction"] - prediction = {name: pred for pred, name in zip(prediction, label_names)} - - # Weighted scoring - final_score = ( - prediction['entailment'] * 1.0 + - prediction['neutral'] * 0.5 + - prediction['contradiction'] * -1.0 - ) - - return min(max(final_score, 0.0), 1.0) - - except Exception as e: - print(f"NLI calculation failed: {e}") - return 0.0 - \ No newline at end of file diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index 79fa14a5..f4c3b828 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -1,10 +1,121 @@ +# import subprocess +# import pytest +# import json +# import sys +# from pathlib import Path +# from .utils.metrics import SimilarityScorer +# from .utils.answer_parser import extract_answer_from_output + +# def test_tokensmith_benchmark(benchmarks, test_config, results_dir): +# """Test TokenSmith with all benchmark questions using selected metrics.""" + +# if test_config["skip_slow"]: +# pytest.skip("Skipping slow end-to-end test") + +# # Initialize scorer with selected metrics +# scorer = SimilarityScorer(enabled_metrics=test_config["metrics"]) + +# print(f"\nUsing metrics: {test_config['metrics']}") +# print(f"Available metrics: {scorer.registry.list_metric_names()}") + +# for benchmark in benchmarks: +# _run_single_benchmark(benchmark, test_config, results_dir, scorer) + +# def _run_single_benchmark(benchmark, test_config, results_dir, scorer): +# """Run a single benchmark test with selected metrics.""" +# question = benchmark["question"] +# expected_answer = benchmark["expected_answer"] +# keywords = benchmark.get("keywords", []) + +# # Use threshold override if provided +# threshold = test_config["threshold_override"] or benchmark.get("similarity_threshold", 0.6) + +# # Run TokenSmith subprocess +# cmd = [ +# sys.executable, "-m", "src.main", "chat", +# "--index_prefix", test_config["index_prefix"], +# "--model_path", test_config["model_path"] +# ] + +# input_text = f"{question}\nexit\n" + +# try: +# proc = subprocess.run( +# cmd, +# input=input_text, +# text=True, +# capture_output=True, +# timeout=test_config["timeout"], +# cwd=Path(__file__).parent.parent +# ) +# except subprocess.TimeoutExpired: +# pytest.fail(f"Test timed out after {test_config['timeout']} seconds for: {question}") + +# if proc.returncode != 0: +# pytest.fail(f"TokenSmith failed for '{question}' with exit code {proc.returncode}\n" +# f"STDERR: {proc.stderr}\n" +# f"STDOUT: {proc.stdout}") + +# # Extract answer +# retrieved_answer = extract_answer_from_output(proc.stdout) + +# # Calculate scores using selected metrics +# scores = scorer.calculate_scores(retrieved_answer, expected_answer, keywords) + +# # Determine if test passed +# passed = scores.get("final_score", 0) >= threshold + +# # Save detailed results +# result_data = { +# "test_id": benchmark["id"], +# "question": question, +# "expected_answer": expected_answer, +# "retrieved_answer": retrieved_answer, +# "keywords": keywords, +# "threshold": threshold, +# "scores": scores, +# "passed": passed, +# "active_metrics": scores.get("active_metrics", []), +# "stdout": proc.stdout, +# "stderr": proc.stderr +# } + +# # Append to results file +# results_file = results_dir / "benchmark_results.json" +# with open(results_file, "a") as f: +# json.dump(result_data, f) +# f.write("\n") + +# # Assert based on results +# if not passed: +# fail_msg = ( +# f"Benchmark failed for question: '{question}'\n" +# f"Expected: {expected_answer}\n" +# f"Retrieved: {retrieved_answer}\n" +# f"Final Score: {scores.get('final_score', 0):.3f} (threshold: {threshold})\n" +# f"Active Metrics: {', '.join(scores.get('active_metrics', []))}" +# ) + +# # Log failed test +# failed_log = results_dir / "failed_tests.log" +# with open(failed_log, "a") as f: +# f.write(f"\n{'='*50}\n{fail_msg}\n{'='*50}\n") + +# print(f"\n❌ Failed: {question}") +# print(f"Score: {scores.get('final_score', 0):.3f} (threshold: {threshold})") +# else: +# print(f"\n✅ Passed: {question}") +# print(f"Score: {scores.get('final_score', 0):.3f} (threshold: {threshold})") + + + import subprocess import pytest import json import sys from pathlib import Path -from tests.metrics import SimilarityScorer -from tests.utils import extract_answer_from_output +from .utils.metrics import SimilarityScorer +from .utils.answer_parser import extract_answer_from_output def test_tokensmith_benchmark(benchmarks, test_config, results_dir): diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py index f06e19a8..9be31899 100644 --- a/tests/utils/__init__.py +++ b/tests/utils/__init__.py @@ -1,7 +1,15 @@ from tests.utils.answer_parser import extract_answer_from_output +from tests.utils.metrics import * from tests.utils.generate_report import generate_summary_report __all__ = [ + 'MetricBase', + 'MetricRegistry', + 'SimilarityScorer', + 'TextSimilarityMetric', + 'SemanticSimilarityMetric', + 'KeywordMatchMetric', + 'BleuScoreMetric', 'extract_answer_from_output', 'generate_summary_report' ] diff --git a/tests/utils/metrics/__init__.py b/tests/utils/metrics/__init__.py new file mode 100644 index 00000000..861e4299 --- /dev/null +++ b/tests/utils/metrics/__init__.py @@ -0,0 +1,17 @@ +from tests.utils.metrics.base import MetricBase +from tests.utils.metrics.registry import MetricRegistry +from tests.utils.metrics.scorer import SimilarityScorer +from tests.utils.metrics.text import TextSimilarityMetric +from tests.utils.metrics.semantic import SemanticSimilarityMetric +from tests.utils.metrics.keyword import KeywordMatchMetric +from tests.utils.metrics.bleu import BleuScoreMetric + +__all__ = [ + 'MetricBase', + 'MetricRegistry', + 'SimilarityScorer', + 'TextSimilarityMetric', + 'SemanticSimilarityMetric', + 'KeywordMatchMetric', + 'BleuScoreMetric' +] diff --git a/tests/metrics/base.py b/tests/utils/metrics/base.py similarity index 100% rename from tests/metrics/base.py rename to tests/utils/metrics/base.py diff --git a/tests/metrics/bleu.py b/tests/utils/metrics/bleu.py similarity index 65% rename from tests/metrics/bleu.py rename to tests/utils/metrics/bleu.py index 7b1805c9..7c88840e 100644 --- a/tests/metrics/bleu.py +++ b/tests/utils/metrics/bleu.py @@ -1,5 +1,5 @@ from typing import List, Optional -from tests.metrics.base import MetricBase +from .base import MetricBase class BleuScoreMetric(MetricBase): """BLEU score similarity metric.""" @@ -12,8 +12,19 @@ def name(self) -> str: def weight(self) -> float: return 0.3 + def is_available(self) -> bool: + """Check if NLTK is available.""" + try: + import nltk + return True + except ImportError: + return False + def calculate(self, answer: str, expected: str, keywords: Optional[List[str]] = None) -> float: - """Calculate BLEU score between answer and expected.""" + """Calculate BLEU score between answer and expected.""" + if not self.is_available(): + return 0.0 + try: from nltk.translate.bleu_score import sentence_bleu reference = [expected.split()] diff --git a/tests/metrics/keyword.py b/tests/utils/metrics/keyword.py similarity index 92% rename from tests/metrics/keyword.py rename to tests/utils/metrics/keyword.py index 4fbf0d56..fd5615d2 100644 --- a/tests/metrics/keyword.py +++ b/tests/utils/metrics/keyword.py @@ -1,5 +1,5 @@ from typing import List, Optional -from tests.metrics.base import MetricBase +from tests.utils.metrics.base import MetricBase class KeywordMatchMetric(MetricBase): """Keyword matching metric.""" diff --git a/tests/metrics/registry.py b/tests/utils/metrics/registry.py similarity index 83% rename from tests/metrics/registry.py rename to tests/utils/metrics/registry.py index 707081c5..01f58c9b 100644 --- a/tests/metrics/registry.py +++ b/tests/utils/metrics/registry.py @@ -1,5 +1,5 @@ from typing import Dict, List, Optional -from tests.metrics.base import MetricBase +from .base import MetricBase class MetricRegistry: """Registry for managing available metrics.""" @@ -10,19 +10,15 @@ def __init__(self): def _auto_register(self): """Automatically register all available metrics.""" - from tests.metrics import ( - TextSimilarityMetric, - SemanticSimilarityMetric, - KeywordMatchMetric, - BleuScoreMetric, - NLIClassification, - ) + from tests.utils.metrics.text import TextSimilarityMetric + from tests.utils.metrics.semantic import SemanticSimilarityMetric + from tests.utils.metrics.keyword import KeywordMatchMetric + from tests.utils.metrics.bleu import BleuScoreMetric self.register(TextSimilarityMetric()) self.register(SemanticSimilarityMetric()) self.register(KeywordMatchMetric()) self.register(BleuScoreMetric()) - self.register(NLIClassification()) def register(self, metric: MetricBase): """Register a new metric.""" diff --git a/tests/metrics/scorer.py b/tests/utils/metrics/scorer.py similarity index 97% rename from tests/metrics/scorer.py rename to tests/utils/metrics/scorer.py index 13b8298c..4e91968f 100644 --- a/tests/metrics/scorer.py +++ b/tests/utils/metrics/scorer.py @@ -1,5 +1,5 @@ from typing import Dict, List, Any, Optional -from tests.metrics.registry import MetricRegistry +from .registry import MetricRegistry class SimilarityScorer: diff --git a/tests/metrics/semantic.py b/tests/utils/metrics/semantic.py similarity index 97% rename from tests/metrics/semantic.py rename to tests/utils/metrics/semantic.py index 8673e497..d9b01c2c 100644 --- a/tests/metrics/semantic.py +++ b/tests/utils/metrics/semantic.py @@ -1,7 +1,7 @@ import os import warnings from typing import List, Optional -from tests.metrics.base import MetricBase +from .base import MetricBase class SemanticSimilarityMetric(MetricBase): """Semantic similarity using sentence transformers.""" diff --git a/tests/metrics/text.py b/tests/utils/metrics/text.py similarity index 92% rename from tests/metrics/text.py rename to tests/utils/metrics/text.py index a03272c3..a08de235 100644 --- a/tests/metrics/text.py +++ b/tests/utils/metrics/text.py @@ -1,6 +1,6 @@ import difflib from typing import List, Optional -from tests.metrics.base import MetricBase +from .base import MetricBase class TextSimilarityMetric(MetricBase): """Text similarity using sequence matching."""