From a6ea36054fa5c48d845b8d88c61059d36b13a1d7 Mon Sep 17 00:00:00 2001
From: ConstFr <denis@agent-bot-dev-vm.us-central1-c.c.agent-bot-dev.internal>
Date: Mon, 14 Apr 2025 06:49:12 +0000
Subject: [PATCH 1/6] Added reasoning enhanced uncertainty estimation -
 ProbasMean

---
 examples/basic_example.ipynb                  |   6 +-
 .../register_default_stat_calculators.py      |   5 +
 .../default_ReasoningKeywordsProbs.py         |   9 +
 src/lm_polygraph/estimators/__init__.py       |   1 +
 .../estimators/chain_of_thought_uq.py         | 118 +++++
 src/lm_polygraph/stat_calculators/__init__.py |   1 +
 .../reasoning_keywords_probs.py               | 466 ++++++++++++++++++
 .../stat_calculators/stat_calculator.py       |   2 +-
 src/lm_polygraph/utils/factory_estimator.py   |   1 +
 test/local/test_benchmark.py                  | 284 +++++------
 test/test_estimators.py                       | 275 ++++++-----
 test/test_lm_polygraph.py                     |  24 +-
 12 files changed, 899 insertions(+), 293 deletions(-)
 create mode 100644 src/lm_polygraph/defaults/stat_calculator_builders/default_ReasoningKeywordsProbs.py
 create mode 100644 src/lm_polygraph/estimators/chain_of_thought_uq.py
 create mode 100644 src/lm_polygraph/stat_calculators/reasoning_keywords_probs.py

diff --git a/examples/basic_example.ipynb b/examples/basic_example.ipynb
index 66d746c5d..d3caf67de 100644
--- a/examples/basic_example.ipynb
+++ b/examples/basic_example.ipynb
@@ -180,9 +180,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python [conda env:.mlspace-focus_new]",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "conda-env-.mlspace-focus_new-py"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -194,7 +194,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.16"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,
diff --git a/src/lm_polygraph/defaults/register_default_stat_calculators.py b/src/lm_polygraph/defaults/register_default_stat_calculators.py
index 81361381d..9c847aba6 100644
--- a/src/lm_polygraph/defaults/register_default_stat_calculators.py
+++ b/src/lm_polygraph/defaults/register_default_stat_calculators.py
@@ -134,6 +134,11 @@ def _register(
             "lm_polygraph.defaults.stat_calculator_builders.default_ClaimsExtractor",
             {"openai_model": "gpt-4o", "cache_path": "~/.cache", "language": language},
         )
+        _register(
+            ReasoningKeywordsProbs,
+            "lm_polygraph.defaults.stat_calculator_builders.default_ReasoningKeywordsProbs",
+            {"max_retries": 5, "max_length_cot": 128, "temperature": 1.0}
+        )
 
     else:
         raise NotImplementedError(f"Unknown model type: {model_type}")
diff --git a/src/lm_polygraph/defaults/stat_calculator_builders/default_ReasoningKeywordsProbs.py b/src/lm_polygraph/defaults/stat_calculator_builders/default_ReasoningKeywordsProbs.py
new file mode 100644
index 000000000..38820560e
--- /dev/null
+++ b/src/lm_polygraph/defaults/stat_calculator_builders/default_ReasoningKeywordsProbs.py
@@ -0,0 +1,9 @@
+from lm_polygraph.stat_calculators.reasoning_keywords_probs import (
+    ReasoningKeywordsProbs,
+)
+
+
+def load_stat_calculator(config, builder):
+    return ReasoningKeywordsProbs(
+        config.max_retries, config.max_length_cot, config.temperature
+    )
diff --git a/src/lm_polygraph/estimators/__init__.py b/src/lm_polygraph/estimators/__init__.py
index 8162f6380..fd06e1232 100644
--- a/src/lm_polygraph/estimators/__init__.py
+++ b/src/lm_polygraph/estimators/__init__.py
@@ -77,3 +77,4 @@
 from .kernel_language_entropy import KernelLanguageEntropy
 from .luq import LUQ
 from .eigenscore import EigenScore
+from .chain_of_thought_uq import ProbasMeanWithCoT
diff --git a/src/lm_polygraph/estimators/chain_of_thought_uq.py b/src/lm_polygraph/estimators/chain_of_thought_uq.py
new file mode 100644
index 000000000..2a9c0d9c8
--- /dev/null
+++ b/src/lm_polygraph/estimators/chain_of_thought_uq.py
@@ -0,0 +1,118 @@
+import numpy as np
+import re
+import math
+
+from typing import Dict
+
+from .estimator import Estimator
+
+
+def extract_p(keyword_token_probability, contribution_scores = None):
+    if contribution_scores == None:
+        # TODO this branch has to be deleted.
+        return_dict = {}
+        for step, inner_dict in keyword_token_probability.items():
+            for key, values in inner_dict.items():
+                if len(values) == 0:
+                    continue
+                # if key.isdigit(): 
+                #     value_to_add = values[0] 
+                # else:
+                #     value_to_add = values[0] 
+                # value_to_add = sum(values)/len(values)
+                value_to_add = min(values)
+                # value_to_add = max(values)
+                if key in return_dict:
+                    return_dict[key].append(value_to_add)
+                else:
+                    return_dict[key] = [value_to_add]
+        return return_dict
+    else:
+        return_keyword_dict = {}
+        return_contribution_dict = {}
+        for step, inner_dict in keyword_token_probability.items():
+            for key, values in inner_dict.items():
+                if len(values) == 0:
+                    continue
+                # if key.isdigit(): 
+                #     value_to_add = values[-1] 
+                # else:
+                #     value_to_add = values[0] 
+                # value_to_add = sum(values)/len(values)
+                value_to_add = min(values)
+                # value_to_add = max(values)
+                if key in return_keyword_dict:
+                    return_keyword_dict[key].append(value_to_add)
+                    return_contribution_dict[key].append(contribution_scores[step][key])
+                else:
+                    return_keyword_dict[key] = [value_to_add]
+                    return_contribution_dict[key] = [contribution_scores[step][key]]
+        return return_keyword_dict, return_contribution_dict
+
+
+def weighted_sum(values):
+    if len(values) == 1:
+        return values[0] 
+    weights = [math.exp(-c) for c in values]  
+    sum_weights = sum(weights)  
+    normalized_weights = [w / sum_weights for w in weights] 
+    result = sum(w * c for w, c in zip(normalized_weights, values)) 
+    return result 
+
+
+class ProbasMeanWithCoT(Estimator):
+    """
+    Enhances Probas-Mean aggregated probabilities strategy with reasoning steps.
+    Only usabe for instruct-finetuned models with chat template support.
+    Adapted from the original implementation in the paper https://arxiv.org/pdf/2502.17214
+    """
+
+    def __init__(
+        self,
+        name_postfix="",
+    ):
+        self.postfix = name_postfix
+        super().__init__(["input_texts", 
+                          "greedy_texts", 
+                          "reasoning_answer", 
+                          "reasoning_keywords_probabilities", 
+                          "reasoning_keywords_contributions"], 
+                         "sequence")
+
+    def __str__(self):
+        return f"ProbasMeanWithCoT{self.postfix}"
+
+    def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
+        prompts = stats["input_texts"]
+        ues = []
+        for i, question in enumerate(prompts):
+            reasoning_answer = stats['reasoning_answer'][i]
+            if reasoning_answer == "":
+                ues.append(0.5)
+                continue
+            
+            keyword_token_probability = stats['reasoning_keywords_probabilities'][i]
+            if keyword_token_probability is None or keyword_token_probability == {}:
+                ues.append(0.5)
+                continue
+            contribution_scores = stats['reasoning_keywords_contributions'][i]
+            if contribution_scores is None or contribution_scores == {}:
+                ues.append(0.5)
+                continue
+            
+            probabilities, contribution_dict = extract_p(keyword_token_probability, contribution_scores)
+
+            probabilities = {key: weighted_sum(value) for key, value in probabilities.items()}
+            contributions = {key: sum(value)/len(value) for key, value in contribution_dict.items()}
+            
+            # CoT-UQ
+            total_sum = sum(probabilities[key] * contributions[key] for key in probabilities)
+            total_weight = sum(contributions[key] for key in contributions)
+            if total_weight == 0:
+                p_list = [v for v in probabilities.values()]
+                confidence = sum(p_list) / len(p_list)
+            else:
+                confidence = total_sum / total_weight
+            ues.append(1 - confidence)
+
+        return np.array(ues)
diff --git a/src/lm_polygraph/stat_calculators/__init__.py b/src/lm_polygraph/stat_calculators/__init__.py
index 354026271..99a0ec4ad 100644
--- a/src/lm_polygraph/stat_calculators/__init__.py
+++ b/src/lm_polygraph/stat_calculators/__init__.py
@@ -29,3 +29,4 @@
 from .extract_claims import ClaimsExtractor
 from .infer_causal_lm_calculator import InferCausalLMCalculator
 from .semantic_classes import SemanticClassesCalculator
+from .reasoning_keywords_probs import ReasoningKeywordsProbs
diff --git a/src/lm_polygraph/stat_calculators/reasoning_keywords_probs.py b/src/lm_polygraph/stat_calculators/reasoning_keywords_probs.py
new file mode 100644
index 000000000..1c65b2ff7
--- /dev/null
+++ b/src/lm_polygraph/stat_calculators/reasoning_keywords_probs.py
@@ -0,0 +1,466 @@
+import re
+import torch
+import numpy as np
+from collections import defaultdict
+
+from typing import Dict, List, Tuple, Optional
+
+from .embeddings import get_embeddings_from_output
+from .stat_calculator import StatCalculator
+from lm_polygraph.utils.model import WhiteboxModel
+
+
+cot_instruction = """
+Please reason the following question step by step. Label each reasoning step as "Step i:", where "i" is the step number.
+You need to ensure that each step builds on the previous one and contributes meaningfully toward reaching the final answer.
+Once you finish all steps, put your final answer on a separate line after the reasoning steps, starting with "Final Answer:" (do not label it as a step).
+
+Question: <QUESTION>
+Response: Let's think step by step.
+"""
+
+keywords_extraction_instruction = ''' 
+You will be provided with a question and a multi-step response containing reasoning steps. 
+For each long reasoning step labeled "Step i:", extract the keywords, only the relevant tokens for that specific reasoning step.
+You also need to evaluate the importance of each keyword to the final answer. Please evaluate the importance score following with the keyword by (/<importance score>/) on a scale of 1 to 10, where 1 is the least critical and 10 is the most critical.
+If you find more than one keyword in a specific step, separate them with “;”.
+If a specific step does not contribute meaningfully to deriving the final answer (e.g., repeating information already provided in the question, introducing irrelevant assumptions or speculations), return "Step i: NO ANSWER" for that step. For example:
+
+Question:
+<QUESTION>
+Multi-Step Response:
+<RESPONSE>
+Keywords for Each Reasoning Step:
+'''
+
+
+def is_effectively_empty(obj):
+    
+    if obj is None:
+        return True
+
+    if isinstance(obj, (int, float)) and obj == 0:
+        return True
+
+    if obj == "":
+        return True
+
+    if isinstance(obj, list):
+        return all(is_effectively_empty(item) for item in obj)
+    
+    if isinstance(obj, dict):
+        if len(obj) == 0: 
+            return True
+        return all(is_effectively_empty(value) for value in obj.values())
+    return False
+
+
+def parse_response_to_dict(response: str) -> Tuple[Optional[str], Dict[str, str], Optional[str]]:
+    """
+    Parse model reasoning output to highlight: reasoning answer, reasoning steps, reasoning output without answer.
+
+    Parameters:
+        response (str): reasoning output.
+    Returns:
+        Tuple[Optional[str], Dict[str, str], Optional[str]]: 
+            - final answer (str or None),
+            - dictionary of steps (e.g., {"Step 1": "Step 1: ..."}),
+            - response before final answer (str or None)
+    """
+    steps: Dict[str, str] = {}
+    final_answer: Optional[str] = None
+
+    # Match Final Answer
+    match = re.search(r"Final Answer:\s*(.+?)\s*(?=(\n|$))", response, re.DOTALL)
+    if match:
+        final_answer = match.group(1).strip()
+        response_before_final_answer = response[:match.end()].strip()
+    else:
+        return None, {}, None
+
+    # Match Steps
+    matches = list(re.finditer(r"(Step \d+):", response_before_final_answer))
+    for i, match in enumerate(matches):
+        start = match.start()
+        end = matches[i + 1].start() if i + 1 < len(matches) else len(response_before_final_answer)
+        segment = response[start:end].strip()
+        steps[match.group(1)] = segment
+
+    return_response = response_before_final_answer
+    return final_answer, steps, return_response
+
+
+def match_final_answer_token_ids(tokenizer, original_tokens, response_tokens, original_token_ids):
+    # caution
+    final_answer_tokens = tokenizer.tokenize("Final answer:")
+
+    end_index = None
+    end_index_original = None
+
+    for i in range(len(response_tokens) - len(final_answer_tokens) + 1):
+        if response_tokens[i : i + len(final_answer_tokens)] == final_answer_tokens:
+            start_index = i
+            end_index = i + len(final_answer_tokens)
+            break
+
+    if end_index == None or end_index + 1 == len(response_tokens):
+        return None, None
+
+    for i in range(len(original_tokens) - len(final_answer_tokens) + 1):
+        if original_tokens[i : i + len(final_answer_tokens)] == final_answer_tokens:
+            end_index_original = i + len(final_answer_tokens)
+            break
+
+    if end_index_original == None:
+        return None, None
+
+    if response_tokens[end_index] in ["▁", "Ġ", tokenizer.tokenize(" ")]:
+        end_index += 1
+        end_index_original += 1
+
+    target_tokens = response_tokens[end_index:]
+
+    final_answer_token_ids = original_token_ids[end_index_original : end_index_original + len(target_tokens)]
+
+    return end_index_original, final_answer_token_ids.data.cpu().numpy()
+
+
+def predict(prompt, model, tokenizer, max_length_cot, temperature):
+    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
+    generate_ids = model.generate(
+        **inputs, 
+        max_new_tokens = max_length_cot,
+        temperature=temperature, 
+        pad_token_id=tokenizer.eos_token_id)
+    generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1]
+    infer_res = tokenizer.decode(generate_ids)
+    return infer_res
+
+
+def step_exacts_2_list(response):
+    # Split response into lines and filter out empty lines
+    lines = response.splitlines()
+    lines = [line for line in lines if line.strip()]
+
+    keywords_by_step = []
+    contributions_by_step = []
+    valid_response_text = []
+
+    for line in lines:
+        # Match lines starting with "Step X:"
+        match = re.search(r"Step \d+: (.+)", line)
+        if match:
+            if "(/" not in line or "/)" not in line:
+                continue  # Skip invalid lines
+
+            # Extract keywords with contributions
+            keywords_w_contribution = match.group(1).split("; ")
+
+            # Check for valid format and skip invalid lines
+            if any("(/" not in key_w_c or "/)" not in key_w_c for key_w_c in keywords_w_contribution):
+                continue
+
+            try:
+                # Extract keywords and contributions
+                keywords = [key_w_c.split("(/")[0].strip() for key_w_c in keywords_w_contribution]
+                contributions = [int(key_w_c.split("(/")[1].split("/)")[0].strip()) for key_w_c in keywords_w_contribution]
+            except ValueError:
+                return False  # Return False if contributions cannot be converted to int
+
+            for i in contributions:
+                if i > 10:
+                    return False
+
+            keywords_by_step.append(keywords)
+            contributions_by_step.append(contributions)
+            valid_response_text.append(line)  # Add valid lines from the original response
+
+    # If no valid lines are found, return False
+    if not valid_response_text:
+        return False
+
+    return "\n".join(valid_response_text), keywords_by_step, contributions_by_step
+
+
+def find_subsequence_position(sub_sequence, long_sequence):
+    len_long = long_sequence.size(0)
+    len_sub = len(sub_sequence) 
+
+    sub_sequence_tensor = torch.tensor(sub_sequence, device=long_sequence.device)
+    
+    for i in range(len_long - len_sub + 1):
+        if torch.equal(long_sequence[i:i + len_sub], sub_sequence_tensor):
+            return i 
+    return -1
+
+
+def clean_words(word):
+    # TODO forward space token
+  return word.replace(" ", "").replace(".", "").replace("\"", "").replace("\n", "").replace("_", "").replace("Ġ", "").lower()
+
+
+def find_token_indices(tokens, word):
+    word_len = len(word.replace(" ", ""))
+    
+    for start_index in range(len(tokens)):
+        combined_text = ""
+        end_index = start_index       
+        while end_index < len(tokens) and len(combined_text) < word_len:
+            combined_text += tokens[end_index]
+            if clean_words(combined_text) == clean_words(word):
+                return start_index, end_index
+            end_index += 1
+    
+    return -1, -1
+
+
+def is_word_in_sentence(sentence, word):
+    pattern = re.escape(word)
+    match = re.search(pattern, sentence, re.IGNORECASE)
+    return True if match else False
+
+
+class ReasoningKeywordsProbs(StatCalculator):
+    """
+    For Whitebox model (lm_polygraph.WhiteboxModel), at input texts batch calculates:
+        * model output for reasoning enhanced input,
+        * model answer for reasoning enhanced input,
+        * token probabilities for `reasoning_answer`,
+        * keywords from `reasoning_output`,
+        * probabilities for `reasoning_keywords`,
+        * contributions for `reasoning_keywords`,
+        * step-wise token indices for `reasoning_keywords`,
+        * token indices for `reasoning_keywords`.
+    """
+
+    @staticmethod
+    def meta_info() -> Tuple[List[str], List[str]]:
+        """
+        Returns the statistics and dependencies for the calculator.
+        """
+        return [
+            "reasoning_output",
+            "reasoning_answer",
+            "reasoning_answer_tokens_probs",
+            "reasoning_keywords",
+            "reasoning_keywords_probabilities",
+            "reasoning_keywords_contributions",
+            "reasoning_keywords_token_ids",
+            "reasoning_answer_token_ids",
+        ], ["input_texts"]
+
+    def __init__(self, max_retries=5, max_length_cot=128, temperature=1):
+        super().__init__()
+        self.max_retries = max_retries
+        self.max_length_cot = max_length_cot
+        self.temperature = temperature
+
+    def __call__(
+        self,
+        dependencies: Dict[str, np.array],
+        texts: List[str],
+        model: WhiteboxModel,
+        max_new_tokens: int = 100,
+    ) -> Dict[str, np.ndarray]:
+        """
+        Calculates the statistics of reasoning enhanced process.
+
+        Parameters:
+            dependencies (Dict[str, np.ndarray]): input statistics, can be empty (not used).
+            texts (List[str]): Input texts batch used for model generation.
+            model (Model): Model used for generation.
+            max_new_tokens (int): Maximum number of new tokens at model generation. Default: 100.
+        Returns:
+            Dict[str, np.ndarray]: dictionary with the following items:
+                - 'reasoning_output' (List[str]): model output for reasoning enhanced input,
+                - 'reasoning_answer' (List[str]): model answer for reasoning enhanced input,
+                - 'reasoning_answer_tokens_probs' (List[str]): token probabilities for `reasoning_answer`,
+                - 'reasoning_keywords' (List[str]): keywords from `reasoning_output`,
+                - 'reasoning_keywords_probabilities' (List[Dict[str, Dict[str, List[int]]]]): probabilities for `reasoning_keywords`,
+                - 'reasoning_keywords_contributions' (List[Dict[str, Dict[str, int]]]): contributions for `reasoning_keywords`,
+                - 'reasoning_keywords_token_ids' (List[Dict[str, Dict[str, List[int]]]]): step-wise token indices for `reasoning_keywords`,
+                - 'reasoning_answer_token_ids' (List[Dict[str, List[int]]]): token indices for `reasoning_keywords`.
+        """
+        result_dict = defaultdict(list)
+        for question in texts:
+            cot_prompt = cot_instruction.replace("<QUESTION>", question)
+
+            inputs = model.tokenizer(cot_prompt, return_tensors="pt")
+            inputs = {key: value.to(model.model.device) for key, value in inputs.items()}
+            n_of_retries = 0
+            while n_of_retries < self.max_retries:
+                outputs = model.generate(
+                    **inputs,
+                    max_new_tokens=self.max_length_cot,
+                    temperature=self.temperature,
+                    pad_token_id=model.tokenizer.eos_token_id,
+                    return_dict_in_generate=True,
+                    output_scores=True,
+                )
+
+                # generated token ids for the question enchanced with CoT.
+                generated_ids = outputs.sequences[0][len(inputs["input_ids"][0]) : -1]
+                # generated text for the question enchaced with CoT
+                to_parse = model.tokenizer.decode(generated_ids, skip_special_tokens=True)
+
+                llm_answer, steps_dict, response = parse_response_to_dict(to_parse)
+                if generated_ids.size(0) >= self.max_length_cot:
+                    # log.debug(f'New Reasoning Tokens Are Too Much, Current try is {n_of_retries + 1}')
+                    n_of_retries += 1
+                    continue
+                elif generated_ids.size(0) == 0:
+                    # log.debug(f'New Reasoning Tokens Are Null, Current try is {n_of_retries + 1}')
+                    n_of_retries += 1
+                    continue
+                elif llm_answer is None or llm_answer in ["", " "]:
+                    # log.debug(f'New Reasoning Tokens Are None, Current try is {n_of_retries + 1}')
+                    n_of_retries += 1
+                    continue
+                
+                # reasoning tokens without final answer
+                response_tokens = model.tokenizer.tokenize(response)
+                # reasoning token ids without final answer
+                response_token_ids = model.tokenizer.convert_tokens_to_ids(response_tokens)
+                # full reasoning tokens
+                original_tokens = model.tokenizer.convert_ids_to_tokens(generated_ids)
+
+                probabilities = [
+                    {i: p for i, p in enumerate(prob[0]) if p > 0}
+                    for prob in [torch.softmax(score, dim=1).tolist() for score in outputs.scores]
+                ]
+
+                final_answer_probabilities = {}
+                final_answer_token_ids = {}
+                answer_start_indice, answer_token_ids = match_final_answer_token_ids(
+                    model.tokenizer,
+                    original_tokens,
+                    response_tokens,
+                    generated_ids,
+                )
+                if answer_start_indice == None:
+                    # log.debug(f'Cannot locate the Final Answer, Current try is {n_of_retries + 1}')
+                    n_of_retries += 1
+                    continue
+                answer_probs = []
+                flag = False
+                for j, token_id in enumerate(answer_token_ids):
+                    idxx = j + answer_start_indice
+                    if token_id not in probabilities[idxx].keys():
+                        flag = True
+                        break
+                    answer_probs.append(probabilities[idxx][token_id])
+                if flag:
+                    # log.debug(f'Cannot locate the Final Answer Token Probability, Current try is {n_of_retries + 1}')
+                    n_of_retries += 1
+                    continue
+                final_answer_probabilities[llm_answer] = answer_probs
+                final_answer_token_ids[llm_answer] = answer_token_ids.tolist()
+                
+                # exacts_prompt = get_step_exact_tokens(args, q, response)
+                keywords_extraction_prompt = keywords_extraction_instruction.replace('<QUESTION>', question).replace('<RESPONSE>', response)
+                keywords_extraction_prompt_output = predict(keywords_extraction_prompt, model, model.tokenizer, self.max_length_cot, self.temperature)
+
+                if "NO ANSWER" in keywords_extraction_prompt_output:
+                    # log.debug(f'Exact Tokens Have NO ANSWER, Current try is {n_of_retries + 1}')
+                    n_of_retries += 1
+                    continue
+                parsed_keywords_output = step_exacts_2_list(keywords_extraction_prompt_output)
+                if not parsed_keywords_output:
+                    # log.debug(f'Exact Tokens Have no contribution scores, Current try is {n_of_retries + 1}')
+                    n_of_retries += 1
+                    continue
+                extracted_keywords, keywords_list, contributions_list = parsed_keywords_output
+                if len(keywords_list) == 0:
+                    # log.debug(f'Cannot Exract Effective Keywords, Current try is {n_of_retries + 1}')
+                    n_of_retries += 1
+                    continue
+
+                if len(steps_dict) > len(keywords_list):
+                    # log.debug(f'Len of keywords list doesn\'t match the len of step dict, Current try is {n_of_retries + 1}')
+                    n_of_retries += 1
+                    continue
+
+                keywords_probabilities = {}
+                keywords_contributions = {}
+                keywords_token_ids = {}
+                for step_idx, (step_name, step_text) in enumerate(steps_dict.items()):
+                    # # Skip the Final Answer
+                    keywords = keywords_list[step_idx]
+                    contributions = contributions_list[step_idx]
+                    if len(keywords) == 1 and keywords[0] == "NO ANSWER":
+                        continue
+                    step_tokens = model.tokenizer.tokenize(step_text)
+                    space_token = model.tokenizer.tokenize(" ")
+                    processed_step_tokens = [
+                        (token[1:] if token.startswith(space_token) else token)
+                        for token in step_tokens
+                    ]
+                    step_token_ids = model.tokenizer.convert_tokens_to_ids(step_tokens)
+                    start_position = find_subsequence_position(step_token_ids[1:-2], generated_ids) - 1
+                    step_token_ids = generated_ids[start_position : start_position + len(step_tokens)]
+                    keywords_probabilities_dict = {}
+                    keywords_contributions_dict = {}
+                    keywords_token_ids_dict = {}
+                    for keyword_idx, keyword in enumerate(keywords):
+
+                        keyword_probs = []
+                        keyword_token_ids = []
+                        if is_word_in_sentence(step_text, keyword) is not True:
+                            # log.debug(f"\n{step_name}-Keyword-{keyword_idx} Does not appear in the Step Text")
+                            continue
+                        keyword_token_start_idx, keyword_token_end_idx = find_token_indices(
+                            processed_step_tokens, keyword
+                        )
+                        keyword_token_ids = generated_ids[
+                            start_position + keyword_token_start_idx : start_position + keyword_token_end_idx + 1
+                        ]
+                        keyword_token_ids = keyword_token_ids.data.cpu().numpy()
+
+                        for j, token_id in enumerate(keyword_token_ids):
+                            idxx = start_position + keyword_token_start_idx + j
+                            keyword_probs.append(probabilities[idxx][token_id])
+                        keywords_probabilities_dict[keyword] = keyword_probs
+                        keywords_contributions_dict[keyword] = int(contributions[keyword_idx])
+                        keywords_token_ids_dict[keyword] = keyword_token_ids.tolist()
+
+                    keywords_probabilities[step_name] = keywords_probabilities_dict
+                    keywords_contributions[step_name] = keywords_contributions_dict
+                    keywords_token_ids[step_name] = keywords_token_ids_dict
+
+                if is_effectively_empty(keywords_probabilities):
+                    # log.debug(f'Token Probability from All Steps are All None, Current try is {n_of_retries + 1}')
+                    n_of_retries += 1
+                    continue
+                
+                # Dict[str, np.ndarray]: dictionary with the following items:
+                # - 'reasoning_output' (List[str]): model output for reasoning enhanced input,
+                # - 'reasoning_answer' (List[str]): model answer for reasoning enhanced input,
+                # - 'reasoning_answer_tokens_probs' (List[str]): token probabilities for `reasoning_answer`,
+                # - 'reasoning_keywords' (List[str]): keywords from `reasoning_output`,
+                # - 'reasoning_keywords_probabilities' (List[Dict[str, Dict[str, List[int]]]]): probabilities for `reasoning_keywords`,
+                # - 'reasoning_keywords_contributions' (List[Dict[str, Dict[str, int]]]): contributions for `reasoning_keywords`,
+                # - 'reasoning_keywords_token_ids' (List[Dict[str, Dict[str, List[int]]]]): step-wise token indices for `reasoning_keywords`,
+                # - 'reasoning_answer_token_ids' (List[Dict[str, List[int]]]): token indices for `reasoning_keywords`.
+                
+                result_dict["reasoning_output"].append(response)
+                result_dict["reasoning_answer"].append(llm_answer)
+                result_dict["reasoning_answer_tokens_probs"].append(final_answer_probabilities)
+                result_dict["reasoning_keywords"].append(extracted_keywords)
+                result_dict["reasoning_keywords_probabilities"].append(keywords_probabilities)
+                result_dict["reasoning_keywords_contributions"].append(keywords_contributions)
+                result_dict["reasoning_keywords_token_ids"].append(keywords_token_ids)
+                result_dict["reasoning_answer_token_ids"].append(final_answer_token_ids)
+                break
+                
+            if n_of_retries >= self.max_retries:
+                # log.debug(f'#####The Following Question:#####\n{q}\nHas no Meaningful Answer & Explanations, Record and Skip')
+                result_dict["reasoning_output"].append(response)
+                result_dict["reasoning_answer"].append(llm_answer)
+                result_dict["reasoning_answer_tokens_probs"].append(None)
+                result_dict["reasoning_keywords"].append(None)
+                result_dict["reasoning_keywords_probabilities"].append(None)
+                result_dict["reasoning_keywords_contributions"].append(None)
+                result_dict["reasoning_keywords_token_ids"].append(None)
+                result_dict["reasoning_answer_token_ids"].append(None)
+
+        return result_dict
diff --git a/src/lm_polygraph/stat_calculators/stat_calculator.py b/src/lm_polygraph/stat_calculators/stat_calculator.py
index e6e6655c4..031e4f163 100644
--- a/src/lm_polygraph/stat_calculators/stat_calculator.py
+++ b/src/lm_polygraph/stat_calculators/stat_calculator.py
@@ -18,7 +18,7 @@ class StatCalculator(ABC):
     UEManager at lm_polygraph.utils.manager will order all the needed calculators and estimators to be called in
     the correct order. Any cycle dependencies among calculators will be spotted by UEManager and end with an exception.
 
-    Each new StatCalculator needs to be registered at lm_polygraph/stat_calculators/__init__.py to be seen be UEManager.
+    Each new StatCalculator needs to be registered at lm_polygraph/stat_calculators/__init__.py to be seen by UEManager.
     """
 
     @staticmethod
diff --git a/src/lm_polygraph/utils/factory_estimator.py b/src/lm_polygraph/utils/factory_estimator.py
index c1e13b5b0..24c859edd 100644
--- a/src/lm_polygraph/utils/factory_estimator.py
+++ b/src/lm_polygraph/utils/factory_estimator.py
@@ -46,6 +46,7 @@ def load_simple_estimators(name: str, config):
         ClaimConditionedProbabilityClaim,
         RandomBaselineClaim,
         FocusClaim,
+        ProbasMeanWithCoT,
     ]
 
     try:
diff --git a/test/local/test_benchmark.py b/test/local/test_benchmark.py
index c29b083f5..032c6c69f 100644
--- a/test/local/test_benchmark.py
+++ b/test/local/test_benchmark.py
@@ -1,203 +1,203 @@
-import subprocess
-import pathlib
-import os
-import torch
-import json
-import pytest
-import diskcache as dc
+# import subprocess
+# import pathlib
+# import os
+# import torch
+# import json
+# import pytest
+# import diskcache as dc
 
-from lm_polygraph.utils.manager import UEManager
-from lm_polygraph.utils.builder_enviroment_stat_calculator import (
-    BuilderEnvironmentStatCalculator,
-)
-from lm_polygraph.defaults.register_default_stat_calculators import (
-    register_default_stat_calculators,
-)
+# from lm_polygraph.utils.manager import UEManager
+# from lm_polygraph.utils.builder_enviroment_stat_calculator import (
+#     BuilderEnvironmentStatCalculator,
+# )
+# from lm_polygraph.defaults.register_default_stat_calculators import (
+#     register_default_stat_calculators,
+# )
 
 
-# ================= TEST HELPERS ==================
+# # ================= TEST HELPERS ==================
 
 
-def get_device():
-    if torch.cuda.is_available():
-        return "cuda"
-    elif torch.mps.is_available():
-        return "mps"
-    else:
-        return "cpu"
+# def get_device():
+#     if torch.cuda.is_available():
+#         return "cuda"
+#     elif torch.mps.is_available():
+#         return "mps"
+#     else:
+#         return "cpu"
 
 
-@pytest.fixture(scope="module")
-def reference():
-    with open(f"{pwd()}/fixtures/input_output_fixtures.json") as f:
-        return json.load(f)
+# @pytest.fixture(scope="module")
+# def reference():
+#     with open(f"{pwd()}/fixtures/input_output_fixtures.json") as f:
+#         return json.load(f)
 
 
-def pwd():
-    return pathlib.Path(__file__).parent.resolve()
+# def pwd():
+#     return pathlib.Path(__file__).parent.resolve()
 
 
-def check_result(dataset, exec_result, reference, method=None):
-    assert (
-        exec_result.returncode == 0
-    ), f"polygraph_eval returned code {exec_result.returncode} != 0"
+# def check_result(dataset, exec_result, reference, method=None):
+#     assert (
+#         exec_result.returncode == 0
+#     ), f"polygraph_eval returned code {exec_result.returncode} != 0"
 
-    man = UEManager.load(
-        f"{pwd()}/ue_manager_seed1",
-        builder_env_stat_calc=BuilderEnvironmentStatCalculator(None),
-        available_stat_calculators=register_default_stat_calculators(
-            model_type="Whitebox"
-        ),
-    )
+#     man = UEManager.load(
+#         f"{pwd()}/ue_manager_seed1",
+#         builder_env_stat_calc=BuilderEnvironmentStatCalculator(None),
+#         available_stat_calculators=register_default_stat_calculators(
+#             model_type="Whitebox"
+#         ),
+#     )
 
-    if method is None:
-        assert len(man.estimations[("sequence", "MaximumSequenceProbability")]) == 2
+#     if method is None:
+#         assert len(man.estimations[("sequence", "MaximumSequenceProbability")]) == 2
 
-    key = dataset
-    if method:
-        key += f"_{method}"
+#     key = dataset
+#     if method:
+#         key += f"_{method}"
 
-    assert man.stats["input_texts"][0] == reference[key + "_input"]
-    assert man.stats["target_texts"][0] == reference[key + "_output"]
+#     assert man.stats["input_texts"][0] == reference[key + "_input"]
+#     assert man.stats["target_texts"][0] == reference[key + "_output"]
 
-    os.remove(f"{pwd()}/ue_manager_seed1")
+#     os.remove(f"{pwd()}/ue_manager_seed1")
 
 
-# ================= TEST CASES ==================
+# # ================= TEST CASES ==================
 
 
-def run_eval(dataset):
-    command = f"HYDRA_CONFIG={pwd()}/../../examples/configs/polygraph_eval_{dataset}.yaml \
-                polygraph_eval \
-                subsample_eval_dataset=2 \
-                model.path=bigscience/bloomz-560m \
-                model.load_model_args.device_map={get_device()} \
-                save_path={pwd()} \
-                stat_calculators.1.cfg.size=10 \
-                stat_calculators.1.cfg.bg_size=20"
+# def run_eval(dataset):
+#     command = f"HYDRA_CONFIG={pwd()}/../../examples/configs/polygraph_eval_{dataset}.yaml \
+#                 polygraph_eval \
+#                 subsample_eval_dataset=2 \
+#                 model.path=bigscience/bloomz-560m \
+#                 model.load_model_args.device_map={get_device()} \
+#                 save_path={pwd()} \
+#                 stat_calculators.1.cfg.size=10 \
+#                 stat_calculators.1.cfg.bg_size=20"
 
-    return subprocess.run(command, shell=True)
+#     return subprocess.run(command, shell=True)
 
 
-def test_coqa(reference):
-    exec_result = run_eval("coqa")
-    check_result("coqa", exec_result, reference)
+# def test_coqa(reference):
+#     exec_result = run_eval("coqa")
+#     check_result("coqa", exec_result, reference)
 
 
-def test_triviaqa(reference):
-    exec_result = run_eval("triviaqa")
-    check_result("triviaqa", exec_result, reference)
+# def test_triviaqa(reference):
+#     exec_result = run_eval("triviaqa")
+#     check_result("triviaqa", exec_result, reference)
 
 
-def test_mmlu(reference):
-    exec_result = run_eval("mmlu")
-    check_result("mmlu", exec_result, reference)
+# def test_mmlu(reference):
+#     exec_result = run_eval("mmlu")
+#     check_result("mmlu", exec_result, reference)
 
 
-def test_gsm8k(reference):
-    exec_result = run_eval("gsm8k")
-    check_result("gsm8k", exec_result, reference)
+# def test_gsm8k(reference):
+#     exec_result = run_eval("gsm8k")
+#     check_result("gsm8k", exec_result, reference)
 
 
-def test_wmt14_fren(reference):
-    exec_result = run_eval("wmt14_fren")
-    check_result("wmt14_fren", exec_result, reference)
+# def test_wmt14_fren(reference):
+#     exec_result = run_eval("wmt14_fren")
+#     check_result("wmt14_fren", exec_result, reference)
 
 
-def test_wmt19_deen(reference):
-    exec_result = run_eval("wmt19_deen")
-    check_result("wmt19_deen", exec_result, reference)
+# def test_wmt19_deen(reference):
+#     exec_result = run_eval("wmt19_deen")
+#     check_result("wmt19_deen", exec_result, reference)
 
 
-def test_xsum(reference):
-    exec_result = run_eval("xsum")
-    check_result("xsum", exec_result, reference)
+# def test_xsum(reference):
+#     exec_result = run_eval("xsum")
+#     check_result("xsum", exec_result, reference)
 
 
-# ================= INSTRUCT TEST CASES ==================
+# # ================= INSTRUCT TEST CASES ==================
 
 
-def run_instruct_eval(dataset, method):
-    command = f"HYDRA_CONFIG={pwd()}/../../examples/configs/instruct/polygraph_eval_{dataset}_{method}.yaml \
-                polygraph_eval \
-                subsample_eval_dataset=2 \
-                model=stablelm-1.6b-chat \
-                model.load_model_args.device_map={get_device()} \
-                save_path={pwd()}"
+# def run_instruct_eval(dataset, method):
+#     command = f"HYDRA_CONFIG={pwd()}/../../examples/configs/instruct/polygraph_eval_{dataset}_{method}.yaml \
+#                 polygraph_eval \
+#                 subsample_eval_dataset=2 \
+#                 model=stablelm-1.6b-chat \
+#                 model.load_model_args.device_map={get_device()} \
+#                 save_path={pwd()}"
 
-    return subprocess.run(command, shell=True)
+#     return subprocess.run(command, shell=True)
 
 
-METHODS = [
-    "ling_1s",
-    "verb_1s_top1",
-    "verb_1s_topk",
-    "verb_2s_top1",
-    "verb_2s_topk",
-    "verb_2s_cot",
-    "empirical_baselines",
-]
+# METHODS = [
+#     "ling_1s",
+#     "verb_1s_top1",
+#     "verb_1s_topk",
+#     "verb_2s_top1",
+#     "verb_2s_topk",
+#     "verb_2s_cot",
+#     "empirical_baselines",
+# ]
 
 
-def test_coqa_instruct(reference):
-    for method in METHODS:
-        exec_result = run_instruct_eval("coqa", method)
-        check_result("coqa", exec_result, reference, method)
+# def test_coqa_instruct(reference):
+#     for method in METHODS:
+#         exec_result = run_instruct_eval("coqa", method)
+#         check_result("coqa", exec_result, reference, method)
 
 
-def test_triviaqa_instruct(reference):
-    for method in METHODS:
-        exec_result = run_instruct_eval("triviaqa", method)
-        check_result("triviaqa", exec_result, reference, method)
+# def test_triviaqa_instruct(reference):
+#     for method in METHODS:
+#         exec_result = run_instruct_eval("triviaqa", method)
+#         check_result("triviaqa", exec_result, reference, method)
 
 
-def test_mmlu_instruct(reference):
-    for method in METHODS:
-        exec_result = run_instruct_eval("mmlu", method)
-        check_result("mmlu", exec_result, reference, method)
+# def test_mmlu_instruct(reference):
+#     for method in METHODS:
+#         exec_result = run_instruct_eval("mmlu", method)
+#         check_result("mmlu", exec_result, reference, method)
 
 
-# ================= CLAIM-LEVEL ==================
+# # ================= CLAIM-LEVEL ==================
 
 
-def run_claim_eval(dataset):
-    fixed_cache = dc.Cache(f"{pwd()}/fixtures/openai_chat_cache.diskcache")
-    with dc.Cache(
-        os.path.expanduser("~") + "/.cache/openai_chat_cache.diskcache"
-    ) as cache:
-        for k in fixed_cache:
-            cache[k] = fixed_cache[k]
+# def run_claim_eval(dataset):
+#     fixed_cache = dc.Cache(f"{pwd()}/fixtures/openai_chat_cache.diskcache")
+#     with dc.Cache(
+#         os.path.expanduser("~") + "/.cache/openai_chat_cache.diskcache"
+#     ) as cache:
+#         for k in fixed_cache:
+#             cache[k] = fixed_cache[k]
 
-    command = f"HYDRA_CONFIG={pwd()}/../../examples/configs/polygraph_eval_{dataset}.yaml \
-                polygraph_eval \
-                subsample_eval_dataset=2 \
-                model.path=bigscience/bloomz-560m \
-                model.load_model_args.device_map={get_device()} \
-                save_path={pwd()}"
+#     command = f"HYDRA_CONFIG={pwd()}/../../examples/configs/polygraph_eval_{dataset}.yaml \
+#                 polygraph_eval \
+#                 subsample_eval_dataset=2 \
+#                 model.path=bigscience/bloomz-560m \
+#                 model.load_model_args.device_map={get_device()} \
+#                 save_path={pwd()}"
 
-    return subprocess.run(command, shell=True)
+#     return subprocess.run(command, shell=True)
 
 
-def check_claim_level_result(dataset, reference):
-    man = UEManager.load(
-        f"{pwd()}/ue_manager_seed1",
-        builder_env_stat_calc=BuilderEnvironmentStatCalculator(None),
-        available_stat_calculators=register_default_stat_calculators(
-            model_type="Whitebox"
-        ),
-    )
+# def check_claim_level_result(dataset, reference):
+#     man = UEManager.load(
+#         f"{pwd()}/ue_manager_seed1",
+#         builder_env_stat_calc=BuilderEnvironmentStatCalculator(None),
+#         available_stat_calculators=register_default_stat_calculators(
+#             model_type="Whitebox"
+#         ),
+#     )
 
-    assert man.stats["input_texts"][0] == reference[dataset + "_input"]
-    assert man.stats["target_texts"][0] == reference[dataset + "_output"]
+#     assert man.stats["input_texts"][0] == reference[dataset + "_input"]
+#     assert man.stats["target_texts"][0] == reference[dataset + "_output"]
 
-    os.remove(f"{pwd()}/ue_manager_seed1")
+#     os.remove(f"{pwd()}/ue_manager_seed1")
 
 
-def test_person_bio(reference):
-    base_dataset_name = "person_bio"
-    langs = ["en_mistral", "zh"]
+# def test_person_bio(reference):
+#     base_dataset_name = "person_bio"
+#     langs = ["en_mistral", "zh"]
 
-    for lang in langs:
-        dataset = f"{base_dataset_name}_{lang}"
-        run_claim_eval(dataset)
-        check_claim_level_result(dataset, reference)
+#     for lang in langs:
+#         dataset = f"{base_dataset_name}_{lang}"
+#         run_claim_eval(dataset)
+#         check_claim_level_result(dataset, reference)
diff --git a/test/test_estimators.py b/test/test_estimators.py
index 50dcd260a..5c2744d85 100644
--- a/test/test_estimators.py
+++ b/test/test_estimators.py
@@ -31,40 +31,40 @@ def model():
     return WhiteboxModel(base_model, tokenizer)
 
 
-def test_maximum_sequence_probability(model):
-    estimator = MaximumSequenceProbability()
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_maximum_sequence_probability(model):
+#     estimator = MaximumSequenceProbability()
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_perplexity(model):
-    estimator = Perplexity()
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_perplexity(model):
+#     estimator = Perplexity()
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_mean_token_entropy(model):
-    estimator = MeanTokenEntropy()
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_mean_token_entropy(model):
+#     estimator = MeanTokenEntropy()
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_mean_pointwise_mutual_information(model):
-    estimator = MeanPointwiseMutualInformation()
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_mean_pointwise_mutual_information(model):
+#     estimator = MeanPointwiseMutualInformation()
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_mean_conditional_pointwise_mutual_information(model):
-    estimator = MeanConditionalPointwiseMutualInformation()
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_mean_conditional_pointwise_mutual_information(model):
+#     estimator = MeanConditionalPointwiseMutualInformation()
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_claim_conditioned_probability(model):
-    estimator = ClaimConditionedProbability()
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_claim_conditioned_probability(model):
+#     estimator = ClaimConditionedProbability()
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
 def test_ptrue(model):
@@ -73,165 +73,165 @@ def test_ptrue(model):
     assert isinstance(ue.uncertainty, float)
 
 
-def test_ptrue_sampling(model):
-    estimator = PTrueSampling()
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_ptrue_sampling(model):
+#     estimator = PTrueSampling()
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_monte_carlo_sequence_entropy(model):
-    estimator = MonteCarloSequenceEntropy()
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_monte_carlo_sequence_entropy(model):
+#     estimator = MonteCarloSequenceEntropy()
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_monte_carlo_normalized_sequence_entropy(model):
-    estimator = MonteCarloNormalizedSequenceEntropy()
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_monte_carlo_normalized_sequence_entropy(model):
+#     estimator = MonteCarloNormalizedSequenceEntropy()
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_lexical_similarity_rouge1(model):
-    estimator = LexicalSimilarity(metric="rouge1")
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_lexical_similarity_rouge1(model):
+#     estimator = LexicalSimilarity(metric="rouge1")
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_lexical_similarity_rouge2(model):
-    estimator = LexicalSimilarity(metric="rouge2")
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_lexical_similarity_rouge2(model):
+#     estimator = LexicalSimilarity(metric="rouge2")
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_lexical_similarity_rougel(model):
-    estimator = LexicalSimilarity(metric="rougeL")
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_lexical_similarity_rougel(model):
+#     estimator = LexicalSimilarity(metric="rougeL")
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_lexical_similarity_bleu(model):
-    estimator = LexicalSimilarity(metric="BLEU")
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_lexical_similarity_bleu(model):
+#     estimator = LexicalSimilarity(metric="BLEU")
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_num_sem_sets(model):
-    estimator = NumSemSets()
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_num_sem_sets(model):
+#     estimator = NumSemSets()
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_eigval_laplacian_nli_entail(model):
-    estimator = EigValLaplacian(similarity_score="NLI_score", affinity="entail")
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_eigval_laplacian_nli_entail(model):
+#     estimator = EigValLaplacian(similarity_score="NLI_score", affinity="entail")
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_eigval_laplacian_nli_contra(model):
-    estimator = EigValLaplacian(similarity_score="NLI_score", affinity="contra")
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_eigval_laplacian_nli_contra(model):
+#     estimator = EigValLaplacian(similarity_score="NLI_score", affinity="contra")
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_eigval_laplacian_jaccard(model):
-    estimator = EigValLaplacian(similarity_score="Jaccard_score")
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_eigval_laplacian_jaccard(model):
+#     estimator = EigValLaplacian(similarity_score="Jaccard_score")
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_degmat_nli_entail(model):
-    estimator = DegMat(similarity_score="NLI_score", affinity="entail")
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_degmat_nli_entail(model):
+#     estimator = DegMat(similarity_score="NLI_score", affinity="entail")
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_degmat_nli_contra(model):
-    estimator = DegMat(similarity_score="NLI_score", affinity="contra")
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_degmat_nli_contra(model):
+#     estimator = DegMat(similarity_score="NLI_score", affinity="contra")
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_degmat_jaccard(model):
-    estimator = DegMat(similarity_score="Jaccard_score")
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_degmat_jaccard(model):
+#     estimator = DegMat(similarity_score="Jaccard_score")
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_eccentricity_nli_entail(model):
-    estimator = Eccentricity(similarity_score="NLI_score", affinity="entail")
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_eccentricity_nli_entail(model):
+#     estimator = Eccentricity(similarity_score="NLI_score", affinity="entail")
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_eccentricity_nli_contra(model):
-    estimator = Eccentricity(similarity_score="NLI_score", affinity="contra")
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_eccentricity_nli_contra(model):
+#     estimator = Eccentricity(similarity_score="NLI_score", affinity="contra")
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_eccentricity_jaccard(model):
-    estimator = Eccentricity(similarity_score="Jaccard_score")
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_eccentricity_jaccard(model):
+#     estimator = Eccentricity(similarity_score="Jaccard_score")
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_semantic_entropy(model):
-    estimator = SemanticEntropy()
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_semantic_entropy(model):
+#     estimator = SemanticEntropy()
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_sar(model):
-    estimator = SAR()
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_sar(model):
+#     estimator = SAR()
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_token_sar(model):
-    estimator = TokenSAR()
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_token_sar(model):
+#     estimator = TokenSAR()
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_sentence_sar(model):
-    estimator = SentenceSAR()
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_sentence_sar(model):
+#     estimator = SentenceSAR()
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_renyi_neg(model):
-    estimator = RenyiNeg()
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_renyi_neg(model):
+#     estimator = RenyiNeg()
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_fisher_rao(model):
-    estimator = FisherRao()
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_fisher_rao(model):
+#     estimator = FisherRao()
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_focus(model):
-    model_name = model.model.config._name_or_path
-    estimator = Focus(
-        model_name=model_name,
-        path="../token_idf/{model_name}/token_idf.pkl",
-        gamma=0.9,
-        p=0.01,
-        idf_dataset="LM-Polygraph/RedPajama-Data-100-Sample-For-Test",
-        trust_remote_code=True,
-        idf_seed=42,
-        idf_dataset_size=5,
-        spacy_path="en_core_web_sm",
-    )
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_focus(model):
+#     model_name = model.model.config._name_or_path
+#     estimator = Focus(
+#         model_name=model_name,
+#         path="../token_idf/{model_name}/token_idf.pkl",
+#         gamma=0.9,
+#         p=0.01,
+#         idf_dataset="LM-Polygraph/RedPajama-Data-100-Sample-For-Test",
+#         trust_remote_code=True,
+#         idf_seed=42,
+#         idf_dataset_size=5,
+#         spacy_path="en_core_web_sm",
+#     )
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
-def test_kernel_language_entropy(model):
-    estimator = KernelLanguageEntropy()
-    ue = estimate_uncertainty(model, estimator, INPUT)
-    assert isinstance(ue.uncertainty, float)
+# def test_kernel_language_entropy(model):
+#     estimator = KernelLanguageEntropy()
+#     ue = estimate_uncertainty(model, estimator, INPUT)
+#     assert isinstance(ue.uncertainty, float)
 
 
 def test_luq(model):
@@ -244,3 +244,8 @@ def test_eigenscore(model):
     estimator = EigenScore()
     ue = estimate_uncertainty(model, estimator, INPUT)
     assert isinstance(ue.uncertainty, float)
+
+def test_probas_mean_with_cot(model):
+    estimator = ProbasMeanWithCoT()
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
diff --git a/test/test_lm_polygraph.py b/test/test_lm_polygraph.py
index da82128c2..0b0e643e2 100644
--- a/test/test_lm_polygraph.py
+++ b/test/test_lm_polygraph.py
@@ -40,18 +40,18 @@ def run_config_with_overrides(config_name, **overrides):
 # ================= TEST CASES ==================
 
 
-def test_just_works():
-    exec_result = run_config_with_overrides("test_polygraph_eval")
-    assert (
-        exec_result.returncode == 0
-    ), f"polygraph_eval returned code {exec_result.returncode} != 0"
-
-
-def test_all_seq_ue():
-    exec_result = run_config_with_overrides("test_polygraph_eval_seq_ue")
-    assert (
-        exec_result.returncode == 0
-    ), f"polygraph_eval returned code {exec_result.returncode} != 0"
+# def test_just_works():
+#     exec_result = run_config_with_overrides("test_polygraph_eval")
+#     assert (
+#         exec_result.returncode == 0
+#     ), f"polygraph_eval returned code {exec_result.returncode} != 0"
+
+
+# def test_all_seq_ue():
+#     exec_result = run_config_with_overrides("test_polygraph_eval_seq_ue")
+#     assert (
+#         exec_result.returncode == 0
+#     ), f"polygraph_eval returned code {exec_result.returncode} != 0"
 
 
 # ================= PE ensembles ==================

From 5f7cc5d66607275fa8d1cc130ad95c36f5b4010b Mon Sep 17 00:00:00 2001
From: ConstFr <denis@agent-bot-dev-vm.us-central1-c.c.agent-bot-dev.internal>
Date: Mon, 14 Apr 2025 14:47:20 +0000
Subject: [PATCH 2/6] Improved documentation, typing, code style

---
 .../estimators/chain_of_thought_uq.py         | 152 ++++++----
 .../reasoning_keywords_probs.py               |  51 ++--
 test/local/test_benchmark.py                  | 284 +++++++++---------
 test/test_estimators.py                       | 271 ++++++++---------
 test/test_lm_polygraph.py                     |  24 +-
 5 files changed, 399 insertions(+), 383 deletions(-)

diff --git a/src/lm_polygraph/estimators/chain_of_thought_uq.py b/src/lm_polygraph/estimators/chain_of_thought_uq.py
index 2a9c0d9c8..5dd7b6fd3 100644
--- a/src/lm_polygraph/estimators/chain_of_thought_uq.py
+++ b/src/lm_polygraph/estimators/chain_of_thought_uq.py
@@ -1,63 +1,77 @@
 import numpy as np
-import re
 import math
 
-from typing import Dict
+from typing import Dict, List, Tuple
 
 from .estimator import Estimator
 
 
-def extract_p(keyword_token_probability, contribution_scores = None):
-    if contribution_scores == None:
-        # TODO this branch has to be deleted.
-        return_dict = {}
-        for step, inner_dict in keyword_token_probability.items():
-            for key, values in inner_dict.items():
-                if len(values) == 0:
-                    continue
-                # if key.isdigit(): 
-                #     value_to_add = values[0] 
-                # else:
-                #     value_to_add = values[0] 
-                # value_to_add = sum(values)/len(values)
-                value_to_add = min(values)
-                # value_to_add = max(values)
-                if key in return_dict:
-                    return_dict[key].append(value_to_add)
-                else:
-                    return_dict[key] = [value_to_add]
-        return return_dict
-    else:
-        return_keyword_dict = {}
-        return_contribution_dict = {}
-        for step, inner_dict in keyword_token_probability.items():
-            for key, values in inner_dict.items():
-                if len(values) == 0:
-                    continue
-                # if key.isdigit(): 
-                #     value_to_add = values[-1] 
-                # else:
-                #     value_to_add = values[0] 
-                # value_to_add = sum(values)/len(values)
-                value_to_add = min(values)
-                # value_to_add = max(values)
-                if key in return_keyword_dict:
-                    return_keyword_dict[key].append(value_to_add)
-                    return_contribution_dict[key].append(contribution_scores[step][key])
-                else:
-                    return_keyword_dict[key] = [value_to_add]
-                    return_contribution_dict[key] = [contribution_scores[step][key]]
-        return return_keyword_dict, return_contribution_dict
-
-
-def weighted_sum(values):
+def aggregate_probas_mean(
+    keyword_token_probability: Dict[str, Dict[str, List[int]]], contribution_scores: Dict[str, Dict[str, int]] = None
+) -> Tuple[Dict[str, List[float]], Dict[str, List[float]]]:
+    """
+    Aggregates token probabilities
+
+    Parameters:
+        keyword_token_probability (Dict[str, Dict[str, List[int]]]): token probs for keywords
+    (example {
+                "step1": {
+                    "keyword1": [0.7, 0.8],
+                    "keyword2": [0.9, 0.6, 0.5],
+                },
+                "step2": {
+                    "keyword1": [0.5, 0.8],
+                    "keyword3": [0.5, 0.9, 0.9],
+                },
+                ...
+             }
+    ),
+        contribution_scores (Dict[str, Dict[str, int]]): contribution scores for keywords.
+    Returns:
+        Tuple[Dict[str, List[float]], Dict[str, List[float]]]: agg. keyword probs, agg. keyword contributions.
+    (example {
+                "keyword1": [(0.7 + 0.8) / 2, (0.5 + 0.8) / 2],
+                "keyword2": [(0.9 + 0.6 + 0.5) / 3],
+                "keyword3": [(0.5 + 0.9 + 0.9) / 3],
+                ...
+             }
+    ),
+    """
+    return_keyword_dict = {}
+    return_contribution_dict = {}
+    for step, inner_dict in keyword_token_probability.items():
+        for key, values in inner_dict.items():
+            if len(values) == 0:
+                continue
+            # it is strange that min(values) was in original implementation for probas mean agg. strategy
+            # value_to_add = min(values)
+            value_to_add = np.mean(values)
+            if key in return_keyword_dict:
+                return_keyword_dict[key].append(value_to_add)
+                return_contribution_dict[key].append(contribution_scores[step][key])
+            else:
+                return_keyword_dict[key] = [value_to_add]
+                return_contribution_dict[key] = [contribution_scores[step][key]]
+    return return_keyword_dict, return_contribution_dict
+
+
+def weighted_sum(values: List[float]) -> float:
+    """
+    Computes a softmin weighted sum of the input values.
+
+    Parameters:
+        values (List[float]): values to be summed
+    Returns:
+        float: a softmin weighted sum
+    """
     if len(values) == 1:
-        return values[0] 
-    weights = [math.exp(-c) for c in values]  
-    sum_weights = sum(weights)  
-    normalized_weights = [w / sum_weights for w in weights] 
-    result = sum(w * c for w, c in zip(normalized_weights, values)) 
-    return result 
+        return values[0]
+    weights = [math.exp(-c) for c in values]
+    sum_weights = sum(weights)
+    normalized_weights = [w / sum_weights for w in weights]
+    print(normalized_weights)
+    result = sum(w * c for w, c in zip(normalized_weights, values))
+    return result
 
 
 class ProbasMeanWithCoT(Estimator):
@@ -72,12 +86,16 @@ def __init__(
         name_postfix="",
     ):
         self.postfix = name_postfix
-        super().__init__(["input_texts", 
-                          "greedy_texts", 
-                          "reasoning_answer", 
-                          "reasoning_keywords_probabilities", 
-                          "reasoning_keywords_contributions"], 
-                         "sequence")
+        super().__init__(
+            [
+                "input_texts",
+                "greedy_texts",
+                "reasoning_answer",
+                "reasoning_keywords_probabilities",
+                "reasoning_keywords_contributions",
+            ],
+            "sequence",
+        )
 
     def __str__(self):
         return f"ProbasMeanWithCoT{self.postfix}"
@@ -86,25 +104,27 @@ def __call__(self, stats: Dict[str, np.ndarray]) -> np.ndarray:
         prompts = stats["input_texts"]
         ues = []
         for i, question in enumerate(prompts):
-            reasoning_answer = stats['reasoning_answer'][i]
+            reasoning_answer = stats["reasoning_answer"][i]
             if reasoning_answer == "":
                 ues.append(0.5)
                 continue
-            
-            keyword_token_probability = stats['reasoning_keywords_probabilities'][i]
+
+            keyword_token_probability = stats["reasoning_keywords_probabilities"][i]
             if keyword_token_probability is None or keyword_token_probability == {}:
                 ues.append(0.5)
                 continue
-            contribution_scores = stats['reasoning_keywords_contributions'][i]
+            contribution_scores = stats["reasoning_keywords_contributions"][i]
             if contribution_scores is None or contribution_scores == {}:
                 ues.append(0.5)
                 continue
-            
-            probabilities, contribution_dict = extract_p(keyword_token_probability, contribution_scores)
 
+            probabilities, contribution_dict = aggregate_probas_mean(keyword_token_probability, contribution_scores)
+
+            # softmin weighted sum of keywords probs
             probabilities = {key: weighted_sum(value) for key, value in probabilities.items()}
-            contributions = {key: sum(value)/len(value) for key, value in contribution_dict.items()}
-            
+            # average of keywords contributions
+            contributions = {key: sum(value) / len(value) for key, value in contribution_dict.items()}
+
             # CoT-UQ
             total_sum = sum(probabilities[key] * contributions[key] for key in probabilities)
             total_weight = sum(contributions[key] for key in contributions)
diff --git a/src/lm_polygraph/stat_calculators/reasoning_keywords_probs.py b/src/lm_polygraph/stat_calculators/reasoning_keywords_probs.py
index 1c65b2ff7..3185b1c98 100644
--- a/src/lm_polygraph/stat_calculators/reasoning_keywords_probs.py
+++ b/src/lm_polygraph/stat_calculators/reasoning_keywords_probs.py
@@ -5,7 +5,6 @@
 
 from typing import Dict, List, Tuple, Optional
 
-from .embeddings import get_embeddings_from_output
 from .stat_calculator import StatCalculator
 from lm_polygraph.utils.model import WhiteboxModel
 
@@ -19,8 +18,8 @@
 Response: Let's think step by step.
 """
 
-keywords_extraction_instruction = ''' 
-You will be provided with a question and a multi-step response containing reasoning steps. 
+keywords_extraction_instruction = '''
+You will be provided with a question and a multi-step response containing reasoning steps.
 For each long reasoning step labeled "Step i:", extract the keywords, only the relevant tokens for that specific reasoning step.
 You also need to evaluate the importance of each keyword to the final answer. Please evaluate the importance score following with the keyword by (/<importance score>/) on a scale of 1 to 10, where 1 is the least critical and 10 is the most critical.
 If you find more than one keyword in a specific step, separate them with “;”.
@@ -35,7 +34,6 @@
 
 
 def is_effectively_empty(obj):
-    
     if obj is None:
         return True
 
@@ -47,9 +45,9 @@ def is_effectively_empty(obj):
 
     if isinstance(obj, list):
         return all(is_effectively_empty(item) for item in obj)
-    
+
     if isinstance(obj, dict):
-        if len(obj) == 0: 
+        if len(obj) == 0:
             return True
         return all(is_effectively_empty(value) for value in obj.values())
     return False
@@ -62,7 +60,7 @@ def parse_response_to_dict(response: str) -> Tuple[Optional[str], Dict[str, str]
     Parameters:
         response (str): reasoning output.
     Returns:
-        Tuple[Optional[str], Dict[str, str], Optional[str]]: 
+        Tuple[Optional[str], Dict[str, str], Optional[str]]:
             - final answer (str or None),
             - dictionary of steps (e.g., {"Step 1": "Step 1: ..."}),
             - response before final answer (str or None)
@@ -99,11 +97,10 @@ def match_final_answer_token_ids(tokenizer, original_tokens, response_tokens, or
 
     for i in range(len(response_tokens) - len(final_answer_tokens) + 1):
         if response_tokens[i : i + len(final_answer_tokens)] == final_answer_tokens:
-            start_index = i
             end_index = i + len(final_answer_tokens)
             break
 
-    if end_index == None or end_index + 1 == len(response_tokens):
+    if end_index is None or end_index + 1 == len(response_tokens):
         return None, None
 
     for i in range(len(original_tokens) - len(final_answer_tokens) + 1):
@@ -111,7 +108,7 @@ def match_final_answer_token_ids(tokenizer, original_tokens, response_tokens, or
             end_index_original = i + len(final_answer_tokens)
             break
 
-    if end_index_original == None:
+    if end_index_original is None:
         return None, None
 
     if response_tokens[end_index] in ["▁", "Ġ", tokenizer.tokenize(" ")]:
@@ -128,9 +125,9 @@ def match_final_answer_token_ids(tokenizer, original_tokens, response_tokens, or
 def predict(prompt, model, tokenizer, max_length_cot, temperature):
     inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
     generate_ids = model.generate(
-        **inputs, 
-        max_new_tokens = max_length_cot,
-        temperature=temperature, 
+        **inputs,
+        max_new_tokens=max_length_cot,
+        temperature=temperature,
         pad_token_id=tokenizer.eos_token_id)
     generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1]
     infer_res = tokenizer.decode(generate_ids)
@@ -184,33 +181,32 @@ def step_exacts_2_list(response):
 
 def find_subsequence_position(sub_sequence, long_sequence):
     len_long = long_sequence.size(0)
-    len_sub = len(sub_sequence) 
+    len_sub = len(sub_sequence)
 
     sub_sequence_tensor = torch.tensor(sub_sequence, device=long_sequence.device)
-    
+
     for i in range(len_long - len_sub + 1):
         if torch.equal(long_sequence[i:i + len_sub], sub_sequence_tensor):
-            return i 
+            return i
     return -1
 
 
 def clean_words(word):
     # TODO forward space token
-  return word.replace(" ", "").replace(".", "").replace("\"", "").replace("\n", "").replace("_", "").replace("Ġ", "").lower()
+    return word.replace(" ", "").replace(".", "").replace("\"", "").replace("\n", "").replace("_", "").replace("Ġ", "").lower()
 
 
 def find_token_indices(tokens, word):
     word_len = len(word.replace(" ", ""))
-    
+
     for start_index in range(len(tokens)):
         combined_text = ""
-        end_index = start_index       
+        end_index = start_index
         while end_index < len(tokens) and len(combined_text) < word_len:
             combined_text += tokens[end_index]
             if clean_words(combined_text) == clean_words(word):
                 return start_index, end_index
             end_index += 1
-    
     return -1, -1
 
 
@@ -316,11 +312,11 @@ def __call__(
                     # log.debug(f'New Reasoning Tokens Are None, Current try is {n_of_retries + 1}')
                     n_of_retries += 1
                     continue
-                
+
                 # reasoning tokens without final answer
                 response_tokens = model.tokenizer.tokenize(response)
                 # reasoning token ids without final answer
-                response_token_ids = model.tokenizer.convert_tokens_to_ids(response_tokens)
+                # response_token_ids = model.tokenizer.convert_tokens_to_ids(response_tokens)
                 # full reasoning tokens
                 original_tokens = model.tokenizer.convert_ids_to_tokens(generated_ids)
 
@@ -337,7 +333,7 @@ def __call__(
                     response_tokens,
                     generated_ids,
                 )
-                if answer_start_indice == None:
+                if answer_start_indice is None:
                     # log.debug(f'Cannot locate the Final Answer, Current try is {n_of_retries + 1}')
                     n_of_retries += 1
                     continue
@@ -355,7 +351,7 @@ def __call__(
                     continue
                 final_answer_probabilities[llm_answer] = answer_probs
                 final_answer_token_ids[llm_answer] = answer_token_ids.tolist()
-                
+
                 # exacts_prompt = get_step_exact_tokens(args, q, response)
                 keywords_extraction_prompt = keywords_extraction_instruction.replace('<QUESTION>', question).replace('<RESPONSE>', response)
                 keywords_extraction_prompt_output = predict(keywords_extraction_prompt, model, model.tokenizer, self.max_length_cot, self.temperature)
@@ -402,7 +398,6 @@ def __call__(
                     keywords_contributions_dict = {}
                     keywords_token_ids_dict = {}
                     for keyword_idx, keyword in enumerate(keywords):
-
                         keyword_probs = []
                         keyword_token_ids = []
                         if is_word_in_sentence(step_text, keyword) is not True:
@@ -431,7 +426,7 @@ def __call__(
                     # log.debug(f'Token Probability from All Steps are All None, Current try is {n_of_retries + 1}')
                     n_of_retries += 1
                     continue
-                
+
                 # Dict[str, np.ndarray]: dictionary with the following items:
                 # - 'reasoning_output' (List[str]): model output for reasoning enhanced input,
                 # - 'reasoning_answer' (List[str]): model answer for reasoning enhanced input,
@@ -441,7 +436,7 @@ def __call__(
                 # - 'reasoning_keywords_contributions' (List[Dict[str, Dict[str, int]]]): contributions for `reasoning_keywords`,
                 # - 'reasoning_keywords_token_ids' (List[Dict[str, Dict[str, List[int]]]]): step-wise token indices for `reasoning_keywords`,
                 # - 'reasoning_answer_token_ids' (List[Dict[str, List[int]]]): token indices for `reasoning_keywords`.
-                
+
                 result_dict["reasoning_output"].append(response)
                 result_dict["reasoning_answer"].append(llm_answer)
                 result_dict["reasoning_answer_tokens_probs"].append(final_answer_probabilities)
@@ -451,7 +446,7 @@ def __call__(
                 result_dict["reasoning_keywords_token_ids"].append(keywords_token_ids)
                 result_dict["reasoning_answer_token_ids"].append(final_answer_token_ids)
                 break
-                
+
             if n_of_retries >= self.max_retries:
                 # log.debug(f'#####The Following Question:#####\n{q}\nHas no Meaningful Answer & Explanations, Record and Skip')
                 result_dict["reasoning_output"].append(response)
diff --git a/test/local/test_benchmark.py b/test/local/test_benchmark.py
index 032c6c69f..c29b083f5 100644
--- a/test/local/test_benchmark.py
+++ b/test/local/test_benchmark.py
@@ -1,203 +1,203 @@
-# import subprocess
-# import pathlib
-# import os
-# import torch
-# import json
-# import pytest
-# import diskcache as dc
+import subprocess
+import pathlib
+import os
+import torch
+import json
+import pytest
+import diskcache as dc
 
-# from lm_polygraph.utils.manager import UEManager
-# from lm_polygraph.utils.builder_enviroment_stat_calculator import (
-#     BuilderEnvironmentStatCalculator,
-# )
-# from lm_polygraph.defaults.register_default_stat_calculators import (
-#     register_default_stat_calculators,
-# )
+from lm_polygraph.utils.manager import UEManager
+from lm_polygraph.utils.builder_enviroment_stat_calculator import (
+    BuilderEnvironmentStatCalculator,
+)
+from lm_polygraph.defaults.register_default_stat_calculators import (
+    register_default_stat_calculators,
+)
 
 
-# # ================= TEST HELPERS ==================
+# ================= TEST HELPERS ==================
 
 
-# def get_device():
-#     if torch.cuda.is_available():
-#         return "cuda"
-#     elif torch.mps.is_available():
-#         return "mps"
-#     else:
-#         return "cpu"
+def get_device():
+    if torch.cuda.is_available():
+        return "cuda"
+    elif torch.mps.is_available():
+        return "mps"
+    else:
+        return "cpu"
 
 
-# @pytest.fixture(scope="module")
-# def reference():
-#     with open(f"{pwd()}/fixtures/input_output_fixtures.json") as f:
-#         return json.load(f)
+@pytest.fixture(scope="module")
+def reference():
+    with open(f"{pwd()}/fixtures/input_output_fixtures.json") as f:
+        return json.load(f)
 
 
-# def pwd():
-#     return pathlib.Path(__file__).parent.resolve()
+def pwd():
+    return pathlib.Path(__file__).parent.resolve()
 
 
-# def check_result(dataset, exec_result, reference, method=None):
-#     assert (
-#         exec_result.returncode == 0
-#     ), f"polygraph_eval returned code {exec_result.returncode} != 0"
+def check_result(dataset, exec_result, reference, method=None):
+    assert (
+        exec_result.returncode == 0
+    ), f"polygraph_eval returned code {exec_result.returncode} != 0"
 
-#     man = UEManager.load(
-#         f"{pwd()}/ue_manager_seed1",
-#         builder_env_stat_calc=BuilderEnvironmentStatCalculator(None),
-#         available_stat_calculators=register_default_stat_calculators(
-#             model_type="Whitebox"
-#         ),
-#     )
+    man = UEManager.load(
+        f"{pwd()}/ue_manager_seed1",
+        builder_env_stat_calc=BuilderEnvironmentStatCalculator(None),
+        available_stat_calculators=register_default_stat_calculators(
+            model_type="Whitebox"
+        ),
+    )
 
-#     if method is None:
-#         assert len(man.estimations[("sequence", "MaximumSequenceProbability")]) == 2
+    if method is None:
+        assert len(man.estimations[("sequence", "MaximumSequenceProbability")]) == 2
 
-#     key = dataset
-#     if method:
-#         key += f"_{method}"
+    key = dataset
+    if method:
+        key += f"_{method}"
 
-#     assert man.stats["input_texts"][0] == reference[key + "_input"]
-#     assert man.stats["target_texts"][0] == reference[key + "_output"]
+    assert man.stats["input_texts"][0] == reference[key + "_input"]
+    assert man.stats["target_texts"][0] == reference[key + "_output"]
 
-#     os.remove(f"{pwd()}/ue_manager_seed1")
+    os.remove(f"{pwd()}/ue_manager_seed1")
 
 
-# # ================= TEST CASES ==================
+# ================= TEST CASES ==================
 
 
-# def run_eval(dataset):
-#     command = f"HYDRA_CONFIG={pwd()}/../../examples/configs/polygraph_eval_{dataset}.yaml \
-#                 polygraph_eval \
-#                 subsample_eval_dataset=2 \
-#                 model.path=bigscience/bloomz-560m \
-#                 model.load_model_args.device_map={get_device()} \
-#                 save_path={pwd()} \
-#                 stat_calculators.1.cfg.size=10 \
-#                 stat_calculators.1.cfg.bg_size=20"
+def run_eval(dataset):
+    command = f"HYDRA_CONFIG={pwd()}/../../examples/configs/polygraph_eval_{dataset}.yaml \
+                polygraph_eval \
+                subsample_eval_dataset=2 \
+                model.path=bigscience/bloomz-560m \
+                model.load_model_args.device_map={get_device()} \
+                save_path={pwd()} \
+                stat_calculators.1.cfg.size=10 \
+                stat_calculators.1.cfg.bg_size=20"
 
-#     return subprocess.run(command, shell=True)
+    return subprocess.run(command, shell=True)
 
 
-# def test_coqa(reference):
-#     exec_result = run_eval("coqa")
-#     check_result("coqa", exec_result, reference)
+def test_coqa(reference):
+    exec_result = run_eval("coqa")
+    check_result("coqa", exec_result, reference)
 
 
-# def test_triviaqa(reference):
-#     exec_result = run_eval("triviaqa")
-#     check_result("triviaqa", exec_result, reference)
+def test_triviaqa(reference):
+    exec_result = run_eval("triviaqa")
+    check_result("triviaqa", exec_result, reference)
 
 
-# def test_mmlu(reference):
-#     exec_result = run_eval("mmlu")
-#     check_result("mmlu", exec_result, reference)
+def test_mmlu(reference):
+    exec_result = run_eval("mmlu")
+    check_result("mmlu", exec_result, reference)
 
 
-# def test_gsm8k(reference):
-#     exec_result = run_eval("gsm8k")
-#     check_result("gsm8k", exec_result, reference)
+def test_gsm8k(reference):
+    exec_result = run_eval("gsm8k")
+    check_result("gsm8k", exec_result, reference)
 
 
-# def test_wmt14_fren(reference):
-#     exec_result = run_eval("wmt14_fren")
-#     check_result("wmt14_fren", exec_result, reference)
+def test_wmt14_fren(reference):
+    exec_result = run_eval("wmt14_fren")
+    check_result("wmt14_fren", exec_result, reference)
 
 
-# def test_wmt19_deen(reference):
-#     exec_result = run_eval("wmt19_deen")
-#     check_result("wmt19_deen", exec_result, reference)
+def test_wmt19_deen(reference):
+    exec_result = run_eval("wmt19_deen")
+    check_result("wmt19_deen", exec_result, reference)
 
 
-# def test_xsum(reference):
-#     exec_result = run_eval("xsum")
-#     check_result("xsum", exec_result, reference)
+def test_xsum(reference):
+    exec_result = run_eval("xsum")
+    check_result("xsum", exec_result, reference)
 
 
-# # ================= INSTRUCT TEST CASES ==================
+# ================= INSTRUCT TEST CASES ==================
 
 
-# def run_instruct_eval(dataset, method):
-#     command = f"HYDRA_CONFIG={pwd()}/../../examples/configs/instruct/polygraph_eval_{dataset}_{method}.yaml \
-#                 polygraph_eval \
-#                 subsample_eval_dataset=2 \
-#                 model=stablelm-1.6b-chat \
-#                 model.load_model_args.device_map={get_device()} \
-#                 save_path={pwd()}"
+def run_instruct_eval(dataset, method):
+    command = f"HYDRA_CONFIG={pwd()}/../../examples/configs/instruct/polygraph_eval_{dataset}_{method}.yaml \
+                polygraph_eval \
+                subsample_eval_dataset=2 \
+                model=stablelm-1.6b-chat \
+                model.load_model_args.device_map={get_device()} \
+                save_path={pwd()}"
 
-#     return subprocess.run(command, shell=True)
+    return subprocess.run(command, shell=True)
 
 
-# METHODS = [
-#     "ling_1s",
-#     "verb_1s_top1",
-#     "verb_1s_topk",
-#     "verb_2s_top1",
-#     "verb_2s_topk",
-#     "verb_2s_cot",
-#     "empirical_baselines",
-# ]
+METHODS = [
+    "ling_1s",
+    "verb_1s_top1",
+    "verb_1s_topk",
+    "verb_2s_top1",
+    "verb_2s_topk",
+    "verb_2s_cot",
+    "empirical_baselines",
+]
 
 
-# def test_coqa_instruct(reference):
-#     for method in METHODS:
-#         exec_result = run_instruct_eval("coqa", method)
-#         check_result("coqa", exec_result, reference, method)
+def test_coqa_instruct(reference):
+    for method in METHODS:
+        exec_result = run_instruct_eval("coqa", method)
+        check_result("coqa", exec_result, reference, method)
 
 
-# def test_triviaqa_instruct(reference):
-#     for method in METHODS:
-#         exec_result = run_instruct_eval("triviaqa", method)
-#         check_result("triviaqa", exec_result, reference, method)
+def test_triviaqa_instruct(reference):
+    for method in METHODS:
+        exec_result = run_instruct_eval("triviaqa", method)
+        check_result("triviaqa", exec_result, reference, method)
 
 
-# def test_mmlu_instruct(reference):
-#     for method in METHODS:
-#         exec_result = run_instruct_eval("mmlu", method)
-#         check_result("mmlu", exec_result, reference, method)
+def test_mmlu_instruct(reference):
+    for method in METHODS:
+        exec_result = run_instruct_eval("mmlu", method)
+        check_result("mmlu", exec_result, reference, method)
 
 
-# # ================= CLAIM-LEVEL ==================
+# ================= CLAIM-LEVEL ==================
 
 
-# def run_claim_eval(dataset):
-#     fixed_cache = dc.Cache(f"{pwd()}/fixtures/openai_chat_cache.diskcache")
-#     with dc.Cache(
-#         os.path.expanduser("~") + "/.cache/openai_chat_cache.diskcache"
-#     ) as cache:
-#         for k in fixed_cache:
-#             cache[k] = fixed_cache[k]
+def run_claim_eval(dataset):
+    fixed_cache = dc.Cache(f"{pwd()}/fixtures/openai_chat_cache.diskcache")
+    with dc.Cache(
+        os.path.expanduser("~") + "/.cache/openai_chat_cache.diskcache"
+    ) as cache:
+        for k in fixed_cache:
+            cache[k] = fixed_cache[k]
 
-#     command = f"HYDRA_CONFIG={pwd()}/../../examples/configs/polygraph_eval_{dataset}.yaml \
-#                 polygraph_eval \
-#                 subsample_eval_dataset=2 \
-#                 model.path=bigscience/bloomz-560m \
-#                 model.load_model_args.device_map={get_device()} \
-#                 save_path={pwd()}"
+    command = f"HYDRA_CONFIG={pwd()}/../../examples/configs/polygraph_eval_{dataset}.yaml \
+                polygraph_eval \
+                subsample_eval_dataset=2 \
+                model.path=bigscience/bloomz-560m \
+                model.load_model_args.device_map={get_device()} \
+                save_path={pwd()}"
 
-#     return subprocess.run(command, shell=True)
+    return subprocess.run(command, shell=True)
 
 
-# def check_claim_level_result(dataset, reference):
-#     man = UEManager.load(
-#         f"{pwd()}/ue_manager_seed1",
-#         builder_env_stat_calc=BuilderEnvironmentStatCalculator(None),
-#         available_stat_calculators=register_default_stat_calculators(
-#             model_type="Whitebox"
-#         ),
-#     )
+def check_claim_level_result(dataset, reference):
+    man = UEManager.load(
+        f"{pwd()}/ue_manager_seed1",
+        builder_env_stat_calc=BuilderEnvironmentStatCalculator(None),
+        available_stat_calculators=register_default_stat_calculators(
+            model_type="Whitebox"
+        ),
+    )
 
-#     assert man.stats["input_texts"][0] == reference[dataset + "_input"]
-#     assert man.stats["target_texts"][0] == reference[dataset + "_output"]
+    assert man.stats["input_texts"][0] == reference[dataset + "_input"]
+    assert man.stats["target_texts"][0] == reference[dataset + "_output"]
 
-#     os.remove(f"{pwd()}/ue_manager_seed1")
+    os.remove(f"{pwd()}/ue_manager_seed1")
 
 
-# def test_person_bio(reference):
-#     base_dataset_name = "person_bio"
-#     langs = ["en_mistral", "zh"]
+def test_person_bio(reference):
+    base_dataset_name = "person_bio"
+    langs = ["en_mistral", "zh"]
 
-#     for lang in langs:
-#         dataset = f"{base_dataset_name}_{lang}"
-#         run_claim_eval(dataset)
-#         check_claim_level_result(dataset, reference)
+    for lang in langs:
+        dataset = f"{base_dataset_name}_{lang}"
+        run_claim_eval(dataset)
+        check_claim_level_result(dataset, reference)
diff --git a/test/test_estimators.py b/test/test_estimators.py
index 5c2744d85..48d3faa8d 100644
--- a/test/test_estimators.py
+++ b/test/test_estimators.py
@@ -31,40 +31,40 @@ def model():
     return WhiteboxModel(base_model, tokenizer)
 
 
-# def test_maximum_sequence_probability(model):
-#     estimator = MaximumSequenceProbability()
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_maximum_sequence_probability(model):
+    estimator = MaximumSequenceProbability()
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_perplexity(model):
-#     estimator = Perplexity()
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_perplexity(model):
+    estimator = Perplexity()
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_mean_token_entropy(model):
-#     estimator = MeanTokenEntropy()
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_mean_token_entropy(model):
+    estimator = MeanTokenEntropy()
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_mean_pointwise_mutual_information(model):
-#     estimator = MeanPointwiseMutualInformation()
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_mean_pointwise_mutual_information(model):
+    estimator = MeanPointwiseMutualInformation()
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_mean_conditional_pointwise_mutual_information(model):
-#     estimator = MeanConditionalPointwiseMutualInformation()
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_mean_conditional_pointwise_mutual_information(model):
+    estimator = MeanConditionalPointwiseMutualInformation()
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_claim_conditioned_probability(model):
-#     estimator = ClaimConditionedProbability()
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_claim_conditioned_probability(model):
+    estimator = ClaimConditionedProbability()
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
 def test_ptrue(model):
@@ -73,165 +73,165 @@ def test_ptrue(model):
     assert isinstance(ue.uncertainty, float)
 
 
-# def test_ptrue_sampling(model):
-#     estimator = PTrueSampling()
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_ptrue_sampling(model):
+    estimator = PTrueSampling()
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_monte_carlo_sequence_entropy(model):
-#     estimator = MonteCarloSequenceEntropy()
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_monte_carlo_sequence_entropy(model):
+    estimator = MonteCarloSequenceEntropy()
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_monte_carlo_normalized_sequence_entropy(model):
-#     estimator = MonteCarloNormalizedSequenceEntropy()
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_monte_carlo_normalized_sequence_entropy(model):
+    estimator = MonteCarloNormalizedSequenceEntropy()
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_lexical_similarity_rouge1(model):
-#     estimator = LexicalSimilarity(metric="rouge1")
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_lexical_similarity_rouge1(model):
+    estimator = LexicalSimilarity(metric="rouge1")
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_lexical_similarity_rouge2(model):
-#     estimator = LexicalSimilarity(metric="rouge2")
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_lexical_similarity_rouge2(model):
+    estimator = LexicalSimilarity(metric="rouge2")
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_lexical_similarity_rougel(model):
-#     estimator = LexicalSimilarity(metric="rougeL")
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_lexical_similarity_rougel(model):
+    estimator = LexicalSimilarity(metric="rougeL")
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_lexical_similarity_bleu(model):
-#     estimator = LexicalSimilarity(metric="BLEU")
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_lexical_similarity_bleu(model):
+    estimator = LexicalSimilarity(metric="BLEU")
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_num_sem_sets(model):
-#     estimator = NumSemSets()
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_num_sem_sets(model):
+    estimator = NumSemSets()
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_eigval_laplacian_nli_entail(model):
-#     estimator = EigValLaplacian(similarity_score="NLI_score", affinity="entail")
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_eigval_laplacian_nli_entail(model):
+    estimator = EigValLaplacian(similarity_score="NLI_score", affinity="entail")
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_eigval_laplacian_nli_contra(model):
-#     estimator = EigValLaplacian(similarity_score="NLI_score", affinity="contra")
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_eigval_laplacian_nli_contra(model):
+    estimator = EigValLaplacian(similarity_score="NLI_score", affinity="contra")
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_eigval_laplacian_jaccard(model):
-#     estimator = EigValLaplacian(similarity_score="Jaccard_score")
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_eigval_laplacian_jaccard(model):
+    estimator = EigValLaplacian(similarity_score="Jaccard_score")
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_degmat_nli_entail(model):
-#     estimator = DegMat(similarity_score="NLI_score", affinity="entail")
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_degmat_nli_entail(model):
+    estimator = DegMat(similarity_score="NLI_score", affinity="entail")
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_degmat_nli_contra(model):
-#     estimator = DegMat(similarity_score="NLI_score", affinity="contra")
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_degmat_nli_contra(model):
+    estimator = DegMat(similarity_score="NLI_score", affinity="contra")
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_degmat_jaccard(model):
-#     estimator = DegMat(similarity_score="Jaccard_score")
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_degmat_jaccard(model):
+    estimator = DegMat(similarity_score="Jaccard_score")
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_eccentricity_nli_entail(model):
-#     estimator = Eccentricity(similarity_score="NLI_score", affinity="entail")
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_eccentricity_nli_entail(model):
+    estimator = Eccentricity(similarity_score="NLI_score", affinity="entail")
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_eccentricity_nli_contra(model):
-#     estimator = Eccentricity(similarity_score="NLI_score", affinity="contra")
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_eccentricity_nli_contra(model):
+    estimator = Eccentricity(similarity_score="NLI_score", affinity="contra")
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_eccentricity_jaccard(model):
-#     estimator = Eccentricity(similarity_score="Jaccard_score")
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_eccentricity_jaccard(model):
+    estimator = Eccentricity(similarity_score="Jaccard_score")
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_semantic_entropy(model):
-#     estimator = SemanticEntropy()
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_semantic_entropy(model):
+    estimator = SemanticEntropy()
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_sar(model):
-#     estimator = SAR()
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_sar(model):
+    estimator = SAR()
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_token_sar(model):
-#     estimator = TokenSAR()
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_token_sar(model):
+    estimator = TokenSAR()
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_sentence_sar(model):
-#     estimator = SentenceSAR()
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_sentence_sar(model):
+    estimator = SentenceSAR()
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_renyi_neg(model):
-#     estimator = RenyiNeg()
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_renyi_neg(model):
+    estimator = RenyiNeg()
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_fisher_rao(model):
-#     estimator = FisherRao()
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_fisher_rao(model):
+    estimator = FisherRao()
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_focus(model):
-#     model_name = model.model.config._name_or_path
-#     estimator = Focus(
-#         model_name=model_name,
-#         path="../token_idf/{model_name}/token_idf.pkl",
-#         gamma=0.9,
-#         p=0.01,
-#         idf_dataset="LM-Polygraph/RedPajama-Data-100-Sample-For-Test",
-#         trust_remote_code=True,
-#         idf_seed=42,
-#         idf_dataset_size=5,
-#         spacy_path="en_core_web_sm",
-#     )
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_focus(model):
+    model_name = model.model.config._name_or_path
+    estimator = Focus(
+        model_name=model_name,
+        path="../token_idf/{model_name}/token_idf.pkl",
+        gamma=0.9,
+        p=0.01,
+        idf_dataset="LM-Polygraph/RedPajama-Data-100-Sample-For-Test",
+        trust_remote_code=True,
+        idf_seed=42,
+        idf_dataset_size=5,
+        spacy_path="en_core_web_sm",
+    )
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
-# def test_kernel_language_entropy(model):
-#     estimator = KernelLanguageEntropy()
-#     ue = estimate_uncertainty(model, estimator, INPUT)
-#     assert isinstance(ue.uncertainty, float)
+def test_kernel_language_entropy(model):
+    estimator = KernelLanguageEntropy()
+    ue = estimate_uncertainty(model, estimator, INPUT)
+    assert isinstance(ue.uncertainty, float)
 
 
 def test_luq(model):
@@ -249,3 +249,4 @@ def test_probas_mean_with_cot(model):
     estimator = ProbasMeanWithCoT()
     ue = estimate_uncertainty(model, estimator, INPUT)
     assert isinstance(ue.uncertainty, float)
+
diff --git a/test/test_lm_polygraph.py b/test/test_lm_polygraph.py
index 0b0e643e2..da82128c2 100644
--- a/test/test_lm_polygraph.py
+++ b/test/test_lm_polygraph.py
@@ -40,18 +40,18 @@ def run_config_with_overrides(config_name, **overrides):
 # ================= TEST CASES ==================
 
 
-# def test_just_works():
-#     exec_result = run_config_with_overrides("test_polygraph_eval")
-#     assert (
-#         exec_result.returncode == 0
-#     ), f"polygraph_eval returned code {exec_result.returncode} != 0"
-
-
-# def test_all_seq_ue():
-#     exec_result = run_config_with_overrides("test_polygraph_eval_seq_ue")
-#     assert (
-#         exec_result.returncode == 0
-#     ), f"polygraph_eval returned code {exec_result.returncode} != 0"
+def test_just_works():
+    exec_result = run_config_with_overrides("test_polygraph_eval")
+    assert (
+        exec_result.returncode == 0
+    ), f"polygraph_eval returned code {exec_result.returncode} != 0"
+
+
+def test_all_seq_ue():
+    exec_result = run_config_with_overrides("test_polygraph_eval_seq_ue")
+    assert (
+        exec_result.returncode == 0
+    ), f"polygraph_eval returned code {exec_result.returncode} != 0"
 
 
 # ================= PE ensembles ==================

From 0ba331021f3394521b2a862d8f766480b5de0049 Mon Sep 17 00:00:00 2001
From: ConstFr <denis@agent-bot-dev-vm.us-central1-c.c.agent-bot-dev.internal>
Date: Wed, 23 Apr 2025 13:27:08 +0000
Subject: [PATCH 3/6] evaluated on reasoning enhanced hotpot

---
 examples/reasoning_example.ipynb | 631 +++++++++++++++++++++++++++++++
 1 file changed, 631 insertions(+)
 create mode 100644 examples/reasoning_example.ipynb

diff --git a/examples/reasoning_example.ipynb b/examples/reasoning_example.ipynb
new file mode 100644
index 000000000..1ae3d7c39
--- /dev/null
+++ b/examples/reasoning_example.ipynb
@@ -0,0 +1,631 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6958a441",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"   # see issue #152\n",
+    "# os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"1\"\n",
+    "\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\n",
+    "from lm_polygraph.estimators import *\n",
+    "from lm_polygraph.utils.model import WhiteboxModel\n",
+    "from lm_polygraph.utils.dataset import Dataset\n",
+    "from lm_polygraph.utils.processor import Logger\n",
+    "from lm_polygraph.utils.manager import UEManager\n",
+    "from lm_polygraph.ue_metrics import PredictionRejectionArea\n",
+    "from lm_polygraph.generation_metrics import RougeMetric, BartScoreSeqMetric, ModelScoreSeqMetric, ModelScoreTokenwiseMetric, AggregatedMetric\n",
+    "from lm_polygraph.utils.builder_enviroment_stat_calculator import (\n",
+    "    BuilderEnvironmentStatCalculator\n",
+    ")\n",
+    "from lm_polygraph.defaults.register_default_stat_calculators import (\n",
+    "    register_default_stat_calculators,\n",
+    ")\n",
+    "from lm_polygraph.utils.factory_stat_calculator import StatCalculatorContainer\n",
+    "from omegaconf import OmegaConf"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5025e26e-fd7f-44b6-88d7-5876439a5ab0",
+   "metadata": {},
+   "source": [
+    "# Specify HyperParameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "7111f938-bc8c-4b82-82a1-fce490bc8e4a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# model_path = \"bigscience/bloomz-560m\"\n",
+    "model_path = \"meta-llama/Llama-3.1-8B-Instruct\"\n",
+    "device = \"cuda\"\n",
+    "model_type = \"Whitebox\"\n",
+    "dataset_name = \"denis1699/hotpot_cot\"\n",
+    "batch_size = 1\n",
+    "seed = 42"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "757a3862-77d1-4bb4-8423-1f86f3a58b54",
+   "metadata": {},
+   "source": [
+    "# Initialize Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "4e7a7afe",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8b41e2f8f6334c8785ffa023bd7c474b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "quantization_config = BitsAndBytesConfig(\n",
+    "    load_in_8bit=True,\n",
+    ")\n",
+    "\n",
+    "base_model = AutoModelForCausalLM.from_pretrained(\n",
+    "    model_path,\n",
+    "    token=os.getenv(\"HF_TOKEN\"),\n",
+    "    device_map=device,\n",
+    "    quantization_config=quantization_config,\n",
+    ")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_path,\n",
+    "                                          token=os.getenv(\"HF_TOKEN\")\n",
+    "                                         )\n",
+    "tokenizer.pad_token_id = tokenizer.eos_token_id\n",
+    "\n",
+    "model = WhiteboxModel(base_model, tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fe460bd5-35bb-4c36-a6b8-12b7a111b403",
+   "metadata": {},
+   "source": [
+    "# Train and Eval Datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "0444bbb3-7b9d-4823-ad9b-2b2a217d1638",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Use validation split, since test split of trivia_qa doesn't have reference answers\n",
+    "dataset = Dataset.load(\n",
+    "    dataset_name,\n",
+    "    'question_with_cot', 'answer',\n",
+    "    batch_size=batch_size,\n",
+    "    prompt=\"Question: {question_with_cot}\\nAnswer:{answer}\",\n",
+    "    split=\"validation\"\n",
+    ")\n",
+    "dataset.subsample(16, seed=seed)\n",
+    "\n",
+    "train_dataset = Dataset.load(\n",
+    "    dataset_name,\n",
+    "    'question_with_cot', 'answer',\n",
+    "    batch_size=batch_size,\n",
+    "    prompt=\"Question: {question_with_cot}\\nAnswer:{answer}\",\n",
+    "    split=\"train\"\n",
+    ")\n",
+    "train_dataset.subsample(16, seed=seed)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bd61ed46-8757-4d83-baae-bf854bd11d0e",
+   "metadata": {},
+   "source": [
+    "# Metric, UE Metric, and UE Methods"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "5baa618b-d6dc-4292-a316-30f0e0f8db78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ue_methods = [MaximumSequenceProbability(), \n",
+    "              SemanticEntropy(),\n",
+    "              MahalanobisDistanceSeq(\"decoder\"),\n",
+    "             ]\n",
+    "\n",
+    "ue_metrics = [PredictionRejectionArea(), PredictionRejectionArea(max_rejection=0.5)]\n",
+    "\n",
+    "# Wrap generation metric in AggregatedMetric, since trivia_qa is a multi-reference dataset\n",
+    "# (y is a list of possible correct answers)\n",
+    "metrics = [AggregatedMetric(RougeMetric('rougeL'))]\n",
+    "\n",
+    "loggers = [Logger()] "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8d89a992-fafe-46ce-ad38-77b1c77aa3df",
+   "metadata": {},
+   "source": [
+    "# Stat Calculators"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "c98d1f0f-320e-4d7b-97a9-fad63d0348e5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "TrainingStatistic_config = {\n",
+    "    \"dataset\": dataset_name,\n",
+    "    \"text_column\": 'question_with_cot',\n",
+    "    \"label_column\": 'answer',\n",
+    "    \"description\": '',\n",
+    "    \"prompt\": \"Question: {question_with_cot}\\nAnswer:\",\n",
+    "    \"few_shot_split\": 'train',\n",
+    "    \"train_split\": 'train',\n",
+    "    \"load_from_disk\": False,\n",
+    "    \"subsample_train_dataset\": 10,\n",
+    "    \"n_shot\": 5,\n",
+    "    \"train_dataset\": dataset_name,\n",
+    "    \"train_test_split\": False,\n",
+    "    # needs to be improved to get rid of hardcoded dataset variables.\n",
+    "    \"background_train_dataset\": \"denis1699/hotpot_cot\",\n",
+    "    \"background_train_dataset_text_column\": 'question_with_cot',\n",
+    "    \"background_train_dataset_label_column\": 'answer',\n",
+    "    \"background_train_dataset_data_files\": 'train.csv',\n",
+    "    \"background_load_from_disk\": False,\n",
+    "    \"subsample_background_train_dataset\": 10,\n",
+    "    \"batch_size\": 1,\n",
+    "    \"size\": 16,\n",
+    "    \"bg_size\": 16,\n",
+    "    \"seed\": 1\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "b93cda59",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# register default stat calculators\n",
+    "result_stat_calculators = dict()\n",
+    "scs = register_default_stat_calculators(model_type)\n",
+    "for sc in scs:\n",
+    "    result_stat_calculators[sc.name] = sc\n",
+    "\n",
+    "# register TrainingStatisticExtractionCalculator for the Mahalanobis Distance method\n",
+    "result_stat_calculators.update(\n",
+    "    {\n",
+    "        \"TrainingStatisticExtractionCalculator\": StatCalculatorContainer(\n",
+    "            name=\"TrainingStatisticExtractionCalculator\",\n",
+    "            cfg=OmegaConf.create(TrainingStatistic_config),\n",
+    "            stats=[\"train_embeddings\", \"background_train_embeddings\", \"train_greedy_log_likelihoods\"],\n",
+    "            dependencies=[],\n",
+    "            builder=\"lm_polygraph.defaults.stat_calculator_builders.default_TrainingStatisticExtractionCalculator\",\n",
+    "        )\n",
+    "    }\n",
+    ")\n",
+    "    \n",
+    "builder_env_stat_calc = BuilderEnvironmentStatCalculator(model=model)\n",
+    "available_stat_calculators = list(result_stat_calculators.values())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7b1c0bc3-8278-4ede-a1f1-6bc3b071a644",
+   "metadata": {},
+   "source": [
+    "# Manager"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "962fff25-5dae-4414-b406-9d4a657928f6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at microsoft/deberta-large-mnli were not used when initializing DebertaForSequenceClassification: ['config']\n",
+      "- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+     ]
+    }
+   ],
+   "source": [
+    "man = UEManager(\n",
+    "    data=dataset,\n",
+    "    model=model,\n",
+    "    estimators=ue_methods,\n",
+    "    builder_env_stat_calc=builder_env_stat_calc,\n",
+    "    available_stat_calculators=available_stat_calculators,\n",
+    "    generation_metrics=metrics,\n",
+    "    ue_metrics=ue_metrics,\n",
+    "    processors=loggers,\n",
+    "    ignore_exceptions=False,\n",
+    "    max_new_tokens=64\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b2a92e70-3036-430d-a60a-4c2ecf768d9d",
+   "metadata": {},
+   "source": [
+    "# Compute Results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "2da7a129-cc59-4b55-b71f-fb4ee230a416",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|                                                    | 0/16 [00:00<?, ?it/s]\n",
+      "\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.it/s]\n",
+      "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation=\"eager\"` when loading the model.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.s]\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.t]\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.t]\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.t]\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.t]\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.t]\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.t]\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.t]\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.t]\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.t]\n",
+      "\n",
+      "\n",
+      "100%|███████████████████████████████████████████| 10/10 [02:08<00:00, 12.81s/it]\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.s]\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.t]\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.t]\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.t]\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.t]\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.t]\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.t]\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.t]\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.t]\n",
+      "\n",
+      "\n",
+      "\u001b[A\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.t]\n",
+      "\n",
+      "\n",
+      "100%|███████████████████████████████████████████| 10/10 [02:07<00:00, 12.76s/it]\n",
+      "\n",
+      "\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.s/it]\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "\n",
+      "\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.s/it]\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "\n",
+      "\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.s/it]\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "\n",
+      "\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.s/it]\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "\n",
+      "\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.s/it]\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "\n",
+      "\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.s/it]\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "\n",
+      "\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.s/it]\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "\n",
+      "\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.s/it]\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "\n",
+      "\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.s/it]\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "\n",
+      "\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.s/it]\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "\n",
+      "\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.s/it]\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "\n",
+      "\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.s/it]\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "\n",
+      "\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.s/it]\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "\n",
+      "\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.s/it]\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "\n",
+      "\u001b[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.s/it]\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
+      "\n",
+      "100%|██████████████████████████████████████████| 16/16 [28:13<00:00, 105.82s/it]\n",
+      "  0%|                                                    | 0/16 [28:13<?, ?it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "results = man()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ef6abce0-dba7-40c1-916f-1be546a78c8f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "UE Score: MaximumSequenceProbability, Metric: Rouge_rougeL, UE Metric: prr, Score: 0.702\n",
+      "UE Score: MaximumSequenceProbability, Metric: Rouge_rougeL, UE Metric: prr_normalized, Score: 0.225\n",
+      "UE Score: SemanticEntropy, Metric: Rouge_rougeL, UE Metric: prr, Score: 0.627\n",
+      "UE Score: SemanticEntropy, Metric: Rouge_rougeL, UE Metric: prr_normalized, Score: -0.209\n",
+      "UE Score: MahalanobisDistanceSeq_decoder, Metric: Rouge_rougeL, UE Metric: prr, Score: 0.652\n",
+      "UE Score: MahalanobisDistanceSeq_decoder, Metric: Rouge_rougeL, UE Metric: prr_normalized, Score: -0.063\n",
+      "UE Score: MaximumSequenceProbability, Metric: Rouge_rougeL, UE Metric: prr_0.5, Score: 0.675\n",
+      "UE Score: MaximumSequenceProbability, Metric: Rouge_rougeL, UE Metric: prr_0.5_normalized, Score: 0.095\n",
+      "UE Score: SemanticEntropy, Metric: Rouge_rougeL, UE Metric: prr_0.5, Score: 0.650\n",
+      "UE Score: SemanticEntropy, Metric: Rouge_rougeL, UE Metric: prr_0.5_normalized, Score: -0.115\n",
+      "UE Score: MahalanobisDistanceSeq_decoder, Metric: Rouge_rougeL, UE Metric: prr_0.5, Score: 0.676\n",
+      "UE Score: MahalanobisDistanceSeq_decoder, Metric: Rouge_rougeL, UE Metric: prr_0.5_normalized, Score: 0.102\n"
+     ]
+    }
+   ],
+   "source": [
+    "for key in results.keys():\n",
+    "    print(f\"UE Score: {key[1]}, Metric: {key[2]}, UE Metric: {key[3]}, Score: {results[key]:.3f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a7fd1450-ee66-479c-a613-8e4ed7eedd0a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 464dc1331a1ff1229a45e43653d5cb684e9e37b7 Mon Sep 17 00:00:00 2001
From: ConstFr <denis@agent-bot-dev-vm.us-central1-c.c.agent-bot-dev.internal>
Date: Mon, 5 May 2025 16:04:03 +0000
Subject: [PATCH 4/6] benchmarking reasoning approach

---
 .../configs/estimators/cot_estimators.yaml    |  13 +
 .../estimators/default_estimators.yaml        |   3 +-
 .../configs/polygraph_eval_cot_hotpot.yaml    |  40 ++++
 notebooks/result_tables.ipynb                 |  16 +-
 scripts/polygraph_eval                        |  10 +-
 .../register_default_stat_calculators.py      |  98 ++++----
 .../estimators/chain_of_thought_uq.py         |   1 -
 .../reasoning_keywords_probs.py               | 225 ++++++++++--------
 src/lm_polygraph/utils/manager.py             |  18 ++
 9 files changed, 268 insertions(+), 156 deletions(-)
 create mode 100644 examples/configs/estimators/cot_estimators.yaml
 create mode 100644 examples/configs/polygraph_eval_cot_hotpot.yaml

diff --git a/examples/configs/estimators/cot_estimators.yaml b/examples/configs/estimators/cot_estimators.yaml
new file mode 100644
index 000000000..41aa129ef
--- /dev/null
+++ b/examples/configs/estimators/cot_estimators.yaml
@@ -0,0 +1,13 @@
+- name: MaximumSequenceProbability
+- name: Perplexity
+- name: MeanTokenEntropy
+- name: MeanPointwiseMutualInformation
+- name: MeanConditionalPointwiseMutualInformation 
+- name: PTrue
+- name: PTrueSampling
+- name: MonteCarloSequenceEntropy
+- name: MonteCarloNormalizedSequenceEntropy
+- name: EigenScore
+- name: RenyiNeg
+- name: FisherRao
+- name: ProbasMeanWithCoT
diff --git a/examples/configs/estimators/default_estimators.yaml b/examples/configs/estimators/default_estimators.yaml
index 41a40e079..477da0631 100644
--- a/examples/configs/estimators/default_estimators.yaml
+++ b/examples/configs/estimators/default_estimators.yaml
@@ -82,4 +82,5 @@
     trust_remote_code: True
     idf_seed: 42
     idf_dataset_size: -1
-    spacy_path: "en_core_web_sm"
\ No newline at end of file
+    spacy_path: "en_core_web_sm"
+- name: ProbasMeanWithCoT
diff --git a/examples/configs/polygraph_eval_cot_hotpot.yaml b/examples/configs/polygraph_eval_cot_hotpot.yaml
new file mode 100644
index 000000000..be3994c36
--- /dev/null
+++ b/examples/configs/polygraph_eval_cot_hotpot.yaml
@@ -0,0 +1,40 @@
+hydra:
+  run:
+    dir: ${cache_path}/${task}/${model}/${dataset}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+
+defaults:
+  - model: bloomz-560m
+  - estimators: cot_estimators
+  - stat_calculators: default_calculators
+  - _self_
+
+cache_path: ./workdir/output
+save_path: '${hydra:run.dir}'
+instruct: true
+task: qa
+
+dataset: ['denis1699/hotpot_cot']
+text_column: question
+label_column: answer
+train_split: train
+eval_split: validation
+few_shot_prompt: null
+max_new_tokens: 256
+load_from_disk: false
+normalize: true
+trust_remote_code: false
+size: 100
+
+
+output_ignore_regex: "(?s).*Final Answer:"
+
+subsample_eval_dataset: 10
+
+generation_metrics: null
+
+ignore_exceptions: false
+
+batch_size: 1
+
+seed:
+    - 1
diff --git a/notebooks/result_tables.ipynb b/notebooks/result_tables.ipynb
index d20962b41..2f7c20780 100644
--- a/notebooks/result_tables.ipynb
+++ b/notebooks/result_tables.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "5e5fd065-8111-48de-9c92-3f7c8f378762",
    "metadata": {
     "tags": []
@@ -26,7 +26,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "2046bc0c-9d7a-484d-8acd-f347dcb28e23",
    "metadata": {
     "tags": []
@@ -76,7 +76,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "6bb03658-a53b-4df3-84d6-2f171badec5f",
    "metadata": {},
    "outputs": [],
@@ -106,7 +106,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "963f5e7c-3a06-405e-bc3f-c16d3fe83074",
    "metadata": {},
    "outputs": [],
@@ -199,7 +199,7 @@
    },
    "outputs": [],
    "source": [
-    "paths = [\"../workdir/camera_ready_exps/v1\", \"../workdir/camera_ready_exps/bertscore\"]\n",
+    "paths = [\"../workdir/output/qa\"]\n",
     "models = [\"vicuna\", \"llama\"]\n",
     "datasets = [\"aeslc\", \"xsum\", \"coqa\", \"babiqa\", \"wmt14_deen\", \"wmt14_fren\"]\n",
     "gen_metrics = [\"Rouge_rougeL\", \"Bert\"]\n",
@@ -478,9 +478,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "lm_poly",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "lm_poly"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -492,7 +492,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,
diff --git a/scripts/polygraph_eval b/scripts/polygraph_eval
index 057bf3b2b..408865ca5 100755
--- a/scripts/polygraph_eval
+++ b/scripts/polygraph_eval
@@ -224,14 +224,14 @@ def get_generation_metrics(args):
             RougeMetric("rouge2"),
             RougeMetric("rougeL"),
             BLEUMetric(),
-            BertScoreMetric("rh"),
-            SbertMetric(),
+            # BertScoreMetric("rh"),
+            # SbertMetric(),
             AccuracyMetric(
                 target_ignore_regex=getattr(args, "target_ignore_regex", None),
                 output_ignore_regex=getattr(args, "output_ignore_regex", None),
                 normalize=getattr(args, "normalize", False),
             ),
-            AlignScore(target_is_claims=False if args.task == "ats" else True),
+            # AlignScore(target_is_claims=False if args.task == "ats" else True),
         ]
         if getattr(args.model, "type", "Whitebox") != "Blackbox":
             if getattr(args, "use_claim_ue", False):
@@ -374,7 +374,9 @@ def get_vllm_model(args):
 
     load_model_args = {'model_path': args.model.path, 
                        'max_new_tokens': args.max_new_tokens, 
-                       'logprobs': args.model.logprobs}
+                       'logprobs': args.model.logprobs,
+                       'max_model_len': 8192,
+                       }
     
     load_model_args.update(args.model.load_model_args)
     base_model, sampling_params = load_module.load_model(**load_model_args)
diff --git a/src/lm_polygraph/defaults/register_default_stat_calculators.py b/src/lm_polygraph/defaults/register_default_stat_calculators.py
index 22f25eb05..a79e17de1 100644
--- a/src/lm_polygraph/defaults/register_default_stat_calculators.py
+++ b/src/lm_polygraph/defaults/register_default_stat_calculators.py
@@ -47,18 +47,18 @@ def _register(
         deberta_model_path = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
 
     _register(InitialStateCalculator)
-    _register(
-        SemanticMatrixCalculator,
-        "lm_polygraph.defaults.stat_calculator_builders.default_SemanticMatrixCalculator",
-        {
-            "nli_model": {
-                "deberta_path": deberta_model_path,
-                "hf_cache": hf_cache,
-                "batch_size": 10,
-                "device": None,
-            }
-        },
-    )
+    # _register(
+    #     SemanticMatrixCalculator,
+    #     "lm_polygraph.defaults.stat_calculator_builders.default_SemanticMatrixCalculator",
+    #     {
+    #         "nli_model": {
+    #             "deberta_path": deberta_model_path,
+    #             "hf_cache": hf_cache,
+    #             "batch_size": 10,
+    #             "device": None,
+    #         }
+    #     },
+    # )
     _register(SemanticClassesCalculator)
 
     if model_type == "Blackbox":
@@ -99,43 +99,43 @@ def _register(
         _register(PromptCalculator)
         _register(SamplingPromptCalculator)
         _register(ClaimPromptCalculator)
-        _register(
-            CrossEncoderSimilarityMatrixCalculator,
-            "lm_polygraph.defaults.stat_calculator_builders.default_CrossEncoderSimilarityMatrixCalculator",
-            {
-                "batch_size": 10,
-                "cross_encoder_name": "cross-encoder/stsb-roberta-large",
-            },
-        )
-        _register(
-            GreedyAlternativesNLICalculator,
-            "lm_polygraph.defaults.stat_calculator_builders.default_GreedyAlternativesNLICalculator",
-            {
-                "nli_model": {
-                    "deberta_path": deberta_model_path,
-                    "hf_cache": hf_cache,
-                    "batch_size": 10,
-                    "device": None,
-                }
-            },
-        )
-        _register(
-            GreedyAlternativesFactPrefNLICalculator,
-            "lm_polygraph.defaults.stat_calculator_builders.default_GreedyAlternativesFactPrefNLICalculator",
-            {
-                "nli_model": {
-                    "deberta_path": deberta_model_path,
-                    "hf_cache": hf_cache,
-                    "batch_size": 10,
-                    "device": None,
-                }
-            },
-        )
-        _register(
-            ClaimsExtractor,
-            "lm_polygraph.defaults.stat_calculator_builders.default_ClaimsExtractor",
-            {"openai_model": "gpt-4o", "cache_path": "~/.cache", "language": language},
-        )
+        # _register(
+        #     CrossEncoderSimilarityMatrixCalculator,
+        #     "lm_polygraph.defaults.stat_calculator_builders.default_CrossEncoderSimilarityMatrixCalculator",
+        #     {
+        #         "batch_size": 10,
+        #         "cross_encoder_name": "cross-encoder/stsb-roberta-large",
+        #     },
+        # )
+        # _register(
+        #     GreedyAlternativesNLICalculator,
+        #     "lm_polygraph.defaults.stat_calculator_builders.default_GreedyAlternativesNLICalculator",
+        #     {
+        #         "nli_model": {
+        #             "deberta_path": deberta_model_path,
+        #             "hf_cache": hf_cache,
+        #             "batch_size": 10,
+        #             "device": None,
+        #         }
+        #     },
+        # )
+        # _register(
+        #     GreedyAlternativesFactPrefNLICalculator,
+        #     "lm_polygraph.defaults.stat_calculator_builders.default_GreedyAlternativesFactPrefNLICalculator",
+        #     {
+        #         "nli_model": {
+        #             "deberta_path": deberta_model_path,
+        #             "hf_cache": hf_cache,
+        #             "batch_size": 10,
+        #             "device": None,
+        #         }
+        #     },
+        # )
+        # _register(
+        #     ClaimsExtractor,
+        #     "lm_polygraph.defaults.stat_calculator_builders.default_ClaimsExtractor",
+        #     {"openai_model": "gpt-4o", "cache_path": "~/.cache", "language": language},
+        # )
         _register(
             ReasoningKeywordsProbs,
             "lm_polygraph.defaults.stat_calculator_builders.default_ReasoningKeywordsProbs",
diff --git a/src/lm_polygraph/estimators/chain_of_thought_uq.py b/src/lm_polygraph/estimators/chain_of_thought_uq.py
index 5dd7b6fd3..c51bce82a 100644
--- a/src/lm_polygraph/estimators/chain_of_thought_uq.py
+++ b/src/lm_polygraph/estimators/chain_of_thought_uq.py
@@ -69,7 +69,6 @@ def weighted_sum(values: List[float]) -> float:
     weights = [math.exp(-c) for c in values]
     sum_weights = sum(weights)
     normalized_weights = [w / sum_weights for w in weights]
-    print(normalized_weights)
     result = sum(w * c for w, c in zip(normalized_weights, values))
     return result
 
diff --git a/src/lm_polygraph/stat_calculators/reasoning_keywords_probs.py b/src/lm_polygraph/stat_calculators/reasoning_keywords_probs.py
index 3185b1c98..de695e6dc 100644
--- a/src/lm_polygraph/stat_calculators/reasoning_keywords_probs.py
+++ b/src/lm_polygraph/stat_calculators/reasoning_keywords_probs.py
@@ -1,5 +1,6 @@
 import re
 import torch
+import warnings
 import numpy as np
 from collections import defaultdict
 
@@ -8,6 +9,11 @@
 from .stat_calculator import StatCalculator
 from lm_polygraph.utils.model import WhiteboxModel
 
+import logging
+
+log = logging.getLogger("lm_polygraph")
+logging.getLogger("httpx").setLevel(logging.WARNING)
+
 
 cot_instruction = """
 Please reason the following question step by step. Label each reasoning step as "Step i:", where "i" is the step number.
@@ -18,18 +24,41 @@
 Response: Let's think step by step.
 """
 
-keywords_extraction_instruction = '''
-You will be provided with a question and a multi-step response containing reasoning steps.
+keywords_extraction_instruction = ''' 
+You will be provided with a question and a multi-step response containing reasoning steps. 
 For each long reasoning step labeled "Step i:", extract the keywords, only the relevant tokens for that specific reasoning step.
-You also need to evaluate the importance of each keyword to the final answer. Please evaluate the importance score following with the keyword by (/<importance score>/) on a scale of 1 to 10, where 1 is the least critical and 10 is the most critical.
+The keywords should be relevant to question and final answer.
 If you find more than one keyword in a specific step, separate them with “;”.
-If a specific step does not contribute meaningfully to deriving the final answer (e.g., repeating information already provided in the question, introducing irrelevant assumptions or speculations), return "Step i: NO ANSWER" for that step. For example:
+For example:
+
+######
+
+Q: Which band has more members, "We Are the Ocean" or "The Dream Academy"?
+
+Reasoning steps:
+Step 1: The question is asking which band has more members.
+Step 2: "We Are the Ocean" has five members.
+Step 3: "The Dream Academy" has three members.
+Step 4: 5 is greater than 3.
+Step 5: Therefore, "We Are the Ocean" has more members.
+Final Answer: We Are the Ocean
 
-Question:
-<QUESTION>
-Multi-Step Response:
+Keywords for each reasoning step: 
+Step 1: band
+Step 2: We Are the Ocean; five
+Step 3: The Dream Academy; three
+Step 4: greater
+Step 5: We Are the Ocean
+
+######
+
+The following is your task:
+Q: <QUESTION>
+
+Reasoning steps: 
 <RESPONSE>
-Keywords for Each Reasoning Step:
+
+Keywords for each reasoning step:
 '''
 
 
@@ -72,25 +101,26 @@ def parse_response_to_dict(response: str) -> Tuple[Optional[str], Dict[str, str]
     match = re.search(r"Final Answer:\s*(.+?)\s*(?=(\n|$))", response, re.DOTALL)
     if match:
         final_answer = match.group(1).strip()
-        response_before_final_answer = response[:match.end()].strip()
+        response_after_final_answer = response[:match.end()].strip()
+        # response_before_final_answer = response[:match.start()].strip()
     else:
         return None, {}, None
 
     # Match Steps
-    matches = list(re.finditer(r"(Step \d+):", response_before_final_answer))
+    matches = list(re.finditer(r"(Step \d+):", response_after_final_answer))
     for i, match in enumerate(matches):
         start = match.start()
-        end = matches[i + 1].start() if i + 1 < len(matches) else len(response_before_final_answer)
+        end = matches[i + 1].start() if i + 1 < len(matches) else len(response_after_final_answer)
         segment = response[start:end].strip()
         steps[match.group(1)] = segment
 
-    return_response = response_before_final_answer
+    return_response = response_after_final_answer
     return final_answer, steps, return_response
 
 
-def match_final_answer_token_ids(tokenizer, original_tokens, response_tokens, original_token_ids):
+def match_final_answer_token_ids(tokenizer, original_tokens, response_tokens, generated_ids):
     # caution
-    final_answer_tokens = tokenizer.tokenize("Final answer:")
+    final_answer_tokens = tokenizer.tokenize("Final Answer:")
 
     end_index = None
     end_index_original = None
@@ -100,7 +130,7 @@ def match_final_answer_token_ids(tokenizer, original_tokens, response_tokens, or
             end_index = i + len(final_answer_tokens)
             break
 
-    if end_index is None or end_index + 1 == len(response_tokens):
+    if end_index is None or end_index == len(response_tokens):
         return None, None
 
     for i in range(len(original_tokens) - len(final_answer_tokens) + 1):
@@ -117,9 +147,9 @@ def match_final_answer_token_ids(tokenizer, original_tokens, response_tokens, or
 
     target_tokens = response_tokens[end_index:]
 
-    final_answer_token_ids = original_token_ids[end_index_original : end_index_original + len(target_tokens)]
+    final_answer_token_ids = generated_ids[end_index_original : end_index_original + len(target_tokens)]
 
-    return end_index_original, final_answer_token_ids.data.cpu().numpy()
+    return end_index_original, final_answer_token_ids
 
 
 def predict(prompt, model, tokenizer, max_length_cot, temperature):
@@ -128,12 +158,56 @@ def predict(prompt, model, tokenizer, max_length_cot, temperature):
         **inputs,
         max_new_tokens=max_length_cot,
         temperature=temperature,
-        pad_token_id=tokenizer.eos_token_id)
-    generate_ids = generate_ids[0][len(inputs["input_ids"][0]):-1]
-    infer_res = tokenizer.decode(generate_ids)
+        pad_token_id=tokenizer.eos_token_id,
+        output_scores=True, 
+        return_dict_in_generate=True,
+        )
+    infer_res = tokenizer.decode(generate_ids.sequences[0][len(inputs["input_ids"][0]):-1])
     return infer_res
 
 
+# def step_exacts_2_list(response):
+#     # Split response into lines and filter out empty lines
+#     lines = response.splitlines()
+#     lines = [line for line in lines if line.strip()]
+
+#     keywords_by_step = []
+#     contributions_by_step = []
+#     valid_response_text = []
+
+#     for line in lines:
+#         # Match lines starting with "Step X:"
+#         match = re.search(r"Step \d+: (.+)", line)
+#         if match:
+#             # Extract keywords with contributions
+#             keywords_w_contribution = match.group(1).split("; ")
+
+#             # Check for valid format and skip invalid lines
+#             if any("(/" not in key_w_c or "/)" not in key_w_c for key_w_c in keywords_w_contribution):
+#                 continue
+
+#             try:
+#                 # Extract keywords and contributions
+#                 keywords = [key_w_c.split("(/")[0].strip() for key_w_c in keywords_w_contribution]
+#                 contributions = [int(key_w_c.split("(/")[1].split("/)")[0].strip()) for key_w_c in keywords_w_contribution]
+#             except ValueError:
+#                 return False  # Return False if contributions cannot be converted to int
+
+#             for i in contributions:
+#                 if i > 10:
+#                     return False
+
+#             keywords_by_step.append(keywords)
+#             contributions_by_step.append(contributions)
+#             valid_response_text.append(line)  # Add valid lines from the original response
+
+#     # If no valid lines are found, return False
+#     if not valid_response_text:
+#         return False
+
+#     return "\n".join(valid_response_text), keywords_by_step, contributions_by_step
+
+
 def step_exacts_2_list(response):
     # Split response into lines and filter out empty lines
     lines = response.splitlines()
@@ -147,46 +221,24 @@ def step_exacts_2_list(response):
         # Match lines starting with "Step X:"
         match = re.search(r"Step \d+: (.+)", line)
         if match:
-            if "(/" not in line or "/)" not in line:
-                continue  # Skip invalid lines
-
-            # Extract keywords with contributions
-            keywords_w_contribution = match.group(1).split("; ")
-
-            # Check for valid format and skip invalid lines
-            if any("(/" not in key_w_c or "/)" not in key_w_c for key_w_c in keywords_w_contribution):
-                continue
+            # Extract keywords
+            keywords = match.group(1).split("; ")
 
-            try:
-                # Extract keywords and contributions
-                keywords = [key_w_c.split("(/")[0].strip() for key_w_c in keywords_w_contribution]
-                contributions = [int(key_w_c.split("(/")[1].split("/)")[0].strip()) for key_w_c in keywords_w_contribution]
-            except ValueError:
-                return False  # Return False if contributions cannot be converted to int
-
-            for i in contributions:
-                if i > 10:
-                    return False
+            contributions = [10]*len(keywords)
 
             keywords_by_step.append(keywords)
             contributions_by_step.append(contributions)
             valid_response_text.append(line)  # Add valid lines from the original response
 
-    # If no valid lines are found, return False
-    if not valid_response_text:
-        return False
-
     return "\n".join(valid_response_text), keywords_by_step, contributions_by_step
 
 
 def find_subsequence_position(sub_sequence, long_sequence):
-    len_long = long_sequence.size(0)
+    len_long = len(long_sequence)
     len_sub = len(sub_sequence)
 
-    sub_sequence_tensor = torch.tensor(sub_sequence, device=long_sequence.device)
-
     for i in range(len_long - len_sub + 1):
-        if torch.equal(long_sequence[i:i + len_sub], sub_sequence_tensor):
+        if long_sequence[i:i + len_sub] == sub_sequence:
             return i
     return -1
 
@@ -243,9 +295,9 @@ def meta_info() -> Tuple[List[str], List[str]]:
             "reasoning_keywords_contributions",
             "reasoning_keywords_token_ids",
             "reasoning_answer_token_ids",
-        ], ["input_texts"]
+        ], ["input_texts", "greedy_texts", "greedy_tokens", "greedy_log_probs"]
 
-    def __init__(self, max_retries=5, max_length_cot=128, temperature=1):
+    def __init__(self, max_retries=5, max_length_cot=256, temperature=1):
         super().__init__()
         self.max_retries = max_retries
         self.max_length_cot = max_length_cot
@@ -278,51 +330,39 @@ def __call__(
                 - 'reasoning_answer_token_ids' (List[Dict[str, List[int]]]): token indices for `reasoning_keywords`.
         """
         result_dict = defaultdict(list)
-        for question in texts:
-            cot_prompt = cot_instruction.replace("<QUESTION>", question)
-
-            inputs = model.tokenizer(cot_prompt, return_tensors="pt")
-            inputs = {key: value.to(model.model.device) for key, value in inputs.items()}
+        batch_input_texts = dependencies['input_texts']
+        batch_generated_texts = dependencies['greedy_texts']
+        batch_generated_tokens = dependencies['greedy_tokens']
+        batch_generated_log_probs = dependencies['greedy_log_probs']
+        for input_text, generated_text, generated_tokens, generated_log_probs in zip(batch_input_texts, batch_generated_texts, batch_generated_tokens, batch_generated_log_probs):
+            question = re.search(r'Question:\s*(.*?)\s*Response:', input_text, re.DOTALL).group(1).strip()
+            # log.info(f"Input texts: {question}")
+            # log.info(f"Generated text: {generated_text}")
             n_of_retries = 0
             while n_of_retries < self.max_retries:
-                outputs = model.generate(
-                    **inputs,
-                    max_new_tokens=self.max_length_cot,
-                    temperature=self.temperature,
-                    pad_token_id=model.tokenizer.eos_token_id,
-                    return_dict_in_generate=True,
-                    output_scores=True,
-                )
-
                 # generated token ids for the question enchanced with CoT.
-                generated_ids = outputs.sequences[0][len(inputs["input_ids"][0]) : -1]
+                generated_ids = generated_tokens
                 # generated text for the question enchaced with CoT
                 to_parse = model.tokenizer.decode(generated_ids, skip_special_tokens=True)
 
                 llm_answer, steps_dict, response = parse_response_to_dict(to_parse)
-                if generated_ids.size(0) >= self.max_length_cot:
-                    # log.debug(f'New Reasoning Tokens Are Too Much, Current try is {n_of_retries + 1}')
-                    n_of_retries += 1
-                    continue
-                elif generated_ids.size(0) == 0:
-                    # log.debug(f'New Reasoning Tokens Are Null, Current try is {n_of_retries + 1}')
+                
+                if len(generated_ids) == 0:
+                    log.info(f'New Reasoning Tokens Are Null, Current try is {n_of_retries + 1}')
                     n_of_retries += 1
                     continue
-                elif llm_answer is None or llm_answer in ["", " "]:
-                    # log.debug(f'New Reasoning Tokens Are None, Current try is {n_of_retries + 1}')
+                if llm_answer is None or llm_answer in ["", " "]:
+                    log.info(f'New Reasoning Tokens Are None, Current try is {n_of_retries + 1}')
                     n_of_retries += 1
                     continue
 
-                # reasoning tokens without final answer
+                # reasoning tokens
                 response_tokens = model.tokenizer.tokenize(response)
-                # reasoning token ids without final answer
-                # response_token_ids = model.tokenizer.convert_tokens_to_ids(response_tokens)
                 # full reasoning tokens
                 original_tokens = model.tokenizer.convert_ids_to_tokens(generated_ids)
-
                 probabilities = [
-                    {i: p for i, p in enumerate(prob[0]) if p > 0}
-                    for prob in [torch.softmax(score, dim=1).tolist() for score in outputs.scores]
+                    {i: p for i, p in enumerate(prob) if p > 0}
+                    for prob in [torch.softmax(torch.from_numpy(score), dim=0).tolist() for score in generated_log_probs]
                 ]
 
                 final_answer_probabilities = {}
@@ -334,7 +374,7 @@ def __call__(
                     generated_ids,
                 )
                 if answer_start_indice is None:
-                    # log.debug(f'Cannot locate the Final Answer, Current try is {n_of_retries + 1}')
+                    log.info(f'Cannot locate the Final Answer, Current try is {n_of_retries + 1}')
                     n_of_retries += 1
                     continue
                 answer_probs = []
@@ -350,29 +390,28 @@ def __call__(
                     n_of_retries += 1
                     continue
                 final_answer_probabilities[llm_answer] = answer_probs
-                final_answer_token_ids[llm_answer] = answer_token_ids.tolist()
+                final_answer_token_ids[llm_answer] = answer_token_ids
 
                 # exacts_prompt = get_step_exact_tokens(args, q, response)
                 keywords_extraction_prompt = keywords_extraction_instruction.replace('<QUESTION>', question).replace('<RESPONSE>', response)
+                chat = [{"role": "user", "content": keywords_extraction_prompt},]
+                keywords_extraction_prompt = model.tokenizer.apply_chat_template(chat, tokenize=False)
+                
                 keywords_extraction_prompt_output = predict(keywords_extraction_prompt, model, model.tokenizer, self.max_length_cot, self.temperature)
-
-                if "NO ANSWER" in keywords_extraction_prompt_output:
-                    # log.debug(f'Exact Tokens Have NO ANSWER, Current try is {n_of_retries + 1}')
-                    n_of_retries += 1
-                    continue
+                
                 parsed_keywords_output = step_exacts_2_list(keywords_extraction_prompt_output)
                 if not parsed_keywords_output:
-                    # log.debug(f'Exact Tokens Have no contribution scores, Current try is {n_of_retries + 1}')
+                    log.info(f'Exact Tokens Have no contribution scores, Current try is {n_of_retries + 1}')
                     n_of_retries += 1
                     continue
                 extracted_keywords, keywords_list, contributions_list = parsed_keywords_output
                 if len(keywords_list) == 0:
-                    # log.debug(f'Cannot Exract Effective Keywords, Current try is {n_of_retries + 1}')
+                    log.info(f'Cannot Exract Effective Keywords, Current try is {n_of_retries + 1}')
                     n_of_retries += 1
                     continue
 
                 if len(steps_dict) > len(keywords_list):
-                    # log.debug(f'Len of keywords list doesn\'t match the len of step dict, Current try is {n_of_retries + 1}')
+                    log.info(f'Len of keywords list doesn\'t match the len of step dict, Current try is {n_of_retries + 1}')
                     n_of_retries += 1
                     continue
 
@@ -384,9 +423,10 @@ def __call__(
                     keywords = keywords_list[step_idx]
                     contributions = contributions_list[step_idx]
                     if len(keywords) == 1 and keywords[0] == "NO ANSWER":
+                        log.info("NO answer")
                         continue
                     step_tokens = model.tokenizer.tokenize(step_text)
-                    space_token = model.tokenizer.tokenize(" ")
+                    space_token = model.tokenizer.tokenize(" ")[0]
                     processed_step_tokens = [
                         (token[1:] if token.startswith(space_token) else token)
                         for token in step_tokens
@@ -401,7 +441,7 @@ def __call__(
                         keyword_probs = []
                         keyword_token_ids = []
                         if is_word_in_sentence(step_text, keyword) is not True:
-                            # log.debug(f"\n{step_name}-Keyword-{keyword_idx} Does not appear in the Step Text")
+                            log.info(f"\n{step_name}-Keyword-{keyword_idx} Does not appear in the Step Text")
                             continue
                         keyword_token_start_idx, keyword_token_end_idx = find_token_indices(
                             processed_step_tokens, keyword
@@ -409,21 +449,20 @@ def __call__(
                         keyword_token_ids = generated_ids[
                             start_position + keyword_token_start_idx : start_position + keyword_token_end_idx + 1
                         ]
-                        keyword_token_ids = keyword_token_ids.data.cpu().numpy()
 
                         for j, token_id in enumerate(keyword_token_ids):
                             idxx = start_position + keyword_token_start_idx + j
                             keyword_probs.append(probabilities[idxx][token_id])
                         keywords_probabilities_dict[keyword] = keyword_probs
                         keywords_contributions_dict[keyword] = int(contributions[keyword_idx])
-                        keywords_token_ids_dict[keyword] = keyword_token_ids.tolist()
+                        keywords_token_ids_dict[keyword] = keyword_token_ids
 
                     keywords_probabilities[step_name] = keywords_probabilities_dict
                     keywords_contributions[step_name] = keywords_contributions_dict
                     keywords_token_ids[step_name] = keywords_token_ids_dict
 
                 if is_effectively_empty(keywords_probabilities):
-                    # log.debug(f'Token Probability from All Steps are All None, Current try is {n_of_retries + 1}')
+                    log.info(f'Token Probability from All Steps are All None, Current try is {n_of_retries + 1}')
                     n_of_retries += 1
                     continue
 
diff --git a/src/lm_polygraph/utils/manager.py b/src/lm_polygraph/utils/manager.py
index c6416b4bd..131c9cfd1 100644
--- a/src/lm_polygraph/utils/manager.py
+++ b/src/lm_polygraph/utils/manager.py
@@ -58,6 +58,24 @@ def _delete_nans(ue, metric):
     return clipped_ue, new_metric
 
 
+def _recombine_data(ue, gen_metric, inputs):
+    ue = np.array(ue)
+    gen_metric = np.array(gen_metric)
+
+    # np.unique() with return_counts=True?
+    recombined_inputs = defaultdict(list)
+    for i, input_text in enumerate(inputs):
+        recombined_inputs[input_text].append(i)
+
+    recombined_ue, recombined_gen_metric = [], []
+    for input_text, ids in recombined_inputs.items():
+        recombined_ue.append(ue[ids].mean())
+        # Assumes that metric is bigger for better generations!
+        recombined_gen_metric.append(gen_metric[ids].max())
+
+    return recombined_ue, recombined_gen_metric
+
+
 def order_calculators(
     stats: List[str],
     stat_calculators: Dict[str, StatCalculator],

From a26686c8245bfdc4266f27627b88fd2f4b741bcc Mon Sep 17 00:00:00 2001
From: ConstFr <denis@agent-bot-dev-vm.us-central1-c.c.agent-bot-dev.internal>
Date: Tue, 6 May 2025 07:27:31 +0000
Subject: [PATCH 5/6] fixed target/output postprocessing

---
 examples/configs/base_processing_hotpot.yaml  |   6 +
 .../output_processing_scripts/hotpot.py       |  14 +
 .../configs/polygraph_eval_cot_hotpot.yaml    |   4 +-
 notebooks/vizualization_tables.ipynb          | 536 +++++++++++++++++-
 4 files changed, 550 insertions(+), 10 deletions(-)
 create mode 100644 examples/configs/base_processing_hotpot.yaml
 create mode 100644 examples/configs/instruct/output_processing_scripts/hotpot.py

diff --git a/examples/configs/base_processing_hotpot.yaml b/examples/configs/base_processing_hotpot.yaml
new file mode 100644
index 000000000..489adc232
--- /dev/null
+++ b/examples/configs/base_processing_hotpot.yaml
@@ -0,0 +1,6 @@
+process_output_fn:
+  path: instruct/output_processing_scripts/hotpot.py
+  fn_name: process_output_cot_hotpot
+process_target_fn:
+  path: instruct/output_processing_scripts/hotpot.py
+  fn_name: process_target_cot_hotpot
\ No newline at end of file
diff --git a/examples/configs/instruct/output_processing_scripts/hotpot.py b/examples/configs/instruct/output_processing_scripts/hotpot.py
new file mode 100644
index 000000000..bd7cd480f
--- /dev/null
+++ b/examples/configs/instruct/output_processing_scripts/hotpot.py
@@ -0,0 +1,14 @@
+import re
+import string
+
+CoT_OUTPUT_IGNORE_REGEX = re.compile(r"(?s).*Final Answer:")
+
+def process_output_cot_hotpot(output: str) -> str:
+    output = CoT_OUTPUT_IGNORE_REGEX.sub("", output).lower().strip()
+    return output
+
+def process_target_cot_hotpot(target: str) -> str:
+    target = target.lower().strip()
+    target = target.translate(str.maketrans("", "", string.punctuation))
+    
+    return target
diff --git a/examples/configs/polygraph_eval_cot_hotpot.yaml b/examples/configs/polygraph_eval_cot_hotpot.yaml
index be3994c36..0952ec455 100644
--- a/examples/configs/polygraph_eval_cot_hotpot.yaml
+++ b/examples/configs/polygraph_eval_cot_hotpot.yaml
@@ -6,6 +6,7 @@ defaults:
   - model: bloomz-560m
   - estimators: cot_estimators
   - stat_calculators: default_calculators
+  - base_processing_hotpot
   - _self_
 
 cache_path: ./workdir/output
@@ -21,13 +22,10 @@ eval_split: validation
 few_shot_prompt: null
 max_new_tokens: 256
 load_from_disk: false
-normalize: true
 trust_remote_code: false
 size: 100
 
 
-output_ignore_regex: "(?s).*Final Answer:"
-
 subsample_eval_dataset: 10
 
 generation_metrics: null
diff --git a/notebooks/vizualization_tables.ipynb b/notebooks/vizualization_tables.ipynb
index 66b016072..bff6d1f00 100644
--- a/notebooks/vizualization_tables.ipynb
+++ b/notebooks/vizualization_tables.ipynb
@@ -10,7 +10,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "999822a8",
    "metadata": {},
    "outputs": [],
@@ -104,18 +104,540 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "31c03154",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Will measure variance using 1 seeds\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style type=\"text/css\">\n",
+       "#T_5f6c1 td:hover {\n",
+       "  background-color: #ffffb3;\n",
+       "}\n",
+       "#T_5f6c1 .index_name {\n",
+       "  font-style: italic;\n",
+       "  color: darkgrey;\n",
+       "  font-weight: normal;\n",
+       "}\n",
+       "#T_5f6c1 th.col4 {\n",
+       "  border-left: 1px solid black;\n",
+       "}\n",
+       "#T_5f6c1 td.col4 {\n",
+       "  border-left: 1px solid black;\n",
+       "}\n",
+       "#T_5f6c1 th.col8 {\n",
+       "  border-left: 1px solid black;\n",
+       "}\n",
+       "#T_5f6c1 td.col8 {\n",
+       "  border-left: 1px solid black;\n",
+       "}\n",
+       "#T_5f6c1 th.col12 {\n",
+       "  border-left: 1px solid black;\n",
+       "}\n",
+       "#T_5f6c1 td.col12 {\n",
+       "  border-left: 1px solid black;\n",
+       "}\n",
+       "#T_5f6c1 th.col16 {\n",
+       "  border-left: 1px solid black;\n",
+       "}\n",
+       "#T_5f6c1 td.col16 {\n",
+       "  border-left: 1px solid black;\n",
+       "}\n",
+       "#T_5f6c1_row0_col0, #T_5f6c1_row0_col1, #T_5f6c1_row0_col2, #T_5f6c1_row0_col3, #T_5f6c1_row1_col1, #T_5f6c1_row1_col2, #T_5f6c1_row2_col1, #T_5f6c1_row2_col2, #T_5f6c1_row5_col1, #T_5f6c1_row5_col2, #T_5f6c1_row6_col1, #T_5f6c1_row6_col2, #T_5f6c1_row7_col0, #T_5f6c1_row7_col1, #T_5f6c1_row7_col2, #T_5f6c1_row7_col3, #T_5f6c1_row7_col5, #T_5f6c1_row7_col6, #T_5f6c1_row7_col8, #T_5f6c1_row7_col9, #T_5f6c1_row7_col10, #T_5f6c1_row7_col11, #T_5f6c1_row7_col12, #T_5f6c1_row7_col13, #T_5f6c1_row7_col14, #T_5f6c1_row7_col15, #T_5f6c1_row7_col16, #T_5f6c1_row7_col17, #T_5f6c1_row7_col18, #T_5f6c1_row7_col19, #T_5f6c1_row8_col1, #T_5f6c1_row8_col2, #T_5f6c1_row8_col4, #T_5f6c1_row8_col5, #T_5f6c1_row8_col6, #T_5f6c1_row8_col7, #T_5f6c1_row8_col9, #T_5f6c1_row8_col10, #T_5f6c1_row8_col13, #T_5f6c1_row8_col14, #T_5f6c1_row8_col17, #T_5f6c1_row8_col18, #T_5f6c1_row9_col0, #T_5f6c1_row9_col1, #T_5f6c1_row9_col2, #T_5f6c1_row9_col3, #T_5f6c1_row9_col5, #T_5f6c1_row9_col6, #T_5f6c1_row9_col12, #T_5f6c1_row9_col13, #T_5f6c1_row9_col14, #T_5f6c1_row9_col15, #T_5f6c1_row10_col1, #T_5f6c1_row10_col2, #T_5f6c1_row10_col12, #T_5f6c1_row10_col13, #T_5f6c1_row10_col14, #T_5f6c1_row10_col15, #T_5f6c1_row11_col1, #T_5f6c1_row11_col2, #T_5f6c1_row11_col12, #T_5f6c1_row11_col13, #T_5f6c1_row11_col14, #T_5f6c1_row11_col15, #T_5f6c1_row12_col1, #T_5f6c1_row12_col2, #T_5f6c1_row12_col5, #T_5f6c1_row12_col6, #T_5f6c1_row12_col8, #T_5f6c1_row12_col11, #T_5f6c1_row12_col12, #T_5f6c1_row12_col13, #T_5f6c1_row12_col14, #T_5f6c1_row12_col15, #T_5f6c1_row12_col16, #T_5f6c1_row12_col19 {\n",
+       "  background-color: #fb7c5c;\n",
+       "}\n",
+       "#T_5f6c1_row0_col4, #T_5f6c1_row0_col7 {\n",
+       "  background-color: #f6583e;\n",
+       "}\n",
+       "#T_5f6c1_row0_col5, #T_5f6c1_row0_col6, #T_5f6c1_row1_col5, #T_5f6c1_row1_col6 {\n",
+       "  background-color: #fa6849;\n",
+       "}\n",
+       "#T_5f6c1_row0_col8, #T_5f6c1_row0_col11, #T_5f6c1_row0_col16, #T_5f6c1_row0_col19 {\n",
+       "  background-color: #f24734;\n",
+       "}\n",
+       "#T_5f6c1_row0_col9, #T_5f6c1_row0_col10, #T_5f6c1_row0_col17, #T_5f6c1_row0_col18, #T_5f6c1_row1_col9, #T_5f6c1_row1_col10, #T_5f6c1_row1_col17, #T_5f6c1_row1_col18, #T_5f6c1_row10_col9, #T_5f6c1_row10_col10, #T_5f6c1_row10_col17, #T_5f6c1_row10_col18, #T_5f6c1_row11_col9, #T_5f6c1_row11_col10, #T_5f6c1_row11_col17, #T_5f6c1_row11_col18 {\n",
+       "  background-color: #f34c37;\n",
+       "}\n",
+       "#T_5f6c1_row0_col12, #T_5f6c1_row0_col15, #T_5f6c1_row5_col12, #T_5f6c1_row5_col15 {\n",
+       "  background-color: #ef3c2c;\n",
+       "}\n",
+       "#T_5f6c1_row0_col13, #T_5f6c1_row0_col14, #T_5f6c1_row1_col13, #T_5f6c1_row1_col14, #T_5f6c1_row4_col13, #T_5f6c1_row4_col14, #T_5f6c1_row5_col13, #T_5f6c1_row5_col14 {\n",
+       "  background-color: #ea362a;\n",
+       "}\n",
+       "#T_5f6c1_row1_col0, #T_5f6c1_row1_col3, #T_5f6c1_row12_col0, #T_5f6c1_row12_col3 {\n",
+       "  background-color: #c1161b;\n",
+       "}\n",
+       "#T_5f6c1_row1_col4, #T_5f6c1_row1_col7 {\n",
+       "  background-color: #f24633;\n",
+       "}\n",
+       "#T_5f6c1_row1_col8, #T_5f6c1_row1_col11, #T_5f6c1_row1_col16, #T_5f6c1_row1_col19 {\n",
+       "  background-color: #ed392b;\n",
+       "}\n",
+       "#T_5f6c1_row1_col12, #T_5f6c1_row1_col15 {\n",
+       "  background-color: #a60f15;\n",
+       "}\n",
+       "#T_5f6c1_row2_col0, #T_5f6c1_row2_col3, #T_5f6c1_row10_col0, #T_5f6c1_row10_col3, #T_5f6c1_row11_col0, #T_5f6c1_row11_col3 {\n",
+       "  background-color: #a91016;\n",
+       "}\n",
+       "#T_5f6c1_row2_col4, #T_5f6c1_row2_col7 {\n",
+       "  background-color: #e32f27;\n",
+       "}\n",
+       "#T_5f6c1_row2_col5, #T_5f6c1_row2_col6 {\n",
+       "  background-color: #d42121;\n",
+       "}\n",
+       "#T_5f6c1_row2_col8, #T_5f6c1_row2_col11, #T_5f6c1_row2_col16, #T_5f6c1_row2_col19 {\n",
+       "  background-color: #dc2924;\n",
+       "}\n",
+       "#T_5f6c1_row2_col9, #T_5f6c1_row2_col10, #T_5f6c1_row2_col17, #T_5f6c1_row2_col18, #T_5f6c1_row4_col9, #T_5f6c1_row4_col10, #T_5f6c1_row4_col17, #T_5f6c1_row4_col18 {\n",
+       "  background-color: #d82422;\n",
+       "}\n",
+       "#T_5f6c1_row2_col12, #T_5f6c1_row2_col15, #T_5f6c1_row3_col12, #T_5f6c1_row3_col15 {\n",
+       "  background-color: #7a0510;\n",
+       "}\n",
+       "#T_5f6c1_row2_col13, #T_5f6c1_row2_col14, #T_5f6c1_row3_col13, #T_5f6c1_row3_col14 {\n",
+       "  background-color: #bb141a;\n",
+       "}\n",
+       "#T_5f6c1_row3_col0, #T_5f6c1_row3_col1, #T_5f6c1_row3_col2, #T_5f6c1_row3_col3, #T_5f6c1_row4_col0, #T_5f6c1_row4_col1, #T_5f6c1_row4_col2, #T_5f6c1_row4_col3, #T_5f6c1_row6_col4, #T_5f6c1_row6_col5, #T_5f6c1_row6_col6, #T_5f6c1_row6_col7, #T_5f6c1_row6_col8, #T_5f6c1_row6_col9, #T_5f6c1_row6_col10, #T_5f6c1_row6_col11, #T_5f6c1_row6_col12, #T_5f6c1_row6_col13, #T_5f6c1_row6_col14, #T_5f6c1_row6_col15, #T_5f6c1_row6_col16, #T_5f6c1_row6_col17, #T_5f6c1_row6_col18, #T_5f6c1_row6_col19 {\n",
+       "  background-color: #67000d;\n",
+       "}\n",
+       "#T_5f6c1_row3_col4, #T_5f6c1_row3_col7 {\n",
+       "  background-color: #cb181d;\n",
+       "}\n",
+       "#T_5f6c1_row3_col5, #T_5f6c1_row3_col6 {\n",
+       "  background-color: #9a0c14;\n",
+       "}\n",
+       "#T_5f6c1_row3_col8, #T_5f6c1_row3_col11, #T_5f6c1_row3_col16, #T_5f6c1_row3_col19 {\n",
+       "  background-color: #c5171c;\n",
+       "}\n",
+       "#T_5f6c1_row3_col9, #T_5f6c1_row3_col10, #T_5f6c1_row3_col17, #T_5f6c1_row3_col18 {\n",
+       "  background-color: #9f0e14;\n",
+       "}\n",
+       "#T_5f6c1_row4_col4, #T_5f6c1_row4_col7 {\n",
+       "  background-color: #b31218;\n",
+       "}\n",
+       "#T_5f6c1_row4_col5, #T_5f6c1_row4_col6 {\n",
+       "  background-color: #c3161b;\n",
+       "}\n",
+       "#T_5f6c1_row4_col8, #T_5f6c1_row4_col11, #T_5f6c1_row4_col16, #T_5f6c1_row4_col19 {\n",
+       "  background-color: #b61319;\n",
+       "}\n",
+       "#T_5f6c1_row4_col12, #T_5f6c1_row4_col15 {\n",
+       "  background-color: #8c0912;\n",
+       "}\n",
+       "#T_5f6c1_row5_col0, #T_5f6c1_row5_col3, #T_5f6c1_row8_col0, #T_5f6c1_row8_col3, #T_5f6c1_row10_col4, #T_5f6c1_row10_col7 {\n",
+       "  background-color: #e22e27;\n",
+       "}\n",
+       "#T_5f6c1_row5_col4, #T_5f6c1_row5_col7 {\n",
+       "  background-color: #e63328;\n",
+       "}\n",
+       "#T_5f6c1_row5_col5, #T_5f6c1_row5_col6 {\n",
+       "  background-color: #f85f43;\n",
+       "}\n",
+       "#T_5f6c1_row5_col8, #T_5f6c1_row5_col11, #T_5f6c1_row5_col16, #T_5f6c1_row5_col19 {\n",
+       "  background-color: #db2824;\n",
+       "}\n",
+       "#T_5f6c1_row5_col9, #T_5f6c1_row5_col10, #T_5f6c1_row5_col17, #T_5f6c1_row5_col18 {\n",
+       "  background-color: #ec382b;\n",
+       "}\n",
+       "#T_5f6c1_row6_col0, #T_5f6c1_row6_col3 {\n",
+       "  background-color: #79040f;\n",
+       "}\n",
+       "#T_5f6c1_row7_col4, #T_5f6c1_row7_col7 {\n",
+       "  background-color: #fb7757;\n",
+       "}\n",
+       "#T_5f6c1_row8_col8, #T_5f6c1_row8_col11, #T_5f6c1_row8_col16, #T_5f6c1_row8_col19 {\n",
+       "  background-color: #fb7555;\n",
+       "}\n",
+       "#T_5f6c1_row8_col12, #T_5f6c1_row8_col15 {\n",
+       "  background-color: #f96044;\n",
+       "}\n",
+       "#T_5f6c1_row9_col4, #T_5f6c1_row9_col7 {\n",
+       "  background-color: #f6563d;\n",
+       "}\n",
+       "#T_5f6c1_row9_col8, #T_5f6c1_row9_col11, #T_5f6c1_row9_col16, #T_5f6c1_row9_col19 {\n",
+       "  background-color: #fb6b4b;\n",
+       "}\n",
+       "#T_5f6c1_row9_col9, #T_5f6c1_row9_col10, #T_5f6c1_row9_col17, #T_5f6c1_row9_col18, #T_5f6c1_row12_col9, #T_5f6c1_row12_col10, #T_5f6c1_row12_col17, #T_5f6c1_row12_col18 {\n",
+       "  background-color: #fb7b5b;\n",
+       "}\n",
+       "#T_5f6c1_row10_col5, #T_5f6c1_row10_col6, #T_5f6c1_row11_col5, #T_5f6c1_row11_col6 {\n",
+       "  background-color: #f03f2e;\n",
+       "}\n",
+       "#T_5f6c1_row10_col8, #T_5f6c1_row10_col11, #T_5f6c1_row10_col16, #T_5f6c1_row10_col19, #T_5f6c1_row11_col8, #T_5f6c1_row11_col11, #T_5f6c1_row11_col16, #T_5f6c1_row11_col19 {\n",
+       "  background-color: #f7593f;\n",
+       "}\n",
+       "#T_5f6c1_row11_col4, #T_5f6c1_row11_col7 {\n",
+       "  background-color: #f14331;\n",
+       "}\n",
+       "#T_5f6c1_row12_col4, #T_5f6c1_row12_col7 {\n",
+       "  background-color: #fb6d4d;\n",
+       "}\n",
+       "</style>\n",
+       "<table id=\"T_5f6c1\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th class=\"blank level0\" >&nbsp;</th>\n",
+       "      <th id=\"T_5f6c1_level0_col0\" class=\"col_heading level0 col0\" colspan=\"20\">HotpotQA, Llama3.2-3b</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th class=\"blank level1\" >&nbsp;</th>\n",
+       "      <th id=\"T_5f6c1_level1_col0\" class=\"col_heading level1 col0\" colspan=\"4\">Accuracy</th>\n",
+       "      <th id=\"T_5f6c1_level1_col4\" class=\"col_heading level1 col4\" colspan=\"4\">BLEU</th>\n",
+       "      <th id=\"T_5f6c1_level1_col8\" class=\"col_heading level1 col8\" colspan=\"4\">Rouge_rouge1</th>\n",
+       "      <th id=\"T_5f6c1_level1_col12\" class=\"col_heading level1 col12\" colspan=\"4\">Rouge_rouge2</th>\n",
+       "      <th id=\"T_5f6c1_level1_col16\" class=\"col_heading level1 col16\" colspan=\"4\">Rouge_rougeL</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th class=\"blank level2\" >&nbsp;</th>\n",
+       "      <th id=\"T_5f6c1_level2_col0\" class=\"col_heading level2 col0\" >prr</th>\n",
+       "      <th id=\"T_5f6c1_level2_col1\" class=\"col_heading level2 col1\" >prr_0.5</th>\n",
+       "      <th id=\"T_5f6c1_level2_col2\" class=\"col_heading level2 col2\" >prr_0.5_normalized</th>\n",
+       "      <th id=\"T_5f6c1_level2_col3\" class=\"col_heading level2 col3\" >prr_normalized</th>\n",
+       "      <th id=\"T_5f6c1_level2_col4\" class=\"col_heading level2 col4\" >prr</th>\n",
+       "      <th id=\"T_5f6c1_level2_col5\" class=\"col_heading level2 col5\" >prr_0.5</th>\n",
+       "      <th id=\"T_5f6c1_level2_col6\" class=\"col_heading level2 col6\" >prr_0.5_normalized</th>\n",
+       "      <th id=\"T_5f6c1_level2_col7\" class=\"col_heading level2 col7\" >prr_normalized</th>\n",
+       "      <th id=\"T_5f6c1_level2_col8\" class=\"col_heading level2 col8\" >prr</th>\n",
+       "      <th id=\"T_5f6c1_level2_col9\" class=\"col_heading level2 col9\" >prr_0.5</th>\n",
+       "      <th id=\"T_5f6c1_level2_col10\" class=\"col_heading level2 col10\" >prr_0.5_normalized</th>\n",
+       "      <th id=\"T_5f6c1_level2_col11\" class=\"col_heading level2 col11\" >prr_normalized</th>\n",
+       "      <th id=\"T_5f6c1_level2_col12\" class=\"col_heading level2 col12\" >prr</th>\n",
+       "      <th id=\"T_5f6c1_level2_col13\" class=\"col_heading level2 col13\" >prr_0.5</th>\n",
+       "      <th id=\"T_5f6c1_level2_col14\" class=\"col_heading level2 col14\" >prr_0.5_normalized</th>\n",
+       "      <th id=\"T_5f6c1_level2_col15\" class=\"col_heading level2 col15\" >prr_normalized</th>\n",
+       "      <th id=\"T_5f6c1_level2_col16\" class=\"col_heading level2 col16\" >prr</th>\n",
+       "      <th id=\"T_5f6c1_level2_col17\" class=\"col_heading level2 col17\" >prr_0.5</th>\n",
+       "      <th id=\"T_5f6c1_level2_col18\" class=\"col_heading level2 col18\" >prr_0.5_normalized</th>\n",
+       "      <th id=\"T_5f6c1_level2_col19\" class=\"col_heading level2 col19\" >prr_normalized</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5f6c1_level0_row0\" class=\"row_heading level0 row0\" >MaximumSequenceProbability</th>\n",
+       "      <td id=\"T_5f6c1_row0_col0\" class=\"data row0 col0\" >29.29 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row0_col1\" class=\"data row0 col1\" >12.91 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row0_col2\" class=\"data row0 col2\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row0_col3\" class=\"data row0 col3\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row0_col4\" class=\"data row0 col4\" >55.70 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row0_col5\" class=\"data row0 col5\" >41.07 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row0_col6\" class=\"data row0 col6\" >78.16 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row0_col7\" class=\"data row0 col7\" >67.37 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row0_col8\" class=\"data row0 col8\" >57.92 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row0_col9\" class=\"data row0 col9\" >45.50 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row0_col10\" class=\"data row0 col10\" >46.68 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row0_col11\" class=\"data row0 col11\" >53.28 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row0_col12\" class=\"data row0 col12\" >46.94 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row0_col13\" class=\"data row0 col13\" >32.78 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row0_col14\" class=\"data row0 col14\" >-10.35 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row0_col15\" class=\"data row0 col15\" >42.17 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row0_col16\" class=\"data row0 col16\" >57.92 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row0_col17\" class=\"data row0 col17\" >45.50 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row0_col18\" class=\"data row0 col18\" >46.68 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row0_col19\" class=\"data row0 col19\" >53.28 ± 0.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5f6c1_level0_row1\" class=\"row_heading level0 row1\" >Perplexity</th>\n",
+       "      <td id=\"T_5f6c1_row1_col0\" class=\"data row1 col0\" >14.29 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row1_col1\" class=\"data row1 col1\" >12.91 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row1_col2\" class=\"data row1 col2\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row1_col3\" class=\"data row1 col3\" >21.71 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row1_col4\" class=\"data row1 col4\" >51.20 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row1_col5\" class=\"data row1 col5\" >41.07 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row1_col6\" class=\"data row1 col6\" >78.16 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row1_col7\" class=\"data row1 col7\" >53.99 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row1_col8\" class=\"data row1 col8\" >53.42 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row1_col9\" class=\"data row1 col9\" >45.50 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row1_col10\" class=\"data row1 col10\" >46.68 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row1_col11\" class=\"data row1 col11\" >40.13 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row1_col12\" class=\"data row1 col12\" >21.94 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row1_col13\" class=\"data row1 col13\" >32.78 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row1_col14\" class=\"data row1 col14\" >-10.35 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row1_col15\" class=\"data row1 col15\" >-37.91 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row1_col16\" class=\"data row1 col16\" >53.42 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row1_col17\" class=\"data row1 col17\" >45.50 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row1_col18\" class=\"data row1 col18\" >46.68 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row1_col19\" class=\"data row1 col19\" >40.13 ± 0.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5f6c1_level0_row2\" class=\"row_heading level0 row2\" >MeanTokenEntropy</th>\n",
+       "      <td id=\"T_5f6c1_row2_col0\" class=\"data row2 col0\" >10.96 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row2_col1\" class=\"data row2 col1\" >12.91 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row2_col2\" class=\"data row2 col2\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row2_col3\" class=\"data row2 col3\" >4.32 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row2_col4\" class=\"data row2 col4\" >44.02 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row2_col5\" class=\"data row2 col5\" >33.38 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row2_col6\" class=\"data row2 col6\" >0.31 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row2_col7\" class=\"data row2 col7\" >32.64 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row2_col8\" class=\"data row2 col8\" >47.18 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row2_col9\" class=\"data row2 col9\" >39.69 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row2_col10\" class=\"data row2 col10\" >-2.72 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row2_col11\" class=\"data row2 col11\" >21.90 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row2_col12\" class=\"data row2 col12\" >13.06 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row2_col13\" class=\"data row2 col13\" >26.11 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row2_col14\" class=\"data row2 col14\" >-98.63 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row2_col15\" class=\"data row2 col15\" >-66.38 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row2_col16\" class=\"data row2 col16\" >47.18 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row2_col17\" class=\"data row2 col17\" >39.69 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row2_col18\" class=\"data row2 col18\" >-2.72 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row2_col19\" class=\"data row2 col19\" >21.90 ± 0.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5f6c1_level0_row3\" class=\"row_heading level0 row3\" >MeanPointwiseMutualInformation</th>\n",
+       "      <td id=\"T_5f6c1_row3_col0\" class=\"data row3 col0\" >4.79 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row3_col1\" class=\"data row3 col1\" >9.58 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row3_col2\" class=\"data row3 col2\" >-16.51 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row3_col3\" class=\"data row3 col3\" >-27.87 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row3_col4\" class=\"data row3 col4\" >36.26 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row3_col5\" class=\"data row3 col5\" >26.84 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row3_col6\" class=\"data row3 col6\" >-65.81 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row3_col7\" class=\"data row3 col7\" >9.55 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row3_col8\" class=\"data row3 col8\" >38.52 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row3_col9\" class=\"data row3 col9\" >31.33 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row3_col10\" class=\"data row3 col10\" >-73.75 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row3_col11\" class=\"data row3 col11\" >-3.41 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row3_col12\" class=\"data row3 col12\" >13.06 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row3_col13\" class=\"data row3 col13\" >26.11 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row3_col14\" class=\"data row3 col14\" >-98.63 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row3_col15\" class=\"data row3 col15\" >-66.38 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row3_col16\" class=\"data row3 col16\" >38.52 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row3_col17\" class=\"data row3 col17\" >31.33 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row3_col18\" class=\"data row3 col18\" >-73.75 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row3_col19\" class=\"data row3 col19\" >-3.41 ± 0.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5f6c1_level0_row4\" class=\"row_heading level0 row4\" >MeanConditionalPointwiseMutualInformation</th>\n",
+       "      <td id=\"T_5f6c1_row4_col0\" class=\"data row4 col0\" >4.79 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row4_col1\" class=\"data row4 col1\" >9.58 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row4_col2\" class=\"data row4 col2\" >-16.51 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row4_col3\" class=\"data row4 col3\" >-27.87 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row4_col4\" class=\"data row4 col4\" >28.45 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row4_col5\" class=\"data row4 col5\" >31.21 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row4_col6\" class=\"data row4 col6\" >-21.62 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row4_col7\" class=\"data row4 col7\" >-13.66 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row4_col8\" class=\"data row4 col8\" >32.78 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row4_col9\" class=\"data row4 col9\" >39.75 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row4_col10\" class=\"data row4 col10\" >-2.23 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row4_col11\" class=\"data row4 col11\" >-20.18 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row4_col12\" class=\"data row4 col12\" >16.39 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row4_col13\" class=\"data row4 col13\" >32.78 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row4_col14\" class=\"data row4 col14\" >-10.35 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row4_col15\" class=\"data row4 col15\" >-55.70 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row4_col16\" class=\"data row4 col16\" >32.78 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row4_col17\" class=\"data row4 col17\" >39.75 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row4_col18\" class=\"data row4 col18\" >-2.23 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row4_col19\" class=\"data row4 col19\" >-20.18 ± 0.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5f6c1_level0_row5\" class=\"row_heading level0 row5\" >PTrue</th>\n",
+       "      <td id=\"T_5f6c1_row5_col0\" class=\"data row5 col0\" >19.29 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row5_col1\" class=\"data row5 col1\" >12.91 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row5_col2\" class=\"data row5 col2\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row5_col3\" class=\"data row5 col3\" >47.81 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row5_col4\" class=\"data row5 col4\" >45.27 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row5_col5\" class=\"data row5 col5\" >40.21 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row5_col6\" class=\"data row5 col6\" >69.43 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row5_col7\" class=\"data row5 col7\" >36.36 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row5_col8\" class=\"data row5 col8\" >46.68 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row5_col9\" class=\"data row5 col9\" >43.02 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row5_col10\" class=\"data row5 col10\" >25.62 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row5_col11\" class=\"data row5 col11\" >20.44 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row5_col12\" class=\"data row5 col12\" >46.94 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row5_col13\" class=\"data row5 col13\" >32.78 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row5_col14\" class=\"data row5 col14\" >-10.35 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row5_col15\" class=\"data row5 col15\" >42.17 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row5_col16\" class=\"data row5 col16\" >46.68 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row5_col17\" class=\"data row5 col17\" >43.02 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row5_col18\" class=\"data row5 col18\" >25.62 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row5_col19\" class=\"data row5 col19\" >20.44 ± 0.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5f6c1_level0_row6\" class=\"row_heading level0 row6\" >PTrueSampling</th>\n",
+       "      <td id=\"T_5f6c1_row6_col0\" class=\"data row6 col0\" >6.46 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row6_col1\" class=\"data row6 col1\" >12.91 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row6_col2\" class=\"data row6 col2\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row6_col3\" class=\"data row6 col3\" >-19.17 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row6_col4\" class=\"data row6 col4\" >11.58 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row6_col5\" class=\"data row6 col5\" >23.12 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row6_col6\" class=\"data row6 col6\" >-103.47 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row6_col7\" class=\"data row6 col7\" >-63.82 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row6_col8\" class=\"data row6 col8\" >13.11 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row6_col9\" class=\"data row6 col9\" >25.98 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row6_col10\" class=\"data row6 col10\" >-119.30 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row6_col11\" class=\"data row6 col11\" >-77.67 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row6_col12\" class=\"data row6 col12\" >8.89 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row6_col13\" class=\"data row6 col13\" >17.78 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row6_col14\" class=\"data row6 col14\" >-208.98 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row6_col15\" class=\"data row6 col15\" >-79.73 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row6_col16\" class=\"data row6 col16\" >13.11 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row6_col17\" class=\"data row6 col17\" >25.98 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row6_col18\" class=\"data row6 col18\" >-119.30 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row6_col19\" class=\"data row6 col19\" >-77.67 ± 0.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5f6c1_level0_row7\" class=\"row_heading level0 row7\" >MonteCarloSequenceEntropy</th>\n",
+       "      <td id=\"T_5f6c1_row7_col0\" class=\"data row7 col0\" >29.29 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row7_col1\" class=\"data row7 col1\" >12.91 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row7_col2\" class=\"data row7 col2\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row7_col3\" class=\"data row7 col3\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row7_col4\" class=\"data row7 col4\" >64.50 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row7_col5\" class=\"data row7 col5\" >43.23 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row7_col6\" class=\"data row7 col6\" >99.95 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row7_col7\" class=\"data row7 col7\" >93.51 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row7_col8\" class=\"data row7 col8\" >73.87 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row7_col9\" class=\"data row7 col9\" >51.74 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row7_col10\" class=\"data row7 col10\" >99.74 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row7_col11\" class=\"data row7 col11\" >99.90 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row7_col12\" class=\"data row7 col12\" >65.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row7_col13\" class=\"data row7 col13\" >41.11 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row7_col14\" class=\"data row7 col14\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row7_col15\" class=\"data row7 col15\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row7_col16\" class=\"data row7 col16\" >73.87 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row7_col17\" class=\"data row7 col17\" >51.74 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row7_col18\" class=\"data row7 col18\" >99.74 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row7_col19\" class=\"data row7 col19\" >99.90 ± 0.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5f6c1_level0_row8\" class=\"row_heading level0 row8\" >MonteCarloNormalizedSequenceEntropy</th>\n",
+       "      <td id=\"T_5f6c1_row8_col0\" class=\"data row8 col0\" >19.29 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row8_col1\" class=\"data row8 col1\" >12.91 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row8_col2\" class=\"data row8 col2\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row8_col3\" class=\"data row8 col3\" >47.81 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row8_col4\" class=\"data row8 col4\" >65.82 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row8_col5\" class=\"data row8 col5\" >43.23 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row8_col6\" class=\"data row8 col6\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row8_col7\" class=\"data row8 col7\" >97.43 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row8_col8\" class=\"data row8 col8\" >71.43 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row8_col9\" class=\"data row8 col9\" >51.77 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row8_col10\" class=\"data row8 col10\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row8_col11\" class=\"data row8 col11\" >92.76 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row8_col12\" class=\"data row8 col12\" >56.67 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row8_col13\" class=\"data row8 col13\" >41.11 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row8_col14\" class=\"data row8 col14\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row8_col15\" class=\"data row8 col15\" >73.31 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row8_col16\" class=\"data row8 col16\" >71.43 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row8_col17\" class=\"data row8 col17\" >51.77 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row8_col18\" class=\"data row8 col18\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row8_col19\" class=\"data row8 col19\" >92.76 ± 0.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5f6c1_level0_row9\" class=\"row_heading level0 row9\" >EigenScore</th>\n",
+       "      <td id=\"T_5f6c1_row9_col0\" class=\"data row9 col0\" >29.29 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row9_col1\" class=\"data row9 col1\" >12.91 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row9_col2\" class=\"data row9 col2\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row9_col3\" class=\"data row9 col3\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row9_col4\" class=\"data row9 col4\" >55.39 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row9_col5\" class=\"data row9 col5\" >43.22 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row9_col6\" class=\"data row9 col6\" >99.83 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row9_col7\" class=\"data row9 col7\" >66.44 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row9_col8\" class=\"data row9 col8\" >68.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row9_col9\" class=\"data row9 col9\" >51.67 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row9_col10\" class=\"data row9 col10\" >99.14 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row9_col11\" class=\"data row9 col11\" >82.75 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row9_col12\" class=\"data row9 col12\" >65.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row9_col13\" class=\"data row9 col13\" >41.11 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row9_col14\" class=\"data row9 col14\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row9_col15\" class=\"data row9 col15\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row9_col16\" class=\"data row9 col16\" >68.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row9_col17\" class=\"data row9 col17\" >51.67 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row9_col18\" class=\"data row9 col18\" >99.14 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row9_col19\" class=\"data row9 col19\" >82.75 ± 0.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5f6c1_level0_row10\" class=\"row_heading level0 row10\" >RenyiNeg</th>\n",
+       "      <td id=\"T_5f6c1_row10_col0\" class=\"data row10 col0\" >10.96 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row10_col1\" class=\"data row10 col1\" >12.91 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row10_col2\" class=\"data row10 col2\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row10_col3\" class=\"data row10 col3\" >4.32 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row10_col4\" class=\"data row10 col4\" >43.76 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row10_col5\" class=\"data row10 col5\" >37.03 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row10_col6\" class=\"data row10 col6\" >37.22 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row10_col7\" class=\"data row10 col7\" >31.87 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row10_col8\" class=\"data row10 col8\" >62.91 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row10_col9\" class=\"data row10 col9\" >45.48 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row10_col10\" class=\"data row10 col10\" >46.51 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row10_col11\" class=\"data row10 col11\" >67.86 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row10_col12\" class=\"data row10 col12\" >65.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row10_col13\" class=\"data row10 col13\" >41.11 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row10_col14\" class=\"data row10 col14\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row10_col15\" class=\"data row10 col15\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row10_col16\" class=\"data row10 col16\" >62.91 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row10_col17\" class=\"data row10 col17\" >45.48 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row10_col18\" class=\"data row10 col18\" >46.51 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row10_col19\" class=\"data row10 col19\" >67.86 ± 0.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5f6c1_level0_row11\" class=\"row_heading level0 row11\" >FisherRao</th>\n",
+       "      <td id=\"T_5f6c1_row11_col0\" class=\"data row11 col0\" >10.96 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row11_col1\" class=\"data row11 col1\" >12.91 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row11_col2\" class=\"data row11 col2\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row11_col3\" class=\"data row11 col3\" >4.32 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row11_col4\" class=\"data row11 col4\" >50.30 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row11_col5\" class=\"data row11 col5\" >37.03 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row11_col6\" class=\"data row11 col6\" >37.25 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row11_col7\" class=\"data row11 col7\" >51.30 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row11_col8\" class=\"data row11 col8\" >62.92 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row11_col9\" class=\"data row11 col9\" >45.50 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row11_col10\" class=\"data row11 col10\" >46.68 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row11_col11\" class=\"data row11 col11\" >67.89 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row11_col12\" class=\"data row11 col12\" >65.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row11_col13\" class=\"data row11 col13\" >41.11 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row11_col14\" class=\"data row11 col14\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row11_col15\" class=\"data row11 col15\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row11_col16\" class=\"data row11 col16\" >62.92 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row11_col17\" class=\"data row11 col17\" >45.50 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row11_col18\" class=\"data row11 col18\" >46.68 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row11_col19\" class=\"data row11 col19\" >67.89 ± 0.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5f6c1_level0_row12\" class=\"row_heading level0 row12\" >ProbasMeanWithCoT</th>\n",
+       "      <td id=\"T_5f6c1_row12_col0\" class=\"data row12 col0\" >14.29 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row12_col1\" class=\"data row12 col1\" >12.91 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row12_col2\" class=\"data row12 col2\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row12_col3\" class=\"data row12 col3\" >21.71 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row12_col4\" class=\"data row12 col4\" >61.23 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row12_col5\" class=\"data row12 col5\" >43.22 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row12_col6\" class=\"data row12 col6\" >99.87 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row12_col7\" class=\"data row12 col7\" >83.79 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row12_col8\" class=\"data row12 col8\" >73.85 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row12_col9\" class=\"data row12 col9\" >51.69 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row12_col10\" class=\"data row12 col10\" >99.31 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row12_col11\" class=\"data row12 col11\" >99.83 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row12_col12\" class=\"data row12 col12\" >65.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row12_col13\" class=\"data row12 col13\" >41.11 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row12_col14\" class=\"data row12 col14\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row12_col15\" class=\"data row12 col15\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row12_col16\" class=\"data row12 col16\" >73.85 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row12_col17\" class=\"data row12 col17\" >51.69 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row12_col18\" class=\"data row12 col18\" >99.31 ± 0.00</td>\n",
+       "      <td id=\"T_5f6c1_row12_col19\" class=\"data row12 col19\" >99.83 ± 0.00</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n"
+      ],
+      "text/plain": [
+       "<pandas.io.formats.style.Styler at 0x7f4c1ee9e7d0>"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "# visualize results in a table\n",
     "pretty_plot(\n",
-    "    'TriviaQA, Dolly3b',\n",
+    "    'HotpotQA, Llama3.2-3b',\n",
     "    # outputs generated by scripts/polygraph_eval benchmark\n",
     "    # provide several seeds to calculate variance\n",
-    "    ['./workdir/output_seed' + str(x)\n",
-    "     for x in range(1, 10)])"
+    "    [\"../workdir/output/qa/{'path': 'meta-llama/Llama-3.2-3B-Instruct', 'ensemble': False, 'mc': False, 'mc_seeds': None, 'dropout_rate': None, 'type': 'CausalLM', 'path_to_load_script': 'model/default_causal.py', 'load_model_args': {'device_map': 'auto'}, 'load_tokenizer_args': {}}/['denis1699/hotpot_cot']/2025-05-06/06-38-32/ue_manager_seed1\"])"
    ]
   },
   {
@@ -143,7 +665,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,

From 6e7c8d6ad31755a86fa0844e7790caed9f2af2eb Mon Sep 17 00:00:00 2001
From: ConstFr <denis@agent-bot-dev-vm.us-central1-c.c.agent-bot-dev.internal>
Date: Tue, 6 May 2025 10:57:03 +0000
Subject: [PATCH 6/6] fixed postprocessing v2

---
 .../output_processing_scripts/hotpot.py       |   1 +
 .../configs/polygraph_eval_cot_hotpot.yaml    |   4 +-
 notebooks/vizualization_tables.ipynb          | 858 ++++++++++--------
 3 files changed, 459 insertions(+), 404 deletions(-)

diff --git a/examples/configs/instruct/output_processing_scripts/hotpot.py b/examples/configs/instruct/output_processing_scripts/hotpot.py
index bd7cd480f..a1bcd9c9c 100644
--- a/examples/configs/instruct/output_processing_scripts/hotpot.py
+++ b/examples/configs/instruct/output_processing_scripts/hotpot.py
@@ -5,6 +5,7 @@
 
 def process_output_cot_hotpot(output: str) -> str:
     output = CoT_OUTPUT_IGNORE_REGEX.sub("", output).lower().strip()
+    output = output.translate(str.maketrans("", "", string.punctuation))
     return output
 
 def process_target_cot_hotpot(target: str) -> str:
diff --git a/examples/configs/polygraph_eval_cot_hotpot.yaml b/examples/configs/polygraph_eval_cot_hotpot.yaml
index 0952ec455..6d874f8a1 100644
--- a/examples/configs/polygraph_eval_cot_hotpot.yaml
+++ b/examples/configs/polygraph_eval_cot_hotpot.yaml
@@ -20,13 +20,13 @@ label_column: answer
 train_split: train
 eval_split: validation
 few_shot_prompt: null
-max_new_tokens: 256
+max_new_tokens: 384
 load_from_disk: false
 trust_remote_code: false
 size: 100
 
 
-subsample_eval_dataset: 10
+subsample_eval_dataset: 20
 
 generation_metrics: null
 
diff --git a/notebooks/vizualization_tables.ipynb b/notebooks/vizualization_tables.ipynb
index bff6d1f00..ea9303ed0 100644
--- a/notebooks/vizualization_tables.ipynb
+++ b/notebooks/vizualization_tables.ipynb
@@ -10,7 +10,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 4,
    "id": "999822a8",
    "metadata": {},
    "outputs": [],
@@ -86,7 +86,7 @@
     "    mean_df = pd.DataFrame([[mean[row][col] for col in columns] for row in index],\n",
     "                           index=index, columns=pd.MultiIndex.from_tuples(columns))\n",
     "    \n",
-    "    s = total_df.style.apply(functools.partial(b_g, A=mean_df, cmap='Reds'), axis=0)\n",
+    "    s = total_df.style.apply(functools.partial(b_g, A=mean_df, cmap='Greens'), axis=0)\n",
     "    s.set_table_styles([{  # for row hover use <tr> instead of <td>\n",
     "        'selector': 'td:hover',\n",
     "        'props': [('background-color', '#ffffb3')]\n",
@@ -104,7 +104,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 6,
    "id": "31c03154",
    "metadata": {},
    "outputs": [
@@ -119,514 +119,568 @@
      "data": {
       "text/html": [
        "<style type=\"text/css\">\n",
-       "#T_5f6c1 td:hover {\n",
+       "#T_79db4 td:hover {\n",
        "  background-color: #ffffb3;\n",
        "}\n",
-       "#T_5f6c1 .index_name {\n",
+       "#T_79db4 .index_name {\n",
        "  font-style: italic;\n",
        "  color: darkgrey;\n",
        "  font-weight: normal;\n",
        "}\n",
-       "#T_5f6c1 th.col4 {\n",
+       "#T_79db4 th.col4 {\n",
        "  border-left: 1px solid black;\n",
        "}\n",
-       "#T_5f6c1 td.col4 {\n",
+       "#T_79db4 td.col4 {\n",
        "  border-left: 1px solid black;\n",
        "}\n",
-       "#T_5f6c1 th.col8 {\n",
+       "#T_79db4 th.col8 {\n",
        "  border-left: 1px solid black;\n",
        "}\n",
-       "#T_5f6c1 td.col8 {\n",
+       "#T_79db4 td.col8 {\n",
        "  border-left: 1px solid black;\n",
        "}\n",
-       "#T_5f6c1 th.col12 {\n",
+       "#T_79db4 th.col12 {\n",
        "  border-left: 1px solid black;\n",
        "}\n",
-       "#T_5f6c1 td.col12 {\n",
+       "#T_79db4 td.col12 {\n",
        "  border-left: 1px solid black;\n",
        "}\n",
-       "#T_5f6c1 th.col16 {\n",
+       "#T_79db4 th.col16 {\n",
        "  border-left: 1px solid black;\n",
        "}\n",
-       "#T_5f6c1 td.col16 {\n",
+       "#T_79db4 td.col16 {\n",
        "  border-left: 1px solid black;\n",
        "}\n",
-       "#T_5f6c1_row0_col0, #T_5f6c1_row0_col1, #T_5f6c1_row0_col2, #T_5f6c1_row0_col3, #T_5f6c1_row1_col1, #T_5f6c1_row1_col2, #T_5f6c1_row2_col1, #T_5f6c1_row2_col2, #T_5f6c1_row5_col1, #T_5f6c1_row5_col2, #T_5f6c1_row6_col1, #T_5f6c1_row6_col2, #T_5f6c1_row7_col0, #T_5f6c1_row7_col1, #T_5f6c1_row7_col2, #T_5f6c1_row7_col3, #T_5f6c1_row7_col5, #T_5f6c1_row7_col6, #T_5f6c1_row7_col8, #T_5f6c1_row7_col9, #T_5f6c1_row7_col10, #T_5f6c1_row7_col11, #T_5f6c1_row7_col12, #T_5f6c1_row7_col13, #T_5f6c1_row7_col14, #T_5f6c1_row7_col15, #T_5f6c1_row7_col16, #T_5f6c1_row7_col17, #T_5f6c1_row7_col18, #T_5f6c1_row7_col19, #T_5f6c1_row8_col1, #T_5f6c1_row8_col2, #T_5f6c1_row8_col4, #T_5f6c1_row8_col5, #T_5f6c1_row8_col6, #T_5f6c1_row8_col7, #T_5f6c1_row8_col9, #T_5f6c1_row8_col10, #T_5f6c1_row8_col13, #T_5f6c1_row8_col14, #T_5f6c1_row8_col17, #T_5f6c1_row8_col18, #T_5f6c1_row9_col0, #T_5f6c1_row9_col1, #T_5f6c1_row9_col2, #T_5f6c1_row9_col3, #T_5f6c1_row9_col5, #T_5f6c1_row9_col6, #T_5f6c1_row9_col12, #T_5f6c1_row9_col13, #T_5f6c1_row9_col14, #T_5f6c1_row9_col15, #T_5f6c1_row10_col1, #T_5f6c1_row10_col2, #T_5f6c1_row10_col12, #T_5f6c1_row10_col13, #T_5f6c1_row10_col14, #T_5f6c1_row10_col15, #T_5f6c1_row11_col1, #T_5f6c1_row11_col2, #T_5f6c1_row11_col12, #T_5f6c1_row11_col13, #T_5f6c1_row11_col14, #T_5f6c1_row11_col15, #T_5f6c1_row12_col1, #T_5f6c1_row12_col2, #T_5f6c1_row12_col5, #T_5f6c1_row12_col6, #T_5f6c1_row12_col8, #T_5f6c1_row12_col11, #T_5f6c1_row12_col12, #T_5f6c1_row12_col13, #T_5f6c1_row12_col14, #T_5f6c1_row12_col15, #T_5f6c1_row12_col16, #T_5f6c1_row12_col19 {\n",
-       "  background-color: #fb7c5c;\n",
+       "#T_79db4_row0_col0, #T_79db4_row0_col3, #T_79db4_row1_col12, #T_79db4_row1_col15 {\n",
+       "  background-color: #004c1e;\n",
        "}\n",
-       "#T_5f6c1_row0_col4, #T_5f6c1_row0_col7 {\n",
-       "  background-color: #f6583e;\n",
+       "#T_79db4_row0_col1, #T_79db4_row0_col2, #T_79db4_row7_col8, #T_79db4_row7_col11, #T_79db4_row7_col16, #T_79db4_row7_col19 {\n",
+       "  background-color: #0d7836;\n",
        "}\n",
-       "#T_5f6c1_row0_col5, #T_5f6c1_row0_col6, #T_5f6c1_row1_col5, #T_5f6c1_row1_col6 {\n",
-       "  background-color: #fa6849;\n",
+       "#T_79db4_row0_col4, #T_79db4_row0_col7, #T_79db4_row0_col8, #T_79db4_row0_col11, #T_79db4_row0_col16, #T_79db4_row0_col19 {\n",
+       "  background-color: #004d1f;\n",
        "}\n",
-       "#T_5f6c1_row0_col8, #T_5f6c1_row0_col11, #T_5f6c1_row0_col16, #T_5f6c1_row0_col19 {\n",
-       "  background-color: #f24734;\n",
+       "#T_79db4_row0_col5, #T_79db4_row0_col6, #T_79db4_row3_col13, #T_79db4_row3_col14 {\n",
+       "  background-color: #117b38;\n",
        "}\n",
-       "#T_5f6c1_row0_col9, #T_5f6c1_row0_col10, #T_5f6c1_row0_col17, #T_5f6c1_row0_col18, #T_5f6c1_row1_col9, #T_5f6c1_row1_col10, #T_5f6c1_row1_col17, #T_5f6c1_row1_col18, #T_5f6c1_row10_col9, #T_5f6c1_row10_col10, #T_5f6c1_row10_col17, #T_5f6c1_row10_col18, #T_5f6c1_row11_col9, #T_5f6c1_row11_col10, #T_5f6c1_row11_col17, #T_5f6c1_row11_col18 {\n",
-       "  background-color: #f34c37;\n",
+       "#T_79db4_row0_col9, #T_79db4_row0_col10, #T_79db4_row0_col17, #T_79db4_row0_col18 {\n",
+       "  background-color: #127c39;\n",
        "}\n",
-       "#T_5f6c1_row0_col12, #T_5f6c1_row0_col15, #T_5f6c1_row5_col12, #T_5f6c1_row5_col15 {\n",
-       "  background-color: #ef3c2c;\n",
+       "#T_79db4_row0_col12, #T_79db4_row0_col15 {\n",
+       "  background-color: #005723;\n",
        "}\n",
-       "#T_5f6c1_row0_col13, #T_5f6c1_row0_col14, #T_5f6c1_row1_col13, #T_5f6c1_row1_col14, #T_5f6c1_row4_col13, #T_5f6c1_row4_col14, #T_5f6c1_row5_col13, #T_5f6c1_row5_col14 {\n",
-       "  background-color: #ea362a;\n",
+       "#T_79db4_row0_col13, #T_79db4_row0_col14, #T_79db4_row4_col4, #T_79db4_row4_col7 {\n",
+       "  background-color: #2f974e;\n",
        "}\n",
-       "#T_5f6c1_row1_col0, #T_5f6c1_row1_col3, #T_5f6c1_row12_col0, #T_5f6c1_row12_col3 {\n",
-       "  background-color: #c1161b;\n",
+       "#T_79db4_row1_col0, #T_79db4_row1_col3, #T_79db4_row1_col4, #T_79db4_row1_col7, #T_79db4_row1_col8, #T_79db4_row1_col11, #T_79db4_row1_col16, #T_79db4_row1_col19 {\n",
+       "  background-color: #005622;\n",
        "}\n",
-       "#T_5f6c1_row1_col4, #T_5f6c1_row1_col7 {\n",
-       "  background-color: #f24633;\n",
+       "#T_79db4_row1_col1, #T_79db4_row1_col2 {\n",
+       "  background-color: #005221;\n",
        "}\n",
-       "#T_5f6c1_row1_col8, #T_5f6c1_row1_col11, #T_5f6c1_row1_col16, #T_5f6c1_row1_col19 {\n",
-       "  background-color: #ed392b;\n",
+       "#T_79db4_row1_col5, #T_79db4_row1_col6, #T_79db4_row1_col9, #T_79db4_row1_col10, #T_79db4_row1_col17, #T_79db4_row1_col18 {\n",
+       "  background-color: #005120;\n",
        "}\n",
-       "#T_5f6c1_row1_col12, #T_5f6c1_row1_col15 {\n",
-       "  background-color: #a60f15;\n",
+       "#T_79db4_row1_col13, #T_79db4_row1_col14, #T_79db4_row3_col1, #T_79db4_row3_col2 {\n",
+       "  background-color: #016e2d;\n",
        "}\n",
-       "#T_5f6c1_row2_col0, #T_5f6c1_row2_col3, #T_5f6c1_row10_col0, #T_5f6c1_row10_col3, #T_5f6c1_row11_col0, #T_5f6c1_row11_col3 {\n",
-       "  background-color: #a91016;\n",
+       "#T_79db4_row2_col0, #T_79db4_row2_col1, #T_79db4_row2_col2, #T_79db4_row2_col3, #T_79db4_row2_col4, #T_79db4_row2_col5, #T_79db4_row2_col6, #T_79db4_row2_col7, #T_79db4_row2_col8, #T_79db4_row2_col9, #T_79db4_row2_col10, #T_79db4_row2_col11, #T_79db4_row2_col12, #T_79db4_row2_col13, #T_79db4_row2_col14, #T_79db4_row2_col15, #T_79db4_row2_col16, #T_79db4_row2_col17, #T_79db4_row2_col18, #T_79db4_row2_col19 {\n",
+       "  background-color: #00441b;\n",
        "}\n",
-       "#T_5f6c1_row2_col4, #T_5f6c1_row2_col7 {\n",
-       "  background-color: #e32f27;\n",
+       "#T_79db4_row3_col0, #T_79db4_row3_col3, #T_79db4_row3_col4, #T_79db4_row3_col7, #T_79db4_row3_col8, #T_79db4_row3_col11, #T_79db4_row3_col16, #T_79db4_row3_col19 {\n",
+       "  background-color: #2a924a;\n",
        "}\n",
-       "#T_5f6c1_row2_col5, #T_5f6c1_row2_col6 {\n",
-       "  background-color: #d42121;\n",
+       "#T_79db4_row3_col5, #T_79db4_row3_col6, #T_79db4_row7_col0, #T_79db4_row7_col3 {\n",
+       "  background-color: #05712f;\n",
        "}\n",
-       "#T_5f6c1_row2_col8, #T_5f6c1_row2_col11, #T_5f6c1_row2_col16, #T_5f6c1_row2_col19 {\n",
-       "  background-color: #dc2924;\n",
+       "#T_79db4_row3_col9, #T_79db4_row3_col10, #T_79db4_row3_col17, #T_79db4_row3_col18 {\n",
+       "  background-color: #077331;\n",
        "}\n",
-       "#T_5f6c1_row2_col9, #T_5f6c1_row2_col10, #T_5f6c1_row2_col17, #T_5f6c1_row2_col18, #T_5f6c1_row4_col9, #T_5f6c1_row4_col10, #T_5f6c1_row4_col17, #T_5f6c1_row4_col18 {\n",
-       "  background-color: #d82422;\n",
+       "#T_79db4_row3_col12, #T_79db4_row3_col15 {\n",
+       "  background-color: #17813d;\n",
        "}\n",
-       "#T_5f6c1_row2_col12, #T_5f6c1_row2_col15, #T_5f6c1_row3_col12, #T_5f6c1_row3_col15 {\n",
-       "  background-color: #7a0510;\n",
+       "#T_79db4_row4_col0, #T_79db4_row4_col3, #T_79db4_row11_col0, #T_79db4_row11_col3 {\n",
+       "  background-color: #2d954d;\n",
        "}\n",
-       "#T_5f6c1_row2_col13, #T_5f6c1_row2_col14, #T_5f6c1_row3_col13, #T_5f6c1_row3_col14 {\n",
-       "  background-color: #bb141a;\n",
+       "#T_79db4_row4_col1, #T_79db4_row4_col2, #T_79db4_row4_col8, #T_79db4_row4_col11, #T_79db4_row4_col16, #T_79db4_row4_col19 {\n",
+       "  background-color: #2f984f;\n",
        "}\n",
-       "#T_5f6c1_row3_col0, #T_5f6c1_row3_col1, #T_5f6c1_row3_col2, #T_5f6c1_row3_col3, #T_5f6c1_row4_col0, #T_5f6c1_row4_col1, #T_5f6c1_row4_col2, #T_5f6c1_row4_col3, #T_5f6c1_row6_col4, #T_5f6c1_row6_col5, #T_5f6c1_row6_col6, #T_5f6c1_row6_col7, #T_5f6c1_row6_col8, #T_5f6c1_row6_col9, #T_5f6c1_row6_col10, #T_5f6c1_row6_col11, #T_5f6c1_row6_col12, #T_5f6c1_row6_col13, #T_5f6c1_row6_col14, #T_5f6c1_row6_col15, #T_5f6c1_row6_col16, #T_5f6c1_row6_col17, #T_5f6c1_row6_col18, #T_5f6c1_row6_col19 {\n",
-       "  background-color: #67000d;\n",
+       "#T_79db4_row4_col5, #T_79db4_row4_col6 {\n",
+       "  background-color: #37a055;\n",
        "}\n",
-       "#T_5f6c1_row3_col4, #T_5f6c1_row3_col7 {\n",
-       "  background-color: #cb181d;\n",
+       "#T_79db4_row4_col9, #T_79db4_row4_col10, #T_79db4_row4_col17, #T_79db4_row4_col18, #T_79db4_row10_col8, #T_79db4_row10_col11, #T_79db4_row10_col16, #T_79db4_row10_col19, #T_79db4_row11_col4, #T_79db4_row11_col7 {\n",
+       "  background-color: #3ba458;\n",
        "}\n",
-       "#T_5f6c1_row3_col5, #T_5f6c1_row3_col6 {\n",
-       "  background-color: #9a0c14;\n",
+       "#T_79db4_row4_col12, #T_79db4_row4_col15 {\n",
+       "  background-color: #005a24;\n",
        "}\n",
-       "#T_5f6c1_row3_col8, #T_5f6c1_row3_col11, #T_5f6c1_row3_col16, #T_5f6c1_row3_col19 {\n",
-       "  background-color: #c5171c;\n",
+       "#T_79db4_row4_col13, #T_79db4_row4_col14 {\n",
+       "  background-color: #1f8742;\n",
        "}\n",
-       "#T_5f6c1_row3_col9, #T_5f6c1_row3_col10, #T_5f6c1_row3_col17, #T_5f6c1_row3_col18 {\n",
-       "  background-color: #9f0e14;\n",
+       "#T_79db4_row5_col0, #T_79db4_row5_col3, #T_79db4_row5_col4, #T_79db4_row5_col7, #T_79db4_row5_col8, #T_79db4_row5_col11, #T_79db4_row5_col13, #T_79db4_row5_col14, #T_79db4_row5_col16, #T_79db4_row5_col19, #T_79db4_row10_col12, #T_79db4_row10_col15 {\n",
+       "  background-color: #339c52;\n",
        "}\n",
-       "#T_5f6c1_row4_col4, #T_5f6c1_row4_col7 {\n",
-       "  background-color: #b31218;\n",
+       "#T_79db4_row5_col1, #T_79db4_row5_col2, #T_79db4_row5_col5, #T_79db4_row5_col6, #T_79db4_row5_col9, #T_79db4_row5_col10, #T_79db4_row5_col17, #T_79db4_row5_col18, #T_79db4_row9_col0, #T_79db4_row9_col3, #T_79db4_row9_col4, #T_79db4_row9_col7, #T_79db4_row9_col8, #T_79db4_row9_col11, #T_79db4_row9_col12, #T_79db4_row9_col15, #T_79db4_row9_col16, #T_79db4_row9_col19, #T_79db4_row10_col13, #T_79db4_row10_col14, #T_79db4_row11_col13, #T_79db4_row11_col14 {\n",
+       "  background-color: #88ce87;\n",
        "}\n",
-       "#T_5f6c1_row4_col5, #T_5f6c1_row4_col6 {\n",
-       "  background-color: #c3161b;\n",
+       "#T_79db4_row5_col12, #T_79db4_row5_col15 {\n",
+       "  background-color: #309950;\n",
        "}\n",
-       "#T_5f6c1_row4_col8, #T_5f6c1_row4_col11, #T_5f6c1_row4_col16, #T_5f6c1_row4_col19 {\n",
-       "  background-color: #b61319;\n",
+       "#T_79db4_row6_col0, #T_79db4_row6_col3 {\n",
+       "  background-color: #005c25;\n",
        "}\n",
-       "#T_5f6c1_row4_col12, #T_5f6c1_row4_col15 {\n",
-       "  background-color: #8c0912;\n",
+       "#T_79db4_row6_col1, #T_79db4_row6_col2 {\n",
+       "  background-color: #78c679;\n",
        "}\n",
-       "#T_5f6c1_row5_col0, #T_5f6c1_row5_col3, #T_5f6c1_row8_col0, #T_5f6c1_row8_col3, #T_5f6c1_row10_col4, #T_5f6c1_row10_col7 {\n",
-       "  background-color: #e22e27;\n",
+       "#T_79db4_row6_col4, #T_79db4_row6_col7, #T_79db4_row6_col8, #T_79db4_row6_col11, #T_79db4_row6_col16, #T_79db4_row6_col19 {\n",
+       "  background-color: #005b25;\n",
        "}\n",
-       "#T_5f6c1_row5_col4, #T_5f6c1_row5_col7 {\n",
-       "  background-color: #e63328;\n",
+       "#T_79db4_row6_col5, #T_79db4_row6_col6, #T_79db4_row10_col5, #T_79db4_row10_col6 {\n",
+       "  background-color: #6dc072;\n",
        "}\n",
-       "#T_5f6c1_row5_col5, #T_5f6c1_row5_col6 {\n",
-       "  background-color: #f85f43;\n",
+       "#T_79db4_row6_col9, #T_79db4_row6_col10, #T_79db4_row6_col17, #T_79db4_row6_col18 {\n",
+       "  background-color: #68be70;\n",
        "}\n",
-       "#T_5f6c1_row5_col8, #T_5f6c1_row5_col11, #T_5f6c1_row5_col16, #T_5f6c1_row5_col19 {\n",
-       "  background-color: #db2824;\n",
+       "#T_79db4_row6_col12, #T_79db4_row6_col15 {\n",
+       "  background-color: #0c7735;\n",
        "}\n",
-       "#T_5f6c1_row5_col9, #T_5f6c1_row5_col10, #T_5f6c1_row5_col17, #T_5f6c1_row5_col18 {\n",
-       "  background-color: #ec382b;\n",
+       "#T_79db4_row6_col13, #T_79db4_row6_col14, #T_79db4_row9_col13, #T_79db4_row9_col14 {\n",
+       "  background-color: #3fa95c;\n",
        "}\n",
-       "#T_5f6c1_row6_col0, #T_5f6c1_row6_col3 {\n",
-       "  background-color: #79040f;\n",
+       "#T_79db4_row7_col1, #T_79db4_row7_col2, #T_79db4_row8_col4, #T_79db4_row8_col7 {\n",
+       "  background-color: #3ca559;\n",
        "}\n",
-       "#T_5f6c1_row7_col4, #T_5f6c1_row7_col7 {\n",
-       "  background-color: #fb7757;\n",
+       "#T_79db4_row7_col4, #T_79db4_row7_col7 {\n",
+       "  background-color: #0a7633;\n",
        "}\n",
-       "#T_5f6c1_row8_col8, #T_5f6c1_row8_col11, #T_5f6c1_row8_col16, #T_5f6c1_row8_col19 {\n",
-       "  background-color: #fb7555;\n",
+       "#T_79db4_row7_col5, #T_79db4_row7_col6 {\n",
+       "  background-color: #45ad5f;\n",
        "}\n",
-       "#T_5f6c1_row8_col12, #T_5f6c1_row8_col15 {\n",
-       "  background-color: #f96044;\n",
+       "#T_79db4_row7_col9, #T_79db4_row7_col10, #T_79db4_row7_col17, #T_79db4_row7_col18 {\n",
+       "  background-color: #4aaf61;\n",
        "}\n",
-       "#T_5f6c1_row9_col4, #T_5f6c1_row9_col7 {\n",
-       "  background-color: #f6563d;\n",
+       "#T_79db4_row7_col12, #T_79db4_row7_col15 {\n",
+       "  background-color: #005f26;\n",
        "}\n",
-       "#T_5f6c1_row9_col8, #T_5f6c1_row9_col11, #T_5f6c1_row9_col16, #T_5f6c1_row9_col19 {\n",
-       "  background-color: #fb6b4b;\n",
+       "#T_79db4_row7_col13, #T_79db4_row7_col14, #T_79db4_row8_col13, #T_79db4_row8_col14 {\n",
+       "  background-color: #55b567;\n",
        "}\n",
-       "#T_5f6c1_row9_col9, #T_5f6c1_row9_col10, #T_5f6c1_row9_col17, #T_5f6c1_row9_col18, #T_5f6c1_row12_col9, #T_5f6c1_row12_col10, #T_5f6c1_row12_col17, #T_5f6c1_row12_col18 {\n",
-       "  background-color: #fb7b5b;\n",
+       "#T_79db4_row8_col0, #T_79db4_row8_col1, #T_79db4_row8_col2, #T_79db4_row8_col3, #T_79db4_row12_col4, #T_79db4_row12_col7 {\n",
+       "  background-color: #369f54;\n",
        "}\n",
-       "#T_5f6c1_row10_col5, #T_5f6c1_row10_col6, #T_5f6c1_row11_col5, #T_5f6c1_row11_col6 {\n",
-       "  background-color: #f03f2e;\n",
+       "#T_79db4_row8_col5, #T_79db4_row8_col6, #T_79db4_row8_col8, #T_79db4_row8_col11, #T_79db4_row8_col16, #T_79db4_row8_col19 {\n",
+       "  background-color: #3ea75a;\n",
        "}\n",
-       "#T_5f6c1_row10_col8, #T_5f6c1_row10_col11, #T_5f6c1_row10_col16, #T_5f6c1_row10_col19, #T_5f6c1_row11_col8, #T_5f6c1_row11_col11, #T_5f6c1_row11_col16, #T_5f6c1_row11_col19 {\n",
-       "  background-color: #f7593f;\n",
+       "#T_79db4_row8_col9, #T_79db4_row8_col10, #T_79db4_row8_col17, #T_79db4_row8_col18 {\n",
+       "  background-color: #40aa5d;\n",
        "}\n",
-       "#T_5f6c1_row11_col4, #T_5f6c1_row11_col7 {\n",
-       "  background-color: #f14331;\n",
+       "#T_79db4_row8_col12, #T_79db4_row8_col15 {\n",
+       "  background-color: #29914a;\n",
        "}\n",
-       "#T_5f6c1_row12_col4, #T_5f6c1_row12_col7 {\n",
-       "  background-color: #fb6d4d;\n",
+       "#T_79db4_row9_col1, #T_79db4_row9_col2 {\n",
+       "  background-color: #81ca81;\n",
+       "}\n",
+       "#T_79db4_row9_col5, #T_79db4_row9_col6 {\n",
+       "  background-color: #84cc83;\n",
+       "}\n",
+       "#T_79db4_row9_col9, #T_79db4_row9_col10, #T_79db4_row9_col17, #T_79db4_row9_col18 {\n",
+       "  background-color: #86cc85;\n",
+       "}\n",
+       "#T_79db4_row10_col0, #T_79db4_row10_col3 {\n",
+       "  background-color: #268e47;\n",
+       "}\n",
+       "#T_79db4_row10_col1, #T_79db4_row10_col2 {\n",
+       "  background-color: #63bc6e;\n",
+       "}\n",
+       "#T_79db4_row10_col4, #T_79db4_row10_col7 {\n",
+       "  background-color: #349d53;\n",
+       "}\n",
+       "#T_79db4_row10_col9, #T_79db4_row10_col10, #T_79db4_row10_col17, #T_79db4_row10_col18 {\n",
+       "  background-color: #72c375;\n",
+       "}\n",
+       "#T_79db4_row11_col1, #T_79db4_row11_col2 {\n",
+       "  background-color: #5bb86a;\n",
+       "}\n",
+       "#T_79db4_row11_col5, #T_79db4_row11_col6 {\n",
+       "  background-color: #66bd6f;\n",
+       "}\n",
+       "#T_79db4_row11_col8, #T_79db4_row11_col11, #T_79db4_row11_col16, #T_79db4_row11_col19 {\n",
+       "  background-color: #42ab5d;\n",
+       "}\n",
+       "#T_79db4_row11_col9, #T_79db4_row11_col10, #T_79db4_row11_col17, #T_79db4_row11_col18 {\n",
+       "  background-color: #6abf71;\n",
+       "}\n",
+       "#T_79db4_row11_col12, #T_79db4_row11_col15 {\n",
+       "  background-color: #4bb062;\n",
+       "}\n",
+       "#T_79db4_row12_col0, #T_79db4_row12_col3 {\n",
+       "  background-color: #319a50;\n",
+       "}\n",
+       "#T_79db4_row12_col1, #T_79db4_row12_col2 {\n",
+       "  background-color: #4eb264;\n",
+       "}\n",
+       "#T_79db4_row12_col5, #T_79db4_row12_col6 {\n",
+       "  background-color: #5ab769;\n",
+       "}\n",
+       "#T_79db4_row12_col8, #T_79db4_row12_col11, #T_79db4_row12_col16, #T_79db4_row12_col19 {\n",
+       "  background-color: #39a257;\n",
+       "}\n",
+       "#T_79db4_row12_col9, #T_79db4_row12_col10, #T_79db4_row12_col17, #T_79db4_row12_col18 {\n",
+       "  background-color: #5eb96b;\n",
+       "}\n",
+       "#T_79db4_row12_col12, #T_79db4_row12_col15 {\n",
+       "  background-color: #58b668;\n",
+       "}\n",
+       "#T_79db4_row12_col13, #T_79db4_row12_col14 {\n",
+       "  background-color: #6ec173;\n",
        "}\n",
        "</style>\n",
-       "<table id=\"T_5f6c1\">\n",
+       "<table id=\"T_79db4\">\n",
        "  <thead>\n",
        "    <tr>\n",
        "      <th class=\"blank level0\" >&nbsp;</th>\n",
-       "      <th id=\"T_5f6c1_level0_col0\" class=\"col_heading level0 col0\" colspan=\"20\">HotpotQA, Llama3.2-3b</th>\n",
+       "      <th id=\"T_79db4_level0_col0\" class=\"col_heading level0 col0\" colspan=\"20\">HotpotQA, Llama3.2-3b</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th class=\"blank level1\" >&nbsp;</th>\n",
-       "      <th id=\"T_5f6c1_level1_col0\" class=\"col_heading level1 col0\" colspan=\"4\">Accuracy</th>\n",
-       "      <th id=\"T_5f6c1_level1_col4\" class=\"col_heading level1 col4\" colspan=\"4\">BLEU</th>\n",
-       "      <th id=\"T_5f6c1_level1_col8\" class=\"col_heading level1 col8\" colspan=\"4\">Rouge_rouge1</th>\n",
-       "      <th id=\"T_5f6c1_level1_col12\" class=\"col_heading level1 col12\" colspan=\"4\">Rouge_rouge2</th>\n",
-       "      <th id=\"T_5f6c1_level1_col16\" class=\"col_heading level1 col16\" colspan=\"4\">Rouge_rougeL</th>\n",
+       "      <th id=\"T_79db4_level1_col0\" class=\"col_heading level1 col0\" colspan=\"4\">Accuracy</th>\n",
+       "      <th id=\"T_79db4_level1_col4\" class=\"col_heading level1 col4\" colspan=\"4\">BLEU</th>\n",
+       "      <th id=\"T_79db4_level1_col8\" class=\"col_heading level1 col8\" colspan=\"4\">Rouge_rouge1</th>\n",
+       "      <th id=\"T_79db4_level1_col12\" class=\"col_heading level1 col12\" colspan=\"4\">Rouge_rouge2</th>\n",
+       "      <th id=\"T_79db4_level1_col16\" class=\"col_heading level1 col16\" colspan=\"4\">Rouge_rougeL</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th class=\"blank level2\" >&nbsp;</th>\n",
-       "      <th id=\"T_5f6c1_level2_col0\" class=\"col_heading level2 col0\" >prr</th>\n",
-       "      <th id=\"T_5f6c1_level2_col1\" class=\"col_heading level2 col1\" >prr_0.5</th>\n",
-       "      <th id=\"T_5f6c1_level2_col2\" class=\"col_heading level2 col2\" >prr_0.5_normalized</th>\n",
-       "      <th id=\"T_5f6c1_level2_col3\" class=\"col_heading level2 col3\" >prr_normalized</th>\n",
-       "      <th id=\"T_5f6c1_level2_col4\" class=\"col_heading level2 col4\" >prr</th>\n",
-       "      <th id=\"T_5f6c1_level2_col5\" class=\"col_heading level2 col5\" >prr_0.5</th>\n",
-       "      <th id=\"T_5f6c1_level2_col6\" class=\"col_heading level2 col6\" >prr_0.5_normalized</th>\n",
-       "      <th id=\"T_5f6c1_level2_col7\" class=\"col_heading level2 col7\" >prr_normalized</th>\n",
-       "      <th id=\"T_5f6c1_level2_col8\" class=\"col_heading level2 col8\" >prr</th>\n",
-       "      <th id=\"T_5f6c1_level2_col9\" class=\"col_heading level2 col9\" >prr_0.5</th>\n",
-       "      <th id=\"T_5f6c1_level2_col10\" class=\"col_heading level2 col10\" >prr_0.5_normalized</th>\n",
-       "      <th id=\"T_5f6c1_level2_col11\" class=\"col_heading level2 col11\" >prr_normalized</th>\n",
-       "      <th id=\"T_5f6c1_level2_col12\" class=\"col_heading level2 col12\" >prr</th>\n",
-       "      <th id=\"T_5f6c1_level2_col13\" class=\"col_heading level2 col13\" >prr_0.5</th>\n",
-       "      <th id=\"T_5f6c1_level2_col14\" class=\"col_heading level2 col14\" >prr_0.5_normalized</th>\n",
-       "      <th id=\"T_5f6c1_level2_col15\" class=\"col_heading level2 col15\" >prr_normalized</th>\n",
-       "      <th id=\"T_5f6c1_level2_col16\" class=\"col_heading level2 col16\" >prr</th>\n",
-       "      <th id=\"T_5f6c1_level2_col17\" class=\"col_heading level2 col17\" >prr_0.5</th>\n",
-       "      <th id=\"T_5f6c1_level2_col18\" class=\"col_heading level2 col18\" >prr_0.5_normalized</th>\n",
-       "      <th id=\"T_5f6c1_level2_col19\" class=\"col_heading level2 col19\" >prr_normalized</th>\n",
+       "      <th id=\"T_79db4_level2_col0\" class=\"col_heading level2 col0\" >prr</th>\n",
+       "      <th id=\"T_79db4_level2_col1\" class=\"col_heading level2 col1\" >prr_0.5</th>\n",
+       "      <th id=\"T_79db4_level2_col2\" class=\"col_heading level2 col2\" >prr_0.5_normalized</th>\n",
+       "      <th id=\"T_79db4_level2_col3\" class=\"col_heading level2 col3\" >prr_normalized</th>\n",
+       "      <th id=\"T_79db4_level2_col4\" class=\"col_heading level2 col4\" >prr</th>\n",
+       "      <th id=\"T_79db4_level2_col5\" class=\"col_heading level2 col5\" >prr_0.5</th>\n",
+       "      <th id=\"T_79db4_level2_col6\" class=\"col_heading level2 col6\" >prr_0.5_normalized</th>\n",
+       "      <th id=\"T_79db4_level2_col7\" class=\"col_heading level2 col7\" >prr_normalized</th>\n",
+       "      <th id=\"T_79db4_level2_col8\" class=\"col_heading level2 col8\" >prr</th>\n",
+       "      <th id=\"T_79db4_level2_col9\" class=\"col_heading level2 col9\" >prr_0.5</th>\n",
+       "      <th id=\"T_79db4_level2_col10\" class=\"col_heading level2 col10\" >prr_0.5_normalized</th>\n",
+       "      <th id=\"T_79db4_level2_col11\" class=\"col_heading level2 col11\" >prr_normalized</th>\n",
+       "      <th id=\"T_79db4_level2_col12\" class=\"col_heading level2 col12\" >prr</th>\n",
+       "      <th id=\"T_79db4_level2_col13\" class=\"col_heading level2 col13\" >prr_0.5</th>\n",
+       "      <th id=\"T_79db4_level2_col14\" class=\"col_heading level2 col14\" >prr_0.5_normalized</th>\n",
+       "      <th id=\"T_79db4_level2_col15\" class=\"col_heading level2 col15\" >prr_normalized</th>\n",
+       "      <th id=\"T_79db4_level2_col16\" class=\"col_heading level2 col16\" >prr</th>\n",
+       "      <th id=\"T_79db4_level2_col17\" class=\"col_heading level2 col17\" >prr_0.5</th>\n",
+       "      <th id=\"T_79db4_level2_col18\" class=\"col_heading level2 col18\" >prr_0.5_normalized</th>\n",
+       "      <th id=\"T_79db4_level2_col19\" class=\"col_heading level2 col19\" >prr_normalized</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th id=\"T_5f6c1_level0_row0\" class=\"row_heading level0 row0\" >MaximumSequenceProbability</th>\n",
-       "      <td id=\"T_5f6c1_row0_col0\" class=\"data row0 col0\" >29.29 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row0_col1\" class=\"data row0 col1\" >12.91 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row0_col2\" class=\"data row0 col2\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row0_col3\" class=\"data row0 col3\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row0_col4\" class=\"data row0 col4\" >55.70 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row0_col5\" class=\"data row0 col5\" >41.07 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row0_col6\" class=\"data row0 col6\" >78.16 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row0_col7\" class=\"data row0 col7\" >67.37 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row0_col8\" class=\"data row0 col8\" >57.92 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row0_col9\" class=\"data row0 col9\" >45.50 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row0_col10\" class=\"data row0 col10\" >46.68 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row0_col11\" class=\"data row0 col11\" >53.28 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row0_col12\" class=\"data row0 col12\" >46.94 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row0_col13\" class=\"data row0 col13\" >32.78 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row0_col14\" class=\"data row0 col14\" >-10.35 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row0_col15\" class=\"data row0 col15\" >42.17 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row0_col16\" class=\"data row0 col16\" >57.92 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row0_col17\" class=\"data row0 col17\" >45.50 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row0_col18\" class=\"data row0 col18\" >46.68 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row0_col19\" class=\"data row0 col19\" >53.28 ± 0.00</td>\n",
+       "      <th id=\"T_79db4_level0_row0\" class=\"row_heading level0 row0\" >MaximumSequenceProbability</th>\n",
+       "      <td id=\"T_79db4_row0_col0\" class=\"data row0 col0\" >29.89 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row0_col1\" class=\"data row0 col1\" >36.33 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row0_col2\" class=\"data row0 col2\" >-26.36 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row0_col3\" class=\"data row0 col3\" >-27.21 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row0_col4\" class=\"data row0 col4\" >30.36 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row0_col5\" class=\"data row0 col5\" >37.28 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row0_col6\" class=\"data row0 col6\" >-28.28 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row0_col7\" class=\"data row0 col7\" >-29.70 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row0_col8\" class=\"data row0 col8\" >30.58 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row0_col9\" class=\"data row0 col9\" >37.71 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row0_col10\" class=\"data row0 col10\" >-29.10 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row0_col11\" class=\"data row0 col11\" >-30.83 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row0_col12\" class=\"data row0 col12\" >22.75 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row0_col13\" class=\"data row0 col13\" >31.47 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row0_col14\" class=\"data row0 col14\" >9.68 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row0_col15\" class=\"data row0 col15\" >-22.90 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row0_col16\" class=\"data row0 col16\" >30.58 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row0_col17\" class=\"data row0 col17\" >37.71 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row0_col18\" class=\"data row0 col18\" >-29.10 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row0_col19\" class=\"data row0 col19\" >-30.83 ± 0.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_5f6c1_level0_row1\" class=\"row_heading level0 row1\" >Perplexity</th>\n",
-       "      <td id=\"T_5f6c1_row1_col0\" class=\"data row1 col0\" >14.29 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row1_col1\" class=\"data row1 col1\" >12.91 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row1_col2\" class=\"data row1 col2\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row1_col3\" class=\"data row1 col3\" >21.71 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row1_col4\" class=\"data row1 col4\" >51.20 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row1_col5\" class=\"data row1 col5\" >41.07 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row1_col6\" class=\"data row1 col6\" >78.16 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row1_col7\" class=\"data row1 col7\" >53.99 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row1_col8\" class=\"data row1 col8\" >53.42 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row1_col9\" class=\"data row1 col9\" >45.50 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row1_col10\" class=\"data row1 col10\" >46.68 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row1_col11\" class=\"data row1 col11\" >40.13 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row1_col12\" class=\"data row1 col12\" >21.94 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row1_col13\" class=\"data row1 col13\" >32.78 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row1_col14\" class=\"data row1 col14\" >-10.35 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row1_col15\" class=\"data row1 col15\" >-37.91 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row1_col16\" class=\"data row1 col16\" >53.42 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row1_col17\" class=\"data row1 col17\" >45.50 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row1_col18\" class=\"data row1 col18\" >46.68 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row1_col19\" class=\"data row1 col19\" >40.13 ± 0.00</td>\n",
+       "      <th id=\"T_79db4_level0_row1\" class=\"row_heading level0 row1\" >Perplexity</th>\n",
+       "      <td id=\"T_79db4_row1_col0\" class=\"data row1 col0\" >32.30 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row1_col1\" class=\"data row1 col1\" >32.06 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row1_col2\" class=\"data row1 col2\" >-57.82 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row1_col3\" class=\"data row1 col3\" >-20.45 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row1_col4\" class=\"data row1 col4\" >32.44 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row1_col5\" class=\"data row1 col5\" >32.34 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row1_col6\" class=\"data row1 col6\" >-63.40 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row1_col7\" class=\"data row1 col7\" >-23.85 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row1_col8\" class=\"data row1 col8\" >32.50 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row1_col9\" class=\"data row1 col9\" >32.47 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row1_col10\" class=\"data row1 col10\" >-65.81 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row1_col11\" class=\"data row1 col11\" >-25.41 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row1_col12\" class=\"data row1 col12\" >20.61 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row1_col13\" class=\"data row1 col13\" >26.43 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row1_col14\" class=\"data row1 col14\" >-49.48 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row1_col15\" class=\"data row1 col15\" >-29.20 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row1_col16\" class=\"data row1 col16\" >32.50 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row1_col17\" class=\"data row1 col17\" >32.47 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row1_col18\" class=\"data row1 col18\" >-65.81 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row1_col19\" class=\"data row1 col19\" >-25.41 ± 0.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_5f6c1_level0_row2\" class=\"row_heading level0 row2\" >MeanTokenEntropy</th>\n",
-       "      <td id=\"T_5f6c1_row2_col0\" class=\"data row2 col0\" >10.96 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row2_col1\" class=\"data row2 col1\" >12.91 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row2_col2\" class=\"data row2 col2\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row2_col3\" class=\"data row2 col3\" >4.32 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row2_col4\" class=\"data row2 col4\" >44.02 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row2_col5\" class=\"data row2 col5\" >33.38 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row2_col6\" class=\"data row2 col6\" >0.31 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row2_col7\" class=\"data row2 col7\" >32.64 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row2_col8\" class=\"data row2 col8\" >47.18 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row2_col9\" class=\"data row2 col9\" >39.69 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row2_col10\" class=\"data row2 col10\" >-2.72 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row2_col11\" class=\"data row2 col11\" >21.90 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row2_col12\" class=\"data row2 col12\" >13.06 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row2_col13\" class=\"data row2 col13\" >26.11 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row2_col14\" class=\"data row2 col14\" >-98.63 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row2_col15\" class=\"data row2 col15\" >-66.38 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row2_col16\" class=\"data row2 col16\" >47.18 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row2_col17\" class=\"data row2 col17\" >39.69 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row2_col18\" class=\"data row2 col18\" >-2.72 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row2_col19\" class=\"data row2 col19\" >21.90 ± 0.00</td>\n",
+       "      <th id=\"T_79db4_level0_row2\" class=\"row_heading level0 row2\" >MeanTokenEntropy</th>\n",
+       "      <td id=\"T_79db4_row2_col0\" class=\"data row2 col0\" >28.05 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row2_col1\" class=\"data row2 col1\" >30.57 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row2_col2\" class=\"data row2 col2\" >-68.74 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row2_col3\" class=\"data row2 col3\" >-32.35 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row2_col4\" class=\"data row2 col4\" >28.20 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row2_col5\" class=\"data row2 col5\" >30.85 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row2_col6\" class=\"data row2 col6\" >-73.94 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row2_col7\" class=\"data row2 col7\" >-35.80 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row2_col8\" class=\"data row2 col8\" >28.26 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row2_col9\" class=\"data row2 col9\" >30.98 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row2_col10\" class=\"data row2 col10\" >-76.18 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row2_col11\" class=\"data row2 col11\" >-37.37 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row2_col12\" class=\"data row2 col12\" >18.79 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row2_col13\" class=\"data row2 col13\" >22.50 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row2_col14\" class=\"data row2 col14\" >-95.73 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row2_col15\" class=\"data row2 col15\" >-34.54 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row2_col16\" class=\"data row2 col16\" >28.26 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row2_col17\" class=\"data row2 col17\" >30.98 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row2_col18\" class=\"data row2 col18\" >-76.18 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row2_col19\" class=\"data row2 col19\" >-37.37 ± 0.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_5f6c1_level0_row3\" class=\"row_heading level0 row3\" >MeanPointwiseMutualInformation</th>\n",
-       "      <td id=\"T_5f6c1_row3_col0\" class=\"data row3 col0\" >4.79 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row3_col1\" class=\"data row3 col1\" >9.58 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row3_col2\" class=\"data row3 col2\" >-16.51 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row3_col3\" class=\"data row3 col3\" >-27.87 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row3_col4\" class=\"data row3 col4\" >36.26 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row3_col5\" class=\"data row3 col5\" >26.84 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row3_col6\" class=\"data row3 col6\" >-65.81 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row3_col7\" class=\"data row3 col7\" >9.55 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row3_col8\" class=\"data row3 col8\" >38.52 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row3_col9\" class=\"data row3 col9\" >31.33 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row3_col10\" class=\"data row3 col10\" >-73.75 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row3_col11\" class=\"data row3 col11\" >-3.41 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row3_col12\" class=\"data row3 col12\" >13.06 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row3_col13\" class=\"data row3 col13\" >26.11 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row3_col14\" class=\"data row3 col14\" >-98.63 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row3_col15\" class=\"data row3 col15\" >-66.38 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row3_col16\" class=\"data row3 col16\" >38.52 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row3_col17\" class=\"data row3 col17\" >31.33 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row3_col18\" class=\"data row3 col18\" >-73.75 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row3_col19\" class=\"data row3 col19\" >-3.41 ± 0.00</td>\n",
+       "      <th id=\"T_79db4_level0_row3\" class=\"row_heading level0 row3\" >MeanPointwiseMutualInformation</th>\n",
+       "      <td id=\"T_79db4_row3_col0\" class=\"data row3 col0\" >48.88 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row3_col1\" class=\"data row3 col1\" >34.89 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row3_col2\" class=\"data row3 col2\" >-36.93 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row3_col3\" class=\"data row3 col3\" >26.12 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row3_col4\" class=\"data row3 col4\" >49.36 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row3_col5\" class=\"data row3 col5\" >35.85 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row3_col6\" class=\"data row3 col6\" >-38.47 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row3_col7\" class=\"data row3 col7\" >23.81 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row3_col8\" class=\"data row3 col8\" >49.57 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row3_col9\" class=\"data row3 col9\" >36.28 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row3_col10\" class=\"data row3 col10\" >-39.14 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row3_col11\" class=\"data row3 col11\" >22.76 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row3_col12\" class=\"data row3 col12\" >32.91 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row3_col13\" class=\"data row3 col13\" >28.10 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row3_col14\" class=\"data row3 col14\" >-29.89 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row3_col15\" class=\"data row3 col15\" >7.02 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row3_col16\" class=\"data row3 col16\" >49.57 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row3_col17\" class=\"data row3 col17\" >36.28 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row3_col18\" class=\"data row3 col18\" >-39.14 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row3_col19\" class=\"data row3 col19\" >22.76 ± 0.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_5f6c1_level0_row4\" class=\"row_heading level0 row4\" >MeanConditionalPointwiseMutualInformation</th>\n",
-       "      <td id=\"T_5f6c1_row4_col0\" class=\"data row4 col0\" >4.79 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row4_col1\" class=\"data row4 col1\" >9.58 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row4_col2\" class=\"data row4 col2\" >-16.51 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row4_col3\" class=\"data row4 col3\" >-27.87 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row4_col4\" class=\"data row4 col4\" >28.45 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row4_col5\" class=\"data row4 col5\" >31.21 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row4_col6\" class=\"data row4 col6\" >-21.62 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row4_col7\" class=\"data row4 col7\" >-13.66 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row4_col8\" class=\"data row4 col8\" >32.78 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row4_col9\" class=\"data row4 col9\" >39.75 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row4_col10\" class=\"data row4 col10\" >-2.23 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row4_col11\" class=\"data row4 col11\" >-20.18 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row4_col12\" class=\"data row4 col12\" >16.39 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row4_col13\" class=\"data row4 col13\" >32.78 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row4_col14\" class=\"data row4 col14\" >-10.35 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row4_col15\" class=\"data row4 col15\" >-55.70 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row4_col16\" class=\"data row4 col16\" >32.78 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row4_col17\" class=\"data row4 col17\" >39.75 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row4_col18\" class=\"data row4 col18\" >-2.23 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row4_col19\" class=\"data row4 col19\" >-20.18 ± 0.00</td>\n",
+       "      <th id=\"T_79db4_level0_row4\" class=\"row_heading level0 row4\" >MeanConditionalPointwiseMutualInformation</th>\n",
+       "      <td id=\"T_79db4_row4_col0\" class=\"data row4 col0\" >49.75 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row4_col1\" class=\"data row4 col1\" >40.65 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row4_col2\" class=\"data row4 col2\" >5.44 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row4_col3\" class=\"data row4 col3\" >28.55 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row4_col4\" class=\"data row4 col4\" >50.80 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row4_col5\" class=\"data row4 col5\" >42.49 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row4_col6\" class=\"data row4 col6\" >8.73 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row4_col7\" class=\"data row4 col7\" >27.88 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row4_col8\" class=\"data row4 col8\" >51.28 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row4_col9\" class=\"data row4 col9\" >43.33 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row4_col10\" class=\"data row4 col10\" >10.14 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row4_col11\" class=\"data row4 col11\" >27.58 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row4_col12\" class=\"data row4 col12\" >23.43 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row4_col13\" class=\"data row4 col13\" >29.61 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row4_col14\" class=\"data row4 col14\" >-12.08 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row4_col15\" class=\"data row4 col15\" >-20.89 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row4_col16\" class=\"data row4 col16\" >51.28 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row4_col17\" class=\"data row4 col17\" >43.33 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row4_col18\" class=\"data row4 col18\" >10.14 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row4_col19\" class=\"data row4 col19\" >27.58 ± 0.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_5f6c1_level0_row5\" class=\"row_heading level0 row5\" >PTrue</th>\n",
-       "      <td id=\"T_5f6c1_row5_col0\" class=\"data row5 col0\" >19.29 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row5_col1\" class=\"data row5 col1\" >12.91 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row5_col2\" class=\"data row5 col2\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row5_col3\" class=\"data row5 col3\" >47.81 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row5_col4\" class=\"data row5 col4\" >45.27 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row5_col5\" class=\"data row5 col5\" >40.21 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row5_col6\" class=\"data row5 col6\" >69.43 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row5_col7\" class=\"data row5 col7\" >36.36 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row5_col8\" class=\"data row5 col8\" >46.68 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row5_col9\" class=\"data row5 col9\" >43.02 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row5_col10\" class=\"data row5 col10\" >25.62 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row5_col11\" class=\"data row5 col11\" >20.44 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row5_col12\" class=\"data row5 col12\" >46.94 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row5_col13\" class=\"data row5 col13\" >32.78 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row5_col14\" class=\"data row5 col14\" >-10.35 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row5_col15\" class=\"data row5 col15\" >42.17 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row5_col16\" class=\"data row5 col16\" >46.68 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row5_col17\" class=\"data row5 col17\" >43.02 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row5_col18\" class=\"data row5 col18\" >25.62 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row5_col19\" class=\"data row5 col19\" >20.44 ± 0.00</td>\n",
+       "      <th id=\"T_79db4_level0_row5\" class=\"row_heading level0 row5\" >PTrue</th>\n",
+       "      <td id=\"T_79db4_row5_col0\" class=\"data row5 col0\" >51.65 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row5_col1\" class=\"data row5 col1\" >48.98 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row5_col2\" class=\"data row5 col2\" >66.76 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row5_col3\" class=\"data row5 col3\" >33.89 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row5_col4\" class=\"data row5 col4\" >52.22 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row5_col5\" class=\"data row5 col5\" >50.13 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row5_col6\" class=\"data row5 col6\" >63.00 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row5_col7\" class=\"data row5 col7\" >31.88 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row5_col8\" class=\"data row5 col8\" >52.48 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row5_col9\" class=\"data row5 col9\" >50.65 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row5_col10\" class=\"data row5 col10\" >61.38 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row5_col11\" class=\"data row5 col11\" >30.96 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row5_col12\" class=\"data row5 col12\" >39.36 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row5_col13\" class=\"data row5 col13\" >32.03 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row5_col14\" class=\"data row5 col14\" >16.36 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row5_col15\" class=\"data row5 col15\" >25.99 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row5_col16\" class=\"data row5 col16\" >52.48 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row5_col17\" class=\"data row5 col17\" >50.65 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row5_col18\" class=\"data row5 col18\" >61.38 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row5_col19\" class=\"data row5 col19\" >30.96 ± 0.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_5f6c1_level0_row6\" class=\"row_heading level0 row6\" >PTrueSampling</th>\n",
-       "      <td id=\"T_5f6c1_row6_col0\" class=\"data row6 col0\" >6.46 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row6_col1\" class=\"data row6 col1\" >12.91 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row6_col2\" class=\"data row6 col2\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row6_col3\" class=\"data row6 col3\" >-19.17 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row6_col4\" class=\"data row6 col4\" >11.58 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row6_col5\" class=\"data row6 col5\" >23.12 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row6_col6\" class=\"data row6 col6\" >-103.47 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row6_col7\" class=\"data row6 col7\" >-63.82 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row6_col8\" class=\"data row6 col8\" >13.11 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row6_col9\" class=\"data row6 col9\" >25.98 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row6_col10\" class=\"data row6 col10\" >-119.30 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row6_col11\" class=\"data row6 col11\" >-77.67 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row6_col12\" class=\"data row6 col12\" >8.89 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row6_col13\" class=\"data row6 col13\" >17.78 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row6_col14\" class=\"data row6 col14\" >-208.98 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row6_col15\" class=\"data row6 col15\" >-79.73 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row6_col16\" class=\"data row6 col16\" >13.11 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row6_col17\" class=\"data row6 col17\" >25.98 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row6_col18\" class=\"data row6 col18\" >-119.30 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row6_col19\" class=\"data row6 col19\" >-77.67 ± 0.00</td>\n",
+       "      <th id=\"T_79db4_level0_row6\" class=\"row_heading level0 row6\" >PTrueSampling</th>\n",
+       "      <td id=\"T_79db4_row6_col0\" class=\"data row6 col0\" >33.60 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row6_col1\" class=\"data row6 col1\" >47.49 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row6_col2\" class=\"data row6 col2\" >55.74 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row6_col3\" class=\"data row6 col3\" >-16.78 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row6_col4\" class=\"data row6 col4\" >33.67 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row6_col5\" class=\"data row6 col5\" >47.62 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row6_col6\" class=\"data row6 col6\" >45.18 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row6_col7\" class=\"data row6 col7\" >-20.38 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row6_col8\" class=\"data row6 col8\" >33.70 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row6_col9\" class=\"data row6 col9\" >47.69 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row6_col10\" class=\"data row6 col10\" >40.63 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row6_col11\" class=\"data row6 col11\" >-22.02 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row6_col12\" class=\"data row6 col12\" >30.19 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row6_col13\" class=\"data row6 col13\" >33.55 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row6_col14\" class=\"data row6 col14\" >34.16 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row6_col15\" class=\"data row6 col15\" >-0.99 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row6_col16\" class=\"data row6 col16\" >33.70 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row6_col17\" class=\"data row6 col17\" >47.69 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row6_col18\" class=\"data row6 col18\" >40.63 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row6_col19\" class=\"data row6 col19\" >-22.02 ± 0.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_5f6c1_level0_row7\" class=\"row_heading level0 row7\" >MonteCarloSequenceEntropy</th>\n",
-       "      <td id=\"T_5f6c1_row7_col0\" class=\"data row7 col0\" >29.29 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row7_col1\" class=\"data row7 col1\" >12.91 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row7_col2\" class=\"data row7 col2\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row7_col3\" class=\"data row7 col3\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row7_col4\" class=\"data row7 col4\" >64.50 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row7_col5\" class=\"data row7 col5\" >43.23 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row7_col6\" class=\"data row7 col6\" >99.95 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row7_col7\" class=\"data row7 col7\" >93.51 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row7_col8\" class=\"data row7 col8\" >73.87 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row7_col9\" class=\"data row7 col9\" >51.74 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row7_col10\" class=\"data row7 col10\" >99.74 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row7_col11\" class=\"data row7 col11\" >99.90 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row7_col12\" class=\"data row7 col12\" >65.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row7_col13\" class=\"data row7 col13\" >41.11 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row7_col14\" class=\"data row7 col14\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row7_col15\" class=\"data row7 col15\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row7_col16\" class=\"data row7 col16\" >73.87 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row7_col17\" class=\"data row7 col17\" >51.74 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row7_col18\" class=\"data row7 col18\" >99.74 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row7_col19\" class=\"data row7 col19\" >99.90 ± 0.00</td>\n",
+       "      <th id=\"T_79db4_level0_row7\" class=\"row_heading level0 row7\" >MonteCarloSequenceEntropy</th>\n",
+       "      <td id=\"T_79db4_row7_col0\" class=\"data row7 col0\" >38.71 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row7_col1\" class=\"data row7 col1\" >42.34 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row7_col2\" class=\"data row7 col2\" >17.84 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row7_col3\" class=\"data row7 col3\" >-2.44 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row7_col4\" class=\"data row7 col4\" >40.52 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row7_col5\" class=\"data row7 col5\" >44.18 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row7_col6\" class=\"data row7 col6\" >20.70 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row7_col7\" class=\"data row7 col7\" >-1.09 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row7_col8\" class=\"data row7 col8\" >41.34 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row7_col9\" class=\"data row7 col9\" >45.01 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row7_col10\" class=\"data row7 col10\" >21.93 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row7_col11\" class=\"data row7 col11\" >-0.48 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row7_col12\" class=\"data row7 col12\" >24.48 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row7_col13\" class=\"data row7 col13\" >35.22 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row7_col14\" class=\"data row7 col14\" >53.75 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row7_col15\" class=\"data row7 col15\" >-17.81 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row7_col16\" class=\"data row7 col16\" >41.34 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row7_col17\" class=\"data row7 col17\" >45.01 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row7_col18\" class=\"data row7 col18\" >21.93 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row7_col19\" class=\"data row7 col19\" >-0.48 ± 0.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_5f6c1_level0_row8\" class=\"row_heading level0 row8\" >MonteCarloNormalizedSequenceEntropy</th>\n",
-       "      <td id=\"T_5f6c1_row8_col0\" class=\"data row8 col0\" >19.29 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row8_col1\" class=\"data row8 col1\" >12.91 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row8_col2\" class=\"data row8 col2\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row8_col3\" class=\"data row8 col3\" >47.81 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row8_col4\" class=\"data row8 col4\" >65.82 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row8_col5\" class=\"data row8 col5\" >43.23 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row8_col6\" class=\"data row8 col6\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row8_col7\" class=\"data row8 col7\" >97.43 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row8_col8\" class=\"data row8 col8\" >71.43 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row8_col9\" class=\"data row8 col9\" >51.77 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row8_col10\" class=\"data row8 col10\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row8_col11\" class=\"data row8 col11\" >92.76 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row8_col12\" class=\"data row8 col12\" >56.67 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row8_col13\" class=\"data row8 col13\" >41.11 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row8_col14\" class=\"data row8 col14\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row8_col15\" class=\"data row8 col15\" >73.31 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row8_col16\" class=\"data row8 col16\" >71.43 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row8_col17\" class=\"data row8 col17\" >51.77 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row8_col18\" class=\"data row8 col18\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row8_col19\" class=\"data row8 col19\" >92.76 ± 0.00</td>\n",
+       "      <th id=\"T_79db4_level0_row8\" class=\"row_heading level0 row8\" >MonteCarloNormalizedSequenceEntropy</th>\n",
+       "      <td id=\"T_79db4_row8_col0\" class=\"data row8 col0\" >52.69 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row8_col1\" class=\"data row8 col1\" >41.50 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row8_col2\" class=\"data row8 col2\" >11.71 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row8_col3\" class=\"data row8 col3\" >36.80 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row8_col4\" class=\"data row8 col4\" >54.77 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row8_col5\" class=\"data row8 col5\" >43.34 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row8_col6\" class=\"data row8 col6\" >14.78 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row8_col7\" class=\"data row8 col7\" >39.06 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row8_col8\" class=\"data row8 col8\" >55.72 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row8_col9\" class=\"data row8 col9\" >44.18 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row8_col10\" class=\"data row8 col10\" >16.10 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row8_col11\" class=\"data row8 col11\" >40.08 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row8_col12\" class=\"data row8 col12\" >37.30 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row8_col13\" class=\"data row8 col13\" >35.22 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row8_col14\" class=\"data row8 col14\" >53.75 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row8_col15\" class=\"data row8 col15\" >19.91 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row8_col16\" class=\"data row8 col16\" >55.72 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row8_col17\" class=\"data row8 col17\" >44.18 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row8_col18\" class=\"data row8 col18\" >16.10 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row8_col19\" class=\"data row8 col19\" >40.08 ± 0.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_5f6c1_level0_row9\" class=\"row_heading level0 row9\" >EigenScore</th>\n",
-       "      <td id=\"T_5f6c1_row9_col0\" class=\"data row9 col0\" >29.29 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row9_col1\" class=\"data row9 col1\" >12.91 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row9_col2\" class=\"data row9 col2\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row9_col3\" class=\"data row9 col3\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row9_col4\" class=\"data row9 col4\" >55.39 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row9_col5\" class=\"data row9 col5\" >43.22 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row9_col6\" class=\"data row9 col6\" >99.83 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row9_col7\" class=\"data row9 col7\" >66.44 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row9_col8\" class=\"data row9 col8\" >68.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row9_col9\" class=\"data row9 col9\" >51.67 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row9_col10\" class=\"data row9 col10\" >99.14 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row9_col11\" class=\"data row9 col11\" >82.75 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row9_col12\" class=\"data row9 col12\" >65.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row9_col13\" class=\"data row9 col13\" >41.11 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row9_col14\" class=\"data row9 col14\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row9_col15\" class=\"data row9 col15\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row9_col16\" class=\"data row9 col16\" >68.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row9_col17\" class=\"data row9 col17\" >51.67 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row9_col18\" class=\"data row9 col18\" >99.14 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row9_col19\" class=\"data row9 col19\" >82.75 ± 0.00</td>\n",
+       "      <th id=\"T_79db4_level0_row9\" class=\"row_heading level0 row9\" >EigenScore</th>\n",
+       "      <td id=\"T_79db4_row9_col0\" class=\"data row9 col0\" >69.41 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row9_col1\" class=\"data row9 col1\" >48.40 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row9_col2\" class=\"data row9 col2\" >62.43 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row9_col3\" class=\"data row9 col3\" >83.77 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row9_col4\" class=\"data row9 col4\" >70.09 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row9_col5\" class=\"data row9 col5\" >49.76 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row9_col6\" class=\"data row9 col6\" >60.33 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row9_col7\" class=\"data row9 col7\" >82.22 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row9_col8\" class=\"data row9 col8\" >70.40 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row9_col9\" class=\"data row9 col9\" >50.37 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row9_col10\" class=\"data row9 col10\" >59.42 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row9_col11\" class=\"data row9 col11\" >81.52 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row9_col12\" class=\"data row9 col12\" >56.09 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row9_col13\" class=\"data row9 col13\" >33.55 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row9_col14\" class=\"data row9 col14\" >34.16 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row9_col15\" class=\"data row9 col15\" >75.20 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row9_col16\" class=\"data row9 col16\" >70.40 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row9_col17\" class=\"data row9 col17\" >50.37 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row9_col18\" class=\"data row9 col18\" >59.42 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row9_col19\" class=\"data row9 col19\" >81.52 ± 0.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_5f6c1_level0_row10\" class=\"row_heading level0 row10\" >RenyiNeg</th>\n",
-       "      <td id=\"T_5f6c1_row10_col0\" class=\"data row10 col0\" >10.96 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row10_col1\" class=\"data row10 col1\" >12.91 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row10_col2\" class=\"data row10 col2\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row10_col3\" class=\"data row10 col3\" >4.32 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row10_col4\" class=\"data row10 col4\" >43.76 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row10_col5\" class=\"data row10 col5\" >37.03 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row10_col6\" class=\"data row10 col6\" >37.22 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row10_col7\" class=\"data row10 col7\" >31.87 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row10_col8\" class=\"data row10 col8\" >62.91 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row10_col9\" class=\"data row10 col9\" >45.48 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row10_col10\" class=\"data row10 col10\" >46.51 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row10_col11\" class=\"data row10 col11\" >67.86 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row10_col12\" class=\"data row10 col12\" >65.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row10_col13\" class=\"data row10 col13\" >41.11 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row10_col14\" class=\"data row10 col14\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row10_col15\" class=\"data row10 col15\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row10_col16\" class=\"data row10 col16\" >62.91 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row10_col17\" class=\"data row10 col17\" >45.48 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row10_col18\" class=\"data row10 col18\" >46.51 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row10_col19\" class=\"data row10 col19\" >67.86 ± 0.00</td>\n",
+       "      <th id=\"T_79db4_level0_row10\" class=\"row_heading level0 row10\" >RenyiNeg</th>\n",
+       "      <td id=\"T_79db4_row10_col0\" class=\"data row10 col0\" >47.63 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row10_col1\" class=\"data row10 col1\" >45.76 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row10_col2\" class=\"data row10 col2\" >43.02 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row10_col3\" class=\"data row10 col3\" >22.59 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row10_col4\" class=\"data row10 col4\" >52.57 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row10_col5\" class=\"data row10 col5\" >47.60 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row10_col6\" class=\"data row10 col6\" >45.00 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row10_col7\" class=\"data row10 col7\" >32.87 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row10_col8\" class=\"data row10 col8\" >54.82 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row10_col9\" class=\"data row10 col9\" >48.43 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row10_col10\" class=\"data row10 col10\" >45.85 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row10_col11\" class=\"data row10 col11\" >37.56 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row10_col12\" class=\"data row10 col12\" >40.08 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row10_col13\" class=\"data row10 col13\" >39.15 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row10_col14\" class=\"data row10 col14\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row10_col15\" class=\"data row10 col15\" >28.11 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row10_col16\" class=\"data row10 col16\" >54.82 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row10_col17\" class=\"data row10 col17\" >48.43 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row10_col18\" class=\"data row10 col18\" >45.85 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row10_col19\" class=\"data row10 col19\" >37.56 ± 0.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_5f6c1_level0_row11\" class=\"row_heading level0 row11\" >FisherRao</th>\n",
-       "      <td id=\"T_5f6c1_row11_col0\" class=\"data row11 col0\" >10.96 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row11_col1\" class=\"data row11 col1\" >12.91 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row11_col2\" class=\"data row11 col2\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row11_col3\" class=\"data row11 col3\" >4.32 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row11_col4\" class=\"data row11 col4\" >50.30 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row11_col5\" class=\"data row11 col5\" >37.03 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row11_col6\" class=\"data row11 col6\" >37.25 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row11_col7\" class=\"data row11 col7\" >51.30 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row11_col8\" class=\"data row11 col8\" >62.92 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row11_col9\" class=\"data row11 col9\" >45.50 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row11_col10\" class=\"data row11 col10\" >46.68 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row11_col11\" class=\"data row11 col11\" >67.89 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row11_col12\" class=\"data row11 col12\" >65.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row11_col13\" class=\"data row11 col13\" >41.11 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row11_col14\" class=\"data row11 col14\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row11_col15\" class=\"data row11 col15\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row11_col16\" class=\"data row11 col16\" >62.92 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row11_col17\" class=\"data row11 col17\" >45.50 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row11_col18\" class=\"data row11 col18\" >46.68 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row11_col19\" class=\"data row11 col19\" >67.89 ± 0.00</td>\n",
+       "      <th id=\"T_79db4_level0_row11\" class=\"row_heading level0 row11\" >FisherRao</th>\n",
+       "      <td id=\"T_79db4_row11_col0\" class=\"data row11 col0\" >49.58 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row11_col1\" class=\"data row11 col1\" >45.17 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row11_col2\" class=\"data row11 col2\" >38.69 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row11_col3\" class=\"data row11 col3\" >28.09 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row11_col4\" class=\"data row11 col4\" >54.53 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row11_col5\" class=\"data row11 col5\" >47.01 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row11_col6\" class=\"data row11 col6\" >40.82 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row11_col7\" class=\"data row11 col7\" >38.38 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row11_col8\" class=\"data row11 col8\" >56.78 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row11_col9\" class=\"data row11 col9\" >47.85 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row11_col10\" class=\"data row11 col10\" >41.74 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row11_col11\" class=\"data row11 col11\" >43.08 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row11_col12\" class=\"data row11 col12\" >45.67 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row11_col13\" class=\"data row11 col13\" >39.15 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row11_col14\" class=\"data row11 col14\" >100.00 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row11_col15\" class=\"data row11 col15\" >44.55 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row11_col16\" class=\"data row11 col16\" >56.78 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row11_col17\" class=\"data row11 col17\" >47.85 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row11_col18\" class=\"data row11 col18\" >41.74 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row11_col19\" class=\"data row11 col19\" >43.08 ± 0.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_5f6c1_level0_row12\" class=\"row_heading level0 row12\" >ProbasMeanWithCoT</th>\n",
-       "      <td id=\"T_5f6c1_row12_col0\" class=\"data row12 col0\" >14.29 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row12_col1\" class=\"data row12 col1\" >12.91 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row12_col2\" class=\"data row12 col2\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row12_col3\" class=\"data row12 col3\" >21.71 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row12_col4\" class=\"data row12 col4\" >61.23 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row12_col5\" class=\"data row12 col5\" >43.22 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row12_col6\" class=\"data row12 col6\" >99.87 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row12_col7\" class=\"data row12 col7\" >83.79 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row12_col8\" class=\"data row12 col8\" >73.85 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row12_col9\" class=\"data row12 col9\" >51.69 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row12_col10\" class=\"data row12 col10\" >99.31 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row12_col11\" class=\"data row12 col11\" >99.83 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row12_col12\" class=\"data row12 col12\" >65.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row12_col13\" class=\"data row12 col13\" >41.11 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row12_col14\" class=\"data row12 col14\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row12_col15\" class=\"data row12 col15\" >100.00 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row12_col16\" class=\"data row12 col16\" >73.85 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row12_col17\" class=\"data row12 col17\" >51.69 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row12_col18\" class=\"data row12 col18\" >99.31 ± 0.00</td>\n",
-       "      <td id=\"T_5f6c1_row12_col19\" class=\"data row12 col19\" >99.83 ± 0.00</td>\n",
+       "      <th id=\"T_79db4_level0_row12\" class=\"row_heading level0 row12\" >ProbasMeanWithCoT</th>\n",
+       "      <td id=\"T_79db4_row12_col0\" class=\"data row12 col0\" >51.14 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row12_col1\" class=\"data row12 col1\" >44.09 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row12_col2\" class=\"data row12 col2\" >30.73 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row12_col3\" class=\"data row12 col3\" >32.45 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row12_col4\" class=\"data row12 col4\" >53.22 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row12_col5\" class=\"data row12 col5\" >45.93 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row12_col6\" class=\"data row12 col6\" >33.13 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row12_col7\" class=\"data row12 col7\" >34.69 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row12_col8\" class=\"data row12 col8\" >54.17 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row12_col9\" class=\"data row12 col9\" >46.76 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row12_col10\" class=\"data row12 col10\" >34.17 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row12_col11\" class=\"data row12 col11\" >35.71 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row12_col12\" class=\"data row12 col12\" >47.84 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row12_col13\" class=\"data row12 col13\" >37.07 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row12_col14\" class=\"data row12 col14\" >75.52 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row12_col15\" class=\"data row12 col15\" >50.94 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row12_col16\" class=\"data row12 col16\" >54.17 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row12_col17\" class=\"data row12 col17\" >46.76 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row12_col18\" class=\"data row12 col18\" >34.17 ± 0.00</td>\n",
+       "      <td id=\"T_79db4_row12_col19\" class=\"data row12 col19\" >35.71 ± 0.00</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n"
       ],
       "text/plain": [
-       "<pandas.io.formats.style.Styler at 0x7f4c1ee9e7d0>"
+       "<pandas.io.formats.style.Styler at 0x7f4c1efa6710>"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -637,7 +691,7 @@
     "    'HotpotQA, Llama3.2-3b',\n",
     "    # outputs generated by scripts/polygraph_eval benchmark\n",
     "    # provide several seeds to calculate variance\n",
-    "    [\"../workdir/output/qa/{'path': 'meta-llama/Llama-3.2-3B-Instruct', 'ensemble': False, 'mc': False, 'mc_seeds': None, 'dropout_rate': None, 'type': 'CausalLM', 'path_to_load_script': 'model/default_causal.py', 'load_model_args': {'device_map': 'auto'}, 'load_tokenizer_args': {}}/['denis1699/hotpot_cot']/2025-05-06/06-38-32/ue_manager_seed1\"])"
+    "    [\"../workdir/output/qa/{'path': 'meta-llama/Llama-3.2-3B-Instruct', 'ensemble': False, 'mc': False, 'mc_seeds': None, 'dropout_rate': None, 'type': 'CausalLM', 'path_to_load_script': 'model/default_causal.py', 'load_model_args': {'device_map': 'auto'}, 'load_tokenizer_args': {}}/['denis1699/hotpot_cot']/2025-05-06/09-26-59/ue_manager_seed1\"])"
    ]
   },
   {