The current deconamination implementation loads the humaneval from disk upon import:
https://github.com/huggingface/alignment-handbook/blob/a9b8a50/src/alignment/decontaminate.py#L53
def human_eval_docstrings() -> List[str]:
ds = load_dataset("openai_humaneval", split="test")
docstrings = [extract_docstring(v["prompt"]) for v in ds]
return docstrings
def load_dataset_column(dataset: str, column: str, split: str, name=None) -> List[str]:
ds = load_dataset(dataset, split=split, name=name)
res = [sample[column].strip() for sample in ds]
# Only return non-empty strings
return [sample for sample in res if len(sample) > 0]
FILTER_OUT = {
"human_eval_docstrings": human_eval_docstrings(),
"human_eval_solutions": [
s
for s in load_dataset_column("openai_humaneval", "canonical_solution", "test")
if s not in HUMAN_EVAL_STRINGS_OK
],
}
def normalize_whitespace(text: str) -> str:
return " ".join(text.split())
def decontaminate_humaneval(
samples: List[Dict[str, Any]], text_column: str = "text", filter_out: Dict[str, List[str]] = FILTER_OUT
) -> List[Dict[str, Any]]:
I suggest moving this into the decontaminate_humaneval function to avoid unnecessary overhead
The current deconamination implementation loads the humaneval from disk upon import:
https://github.com/huggingface/alignment-handbook/blob/a9b8a50/src/alignment/decontaminate.py#L53
I suggest moving this into the
decontaminate_humanevalfunction to avoid unnecessary overhead