From 58e1083ce96cf6d650b516f189bef62e2b6e7acb Mon Sep 17 00:00:00 2001 From: Collin Leiber Date: Mon, 25 May 2026 13:50:46 +0300 Subject: [PATCH] Change path handling from os to pathlib #77 --- clustpy/alternative/nrkmeans.py | 12 +- clustpy/data/__init__.py | 6 +- clustpy/data/_utils.py | 93 ++--- clustpy/data/real_clustpy_data.py | 4 +- clustpy/data/real_medical_mnist_data.py | 82 ++--- clustpy/data/real_timeseries_data.py | 59 ++-- clustpy/data/real_torchvision_data.py | 42 +-- clustpy/data/real_uci_data.py | 320 +++++++++--------- clustpy/data/real_video_data.py | 54 +-- clustpy/data/real_world_data.py | 286 ++++++++++++---- clustpy/data/tests/test_real_world_data.py | 16 +- .../deep/_abstract_deep_clustering_algo.py | 5 +- clustpy/deep/_data_utils.py | 6 +- clustpy/deep/_train_utils.py | 13 +- clustpy/deep/aec.py | 9 +- clustpy/deep/dcn.py | 9 +- clustpy/deep/ddc_n2d.py | 13 +- clustpy/deep/dec.py | 13 +- clustpy/deep/deepect.py | 9 +- clustpy/deep/den.py | 5 +- clustpy/deep/dipdeck.py | 7 +- clustpy/deep/dipencoder.py | 5 +- clustpy/deep/dkm.py | 9 +- clustpy/deep/enrc.py | 13 +- .../neural_networks/_abstract_autoencoder.py | 16 +- clustpy/deep/vade.py | 9 +- clustpy/utils/_information_theory.py | 16 +- clustpy/utils/evaluation.py | 112 +++--- .../utils/tests/test_information_theory.py | 4 +- 29 files changed, 718 insertions(+), 529 deletions(-) diff --git a/clustpy/alternative/nrkmeans.py b/clustpy/alternative/nrkmeans.py index 83348c2..9705d81 100644 --- a/clustpy/alternative/nrkmeans.py +++ b/clustpy/alternative/nrkmeans.py @@ -1292,7 +1292,7 @@ def _mdl_m_dependant_subspace_costs(X: np.ndarray, V: np.ndarray, cluster_index: # ==== Costs of cluster space ==== cropped_V_cluster = V[:, P_cluster] # Costs for cluster dimensionality - cluster_costs = mdl.integer_costs(m_cluster) + cluster_costs = mdl.integer_costs(m_cluster, use_log2=True) # Costs for centers cluster_costs += n_clusters[cluster_index] * _mdl_reference_vector(m_cluster, max_distance, precision) # Costs for point encoding @@ -1306,7 +1306,7 @@ def _mdl_m_dependant_subspace_costs(X: np.ndarray, V: np.ndarray, cluster_index: # ==== Costs of noise space ==== cropped_V_noise = V[:, P_noise] # Costs for noise dimensionality - noise_costs = mdl.integer_costs(m_noise) + noise_costs = mdl.integer_costs(m_noise, use_log2=True) # Costs for centers noise_costs += n_clusters[noise_index] * _mdl_reference_vector(m_noise, max_distance, precision) # Costs for point encoding @@ -1443,7 +1443,7 @@ def _mdl_costs(X: np.ndarray, n_clusters: list, m: list, P: list, V: np.ndarray, # Costs of matrix V # global_costs += mdl.mdl_costs_orthogonal_matrix(n_points, mdl.mdl_costs_float_value(n_points)) # Costs of number of subspaces - global_costs += mdl.integer_costs(subspaces) + global_costs += mdl.integer_costs(subspaces, use_log2=True) # Costs for each subspace all_subspace_costs = [] for subspace in range(subspaces): @@ -1451,9 +1451,9 @@ def _mdl_costs(X: np.ndarray, n_clusters: list, m: list, P: list, V: np.ndarray, # Calculate costs model_costs = 0 # Costs for dimensionality - model_costs += mdl.integer_costs(m[subspace]) + model_costs += mdl.integer_costs(m[subspace], use_log2=True) # Number of clusters in this subspace - model_costs += mdl.integer_costs(n_clusters[subspace]) + model_costs += mdl.integer_costs(n_clusters[subspace], use_log2=True) # Costs for cluster centers model_costs += n_clusters[subspace] * \ _mdl_reference_vector(m[subspace], max_distance, precision) @@ -1462,7 +1462,7 @@ def _mdl_costs(X: np.ndarray, n_clusters: list, m: list, P: list, V: np.ndarray, if outliers: # Encode number of outliers n_outliers = len(labels[:, subspace][labels[:, subspace] == -1]) - model_costs += mdl.integer_costs(n_outliers) + model_costs += mdl.integer_costs(n_outliers, use_log2=True) # Encode coding costs of outliers outlier_costs += n_outliers * np.log2(n_points) outlier_costs += n_outliers * _mdl_costs_uniform_pdf(m[subspace], max_distance) diff --git a/clustpy/data/__init__.py b/clustpy/data/__init__.py index 4939f40..1971038 100644 --- a/clustpy/data/__init__.py +++ b/clustpy/data/__init__.py @@ -1,6 +1,6 @@ from .synthetic_data_creator import create_subspace_data, create_nr_data from .real_world_data import load_newsgroups, load_iris, load_wine, load_breast_cancer, load_rcv1, load_imagenet_dog, \ - load_imagenet10, load_coil20, load_coil100, load_olivetti_faces, load_webkb + load_imagenet10, load_coil20, load_coil100, load_olivetti_faces, load_webkb, load_bbcnews, load_bbcsport from .real_uci_data import load_har, load_letterrecognition, load_optdigits, load_pendigits, load_banknotes, load_htru2, \ load_mice_protein, load_ecoli, load_spambase, load_seeds, load_statlog_shuttle, load_forest_types, \ load_breast_tissue, load_soybean_large, load_soybean_small, load_skin, load_user_knowledge, load_dermatology, \ @@ -105,4 +105,6 @@ 'load_gene_expression_cancer_rna_seq', 'load_sport_articles', 'load_wholesale_customers', - 'load_reuters21578'] + 'load_reuters21578', + 'load_bbcsport', + 'load_bbcnews'] diff --git a/clustpy/data/_utils.py b/clustpy/data/_utils.py index 70c4466..33dddf3 100644 --- a/clustpy/data/_utils.py +++ b/clustpy/data/_utils.py @@ -14,23 +14,24 @@ "[WARNING] Could not import PIL in clustpy.data.real_world_data. Please install PIL by 'pip install Pillow' if necessary") import numpy as np import os -from pathlib import Path +from pathlib import Path, PurePath from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer from sklearn.feature_selection import VarianceThreshold from sklearn.datasets import fetch_file +import subprocess -DEFAULT_DOWNLOAD_PATH = str(Path.home() / "Downloads/clustpy_datafiles") +DEFAULT_DOWNLOAD_PATH = Path.home() / "Downloads" / "clustpy_datafiles" -def _get_download_dir(downloads_path: str) -> str: +def _get_download_dir(downloads_path: str | Path) -> Path: """ Helper function to define the path where the data files should be stored. If downloads_path is None then default path '[USER]/Downloads/clustpy_datafiles' will be used. If the directory does not exists it will be created. Parameters ---------- - downloads_path : str + downloads_path : str | Path path to the directory where the data will be stored. Can be None Returns @@ -44,10 +45,13 @@ def _get_download_dir(downloads_path: str) -> str: if env_data_path is None: downloads_path = DEFAULT_DOWNLOAD_PATH else: - downloads_path = env_data_path - if not os.path.isdir(downloads_path): - os.makedirs(downloads_path) - with open(downloads_path + "/info.txt", "w") as f: + downloads_path = Path(env_data_path) + elif isinstance(downloads_path, str): + # Cast str to Path + downloads_path = Path(downloads_path) + if not downloads_path.is_dir(): + downloads_path.mkdir(parents=True, exist_ok=False) + with open(downloads_path / "info.txt", "w") as f: f.write("This directory was created by the ClustPy python package to store real world data sets.\n" "The default directory is '[USER]/Downloads/clustpy_datafiles' and can be changed with the " "'downloads_path' parameter when loading a data set.\n" @@ -55,7 +59,7 @@ def _get_download_dir(downloads_path: str) -> str: return downloads_path -def _download_file(file_url: str, filename_local: str) -> None: +def _download_file(file_url: str, filename_local: str | Path) -> None: """ Helper function to download a file into a specified location. @@ -63,17 +67,18 @@ def _download_file(file_url: str, filename_local: str) -> None: ---------- file_url : str URL of the file - filename_local : str + filename_local : str | Path local name of the file after it has been downloaded """ - local_path = Path(filename_local) - local_dir = local_path.parent - local_filename = local_path.name + if isinstance(filename_local, str): + filename_local = Path(filename_local) + local_dir = filename_local.parent + local_filename = filename_local.name print("Downloading data set from {0} to {1}".format(file_url, filename_local)) fetch_file(file_url, folder=local_dir, local_filename=local_filename) -def _download_file_from_google_drive(file_id: str, filename_local: str, chunk_size: int = 32768) -> None: +def _download_file_from_google_drive(file_id: str, filename_local: str | Path, chunk_size: int = 32768) -> None: """ Download a file from google drive. Code taken from: @@ -83,7 +88,7 @@ def _download_file_from_google_drive(file_id: str, filename_local: str, chunk_si ---------- file_id : str ID of the file on google drive - filename_local : str + filename_local : str | Path local name of the file after it has been downloaded chunk_size : int chink size when downloading the file (default: 32768) @@ -107,8 +112,8 @@ def _download_file_from_google_drive(file_id: str, filename_local: str, chunk_si session.close() -def _load_data_file(filename_local: str, file_url: str, delimiter: str = ",", last_column_are_labels: bool = True) -> ( - np.ndarray, np.ndarray): +def _load_data_file(filename_local: Path, file_url: str, delimiter: str = ",", last_column_are_labels: bool = True) -> tuple[ + np.ndarray, np.ndarray]: """ Helper function to load a data file. Either the first or last column, depending on last_column_are_labels, of the data file is used as the label column. @@ -116,7 +121,7 @@ def _load_data_file(filename_local: str, file_url: str, delimiter: str = ",", la Parameters ---------- - filename_local : str + filename_local : Path local name of the file after it has been downloaded file_url : str URL of the file @@ -127,10 +132,10 @@ def _load_data_file(filename_local: str, file_url: str, delimiter: str = ",", la Returns ------- - data, labels : (np.ndarray, np.ndarray) + data, labels : tuple[np.ndarray, np.ndarray] the data numpy array, the labels numpy array """ - if not os.path.isfile(filename_local): + if not filename_local.is_file(): _download_file(file_url, filename_local) datafile = np.genfromtxt(filename_local, delimiter=delimiter) if last_column_are_labels: @@ -144,7 +149,7 @@ def _load_data_file(filename_local: str, file_url: str, delimiter: str = ",", la return data, labels -def _decompress_z_file(filename: str, directory: str) -> bool: +def _decompress_z_file(filename: str | Path, directory: str | Path) -> bool: """ Helper function to decompress a 7z file. The function uses an installed version of 7zip to decompress the file. If 7zip is not installed on this machine, the function will return False and a warning is printed. @@ -161,22 +166,30 @@ def _decompress_z_file(filename: str, directory: str) -> bool: successful : bool True if decompression was successful, else False """ - os.system("7z x {0} -o{1}".format(filename.replace("\\", "/"), directory.replace("\\", "/"))) - successful = True - if not os.path.isfile(filename[:-2]): + if isinstance(filename, str): + filename = Path(filename) + if isinstance(directory, str): + directory = Path(directory) + cmd = ["7z", "x", filename.as_posix(), f"-o{directory.as_posix()}"] + try: + subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + except (subprocess.CalledProcessError, FileNotFoundError): + print("[WARNING] 7Zip extraction failed or 7z executable is missing!") + return False + if not filename.with_suffix('').is_file(): # If no file without .z exists, decompression was not successful - successful = False - print("[WARNING] 7Zip is needed to uncompress *.Z files!") - return successful + print("[WARNING] Decompression check failed: expected file not found.") + return False + return True -def _load_image_data(image: str, image_size: tuple, color_image: bool) -> np.ndarray: +def _load_image_data(image: str | Path | np.ndarray, image_size: tuple, color_image: bool) -> np.ndarray: """ Load image and convert it into a coherent size. Returns a numpy array containing the image data. Parameters ---------- - image : str + image : str | Path | np.ndarray Path to the image. Can also be a numpy array containing the specific pixels image_size : tuple images of various sizes can be converted into a coherent size. @@ -190,7 +203,7 @@ def _load_image_data(image: str, image_size: tuple, color_image: bool) -> np.nda image_data : np.ndarray The numpy array containing the image data """ - if isinstance(image, str): + if isinstance(image, (str, PurePath)): pil_image = Image.open(image) else: pil_image = Image.fromarray(np.uint8(image)) @@ -231,7 +244,7 @@ def build_analyzer(self): def _transform_text_data(data: np.ndarray, use_tfidf: bool, use_stemming: bool, use_stop_words: bool, max_df: float | int, min_df: float | int, max_features: int, min_variance : float, sublinear_tf: bool, - data_all: np.ndarray = None) -> np.ndarray: + data_all: np.ndarray | None = None) -> tuple[np.ndarray, list[str]]: """ Transform a set of texts into a data matrix. Result can be either a raw count matrix or the result of tf-idf. @@ -261,13 +274,14 @@ def _transform_text_data(data: np.ndarray, use_tfidf: bool, use_stemming: bool, The default is to keep all features with non-zero variance, i.e. remove only the features that have the same value in all samples sublinear_tf : bool Apply sublinear term frequency scaling, i.e. replace tf with 1 + log(tf) (see sklearn TfidfTransformer) - data_all : np.ndarray + data_all : np.ndarray | None The complete data set, i.e., if no subset is used. If it is None, it will be equal to data (default: None) Returns ------- - data : np.ndarray - The resulting data array + tuple : tuple[np.ndarray, list[str]] + The resulting data array, + The vocabulary of the data output """ if data_all is None: data_all = data @@ -278,18 +292,21 @@ def _transform_text_data(data: np.ndarray, use_tfidf: bool, use_stemming: bool, vectorizer = CountVectorizer(dtype=np.float64, stop_words="english" if use_stop_words else None, min_df=min_df, max_df=max_df, max_features=max_features) data_sparse_all = vectorizer.fit_transform(data_all) data_sparse = vectorizer.transform(data) + vocabulary = vectorizer.get_feature_names_out() # (Optional) Check for variance threshold if min_variance != 0: selector = VarianceThreshold(min_variance) data_sparse_all = selector.fit_transform(data_sparse_all) data_sparse = selector.transform(data_sparse) + vocabulary_mask = selector._get_support_mask() + vocabulary = vocabulary[vocabulary_mask] # (Optional) Apply tf-idf if use_tfidf: tfidf = TfidfTransformer(sublinear_tf=sublinear_tf) tfidf.fit(data_sparse_all) data_sparse = tfidf.transform(data_sparse) data = np.asarray(data_sparse.todense()) - return data + return data, vocabulary def flatten_images(data: np.ndarray, format: str) -> np.ndarray: @@ -313,11 +330,11 @@ def flatten_images(data: np.ndarray, format: str) -> np.ndarray: format_possibilities = ["HW", "HWD", "CHW", "CHWD", "HWC", "HWDC"] assert format in format_possibilities, "Format must be within {0}".format(format_possibilities) if format == "HW": - assert data.ndim == 3 + assert data.ndim == 3, f"ndim has to be 3 but is {data.ndim}" elif format in ["HWD", "CHW", "HWC"]: - assert data.ndim == 4 + assert data.ndim == 4, f"ndim has to be 4 but is {data.ndim}" elif format in ["CHWD", "HWDC"]: - assert data.ndim == 5 + assert data.ndim == 5, f"ndim has to be 5 but is {data.ndim}" # Flatten shape if format != "HW" and format != "HWD": if format == "CHW": diff --git a/clustpy/data/real_clustpy_data.py b/clustpy/data/real_clustpy_data.py index 6747ddb..abf96be 100644 --- a/clustpy/data/real_clustpy_data.py +++ b/clustpy/data/real_clustpy_data.py @@ -1,7 +1,7 @@ import numpy as np -import os from sklearn.datasets._base import Bunch from clustpy.data._utils import unflatten_images +from pathlib import Path def _load_nr_data(file_name: str, n_labels: int) -> (np.ndarray, np.ndarray): @@ -21,7 +21,7 @@ def _load_nr_data(file_name: str, n_labels: int) -> (np.ndarray, np.ndarray): data, labels : (np.ndarray, np.ndarray) the data numpy array, the labels numpy array """ - path = os.path.dirname(__file__) + "/datasets/" + file_name + path = Path(__file__).parent / "datasets" / file_name dataset = np.genfromtxt(path, delimiter=",") data = dataset[:, n_labels:] labels = dataset[:, :n_labels] diff --git a/clustpy/data/real_medical_mnist_data.py b/clustpy/data/real_medical_mnist_data.py index 9474164..8a5daf2 100644 --- a/clustpy/data/real_medical_mnist_data.py +++ b/clustpy/data/real_medical_mnist_data.py @@ -1,11 +1,11 @@ import numpy as np from clustpy.data._utils import _get_download_dir, _download_file, flatten_images -import os from sklearn.datasets._base import Bunch +from pathlib import Path def _load_medical_mnist_data(dataset_name: str, subset: str, colored: bool, multiple_labelings: bool, - return_X_y: bool, downloads_path: str) -> Bunch: + return_X_y: bool, downloads_path: str | Path) -> Bunch: """ Helper function to load medical MNIST data from https://medmnist.com/. @@ -21,7 +21,7 @@ def _load_medical_mnist_data(dataset_name: str, subset: str, colored: bool, mult specifies if the data set contains multiple labelings (for alternative clusterings) return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored. If input was None this will be equal to '[USER]/Downloads/clustpy_datafiles' @@ -38,8 +38,8 @@ def _load_medical_mnist_data(dataset_name: str, subset: str, colored: bool, mult assert subset in ["all", "train", "test", "val"], "subset must match 'all', 'train', 'test' or 'val'. Your input {0}".format(subset) # Check if data exists - filename = _get_download_dir(downloads_path) + "/" + dataset_name + ".npz" - if not os.path.isfile(filename): + filename = _get_download_dir(downloads_path) / (dataset_name + ".npz") + if not filename.is_file(): _download_file("https://zenodo.org/record/6496656/files/" + dataset_name + ".npz?download=1", filename) # Load data dataset = np.load(filename) @@ -104,7 +104,7 @@ def _load_medical_mnist_data(dataset_name: str, subset: str, colored: bool, mult """ -def load_path_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_path_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the PathMNIST data set. It consists of 107180 28x28 colored images belonging to one of 9 classes. The data set is composed of 89996 training, 10004 validation and 7180 test samples. @@ -116,7 +116,7 @@ def load_path_mnist(subset: str = "all", return_X_y: bool = False, downloads_pat can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -138,7 +138,7 @@ def load_path_mnist(subset: str = "all", return_X_y: bool = False, downloads_pat return _load_medical_mnist_data("pathmnist", subset, True, False, return_X_y, downloads_path) -def load_chest_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_chest_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the ChestMNIST data set. It consists of 112120 28x28 grayscale images. The ground truth labels consist of 14 labelings with 2 clusters each. @@ -151,7 +151,7 @@ def load_chest_mnist(subset: str = "all", return_X_y: bool = False, downloads_pa can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -173,7 +173,7 @@ def load_chest_mnist(subset: str = "all", return_X_y: bool = False, downloads_pa return _load_medical_mnist_data("chestmnist", subset, False, True, return_X_y, downloads_path) -def load_derma_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_derma_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the DermaMNIST data set. It consists of 10015 28x28 colored images belonging to one of 7 classes. The data set is composed of 7007 training, 1003 validation and 2005 test samples. @@ -185,7 +185,7 @@ def load_derma_mnist(subset: str = "all", return_X_y: bool = False, downloads_pa can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -210,7 +210,7 @@ def load_derma_mnist(subset: str = "all", return_X_y: bool = False, downloads_pa return _load_medical_mnist_data("dermamnist", subset, True, False, return_X_y, downloads_path) -def load_oct_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_oct_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the OCTMNIST data set. It consists of 109309 28x28 grayscale images belonging to one of 4 classes. The data set is composed of 97477 training, 10832 validation and 1000 test samples. @@ -222,7 +222,7 @@ def load_oct_mnist(subset: str = "all", return_X_y: bool = False, downloads_path can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -244,7 +244,7 @@ def load_oct_mnist(subset: str = "all", return_X_y: bool = False, downloads_path return _load_medical_mnist_data("octmnist", subset, False, False, return_X_y, downloads_path) -def load_pneumonia_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_pneumonia_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the PneumoniaMNIST data set. It consists of 5856 28x28 grayscale images belonging to one of 2 classes. The data set is composed of 4708 training, 524 validation and 624 test samples. @@ -256,7 +256,7 @@ def load_pneumonia_mnist(subset: str = "all", return_X_y: bool = False, download can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -278,7 +278,7 @@ def load_pneumonia_mnist(subset: str = "all", return_X_y: bool = False, download return _load_medical_mnist_data("pneumoniamnist", subset, False, False, return_X_y, downloads_path) -def load_retina_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_retina_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the RetinaMNIST data set. It consists of 1600 28x28 colored images belonging to one of 5 classes. The data set is composed of 1080 training, 120 validation and 400 test samples. @@ -290,7 +290,7 @@ def load_retina_mnist(subset: str = "all", return_X_y: bool = False, downloads_p can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -312,7 +312,7 @@ def load_retina_mnist(subset: str = "all", return_X_y: bool = False, downloads_p return _load_medical_mnist_data("retinamnist", subset, True, False, return_X_y, downloads_path) -def load_breast_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_breast_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the BreastMNIST data set. It consists of 780 28x28 grayscale images belonging to one of 2 classes. The data set is composed of 546 training, 78 validation and 156 test samples. @@ -324,7 +324,7 @@ def load_breast_mnist(subset: str = "all", return_X_y: bool = False, downloads_p can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -346,7 +346,7 @@ def load_breast_mnist(subset: str = "all", return_X_y: bool = False, downloads_p return _load_medical_mnist_data("breastmnist", subset, False, False, return_X_y, downloads_path) -def load_blood_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_blood_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the BloodMNIST data set. It consists of 17092 28x28 colored images belonging to one of 8 classes. The data set is composed of 11959 training, 1712 validation and 3421 test samples. @@ -358,7 +358,7 @@ def load_blood_mnist(subset: str = "all", return_X_y: bool = False, downloads_pa can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -380,7 +380,7 @@ def load_blood_mnist(subset: str = "all", return_X_y: bool = False, downloads_pa return _load_medical_mnist_data("bloodmnist", subset, True, False, return_X_y, downloads_path) -def load_tissue_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_tissue_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the TissueMNIST data set. It consists of 236386 28x28 grayscale images belonging to one of 8 classes. The data set is composed of 165466 training, 23640 validation and 47280 test samples. @@ -392,7 +392,7 @@ def load_tissue_mnist(subset: str = "all", return_X_y: bool = False, downloads_p can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -414,7 +414,7 @@ def load_tissue_mnist(subset: str = "all", return_X_y: bool = False, downloads_p return _load_medical_mnist_data("tissuemnist", subset, False, False, return_X_y, downloads_path) -def load_organ_a_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_organ_a_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the OrganAMNIST data set. It consists of 58850 28x28 grayscale images belonging to one of 11 classes. The data set is composed of 34581 training, 6491 validation and 17778 test samples. @@ -426,7 +426,7 @@ def load_organ_a_mnist(subset: str = "all", return_X_y: bool = False, downloads_ can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -451,7 +451,7 @@ def load_organ_a_mnist(subset: str = "all", return_X_y: bool = False, downloads_ return _load_medical_mnist_data("organamnist", subset, False, False, return_X_y, downloads_path) -def load_organ_c_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_organ_c_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the OrganCMNIST data set. It consists of 23660 28x28 grayscale images belonging to one of 11 classes. The data set is composed of 13000 training, 2392 validation and 8268 test samples. @@ -463,7 +463,7 @@ def load_organ_c_mnist(subset: str = "all", return_X_y: bool = False, downloads_ can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -488,7 +488,7 @@ def load_organ_c_mnist(subset: str = "all", return_X_y: bool = False, downloads_ return _load_medical_mnist_data("organcmnist", subset, False, False, return_X_y, downloads_path) -def load_organ_s_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_organ_s_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the OrganSMNIST data set. It consists of 25221 28x28 grayscale images belonging to one of 11 classes. The data set is composed of 13940 training, 2452 validation and 8829 test samples. @@ -500,7 +500,7 @@ def load_organ_s_mnist(subset: str = "all", return_X_y: bool = False, downloads_ can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -525,7 +525,7 @@ def load_organ_s_mnist(subset: str = "all", return_X_y: bool = False, downloads_ return _load_medical_mnist_data("organsmnist", subset, False, False, return_X_y, downloads_path) -def load_organ_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_organ_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the OrganMNIST3D data set. It consists of 1743 28x28x28 grayscale images belonging to one of 11 classes. The data set is composed of 972 training, 161 validation and 610 test samples. @@ -537,7 +537,7 @@ def load_organ_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -562,7 +562,7 @@ def load_organ_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads return _load_medical_mnist_data("organmnist3d", subset, False, False, return_X_y, downloads_path) -def load_nodule_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_nodule_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the NoduleMNIST3D data set. It consists of 1633 28x28x28 grayscale images belonging to one of 2 classes. The data set is composed of 1158 training, 165 validation and 310 test samples. @@ -574,7 +574,7 @@ def load_nodule_mnist_3d(subset: str = "all", return_X_y: bool = False, download can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -596,7 +596,7 @@ def load_nodule_mnist_3d(subset: str = "all", return_X_y: bool = False, download return _load_medical_mnist_data("nodulemnist3d", subset, False, False, return_X_y, downloads_path) -def load_adrenal_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_adrenal_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the AdrenalMNIST3D data set. It consists of 1584 28x28x28 grayscale images belonging to one of 2 classes. The data set is composed of 1188 training, 98 validation and 298 test samples. @@ -608,7 +608,7 @@ def load_adrenal_mnist_3d(subset: str = "all", return_X_y: bool = False, downloa can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -627,7 +627,7 @@ def load_adrenal_mnist_3d(subset: str = "all", return_X_y: bool = False, downloa return _load_medical_mnist_data("adrenalmnist3d", subset, False, False, return_X_y, downloads_path) -def load_fracture_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_fracture_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the FractureMNIST3D data set. It consists of 1370 28x28x28 grayscale images belonging to one of 3 classes. The data set is composed of 1027 training, 103 validation and 240 test samples. @@ -639,7 +639,7 @@ def load_fracture_mnist_3d(subset: str = "all", return_X_y: bool = False, downlo can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -661,7 +661,7 @@ def load_fracture_mnist_3d(subset: str = "all", return_X_y: bool = False, downlo return _load_medical_mnist_data("fracturemnist3d", subset, False, False, return_X_y, downloads_path) -def load_vessel_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_vessel_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the VesselMNIST3D data set. It consists of 1909 28x28x28 grayscale images belonging to one of 2 classes. The data set is composed of 1335 training, 192 validation and 382 test samples. @@ -673,7 +673,7 @@ def load_vessel_mnist_3d(subset: str = "all", return_X_y: bool = False, download can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -695,7 +695,7 @@ def load_vessel_mnist_3d(subset: str = "all", return_X_y: bool = False, download return _load_medical_mnist_data("vesselmnist3d", subset, False, False, return_X_y, downloads_path) -def load_synapse_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_synapse_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the SynapseMNIST3D data set. It consists of 1759 28x28x28 grayscale images belonging to one of 2 classes. The data set is composed of 1230 training, 177 validation and 352 test samples. @@ -707,7 +707,7 @@ def load_synapse_mnist_3d(subset: str = "all", return_X_y: bool = False, downloa can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns diff --git a/clustpy/data/real_timeseries_data.py b/clustpy/data/real_timeseries_data.py index cd8cff2..fc679cb 100644 --- a/clustpy/data/real_timeseries_data.py +++ b/clustpy/data/real_timeseries_data.py @@ -1,12 +1,12 @@ import numpy as np from clustpy.data._utils import _get_download_dir, _download_file from sklearn.datasets._base import Bunch -import os import zipfile +from pathlib import Path def _load_timeseries_classification_data(dataset_name: str, subset: str, labels_minus_one: bool, file_type: str, - last_column_are_labels: bool, return_X_y: bool, downloads_path: str) -> Bunch: + last_column_are_labels: bool, return_X_y: bool, downloads_path: str | Path) -> Bunch: """ Helper function to load timeseries data from www.timeseriesclassification.com. @@ -24,7 +24,7 @@ def _load_timeseries_classification_data(dataset_name: str, subset: str, labels_ specifies if the last column contains the labels. If false labels should be contained in the first column return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored. If input was None this will be equal to '[USER]/Downloads/clustpy_datafiles' @@ -38,11 +38,10 @@ def _load_timeseries_classification_data(dataset_name: str, subset: str, labels_ subset = subset.lower() assert subset in ["all", "train", "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset) - directory = _get_download_dir(downloads_path) + "/" + dataset_name + "/" - filename = directory + dataset_name + ".zip" - if not os.path.isfile(filename): - if not os.path.isdir(directory): - os.mkdir(directory) + directory = _get_download_dir(downloads_path) / dataset_name + filename = directory / (dataset_name + ".zip") + if not filename.is_file(): + directory.mkdir(parents=False, exist_ok=True) _download_file("http://www.timeseriesclassification.com/aeon-toolkit/" + dataset_name + ".zip", filename) # Unpack zipfile @@ -52,10 +51,10 @@ def _load_timeseries_classification_data(dataset_name: str, subset: str, labels_ if subset == "all" or subset == "train": # Normally we have txt files if file_type == "txt": - dataset = np.genfromtxt(directory + dataset_name + "_TRAIN.txt") + dataset = np.genfromtxt(directory / (dataset_name + "_TRAIN.txt")) elif file_type == "ts": # Ts files must be changed first - with open(directory + dataset_name + "_TRAIN.ts", "rb") as f: + with open(directory / (dataset_name + "_TRAIN.ts"), "rb") as f: clean_lines = (line.replace(b":", b",").replace(b"@", b"#") for line in f) dataset = np.genfromtxt(clean_lines, delimiter=",", comments="#") # Are labels in first or last column? @@ -68,10 +67,10 @@ def _load_timeseries_classification_data(dataset_name: str, subset: str, labels_ if subset == "all" or subset == "test": # Normally we have txt files if file_type == "txt": - test_dataset = np.genfromtxt(directory + dataset_name + "_TEST.txt") + test_dataset = np.genfromtxt(directory / (dataset_name + "_TEST.txt")) elif file_type == "ts": # Ts files must be changed first - with open(directory + dataset_name + "_TEST.ts", "rb") as f: + with open(directory / (dataset_name + "_TEST.ts"), "rb") as f: clean_lines = (line.replace(b":", b",").replace(b"@", b"#") for line in f) test_dataset = np.genfromtxt(clean_lines, delimiter=",", comments="#") # Are labels in first or last column? @@ -101,7 +100,7 @@ def _load_timeseries_classification_data(dataset_name: str, subset: str, labels_ return Bunch(dataset_name=dataset_name, data=data, target=labels) -def load_motestrain(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_motestrain(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the motestrain data set. It consists of 1272 samples belonging to one of 2 classes. The data set is composed of 20 training and 1252 test samples. @@ -113,7 +112,7 @@ def load_motestrain(subset: str = "all", return_X_y: bool = False, downloads_pat can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -130,7 +129,7 @@ def load_motestrain(subset: str = "all", return_X_y: bool = False, downloads_pat return _load_timeseries_classification_data("MoteStrain", subset, True, "txt", False, return_X_y, downloads_path) -def load_proximal_phalanx_outline(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_proximal_phalanx_outline(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the proximal phalanx outline data set. It consists of 876 samples belonging to one of 2 classes. The data set is composed of 600 training and 276 test samples. @@ -142,7 +141,7 @@ def load_proximal_phalanx_outline(subset: str = "all", return_X_y: bool = False, can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -160,7 +159,7 @@ def load_proximal_phalanx_outline(subset: str = "all", return_X_y: bool = False, return_X_y, downloads_path) -def load_diatom_size_reduction(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_diatom_size_reduction(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the diatom size reduction data set. It consists of 322 samples belonging to one of 4 classes. The data set is composed of 16 training and 306 test samples. @@ -172,7 +171,7 @@ def load_diatom_size_reduction(subset: str = "all", return_X_y: bool = False, do can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -190,7 +189,7 @@ def load_diatom_size_reduction(subset: str = "all", return_X_y: bool = False, do return_X_y, downloads_path) -def load_symbols(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_symbols(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the symbols data set. It consists of 1020 samples belonging to one of 6 classes. The data set is composed of 25 training and 995 test samples. @@ -202,7 +201,7 @@ def load_symbols(subset: str = "all", return_X_y: bool = False, downloads_path: can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -219,7 +218,7 @@ def load_symbols(subset: str = "all", return_X_y: bool = False, downloads_path: return _load_timeseries_classification_data("Symbols", subset, True, "txt", False, return_X_y, downloads_path) -def load_olive_oil(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_olive_oil(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the OliveOil data set. It consists of 60 samples belonging to one of 4 classes. The data set is composed of 30 training and 30 test samples. @@ -231,7 +230,7 @@ def load_olive_oil(subset: str = "all", return_X_y: bool = False, downloads_path can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -248,7 +247,7 @@ def load_olive_oil(subset: str = "all", return_X_y: bool = False, downloads_path return _load_timeseries_classification_data("OliveOil", subset, True, "txt", False, return_X_y, downloads_path) -def load_plane(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_plane(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the plane data set. It consists of 210 samples belonging to one of 7 classes. The data set is composed of 105 training and 105 test samples. @@ -260,7 +259,7 @@ def load_plane(subset: str = "all", return_X_y: bool = False, downloads_path: st can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -277,7 +276,7 @@ def load_plane(subset: str = "all", return_X_y: bool = False, downloads_path: st return _load_timeseries_classification_data("Plane", subset, True, "txt", False, return_X_y, downloads_path) -def load_sony_aibo_robot_surface(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_sony_aibo_robot_surface(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the Sony AIBO Robot Surface 1 data set. It consists of 621 samples belonging to one of 2 classes. The data set is composed of 20 training and 601 test samples. @@ -289,7 +288,7 @@ def load_sony_aibo_robot_surface(subset: str = "all", return_X_y: bool = False, can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -307,7 +306,7 @@ def load_sony_aibo_robot_surface(subset: str = "all", return_X_y: bool = False, return_X_y, downloads_path) -def load_two_patterns(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_two_patterns(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the two patterns data set. It consists of 5000 samples belonging to one of 4 classes. The data set is composed of 1000 training and 4000 test samples. @@ -319,7 +318,7 @@ def load_two_patterns(subset: str = "all", return_X_y: bool = False, downloads_p can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -336,7 +335,7 @@ def load_two_patterns(subset: str = "all", return_X_y: bool = False, downloads_p return _load_timeseries_classification_data("TwoPatterns", subset, True, "txt", False, return_X_y, downloads_path) -def load_lsst(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_lsst(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the LSST data set. It consists of 4925 samples belonging to one of 14 classes. The data set is composed of 2459 training and 2466 test samples. @@ -348,7 +347,7 @@ def load_lsst(subset: str = "all", return_X_y: bool = False, downloads_path: str can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns diff --git a/clustpy/data/real_torchvision_data.py b/clustpy/data/real_torchvision_data.py index a3c0fb0..7e877e1 100644 --- a/clustpy/data/real_torchvision_data.py +++ b/clustpy/data/real_torchvision_data.py @@ -3,6 +3,8 @@ import numpy as np from clustpy.data._utils import _get_download_dir, _load_image_data, flatten_images from sklearn.datasets._base import Bunch +from pathlib import Path + """ Torchvision datasets helpers @@ -55,7 +57,7 @@ def _get_data_and_labels(dataset: torchvision.datasets.VisionDataset, image_size def _load_torch_image_data(data_source: torchvision.datasets.VisionDataset, subset: str, uses_train_param: bool, - image_format: str, return_X_y: bool, downloads_path: str, image_size: tuple = None) -> Bunch: + image_format: str, return_X_y: bool, downloads_path: str | Path, image_size: tuple = None) -> Bunch: """ Helper function to load a data set from the torchvision package. All data sets will be returned as a two-dimensional tensor, created out of the HWC (height, width, color channels) image representation. @@ -73,7 +75,7 @@ def _load_torch_image_data(data_source: torchvision.datasets.VisionDataset, subs Abbreviations stand for: H: Height, W: Width, D: Depth, C: Color-channels return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored image_size : tuple for some datasets (e.g., GTSRB) the images of various sizes must be converted into a coherent size. @@ -144,7 +146,7 @@ def _load_torch_image_data(data_source: torchvision.datasets.VisionDataset, subs """ -def load_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the MNIST data set. It consists of 70000 28x28 grayscale images showing handwritten digits (0 to 9). The data set is composed of 60000 training and 10000 test images. @@ -156,7 +158,7 @@ def load_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: st can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : bool + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -180,7 +182,7 @@ def load_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: st return _load_torch_image_data(torchvision.datasets.MNIST, subset, True, "HW", return_X_y, downloads_path) -def load_kmnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_kmnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the Kuzushiji-MNIST data set. It consists of 70000 28x28 grayscale images showing Kanji characters. It is composed of 10 different characters, each representing one column of hiragana. @@ -193,7 +195,7 @@ def load_kmnist(subset: str = "all", return_X_y: bool = False, downloads_path: s can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -217,7 +219,7 @@ def load_kmnist(subset: str = "all", return_X_y: bool = False, downloads_path: s return _load_torch_image_data(torchvision.datasets.KMNIST, subset, True, "HW", return_X_y, downloads_path) -def load_fmnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_fmnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the Fashion-MNIST data set. It consists of 70000 28x28 grayscale images showing articles from the Zalando online store. Each sample belongs to one of 10 product groups. @@ -230,7 +232,7 @@ def load_fmnist(subset: str = "all", return_X_y: bool = False, downloads_path: s can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -254,7 +256,7 @@ def load_fmnist(subset: str = "all", return_X_y: bool = False, downloads_path: s return _load_torch_image_data(torchvision.datasets.FashionMNIST, subset, True, "HW", return_X_y, downloads_path) -def load_usps(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_usps(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the USPS data set. It consists of 9298 16x16 grayscale images showing handwritten digits (0 to 9). The data set is composed of 7291 training and 2007 test images. @@ -266,7 +268,7 @@ def load_usps(subset: str = "all", return_X_y: bool = False, downloads_path: str can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -290,7 +292,7 @@ def load_usps(subset: str = "all", return_X_y: bool = False, downloads_path: str return _load_torch_image_data(torchvision.datasets.USPS, subset, True, "HW", return_X_y, downloads_path) -def load_cifar10(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_cifar10(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the CIFAR10 data set. It consists of 60000 32x32 color images showing different objects. The classes are airplane, automobile, bird, cat, deer, dog, frog, horse, ship and truck. @@ -303,7 +305,7 @@ def load_cifar10(subset: str = "all", return_X_y: bool = False, downloads_path: can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -327,7 +329,7 @@ def load_cifar10(subset: str = "all", return_X_y: bool = False, downloads_path: def load_cifar100(subset: str = "all", use_superclasses: bool = False, return_X_y: bool = False, - downloads_path: str = None) -> Bunch: + downloads_path: str | Path = None) -> Bunch: """ Load the CIFAR100 data set. It consists of 60000 32x32 color images showing different objects. A total of 100 classes are included, each depicting a specific of objects. Each class contains 600 objects. @@ -343,7 +345,7 @@ def load_cifar100(subset: str = "all", use_superclasses: bool = False, return_X_ If set to True, the 20 superclasses are used instead of the 100 regular classes (default: False) return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -395,7 +397,7 @@ def load_cifar100(subset: str = "all", use_superclasses: bool = False, return_X_ return dataset -def load_svhn(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_svhn(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the SVHN data set. It consists of 99289 32x32 color images showing house numbers (0 to 9). The data set is composed of 73257 training and 26032 test images. @@ -407,7 +409,7 @@ def load_svhn(subset: str = "all", return_X_y: bool = False, downloads_path: str can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -430,7 +432,7 @@ def load_svhn(subset: str = "all", return_X_y: bool = False, downloads_path: str return _load_torch_image_data(torchvision.datasets.SVHN, subset, False, "CHW", return_X_y, downloads_path) -def load_stl10(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_stl10(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the STL10 data set. It consists of 13000 96x96 color images showing different objects. The classes are airplane, bird, car, cat, deer, dog, horse, monkey, ship and truck. @@ -443,7 +445,7 @@ def load_stl10(subset: str = "all", return_X_y: bool = False, downloads_path: st can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -468,7 +470,7 @@ def load_stl10(subset: str = "all", return_X_y: bool = False, downloads_path: st def load_gtsrb(subset: str = "all", image_size: tuple = (32, 32), return_X_y: bool = False, - downloads_path: str = None) -> Bunch: + downloads_path: str | Path = None) -> Bunch: """ Load the GTSRB (German Traffic Sign Recognition Benchmark) data set. It consists of 39270 color images showing 43 different traffic signs. Example classes are: stop sign, speed limit 50 sign, speed limit 70 sign, construction site sign and many others. @@ -484,7 +486,7 @@ def load_gtsrb(subset: str = "all", image_size: tuple = (32, 32), return_X_y: bo The tuple equals (width, height) of the images (default: (32, 32)) return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns diff --git a/clustpy/data/real_uci_data.py b/clustpy/data/real_uci_data.py index 219e7e5..3556194 100644 --- a/clustpy/data/real_uci_data.py +++ b/clustpy/data/real_uci_data.py @@ -1,14 +1,14 @@ from clustpy.data._utils import _download_file, _get_download_dir, _decompress_z_file, _load_data_file, flatten_images, _transform_text_data, _load_image_data -import os import numpy as np import zipfile import tarfile from sklearn.preprocessing import LabelEncoder import pandas as pd from sklearn.datasets._base import Bunch +from pathlib import Path -def load_banknotes(return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_banknotes(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the banknote authentication data set. It consists of 1372 genuine and forged banknote samples. N=1372, d=4, k=2. @@ -17,7 +17,7 @@ def load_banknotes(return_X_y: bool = False, downloads_path: str = None) -> Bunc ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -31,7 +31,7 @@ def load_banknotes(return_X_y: bool = False, downloads_path: str = None) -> Bunc ------- https://archive.ics.uci.edu/ml/datasets/banknote+authentication """ - filename = _get_download_dir(downloads_path) + "/data_banknote_authentication.txt" + filename = _get_download_dir(downloads_path) / "data_banknote_authentication.txt" data, labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt") # Return values @@ -41,7 +41,7 @@ def load_banknotes(return_X_y: bool = False, downloads_path: str = None) -> Bunc return Bunch(dataset_name="Banknotes", data=data, target=labels) -def load_spambase(return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_spambase(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the spambase data set. It consists of 4601 spam and non-spam mails. N=4601, d=57, k=2. @@ -50,7 +50,7 @@ def load_spambase(return_X_y: bool = False, downloads_path: str = None) -> Bunch ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -64,7 +64,7 @@ def load_spambase(return_X_y: bool = False, downloads_path: str = None) -> Bunch ------- https://archive.ics.uci.edu/ml/datasets/spambase """ - filename = _get_download_dir(downloads_path) + "/spambase.data" + filename = _get_download_dir(downloads_path) / "spambase.data" data, labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data") # Return values @@ -74,7 +74,7 @@ def load_spambase(return_X_y: bool = False, downloads_path: str = None) -> Bunch return Bunch(dataset_name="Spambase", data=data, target=labels) -def load_seeds(return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_seeds(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the seeds data set. It consists of 210 samples belonging to one of three varieties of wheat. N=210, d=7, k=3. @@ -83,7 +83,7 @@ def load_seeds(return_X_y: bool = False, downloads_path: str = None) -> Bunch: ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -97,7 +97,7 @@ def load_seeds(return_X_y: bool = False, downloads_path: str = None) -> Bunch: ------- https://archive.ics.uci.edu/ml/datasets/seeds """ - filename = _get_download_dir(downloads_path) + "/seeds_dataset.txt" + filename = _get_download_dir(downloads_path) / "seeds_dataset.txt" data, labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt", delimiter=None) @@ -110,7 +110,7 @@ def load_seeds(return_X_y: bool = False, downloads_path: str = None) -> Bunch: return Bunch(dataset_name="Seeds", data=data, target=labels) -def load_skin(return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_skin(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the Skin Segmentation data set. It consists of 245057 skin- and non-skin samples with their B, G, R color information. @@ -120,7 +120,7 @@ def load_skin(return_X_y: bool = False, downloads_path: str = None) -> Bunch: ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -134,7 +134,7 @@ def load_skin(return_X_y: bool = False, downloads_path: str = None) -> Bunch: ------- https://archive.ics.uci.edu/ml/datasets/skin+segmentation """ - filename = _get_download_dir(downloads_path) + "/Skin_NonSkin.txt" + filename = _get_download_dir(downloads_path) / "Skin_NonSkin.txt" data, labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/00229/Skin_NonSkin.txt", delimiter=None) @@ -147,7 +147,7 @@ def load_skin(return_X_y: bool = False, downloads_path: str = None) -> Bunch: return Bunch(dataset_name="SkinSegmentation", data=data, target=labels) -def load_soybean_small(return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_soybean_small(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the small version of the soybean data set. It is a small subset of the original soybean data set. It consists of 47 samples belonging to one of 4 classes. @@ -157,7 +157,7 @@ def load_soybean_small(return_X_y: bool = False, downloads_path: str = None) -> ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -171,8 +171,8 @@ def load_soybean_small(return_X_y: bool = False, downloads_path: str = None) -> ------- https://archive.ics.uci.edu/ml/datasets/soybean+(small) """ - filename = _get_download_dir(downloads_path) + "/soybean-small.data" - if not os.path.isfile(filename): + filename = _get_download_dir(downloads_path) / "soybean-small.data" + if not filename.is_file(): _download_file( "https://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-small.data", filename) @@ -189,7 +189,7 @@ def load_soybean_small(return_X_y: bool = False, downloads_path: str = None) -> return Bunch(dataset_name="SoybeanSmall", data=data, target=labels) -def load_soybean_large(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_soybean_large(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the large version of the soybean data set. It consists of 562 samples belonging to one of 15 classes. Originally, the data set would have samples and 19 classes but some samples have attributes showing '?' values. Those @@ -203,7 +203,7 @@ def load_soybean_large(subset: str = "all", return_X_y: bool = False, downloads_ can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -221,8 +221,8 @@ def load_soybean_large(subset: str = "all", return_X_y: bool = False, downloads_ assert subset in ["all", "train", "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset) if subset == "all" or subset == "train": - filename = _get_download_dir(downloads_path) + "/soybean-large.data" - if not os.path.isfile(filename): + filename = _get_download_dir(downloads_path) / "soybean-large.data" + if not filename.is_file(): _download_file( "https://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-large.data", filename) @@ -232,8 +232,8 @@ def load_soybean_large(subset: str = "all", return_X_y: bool = False, downloads_ labels_raw = df_train.pop(0) data = df_train.values if subset == "all" or subset == "test": - filename = _get_download_dir(downloads_path) + "/soybean-large.test" - if not os.path.isfile(filename): + filename = _get_download_dir(downloads_path) / "soybean-large.test" + if not filename.is_file(): _download_file( "https://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-large.test", filename) @@ -257,7 +257,7 @@ def load_soybean_large(subset: str = "all", return_X_y: bool = False, downloads_ return Bunch(dataset_name="SoybeanLarge", data=data, target=labels) -def load_pendigits(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_pendigits(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the pendigits data set. It consists of 10992 vectors of length 16, representing 8 coordinates. The coordinates were taken from the task of writing digits (0 to 9) on a tablet. @@ -270,7 +270,7 @@ def load_pendigits(subset: str = "all", return_X_y: bool = False, downloads_path can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -288,11 +288,11 @@ def load_pendigits(subset: str = "all", return_X_y: bool = False, downloads_path assert subset in ["all", "train", "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset) if subset == "all" or subset == "train": - filename = _get_download_dir(downloads_path) + "/pendigits.tra" + filename = _get_download_dir(downloads_path) / "pendigits.tra" data, labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra") if subset == "all" or subset == "test": - filename = _get_download_dir(downloads_path) + "/pendigits.tes" + filename = _get_download_dir(downloads_path) / "pendigits.tes" test_data, test_labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tes") if subset == "all": @@ -308,7 +308,7 @@ def load_pendigits(subset: str = "all", return_X_y: bool = False, downloads_path return Bunch(dataset_name="Pendigits", data=data, target=labels) -def load_ecoli(ignore_small_clusters: bool = False, return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_ecoli(ignore_small_clusters: bool = False, return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the ecoli data set. It consists of 336 samples belonging to one of 8 classes. N=336, d=7, k=8. @@ -319,7 +319,7 @@ def load_ecoli(ignore_small_clusters: bool = False, return_X_y: bool = False, do specify if the three small clusters with size 2, 2 and 5 should be ignored (default: False) return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -333,8 +333,8 @@ def load_ecoli(ignore_small_clusters: bool = False, return_X_y: bool = False, do ------- https://archive.ics.uci.edu/ml/datasets/ecoli """ - filename = _get_download_dir(downloads_path) + "/ecoli.data" - if not os.path.isfile(filename): + filename = _get_download_dir(downloads_path) / "ecoli.data" + if not filename.is_file(): _download_file( "https://archive.ics.uci.edu/ml/machine-learning-databases/ecoli/ecoli.data", filename) @@ -361,7 +361,7 @@ def load_ecoli(ignore_small_clusters: bool = False, return_X_y: bool = False, do return Bunch(dataset_name="Ecoli", data=data, target=labels) -def load_htru2(return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_htru2(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the HTRU2 data set. It consists of 17898 samples belonging to the pulsar or non-pulsar class. A special property is that more than 90% of the data belongs to class 0. @@ -371,7 +371,7 @@ def load_htru2(return_X_y: bool = False, downloads_path: str = None) -> Bunch: ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -385,18 +385,17 @@ def load_htru2(return_X_y: bool = False, downloads_path: str = None) -> Bunch: ------- https://archive.ics.uci.edu/ml/datasets/HTRU2 """ - directory = _get_download_dir(downloads_path) + "/htru2/" - filename = directory + "HTRU2.zip" - if not os.path.isfile(filename): - if not os.path.isdir(directory): - os.mkdir(directory) + directory = _get_download_dir(downloads_path) / "htru2" + filename = directory / "HTRU2.zip" + if not filename.is_file(): + directory.mkdir(parents=False, exist_ok=True) _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00372/HTRU2.zip", filename) # Unpack zipfile with zipfile.ZipFile(filename, 'r') as zipf: zipf.extractall(directory) # Load data and labels - dataset = np.genfromtxt(directory + "HTRU_2.csv", delimiter=",") + dataset = np.genfromtxt(directory / "HTRU_2.csv", delimiter=",") data = dataset[:, :-1] labels = dataset[:, -1] # Convert labels to int32 format @@ -408,7 +407,7 @@ def load_htru2(return_X_y: bool = False, downloads_path: str = None) -> Bunch: return Bunch(dataset_name="HTRU2", data=data, target=labels) -def load_letterrecognition(return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_letterrecognition(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the Letter Recognition data set. It consists of 20000 samples where each sample represents one of the 26 capital letters in the English alphabet. All samples are composed of 16 numerical stimuli describing the respective letter. @@ -418,7 +417,7 @@ def load_letterrecognition(return_X_y: bool = False, downloads_path: str = None) ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -432,8 +431,8 @@ def load_letterrecognition(return_X_y: bool = False, downloads_path: str = None) ------- https://archive.ics.uci.edu/ml/datasets/letter+recognition """ - filename = _get_download_dir(downloads_path) + "/letter-recognition.data" - if not os.path.isfile(filename): + filename = _get_download_dir(downloads_path) / "letter-recognition.data" + if not filename.is_file(): _download_file( "https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data", filename) @@ -460,7 +459,7 @@ def load_letterrecognition(return_X_y: bool = False, downloads_path: str = None) return Bunch(dataset_name="Letterrecognition", data=data, target=labels) -def load_har(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_har(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the Human Activity Recognition data set. It consists of 10299 samples each representing sensor data of a person performing an activity. The six activities are walking, walking_upstairs, walking_downstairs, sitting, standing and @@ -474,7 +473,7 @@ def load_har(subset: str = "all", return_X_y: bool = False, downloads_path: str can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -491,11 +490,10 @@ def load_har(subset: str = "all", return_X_y: bool = False, downloads_path: str subset = subset.lower() assert subset in ["all", "train", "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset) - directory = _get_download_dir(downloads_path) + "/har/" - filename = directory + "UCI HAR Dataset.zip" - if not os.path.isfile(filename): - if not os.path.isdir(directory): - os.mkdir(directory) + directory = _get_download_dir(downloads_path) / "har" + filename = directory / "UCI HAR Dataset.zip" + if not filename.is_file(): + directory.mkdir(parents=False, exist_ok=True) _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip", filename) # Unpack zipfile @@ -503,11 +501,11 @@ def load_har(subset: str = "all", return_X_y: bool = False, downloads_path: str zipf.extractall(directory) # Load data and labels if subset == "all" or subset == "train": - data = np.genfromtxt(directory + "UCI HAR Dataset/train/X_train.txt") - labels = np.genfromtxt(directory + "UCI HAR Dataset/train/y_train.txt") + data = np.genfromtxt(directory / "UCI HAR Dataset/train/X_train.txt") + labels = np.genfromtxt(directory / "UCI HAR Dataset/train/y_train.txt") if subset == "all" or subset == "test": - test_data = np.genfromtxt(directory + "UCI HAR Dataset/test/X_test.txt") - test_labels = np.genfromtxt(directory + "UCI HAR Dataset/test/y_test.txt") + test_data = np.genfromtxt(directory / "UCI HAR Dataset/test/X_test.txt") + test_labels = np.genfromtxt(directory / "UCI HAR Dataset/test/y_test.txt") if subset == "all": data = np.r_[data, test_data] labels = np.r_[labels, test_labels] @@ -525,7 +523,7 @@ def load_har(subset: str = "all", return_X_y: bool = False, downloads_path: str return Bunch(dataset_name="HAR", data=data, target=labels) -def load_statlog_shuttle(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_statlog_shuttle(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the statlog shuttle data set. It consists of 58000 samples belonging to one of 7 classes. A special property is that about 80% of the data belongs to class 0. @@ -538,7 +536,7 @@ def load_statlog_shuttle(subset: str = "all", return_X_y: bool = False, download can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -555,30 +553,29 @@ def load_statlog_shuttle(subset: str = "all", return_X_y: bool = False, download subset = subset.lower() assert subset in ["all", "train", "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset) - directory = _get_download_dir(downloads_path) + "/shuttle/" + directory = _get_download_dir(downloads_path) / "shuttle" if subset == "all" or subset == "train": - filename = directory + "shuttle.trn.Z" - if not os.path.isfile(filename): - if not os.path.isdir(directory): - os.mkdir(directory) + filename = directory / "shuttle.trn.Z" + if not filename.is_file(): + directory.mkdir(parents=False, exist_ok=True) _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.trn.Z", filename) # Unpack z-file success = _decompress_z_file(filename, directory) if not success: - os.remove(filename) + filename.unlink() return (None, None) if return_X_y else None # Load data and labels - dataset = np.genfromtxt(directory + "shuttle.trn") + dataset = np.genfromtxt(directory / "shuttle.trn") data = dataset[:, :-1] labels = dataset[:, -1] if subset == "all" or subset == "test": - filename = directory + "shuttle.tst" - if not os.path.isfile(filename): + filename = directory / "shuttle.tst" + if not filename.is_file(): _download_file( "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.tst", filename) - test_dataset = np.genfromtxt(directory + "shuttle.tst") + test_dataset = np.genfromtxt(directory / "shuttle.tst") test_data = test_dataset[:, :-1] test_labels = test_dataset[:, -1] if subset == "all": @@ -599,7 +596,7 @@ def load_statlog_shuttle(subset: str = "all", return_X_y: bool = False, download def load_mice_protein(return_additional_labels: bool = False, return_X_y: bool = False, - downloads_path: str = None) -> Bunch: + downloads_path: str | Path = None) -> Bunch: """ Load the Mice Protein Expression data set. It consists of 1077 samples belonging to one of 8 classes. Each feature represents the expression level of one of 77 proteins. @@ -615,7 +612,7 @@ def load_mice_protein(return_additional_labels: bool = False, return_X_y: bool = return additional labels (default: False) return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -629,8 +626,8 @@ def load_mice_protein(return_additional_labels: bool = False, return_X_y: bool = ------- https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression """ - filename = _get_download_dir(downloads_path) + "/Data_Cortex_Nuclear.xls" - if not os.path.isfile(filename): + filename = _get_download_dir(downloads_path) / "Data_Cortex_Nuclear.xls" + if not filename.is_file(): _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00342/Data_Cortex_Nuclear.xls", filename) xls = pd.ExcelFile(filename) @@ -674,7 +671,7 @@ def load_mice_protein(return_additional_labels: bool = False, return_X_y: bool = return Bunch(dataset_name="MiceProtein", data=data, target=labels) -def load_user_knowledge(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_user_knowledge(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the user knowledge data set. It consists of 403 samples belonging to one of 4 classes. The 4 classes are the knowledge levels 'very low', 'low', 'middle' and 'high'. @@ -687,7 +684,7 @@ def load_user_knowledge(subset: str = "all", return_X_y: bool = False, downloads can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -704,8 +701,8 @@ def load_user_knowledge(subset: str = "all", return_X_y: bool = False, downloads subset = subset.lower() assert subset in ["all", "train", "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset) - filename = _get_download_dir(downloads_path) + "/Data_User_Modeling_Dataset_Hamdi Tolga KAHRAMAN.xls" - if not os.path.isfile(filename): + filename = _get_download_dir(downloads_path) / "Data_User_Modeling_Dataset_Hamdi Tolga KAHRAMAN.xls" + if not filename.is_file(): _download_file( "https://archive.ics.uci.edu/ml/machine-learning-databases/00257/Data_User_Modeling_Dataset_Hamdi%20Tolga%20KAHRAMAN.xls", filename) @@ -743,7 +740,7 @@ def load_user_knowledge(subset: str = "all", return_X_y: bool = False, downloads return Bunch(dataset_name="UserKnowledge", data=data, target=labels) -def load_breast_tissue(return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_breast_tissue(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the breast tissue data set. It consists of 106 samples belonging to one of 6 classes. N=106, d=9, k=6. @@ -752,7 +749,7 @@ def load_breast_tissue(return_X_y: bool = False, downloads_path: str = None) -> ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -766,8 +763,8 @@ def load_breast_tissue(return_X_y: bool = False, downloads_path: str = None) -> ------- http://archive.ics.uci.edu/ml/datasets/breast+tissue """ - filename = _get_download_dir(downloads_path) + "/BreastTissue.xls" - if not os.path.isfile(filename): + filename = _get_download_dir(downloads_path) / "BreastTissue.xls" + if not filename.is_file(): _download_file("http://archive.ics.uci.edu/ml/machine-learning-databases/00192/BreastTissue.xls", filename) xls = pd.ExcelFile(filename) @@ -786,7 +783,7 @@ def load_breast_tissue(return_X_y: bool = False, downloads_path: str = None) -> return Bunch(dataset_name="BreastTissue", data=data, target=labels) -def load_forest_types(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_forest_types(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the forest type mapping data set. It consists of 523 samples belonging to one of 4 classes. The data set is composed of 198 training and 325 test samples. @@ -798,7 +795,7 @@ def load_forest_types(subset: str = "all", return_X_y: bool = False, downloads_p can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -815,11 +812,10 @@ def load_forest_types(subset: str = "all", return_X_y: bool = False, downloads_p subset = subset.lower() assert subset in ["all", "train", "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset) - directory = _get_download_dir(downloads_path) + "/ForestTypes/" - filename = directory + "ForestTypes.zip" - if not os.path.isfile(filename): - if not os.path.isdir(directory): - os.mkdir(directory) + directory = _get_download_dir(downloads_path) / "ForestTypes" + filename = directory / "ForestTypes.zip" + if not filename.is_file(): + directory.mkdir(parents=False, exist_ok=True) _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00333/ForestTypes.zip", filename) # Unpack zipfile @@ -827,11 +823,11 @@ def load_forest_types(subset: str = "all", return_X_y: bool = False, downloads_p zipf.extractall(directory) # Load data and labels if subset == "all" or subset == "train": - df_train = pd.read_csv(directory + "/training.csv", delimiter=",") + df_train = pd.read_csv(directory / "training.csv", delimiter=",") labels_raw = df_train.pop("class") data = df_train.values if subset == "all" or subset == "test": - df_test = pd.read_csv(directory + "/testing.csv", delimiter=",") + df_test = pd.read_csv(directory / "testing.csv", delimiter=",") labels_test = df_test.pop("class") if subset == "all": data = np.r_[data, df_test.values] @@ -849,7 +845,7 @@ def load_forest_types(subset: str = "all", return_X_y: bool = False, downloads_p return Bunch(dataset_name="ForestTypes", data=data, target=labels) -def load_dermatology(return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_dermatology(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the dermatology data set. It consists of 366 samples belonging to one of 6 classes. 8 samples contain '?' values and are therefore removed. @@ -859,7 +855,7 @@ def load_dermatology(return_X_y: bool = False, downloads_path: str = None) -> Bu ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -873,7 +869,7 @@ def load_dermatology(return_X_y: bool = False, downloads_path: str = None) -> Bu ------- https://archive.ics.uci.edu/ml/datasets/dermatology """ - filename = _get_download_dir(downloads_path) + "/dermatology.data" + filename = _get_download_dir(downloads_path) / "dermatology.data" data, labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data", delimiter=",") @@ -890,7 +886,7 @@ def load_dermatology(return_X_y: bool = False, downloads_path: str = None) -> Bu return Bunch(dataset_name="Dermatology", data=data, target=labels) -def load_multiple_features(return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_multiple_features(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the multiple features data set. It consists of 2000 samples belonging to one of 10 classes. Each class corresponds to handwritten numerals (0-9) extracted from a collection of Dutch utility maps. @@ -900,7 +896,7 @@ def load_multiple_features(return_X_y: bool = False, downloads_path: str = None) ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -914,14 +910,13 @@ def load_multiple_features(return_X_y: bool = False, downloads_path: str = None) ------- https://archive.ics.uci.edu/ml/datasets/Multiple+Features """ - directory = _get_download_dir(downloads_path) + "/MultipleFeatures/" - if not os.path.isdir(directory): - os.mkdir(directory) + directory = _get_download_dir(downloads_path) / "MultipleFeatures" + directory.mkdir(parents=False, exist_ok=True) data = np.zeros((2000, 0)) # Dataset consists of multiple .xls files for file in ["mfeat-fac", "mfeat-fou", "mfeat-kar", "mfeat-mor", "mfeat-pix", "mfeat-zer"]: - filename = directory + file + ".xls" - if not os.path.isfile(filename): + filename = directory / (file + ".xls") + if not filename.is_file(): _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/mfeat/" + file, filename) data_tmp = np.genfromtxt(filename, delimiter=None) @@ -935,7 +930,7 @@ def load_multiple_features(return_X_y: bool = False, downloads_path: str = None) return Bunch(dataset_name="MultipleFeatures", data=data, target=labels) -def load_statlog_australian_credit_approval(return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_statlog_australian_credit_approval(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the statlog Australian Credit Approval data set. It consists of 690 samples belonging to one of 2 classes. N=690, d=14, k=2. @@ -944,7 +939,7 @@ def load_statlog_australian_credit_approval(return_X_y: bool = False, downloads_ ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -958,7 +953,7 @@ def load_statlog_australian_credit_approval(return_X_y: bool = False, downloads_ ------- https://archive.ics.uci.edu/ml/datasets/statlog+(australian+credit+approval) """ - filename = _get_download_dir(downloads_path) + "/australian.dat" + filename = _get_download_dir(downloads_path) / "australian.dat" data, labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/australian/australian.dat", delimiter=None) @@ -969,7 +964,7 @@ def load_statlog_australian_credit_approval(return_X_y: bool = False, downloads_ return Bunch(dataset_name="StatlogAustralianCreditApproval", data=data, target=labels) -def load_breast_cancer_wisconsin_original(return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_breast_cancer_wisconsin_original(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the original breast cancer Wisconsin data set. It consists of 699 samples belonging to one of 2 classes. 16 samples contain '?' values and will be removed. @@ -979,7 +974,7 @@ def load_breast_cancer_wisconsin_original(return_X_y: bool = False, downloads_pa ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -993,7 +988,7 @@ def load_breast_cancer_wisconsin_original(return_X_y: bool = False, downloads_pa ------- https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+%28original%29 """ - filename = _get_download_dir(downloads_path) + "/breast-cancer-wisconsin.data" + filename = _get_download_dir(downloads_path) / "breast-cancer-wisconsin.data" data, labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data", delimiter=",") @@ -1014,7 +1009,7 @@ def load_breast_cancer_wisconsin_original(return_X_y: bool = False, downloads_pa return Bunch(dataset_name="BreastCancerWisconsin", data=data, target=labels) -def load_optdigits(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_optdigits(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the optdigits data set. It consists of 5620 8x8 grayscale images, each representing a digit (0 to 9). Each pixel depicts the number of marked pixel within a 4x4 block of the original 32x32 bitmaps. @@ -1027,7 +1022,7 @@ def load_optdigits(subset: str = "all", return_X_y: bool = False, downloads_path can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all') return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -1046,11 +1041,11 @@ def load_optdigits(subset: str = "all", return_X_y: bool = False, downloads_path assert subset in ["all", "train", "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset) if subset == "all" or subset == "train": - filename = _get_download_dir(downloads_path) + "/optdigits.tra" + filename = _get_download_dir(downloads_path) / "optdigits.tra" data, labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra") if subset == "all" or subset == "test": - filename = _get_download_dir(downloads_path) + "/optdigits.tes" + filename = _get_download_dir(downloads_path) / "optdigits.tes" test_data, test_labels = _load_data_file(filename, "https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes") if subset == "all": @@ -1067,7 +1062,7 @@ def load_optdigits(subset: str = "all", return_X_y: bool = False, downloads_path return Bunch(dataset_name="Optdigits", data=data, target=labels, images=data_image, image_format="HW") -def load_semeion(return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_semeion(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the semeion data set. It consists of 1593 samples belonging to one of 10 classes. Each sample corresponds to a grayscale 16x16 scan of handwritten digits originating from about 80 different persons. @@ -1078,7 +1073,7 @@ def load_semeion(return_X_y: bool = False, downloads_path: str = None) -> Bunch: ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -1093,8 +1088,8 @@ def load_semeion(return_X_y: bool = False, downloads_path: str = None) -> Bunch: ------- https://archive.ics.uci.edu/ml/datasets/semeion+handwritten+digit """ - filename = _get_download_dir(downloads_path) + "/semeion.data" - if not os.path.isfile(filename): + filename = _get_download_dir(downloads_path) / "semeion.data" + if not filename.is_file(): _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/semeion/semeion.data", filename) datafile = np.genfromtxt(filename) @@ -1111,19 +1106,19 @@ def load_semeion(return_X_y: bool = False, downloads_path: str = None) -> Bunch: return Bunch(dataset_name="Semeion", data=data, target=labels, images=data_image, image_format="HW") -def load_cmu_faces(return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_cmu_faces(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the CMU Face Images data set. It consists of 640 30x32 grayscale images showing 20 persons in different poses (up, straight, left, right) and with different expressions (neutral, happy, sad, angry). Additionally, the persons can wear sunglasses or not. 16 images show glitches which is why the final data set only contains 624 images. - N=624, d=400, k=[20,4,4,2]. + N=624, d=960, k=[20,4,4,2]. Parameters ------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -1132,17 +1127,16 @@ def load_cmu_faces(return_X_y: bool = False, downloads_path: str = None) -> Bunc A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. Furthermore, the original images are contained in the 'images' attribute. Alternatively, if return_X_y is True two arrays will be returned: - the data numpy array (624 x 400), the labels numpy array (624 x 4) + the data numpy array (624 x 960), the labels numpy array (624 x 4) References ------- http://archive.ics.uci.edu/ml/datasets/cmu+face+images """ - directory = _get_download_dir(downloads_path) + "/cmufaces/" - filename = directory + "faces_4.tar.gz" - if not os.path.isfile(filename): - if not os.path.isdir(directory): - os.mkdir(directory) + directory = _get_download_dir(downloads_path) / "cmufaces" + filename = directory / "faces_4.tar.gz" + if not filename.is_file(): + directory.mkdir(parents=False, exist_ok=True) _download_file("http://archive.ics.uci.edu/ml/machine-learning-databases/faces-mld/faces_4.tar.gz", filename) # Unpack zipfile @@ -1157,14 +1151,15 @@ def load_cmu_faces(return_X_y: bool = False, downloads_path: str = None) -> Bunc data_list = [] label_list = [] for name in names: - path_images = directory + "/faces_4/" + name - for image in os.listdir(path_images): - if not image.endswith("_4.pgm"): + path_images = directory / "faces_4" / name + for image in path_images.iterdir(): + image_str = image.name + if not image_str.endswith("_4.pgm"): continue # get image data - image_array = _load_image_data(path_images + "/" + image, None, False) + image_array = _load_image_data(image, None, False) # Get labels - name_parts = image.split("_") + name_parts = image_str.split("_") user_id = np.argwhere(names == name_parts[0])[0][0] position = np.argwhere(positions == name_parts[1])[0][0] expression = np.argwhere(expressions == name_parts[2])[0][0] @@ -1185,7 +1180,7 @@ def load_cmu_faces(return_X_y: bool = False, downloads_path: str = None) -> Bunc classes=(names, positions, expressions, eyes)) -def load_gene_expression_cancer_rna_seq(return_X_y: bool = False, downloads_path: str = None): +def load_gene_expression_cancer_rna_seq(return_X_y: bool = False, downloads_path: str | Path = None): """ Load the Gene Expression Cancer RNA-SEQ data set. It consists of 801 samples belonging to one of 5 classes. N=801, d=20531, k=5. @@ -1194,7 +1189,7 @@ def load_gene_expression_cancer_rna_seq(return_X_y: bool = False, downloads_path ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -1208,21 +1203,20 @@ def load_gene_expression_cancer_rna_seq(return_X_y: bool = False, downloads_path ------- https://archive.ics.uci.edu/dataset/401/gene+expression+cancer+rna+seq """ - directory = _get_download_dir(downloads_path) + "/GeneExpressionRNASEQ/" - filename = directory + "gene+expression+cancer+rna+seq.zip" - if not os.path.isfile(filename): - if not os.path.isdir(directory): - os.mkdir(directory) + directory = _get_download_dir(downloads_path) / "GeneExpressionRNASEQ" + filename = directory / "gene+expression+cancer+rna+seq.zip" + if not filename.is_file(): + directory.mkdir(parents=False, exist_ok=True) _download_file("https://archive.ics.uci.edu/static/public/401/gene+expression+cancer+rna+seq.zip", filename) # Unpack zipfile with zipfile.ZipFile(filename, 'r') as zipf: zipf.extractall(directory) - with tarfile.open(directory + "TCGA-PANCAN-HiSeq-801x20531.tar.gz", "r:gz") as tar: + with tarfile.open(directory / "TCGA-PANCAN-HiSeq-801x20531.tar.gz", "r:gz") as tar: tar.extractall(directory) # Load data and labels - data = np.genfromtxt(directory + "TCGA-PANCAN-HiSeq-801x20531/data.csv", delimiter=",")[1:,1:] - labels_raw = np.genfromtxt(directory + "TCGA-PANCAN-HiSeq-801x20531/labels.csv", delimiter=",", dtype=str)[1:,1] + data = np.genfromtxt(directory / "TCGA-PANCAN-HiSeq-801x20531" / "data.csv", delimiter=",")[1:,1:] + labels_raw = np.genfromtxt(directory / "TCGA-PANCAN-HiSeq-801x20531" / "labels.csv", delimiter=",", dtype=str)[1:,1] LE = LabelEncoder() labels = LE.fit_transform(labels_raw) # Return values @@ -1232,7 +1226,7 @@ def load_gene_expression_cancer_rna_seq(return_X_y: bool = False, downloads_path return Bunch(dataset_name="GeneExpressionCancerRNA-SEQ", data=data, target=labels) -def load_sport_articles(return_X_y: bool = False, downloads_path: str = None): +def load_sport_articles(return_X_y: bool = False, downloads_path: str | Path = None): """ Load the Sport Articles data set. It consists of 1000 samples belonging to one of 2 classes (objective or subjective). We only consider features that correspond to specific frequencies and, therefore, ignore the attributes @@ -1243,7 +1237,7 @@ def load_sport_articles(return_X_y: bool = False, downloads_path: str = None): ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -1257,11 +1251,10 @@ def load_sport_articles(return_X_y: bool = False, downloads_path: str = None): ------- https://archive.ics.uci.edu/dataset/450/sports+articles+for+objectivity+analysis """ - directory = _get_download_dir(downloads_path) + "/SportArticles/" - filename = directory + "sports+articles+for+objectivity+analysis.zip" - if not os.path.isfile(filename): - if not os.path.isdir(directory): - os.mkdir(directory) + directory = _get_download_dir(downloads_path) / "SportArticles" + filename = directory / "sports+articles+for+objectivity+analysis.zip" + if not filename.is_file(): + directory.mkdir(parents=False, exist_ok=True) _download_file("https://archive.ics.uci.edu/static/public/450/sports+articles+for+objectivity+analysis.zip", filename) # Unpack zipfile @@ -1272,7 +1265,7 @@ def load_sport_articles(return_X_y: bool = False, downloads_path: str = None): labels = np.zeros(1000, dtype=np.int32) row = -2 # first row is the header and should be skipped column = 0 - with open(directory + "features.xls", "r") as f: + with open(directory / "features.xls", "r") as f: for _, line in enumerate(f.readlines()): if "" in line: # Next table is not relevant for the data @@ -1295,7 +1288,7 @@ def load_sport_articles(return_X_y: bool = False, downloads_path: str = None): return Bunch(dataset_name="SportArticles", data=data, target=labels) -def load_wholesale_customers(return_X_y: bool = False, downloads_path: str = None): +def load_wholesale_customers(return_X_y: bool = False, downloads_path: str | Path = None): """ Load the Wholesale Customers data set. It consists of 440 samples and can be grouped in two different ways: Either two classes based on the channel (Horeca or Retail) or three classes based on the region (Lisbon, Oporto or Other region). @@ -1305,7 +1298,7 @@ def load_wholesale_customers(return_X_y: bool = False, downloads_path: str = Non ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -1319,17 +1312,16 @@ def load_wholesale_customers(return_X_y: bool = False, downloads_path: str = Non ------- https://archive.ics.uci.edu/dataset/292/wholesale+customers """ - directory = _get_download_dir(downloads_path) + "/WholeCustomers/" - filename = directory + "wholesale+customers.zip" - if not os.path.isfile(filename): - if not os.path.isdir(directory): - os.mkdir(directory) + directory = _get_download_dir(downloads_path) / "WholeCustomers" + filename = directory / "wholesale+customers.zip" + if not filename.is_file(): + directory.mkdir(parents=False, exist_ok=True) _download_file("https://archive.ics.uci.edu/static/public/292/wholesale+customers.zip", filename) # Unpack zipfile with zipfile.ZipFile(filename, 'r') as zipf: zipf.extractall(directory) - wholesale = np.genfromtxt(directory + "Wholesale customers data.csv", delimiter=",", skip_header=True) + wholesale = np.genfromtxt(directory / "Wholesale customers data.csv", delimiter=",", skip_header=True) labels = wholesale[:,:2] - 1 data = wholesale[:,2:] # Convert labels to int32 format @@ -1340,10 +1332,11 @@ def load_wholesale_customers(return_X_y: bool = False, downloads_path: str = Non else: return Bunch(dataset_name="WholesaleCustomers", data=data, target=labels) + def load_reuters21578(subset: str = "all", categories: tuple = ("grain", "money-fx", "earn", "acq", "crude"), use_tfidf: bool = True, use_stemming: bool = True, use_stop_words: bool = True, max_df: float | int = 1., min_df: float | int = 1, max_features: int = 2000, min_variance : float = 0., - sublinear_tf: bool = False, return_X_y: bool = False, downloads_path: str = None) -> Bunch: + sublinear_tf: bool = False, return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the Reuters21578 data set. It consists of 21578 Reuters newswire artices divided into different categories. When loading the artices, the title will be included in the text. @@ -1382,7 +1375,7 @@ def load_reuters21578(subset: str = "all", categories: tuple = ("grain", "money- Apply sublinear term frequency scaling, i.e. replace tf with 1 + log(tf) (see sklearn TfidfTransformer) (default: False) return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -1400,27 +1393,26 @@ def load_reuters21578(subset: str = "all", categories: tuple = ("grain", "money- assert subset in ["all", "train", "test", "test-cgi", "train-cgi"], "subset must match 'all', 'train', 'test', 'train-cgi' or 'test-cgi'. Your input {0}".format(subset) # Check if data is already downloaded - directory = _get_download_dir(downloads_path) + "/Reuters21578/" - filename = directory + "reuters+21578+text+categorization+collection.zip" - if not os.path.isfile(filename): - if not os.path.isdir(directory): - os.mkdir(directory) + directory = _get_download_dir(downloads_path) / "Reuters21578" + filename = directory / "reuters+21578+text+categorization+collection.zip" + if not filename.is_file(): + directory.mkdir(parents=False, exist_ok=True) _download_file("https://archive.ics.uci.edu/static/public/137/reuters+21578+text+categorization+collection.zip", filename) # Unpack zipfile with zipfile.ZipFile(filename, 'r') as zipf: zipf.extractall(directory) - with tarfile.open(directory + "reuters21578.tar.gz", "r:gz") as tar: + with tarfile.open(directory / "reuters21578.tar.gz", "r:gz") as tar: tar.extractall(directory) # Load actual articles into arrays all_topics = [] all_bodies = [] all_lewis_splits = [] all_cgi_splits = [] - for file in os.listdir(directory): - if file.endswith(".sgm"): + for file in directory.iterdir(): + if file.suffix == ".sgm": in_body = False - with open(directory + file, "rb") as f: + with open(file, "rb") as f: for line in f.readlines(): # Needed so that reut2-017.sgm is not crashing due to encoding line = line.decode('utf-8','ignore') @@ -1482,7 +1474,7 @@ def load_reuters21578(subset: str = "all", categories: tuple = ("grain", "money- else: all_topics[i] = new_topic # Transform raw data - data = _transform_text_data(all_bodies, use_tfidf, use_stemming, use_stop_words, max_df, min_df, max_features, min_variance, + data, vocabulary = _transform_text_data(all_bodies, use_tfidf, use_stemming, use_stop_words, max_df, min_df, max_features, min_variance, sublinear_tf) # Get labels LE = LabelEncoder() @@ -1503,4 +1495,4 @@ def load_reuters21578(subset: str = "all", categories: tuple = ("grain", "money- if return_X_y: return data, labels else: - return Bunch(dataset_name="Reuters21578", data=data, target=labels, classes=categories) + return Bunch(dataset_name="Reuters21578", data=data, target=labels, classes=categories, columns=vocabulary) diff --git a/clustpy/data/real_video_data.py b/clustpy/data/real_video_data.py index 8a3211b..40b2217 100644 --- a/clustpy/data/real_video_data.py +++ b/clustpy/data/real_video_data.py @@ -4,22 +4,23 @@ print("[WARNING] Could not import cv2 in clustpy.data.real_video_data. Please install cv2 by 'pip install opencv-python' if necessary") from clustpy.data._utils import _download_file, _get_download_dir, flatten_images import numpy as np -import os import zipfile from sklearn.datasets._base import Bunch +from pathlib import Path + """ Helpers """ -def _load_video(path: str, image_size: tuple) -> np.ndarray: +def _load_video(path: str | Path, image_size: tuple) -> np.ndarray: """ Load a video by saving each frame within a numpy array. Parameters ---------- - path : str + path : Path | Path Path to the video image_size : tuple The single frames can be downsized. This is necessary for large datasets. @@ -99,7 +100,7 @@ def _downsample_frames(data: np.ndarray, labels: np.ndarray, frame_sampling_rati def load_video_weizmann(use_actions : tuple = None, use_persons : tuple = None, image_size: tuple = None, frame_sampling_ratio: float = 1, return_X_y: bool = False, - downloads_path: str = None) -> Bunch: + downloads_path: str | Path = None) -> Bunch: """ Load the Weizmann video data set. It consists of 93 videos showing 9 different persons performing 10 different activities. @@ -123,7 +124,7 @@ def load_video_weizmann(use_actions : tuple = None, use_persons : tuple = None, Can take values within (0, 1] (default: 1) return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -139,7 +140,7 @@ def load_video_weizmann(use_actions : tuple = None, use_persons : tuple = None, ------- https://www.wisdom.weizmann.ac.il/~vision/SpaceTimeActions.html """ - directory = _get_download_dir(downloads_path) + "/Video_Weizmann/" + directory = _get_download_dir(downloads_path) / "Video_Weizmann" all_actions = ["walk", "run", "jump", "side", "bend", "wave1", "wave2", "pjump", "jack", "skip"] if use_actions is None: use_actions = all_actions.copy() @@ -153,10 +154,9 @@ def load_video_weizmann(use_actions : tuple = None, use_persons : tuple = None, # Download data for action in use_actions: my_zip_file = action + ".zip" - filename = directory + my_zip_file - if not os.path.isfile(filename): - if not os.path.isdir(directory): - os.mkdir(directory) + filename = directory / my_zip_file + if not filename.is_file(): + directory.mkdir(parents=False, exist_ok=True) _download_file( "https://www.wisdom.weizmann.ac.il/~vision/VideoAnalysis/Demos/SpaceTimeActions/DB/" + my_zip_file, filename) @@ -164,11 +164,11 @@ def load_video_weizmann(use_actions : tuple = None, use_persons : tuple = None, with zipfile.ZipFile(filename, 'r') as zipf: zipf.extractall(directory) # Load data, iterate over all video files - for v_file in os.listdir(directory): + for v_file in directory.iterdir(): # Ignore zip files - if v_file.endswith(".avi"): + if v_file.suffix == ".avi": # Get name of person and type of activity - relevant_parts = v_file.split(".")[0] + relevant_parts = v_file.name.split(".")[0] person = relevant_parts.split("_")[0] action = relevant_parts.split("_")[1] # Sometimes a person performs an action twice. In that case a 1/2 is appended to the action @@ -179,7 +179,7 @@ def load_video_weizmann(use_actions : tuple = None, use_persons : tuple = None, if person not in use_persons or action not in use_actions: continue # Load video - data_local = _load_video(directory + "/" + v_file, image_size) + data_local = _load_video(directory / v_file, image_size) # Transform string to label label_person = use_persons.index(person) label_action = use_actions.index(action) @@ -207,7 +207,7 @@ def load_video_weizmann(use_actions : tuple = None, use_persons : tuple = None, def load_video_keck_gesture(subset: str = "all", image_size: tuple = (200, 200), frame_sampling_ratio: float = 1, - return_X_y: bool = False, downloads_path: str = None) -> Bunch: + return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the Keck Gesture video data set. It consists of 42 training and 56 testing videos showing 4 different persons performing 14 different gestures. @@ -234,7 +234,7 @@ def load_video_keck_gesture(subset: str = "all", image_size: tuple = (200, 200), Can take values within (0, 1] (default: 1) return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -293,12 +293,11 @@ def parse_frames_file(frames_file: str) -> (dict, dict): subset = subset.lower() assert subset in ["all", "train", "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset) - directory = _get_download_dir(downloads_path) + "/Video_Keck_Gesture/" - filename = directory + "Keck_Dataset.zip" - frames_file = directory + "sequences.txt" - if not os.path.isfile(filename): - if not os.path.isdir(directory): - os.mkdir(directory) + directory = _get_download_dir(downloads_path) / "Video_Keck_Gesture" + filename = directory / "Keck_Dataset.zip" + frames_file = directory / "sequences.txt" + if not filename.is_file(): + directory.mkdir(parents=False, exist_ok=True) _download_file("http://www.zhuolin.umiacs.io/PrototypeTree/Keck_Dataset.zip", filename) # Unpack zipfile with zipfile.ZipFile(filename, 'r') as zipf: @@ -318,13 +317,14 @@ def parse_frames_file(frames_file: str) -> (dict, dict): file_directories.append((False, "testingfiles/")) # load videos for train_data, file_directory in file_directories: - directory_files = directory + "Keck Dataset/" + file_directory + directory_files = directory / "Keck Dataset" / file_directory # Iterate over all video files - for v_file in os.listdir(directory_files): - data_local = _load_video(directory_files + v_file, image_size) + for v_file in directory_files.iterdir(): + v_file_str = v_file.name + data_local = _load_video(v_file, image_size) # Transform string to label - label_gesture = int(v_file.split("_")[1].replace("gesture", "")) - label_person = int(v_file.split("_")[0].replace("person", "")) - 1 + label_gesture = int(v_file_str.split("_")[1].replace("gesture", "")) + label_person = int(v_file_str.split("_")[0].replace("person", "")) - 1 labels_local = np.array([[0, label_person]] * data_local.shape[0], dtype="int32") # Use frames_dicts to set gestures correctly if train_data: diff --git a/clustpy/data/real_world_data.py b/clustpy/data/real_world_data.py index f9637fd..78ad801 100644 --- a/clustpy/data/real_world_data.py +++ b/clustpy/data/real_world_data.py @@ -1,6 +1,5 @@ from clustpy.data._utils import _download_file, _get_download_dir, _download_file_from_google_drive, _load_image_data, \ flatten_images, _transform_text_data -import os import numpy as np import zipfile import tarfile @@ -10,6 +9,8 @@ from scipy.io import loadmat import re from sklearn.datasets._base import Bunch +from pathlib import Path + # More datasets https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#usps @@ -189,12 +190,12 @@ def load_newsgroups(subset: str = "all", use_tfidf: bool = True, use_stemming: b data_raw = newsgroups.data # Get all data so that transformations can be applied to all possible subsets data_all = fetch_20newsgroups(subset="all", remove=('headers', 'footers', 'quotes')).data if subset != "all" else data_raw - data = _transform_text_data(data_raw, use_tfidf, use_stemming, use_stop_words, max_df, min_df, max_features, min_variance, + data, vocabulary = _transform_text_data(data_raw, use_tfidf, use_stemming, use_stop_words, max_df, min_df, max_features, min_variance, sublinear_tf, data_all) if return_X_y: return data, newsgroups.target else: - return Bunch(dataset_name="20Newsgroups", data=data, target=newsgroups.target) + return Bunch(dataset_name="20Newsgroups", data=data, target=newsgroups.target, columns=vocabulary) def load_rcv1(subset: str = "all", n_features: int = 2000, categories: tuple = ("CCAT", "GCAT", "MCAT", "ECAT"), @@ -286,7 +287,7 @@ def load_imagenet_dog(subset: str = "all", "n02102177-Welsh_springer_spaniel", "n02105056-groenendael", "n02105412-kelpie", "n02105855-Shetland_sheepdog", "n02107142-Doberman", "n02110958-pug", "n02112137-chow"], - return_X_y: bool = False, downloads_path: str = None) -> Bunch: + return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the ImageNet Dog data set. It consists of 20580 color images of different sizes showing 120 breeds of dogs. The data set is composed of 12000 training and 8580 test images. @@ -308,7 +309,7 @@ def load_imagenet_dog(subset: str = "all", Usually, a subset consisting of 15 breeds is extracted (default: list with 15 dog breeds) return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : bool + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -333,19 +334,18 @@ def load_imagenet_dog(subset: str = "all", subset = subset.lower() assert subset in ["all", "train", "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset) - directory = _get_download_dir(downloads_path) + "/ImageNetDog/" - filename = directory + "images.tar" - if not os.path.isfile(filename): - if not os.path.isdir(directory): - os.mkdir(directory) + directory = _get_download_dir(downloads_path) / "ImageNetDog" + filename = directory / "images.tar" + if not filename.is_file(): + directory.mkdir(parents=False, exist_ok=True) _download_file("http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar", filename) # Unpack zipfile with tarfile.open(filename, "r") as tar: tar.extractall(directory) # Get files for test/train split - train_test_filename = directory + "lists.tar" - if not os.path.isfile(train_test_filename): + train_test_filename = directory / "lists.tar" + if not train_test_filename.is_file(): _download_file("http://vision.stanford.edu/aditya86/ImageNetDogs/lists.tar", train_test_filename) # Unpack zipfile @@ -353,15 +353,15 @@ def load_imagenet_dog(subset: str = "all", tar.extractall(directory) # Check breeds list if breeds is None: - breeds = os.listdir(directory + "/Images") + breeds = [br.name for br in (directory / "Images").iterdir()] # Load data lists data_list = [] if subset == "train": - object_list = loadmat(directory + "/train_list.mat") + object_list = loadmat(directory / "train_list.mat") elif subset == "test": - object_list = loadmat(directory + "/test_list.mat") + object_list = loadmat(directory / "test_list.mat") else: - object_list = loadmat(directory + "/file_list.mat") + object_list = loadmat(directory / "file_list.mat") labels = object_list["labels"] file_list = object_list["file_list"] # get image data @@ -369,7 +369,7 @@ def load_imagenet_dog(subset: str = "all", for i, file in enumerate(file_list): file = file[0][0] if file.split("/")[0] in breeds: - image_data = _load_image_data(directory + "/Images/" + file, image_size, True) + image_data = _load_image_data(directory / "Images" / file, image_size, True) data_list.append(image_data) else: use_image[i] = False @@ -392,7 +392,7 @@ def load_imagenet_dog(subset: str = "all", images=data_image, image_format=image_format, classes=breeds) -def load_imagenet10(use_224_size: bool = True, return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_imagenet10(use_224_size: bool = True, return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the ImageNet-10 data set. This is a subset of the well-known ImageNet data set with only 10 classes. It consists of 13000 224x224 (or 96x96) color images showing different objects. @@ -404,7 +404,7 @@ def load_imagenet10(use_224_size: bool = True, return_X_y: bool = False, downloa defines wheter the images should be loaded in the size (224 x 224) or (96 x 96) (default: True) return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -425,23 +425,22 @@ def load_imagenet10(use_224_size: bool = True, return_X_y: bool = False, downloa Russakovsky, Olga, et al. "Imagenet large scale visual recognition challenge." International journal of computer vision 115 (2015): 211-252. """ - directory = _get_download_dir(downloads_path) + "/ImageNet10" - if not os.path.isdir(directory): - os.mkdir(directory) + directory = _get_download_dir(downloads_path) / "ImageNet10" + directory.mkdir(parents=False, exist_ok=True) # Source: https://drive.google.com/drive/folders/1XL0Nohi4vO2f1I4znf388n2pMP8PiKFd if use_224_size: - filename_data = directory + "/data_224.npy" - if not os.path.isfile(filename_data): + filename_data = directory / "data_224.npy" + if not filename_data.is_file(): _download_file_from_google_drive("1sLfA0U9s9Q5Cf8o32GxYoyiyrzZN1K_6", filename_data) - filename_labels = directory + "/labels_224.npy" - if not os.path.isfile(filename_labels): + filename_labels = directory / "labels_224.npy" + if not filename_labels.is_file(): _download_file_from_google_drive("1OjAQwaGnAfJBW66HFkR7yODLFxnTZWWI", filename_labels) else: - filename_data = directory + "/data_96.npy" - if not os.path.isfile(filename_data): + filename_data = directory / "data_96.npy" + if not filename_data.is_file(): _download_file_from_google_drive("13VbP1qYz6bSeibnoR-w0J_jL9bQf6tGX", filename_data) - filename_labels = directory + "/labels_96.npy" - if not os.path.isfile(filename_labels): + filename_labels = directory / "labels_96.npy" + if not filename_labels.is_file(): _download_file_from_google_drive("1uiuYUdjyCITLURc5eo8ByP9b51MK_Uk6", filename_labels) # Load data and labels data_image = np.load(filename_data) @@ -460,7 +459,7 @@ def load_imagenet10(use_224_size: bool = True, return_X_y: bool = False, downloa images=data_image, image_format=image_format) -def load_coil20(return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_coil20(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the COIL-20 data set. It consists of 1440 128x128 gray-scale images of 20 objects photographed from 72 different angles. @@ -470,7 +469,7 @@ def load_coil20(return_X_y: bool = False, downloads_path: str = None) -> Bunch: ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -485,11 +484,10 @@ def load_coil20(return_X_y: bool = False, downloads_path: str = None) -> Bunch: ------- https://www.cs.columbia.edu/CAVE/software/softlib/coil-20.php """ - directory = _get_download_dir(downloads_path) + "/COIL20/" - filename = directory + "coil-20-proc.zip" - if not os.path.isfile(filename): - if not os.path.isdir(directory): - os.mkdir(directory) + directory = _get_download_dir(downloads_path) / "COIL20" + filename = directory / "coil-20-proc.zip" + if not filename.is_file(): + directory.mkdir(parents=False, exist_ok=True) _download_file("https://cave.cs.columbia.edu/old/databases/SLAM_coil-20_coil-100/coil-20/coil-20-proc.zip", filename) # Unpack zipfile @@ -500,7 +498,7 @@ def load_coil20(return_X_y: bool = False, downloads_path: str = None) -> Bunch: labels = np.zeros(1440, dtype=np.int32) for i in range(20): for j in range(72): - image_data = _load_image_data(directory + "coil-20-proc/obj{0}__{1}.png".format(i + 1, j), None, False) + image_data = _load_image_data(directory / "coil-20-proc" / "obj{0}__{1}.png".format(i + 1, j), None, False) assert image_data.shape == ( 128, 128), "Shape of image obj{0}__{1}.png is not correct. Mest be (128, 128) but is {2}".format(i + 1, j, @@ -518,7 +516,7 @@ def load_coil20(return_X_y: bool = False, downloads_path: str = None) -> Bunch: return Bunch(dataset_name="COIL20", data=data_flatten, target=labels, images=data_image, image_format="HW") -def load_coil100(return_X_y: bool = False, downloads_path: str = None) -> Bunch: +def load_coil100(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: """ Load the COIL-100 data set. It consists of 7200 128x128 color images of 100 objects photographed from 72 different angles. @@ -528,7 +526,7 @@ def load_coil100(return_X_y: bool = False, downloads_path: str = None) -> Bunch: ---------- return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -544,11 +542,10 @@ def load_coil100(return_X_y: bool = False, downloads_path: str = None) -> Bunch: ------- https://www.cs.columbia.edu/CAVE/software/softlib/coil-100.php """ - directory = _get_download_dir(downloads_path) + "/COIL100/" - filename = directory + "coil-100.zip" - if not os.path.isfile(filename): - if not os.path.isdir(directory): - os.mkdir(directory) + directory = _get_download_dir(downloads_path) / "COIL100" + filename = directory / "coil-100.zip" + if not filename.is_file(): + directory.mkdir(parents=False, exist_ok=True) _download_file("http://cave.cs.columbia.edu/old/databases/SLAM_coil-20_coil-100/coil-100/coil-100.zip", filename) # Unpack zipfile @@ -559,7 +556,7 @@ def load_coil100(return_X_y: bool = False, downloads_path: str = None) -> Bunch: labels = np.zeros(7200, dtype=np.int32) for i in range(100): for j in range(72): - image_data = _load_image_data(directory + "coil-100/obj{0}__{1}.png".format(i + 1, j * 5), None, True) + image_data = _load_image_data(directory / "coil-100" / "obj{0}__{1}.png".format(i + 1, j * 5), None, True) assert image_data.shape == ( 128, 128, 3), "Shape of image obj{0}__{1}.png is not correct. Mest be (128, 128, 3) but is {2}".format( i + 1, j, image_data.shape) @@ -587,16 +584,16 @@ def load_coil100(return_X_y: bool = False, downloads_path: str = None) -> Bunch: def load_webkb(use_universities: tuple = ("cornell", "texas", "washington", "wisconsin"), use_categories: tuple = ("course", "faculty", "project", "student"), use_tfidf: bool = True, use_stemming: bool = True, use_stop_words: bool = True, max_df: float | int = 1., - min_df: float | int = 0.01, max_features: int = None, min_variance : float = 0.25, + min_df: float | int = 1, max_features: int = 2000, min_variance : float = 0., sublinear_tf: bool = False, remove_headers: bool = True, return_X_y: bool = False, - downloads_path: str = None) -> Bunch: + downloads_path: str | Path = None) -> Bunch: """ Load the WebKB data set. It consists of 8282 Html documents from different universities ("wisconsin", "washington", "texas", "cornell", "misc"). These web pages have a specified category ("student", "staff", "project", "faculty", "department", "course", "other"). The first column of the labels contains the category information and the second the university information. For more information see the references website. The data is usually preprocessed by using stemming and removing stop words. Furthermore, words with a document frequency - smaller than min_df or with a variance smaller than min_variance are usually removed and tf-idf is applied. + smaller than min_df or with a variance smaller than min_variance are often removed and tf-idf is applied. N=1041, d=323, k=[4,4] using the default settings. Parameters @@ -616,20 +613,20 @@ def load_webkb(use_universities: tuple = ("cornell", "texas", "washington", "wis If float, the parameter represents a proportion of documents, integer corresponds to absolute counts (see sklearn CountVectorizer) (default: 1.0) min_df : float | int Ignore words that have a document frequency strictly lower than min_df. - If float, the parameter represents a proportion of documents, integer corresponds to absolute counts (see sklearn CountVectorizer) (default: 0.01) + If float, the parameter represents a proportion of documents, integer corresponds to absolute counts (see sklearn CountVectorizer) (default: 1) max_features : int If not None, the resulting count matric will ony contain the top max_features ordered by term frequency across the corpus (see sklearn CountVectorizer). - Note that this value could be further reduced if min_variance is smaller than one (default: None) + Note that this value could be further reduced if min_variance is smaller than one (default: 2000) min_variance : float Features with a variance lower than min_variance will be removed (see sklearn VarianceThreshold). - The default is to keep all features with non-zero variance, i.e. remove only the features that have the same value in all samples (default: 0.25) + The default is to keep all features with non-zero variance, i.e. remove only the features that have the same value in all samples (default: 0.) sublinear_tf : bool Apply sublinear term frequency scaling, i.e. replace tf with 1 + log(tf) (see sklearn TfidfTransformer) (default: False) remove_headers : bool Specifies if the headers of the Html files should be removed (default: True) return_X_y : bool If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) - downloads_path : str + downloads_path : str | Path path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) Returns @@ -652,11 +649,10 @@ def load_webkb(use_universities: tuple = ("cornell", "texas", "washington", "wis use_categories = possible_categories.copy() assert all([cat in possible_categories for cat in use_categories]) # Check if data is already downloaded - directory = _get_download_dir(downloads_path) + "/WebKB/" - filename = directory + "webkb-data.gtar.gz" - if not os.path.isfile(filename): - if not os.path.isdir(directory): - os.mkdir(directory) + directory = _get_download_dir(downloads_path) / "WebKB" + filename = directory / "webkb-data.gtar.gz" + if not filename.is_file(): + directory.mkdir(parents=False, exist_ok=True) _download_file("http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/webkb-data.gtar.gz", filename) # Unpack zipfile @@ -673,7 +669,7 @@ def load_webkb(use_universities: tuple = ("cornell", "texas", "washington", "wis f = tar.extractfile(obj) lines = f.readlines() # Write file - with open(directory + new_name, "wb") as output: + with open(directory / new_name, "wb") as output: for line in lines: output.write(line) texts = [] @@ -684,10 +680,9 @@ def load_webkb(use_universities: tuple = ("cornell", "texas", "washington", "wis # Read files for i, category in enumerate(use_categories): for j, univerity in enumerate(use_universities): - inner_directory = "{0}webkb/{1}/{2}/".format(directory, category, univerity) - files = os.listdir(inner_directory) - for file in files: - with open(inner_directory + file, "r", encoding='latin-1') as f: + inner_directory = directory / "webkb" / category / univerity + for file in inner_directory.iterdir(): + with open(file, "r", encoding='latin-1') as f: lines = f.read() if remove_headers: # Remove header @@ -698,10 +693,171 @@ def load_webkb(use_universities: tuple = ("cornell", "texas", "washington", "wis texts.append(lines) labels = np.r_[labels, [[i, j]]] # Transform raw data - data = _transform_text_data(texts, use_tfidf, use_stemming, use_stop_words, max_df, min_df, max_features, min_variance, + data, vocabulary = _transform_text_data(texts, use_tfidf, use_stemming, use_stop_words, max_df, min_df, max_features, min_variance, + sublinear_tf) + # Return values + if return_X_y: + return data, labels + else: + return Bunch(dataset_name="WebKB", data=data, target=labels, classes=(use_categories, use_universities), columns=vocabulary) + + +""" +BBC Data +""" + + +def load_bbcsport(use_tfidf: bool = True, use_stemming: bool = True, use_stop_words: bool = True, max_df: float | int = 1., + min_df: float | int = 1, max_features: int = 2000, min_variance : float = 0., sublinear_tf: bool = False, + return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: + """ + Load the BBC Sport data set. It consists of a collection of 18846 BBC sport documents, partitioned + into the topics "athletics", "cricket", "football", "rugby", and "tennis". + The documents are usually converted into feature vectors using tf-idf. + N=737, d=2000, k=5 using the default settings. + + Parameters + ---------- + use_tfidf : bool + If true, tf-idf will be applied as the last step of the pipeline (default: True) + use_stemming : bool + If true, the SnowballStemmer from nltk will be used when creating the count matrix (default: True) + use_stop_words : bool + If true, the list of English stopwords from sklearn CountVectorizer will be used (default: True) + max_df : float | int + Ignore words that have a document frequency strictly higher than max_df. + If float, the parameter represents a proportion of documents, integer corresponds to absolute counts (see sklearn CountVectorizer) (default: 1.0) + min_df : float | int + Ignore words that have a document frequency strictly lower than min_df. + If float, the parameter represents a proportion of documents, integer corresponds to absolute counts (see sklearn CountVectorizer) (default: 1) + max_features : int + If not None, the resulting count matric will ony contain the top max_features ordered by term frequency across the corpus (see sklearn CountVectorizer). + Note that this value could be further reduced if min_variance is smaller than one (default: 2000) + min_variance : float + Features with a variance lower than min_variance will be removed (see sklearn VarianceThreshold). + The default is to keep all features with non-zero variance, i.e. remove only the features that have the same value in all samples (default: 0.) + sublinear_tf : bool + Apply sublinear term frequency scaling, i.e. replace tf with 1 + log(tf) (see sklearn TfidfTransformer) (default: False) + return_X_y : bool + If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) + downloads_path : str | Path + path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) + + Returns + ------- + bunch : Bunch + A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. + Alternatively, if return_X_y is True two arrays will be returned: + the data numpy array (737 x 2000 - using the default settings), the labels numpy array (737) + + References + ------- + http://mlg.ucd.ie/datasets/bbc.html + """ + # Check if data is already downloaded + directory = _get_download_dir(downloads_path) / "bbcsport" + filename = directory / "bbcsport-fulltext.zip" + if not filename.is_file(): + directory.mkdir(parents=False, exist_ok=True) + _download_file("http://mlg.ucd.ie/files/datasets/bbcsport-fulltext.zip", filename) + # Unpack zipfile + with zipfile.ZipFile(filename, 'r') as zipf: + zipf.extractall(directory) + directory = directory / "bbcsport" + labels = [] + texts = [] + topics = ["athletics", "cricket", "football", "rugby", "tennis"] + for i, topic in enumerate(topics): + inner_directory = directory / topic + for file in inner_directory.iterdir(): + with open(file, "r") as f: + lines = f.read() + texts.append(lines) + labels.append(i) + # Transform raw data + data, vocabulary = _transform_text_data(texts, use_tfidf, use_stemming, use_stop_words, max_df, min_df, max_features, min_variance, + sublinear_tf) + labels = np.array(labels) + # Return values + if return_X_y: + return data, labels + else: + return Bunch(dataset_name="BBCSport", data=data, target=labels, classes=topics, columns=vocabulary) + + +def load_bbcnews(use_tfidf: bool = True, use_stemming: bool = True, use_stop_words: bool = True, max_df: float | int = 1., + min_df: float | int = 1, max_features: int = 2000, min_variance : float = 0., sublinear_tf: bool = False, + return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch: + """ + Load the BBC News data set. It consists of a collection of 2225 BBC news documents, partitioned + into the topics "business", "entertainment", "politics", "sport", and "tech". + The documents are usually converted into feature vectors using tf-idf. + N=2225, d=2000, k=5 using the default settings. + + Parameters + ---------- + use_tfidf : bool + If true, tf-idf will be applied as the last step of the pipeline (default: True) + use_stemming : bool + If true, the SnowballStemmer from nltk will be used when creating the count matrix (default: True) + use_stop_words : bool + If true, the list of English stopwords from sklearn CountVectorizer will be used (default: True) + max_df : float | int + Ignore words that have a document frequency strictly higher than max_df. + If float, the parameter represents a proportion of documents, integer corresponds to absolute counts (see sklearn CountVectorizer) (default: 1.0) + min_df : float | int + Ignore words that have a document frequency strictly lower than min_df. + If float, the parameter represents a proportion of documents, integer corresponds to absolute counts (see sklearn CountVectorizer) (default: 1) + max_features : int + If not None, the resulting count matric will ony contain the top max_features ordered by term frequency across the corpus (see sklearn CountVectorizer). + Note that this value could be further reduced if min_variance is smaller than one (default: 2000) + min_variance : float + Features with a variance lower than min_variance will be removed (see sklearn VarianceThreshold). + The default is to keep all features with non-zero variance, i.e. remove only the features that have the same value in all samples (default: 0.) + sublinear_tf : bool + Apply sublinear term frequency scaling, i.e. replace tf with 1 + log(tf) (see sklearn TfidfTransformer) (default: False) + return_X_y : bool + If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False) + downloads_path : str | Path + path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles) + + Returns + ------- + bunch : Bunch + A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute. + Alternatively, if return_X_y is True two arrays will be returned: + the data numpy array (2225 x 2000 - using the default settings), the labels numpy array (2225) + + References + ------- + http://mlg.ucd.ie/datasets/bbc.html + """ + # Check if data is already downloaded + directory = _get_download_dir(downloads_path) / "bbcnews" + filename = directory / "bbc-fulltext.zip" + if not filename.is_file(): + directory.mkdir(parents=False, exist_ok=True) + _download_file("http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip", filename) + # Unpack zipfile + with zipfile.ZipFile(filename, 'r') as zipf: + zipf.extractall(directory) + directory = directory / "bbc" + labels = [] + texts = [] + topics = ["business", "entertainment", "politics", "sport", "tech"] + for i, topic in enumerate(topics): + inner_directory = directory / topic + for file in inner_directory.iterdir(): + with open(file, "r") as f: + lines = f.read() + texts.append(lines) + labels.append(i) + # Transform raw data + data, vocabulary = _transform_text_data(texts, use_tfidf, use_stemming, use_stop_words, max_df, min_df, max_features, min_variance, sublinear_tf) + labels = np.array(labels) # Return values if return_X_y: return data, labels else: - return Bunch(dataset_name="WebKB", data=data, target=labels, classes=(use_categories, use_universities)) + return Bunch(dataset_name="BBCNews", data=data, target=labels, classes=topics, columns=vocabulary) diff --git a/clustpy/data/tests/test_real_world_data.py b/clustpy/data/tests/test_real_world_data.py index c1dba3c..ed7cc90 100644 --- a/clustpy/data/tests/test_real_world_data.py +++ b/clustpy/data/tests/test_real_world_data.py @@ -1,6 +1,6 @@ from clustpy.data.tests._helpers_for_tests import _helper_test_data_loader from clustpy.data import load_iris, load_wine, load_breast_cancer, load_olivetti_faces, load_newsgroups, load_rcv1, \ - load_imagenet_dog, load_imagenet10, load_coil20, load_coil100, load_webkb + load_imagenet_dog, load_imagenet10, load_coil20, load_coil100, load_webkb, load_bbcnews, load_bbcsport import pytest import shutil @@ -129,5 +129,15 @@ def test_load_coil100(my_tmp_dir): @pytest.mark.data def test_load_webkb(my_tmp_dir): - _helper_test_data_loader(load_webkb, 1041, 323, [4, 4], dataloader_params={"downloads_path": my_tmp_dir}) - _helper_test_data_loader(load_webkb, 8282, 761, [7, 5], dataloader_params={"downloads_path": my_tmp_dir, "use_categories": None, "use_universities": None}) + _helper_test_data_loader(load_webkb, 1041, 2000, [4, 4], dataloader_params={"downloads_path": my_tmp_dir}) + _helper_test_data_loader(load_webkb, 1041, 323, [4, 4], dataloader_params={"downloads_path": my_tmp_dir, "min_df": 0.01, "max_features": None, "min_variance": 0.25}) + _helper_test_data_loader(load_webkb, 8282, 2000, [7, 5], dataloader_params={"downloads_path": my_tmp_dir, "use_categories": None, "use_universities": None}) + + +@pytest.mark.data +def test_load_bbcsport(my_tmp_dir): + _helper_test_data_loader(load_bbcsport, 737, 2000, 5, dataloader_params={"downloads_path": my_tmp_dir}) + +@pytest.mark.data +def test_load_bbcnews(my_tmp_dir): + _helper_test_data_loader(load_bbcnews, 2225, 2000, 5, dataloader_params={"downloads_path": my_tmp_dir}) diff --git a/clustpy/deep/_abstract_deep_clustering_algo.py b/clustpy/deep/_abstract_deep_clustering_algo.py index c776ff6..68ba772 100644 --- a/clustpy/deep/_abstract_deep_clustering_algo.py +++ b/clustpy/deep/_abstract_deep_clustering_algo.py @@ -6,6 +6,7 @@ from clustpy.utils.checks import check_parameters from sklearn.utils.validation import check_is_fitted from sklearn.metrics.pairwise import pairwise_distances_argmin_min +from pathlib import Path class _AbstractDeepClusteringAlgo(TransformerMixin, ClusterMixin, BaseEstimator): @@ -19,7 +20,7 @@ class _AbstractDeepClusteringAlgo(TransformerMixin, ClusterMixin, BaseEstimator) neural_network : torch.nn.Module | tuple the neural network used for the computations. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict). - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network. embedding_size : int size of the embedding within the autoencoder @@ -29,7 +30,7 @@ class _AbstractDeepClusteringAlgo(TransformerMixin, ClusterMixin, BaseEstimator) use a fixed random state to get a repeatable solution. Can also be of type int """ - def __init__(self, batch_size: int, neural_network: torch.nn.Module | tuple, neural_network_weights: str, + def __init__(self, batch_size: int, neural_network: torch.nn.Module | tuple, neural_network_weights: str | Path, embedding_size: int, device: torch.device, random_state: np.random.RandomState | int): self.batch_size = batch_size self.neural_network = neural_network diff --git a/clustpy/deep/_data_utils.py b/clustpy/deep/_data_utils.py index fc787fa..257e08a 100644 --- a/clustpy/deep/_data_utils.py +++ b/clustpy/deep/_data_utils.py @@ -2,6 +2,7 @@ import torchvision import numpy as np from typing import Callable, List +from pathlib import PurePath class _ClustpyDataset(torch.utils.data.Dataset): @@ -287,9 +288,9 @@ def get_train_and_test_dataloader(X: np.ndarray | torch.Tensor, batch_size: int else: trainloader, testloader = custom_dataloaders # If train-/testloader is string, it can be loaded from a file - if type(trainloader) is str: + if isinstance(trainloader, (str, PurePath)): trainloader = torch.load(trainloader, weights_only=False) - if type(testloader) is str: + if isinstance(testloader, (str, PurePath)): testloader = torch.load(testloader, weights_only=False) if trainloader.batch_size != testloader.batch_size: print( @@ -313,7 +314,6 @@ def get_default_augmented_dataloaders(X: np.ndarray | torch.Tensor, batch_size: a channel-wise z-transformation. Optionally, the images can be flatten afterward. - Parameters ---------- X : np.ndarray | torch.Tensor diff --git a/clustpy/deep/_train_utils.py b/clustpy/deep/_train_utils.py index e776d96..4618a5c 100644 --- a/clustpy/deep/_train_utils.py +++ b/clustpy/deep/_train_utils.py @@ -6,6 +6,7 @@ from clustpy.deep._data_utils import get_dataloader, get_train_and_test_dataloader, get_data_dim_from_dataloader from clustpy.deep._utils import run_initial_clustering, detect_device, encode_batchwise, mean_squared_error from collections.abc import Callable +from pathlib import Path def _get_default_layers(input_dim: int, embedding_size: int) -> list: @@ -31,7 +32,7 @@ def _get_default_layers(input_dim: int, embedding_size: int) -> list: def get_neural_network(input_dim: int, embedding_size: int = 10, neural_network: torch.nn.Module | tuple = None, neural_network_class: torch.nn.Module = FeedforwardAutoencoder, - neural_network_params: dict = None, neural_network_weights: str = None, device : torch.device = None, + neural_network_params: dict = None, neural_network_weights: str | Path = None, device : torch.device = None, random_state: np.random.RandomState | int = None) -> torch.nn.Module: """This function returns a new neural_network. - If neural_network is already a torch.nn.module, nothing will happen. @@ -52,7 +53,7 @@ def get_neural_network(input_dim: int, embedding_size: int = 10, neural_network: The neural network class that should be used (default: FeedforwardAutoencoder) neural_network_params : dict Parameters to be used when creating a new neural network using the neural_network_class (default: None) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network (default: None) device : torch.device The device on which to perform the computations. @@ -111,7 +112,7 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, embedding_size: int = 10, neural_network: torch.nn.Module | tuple = None, neural_network_class: torch.nn.Module = FeedforwardAutoencoder, - neural_network_params: dict = None, neural_network_weights: str = None, + neural_network_params: dict = None, neural_network_weights: str | Path = None, random_state: np.random.RandomState | int = None) -> torch.nn.Module: """This function returns a trained neural network. The following cases are considered - If the neural network is initialized and trained (neural_network.fitted==True), then return input neural network without training it again. @@ -147,7 +148,7 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n The neural network class that should be used (default: FeedforwardAutoencoder) neural_network_params : dict Parameters to be used when creating a new neural network using the neural_network_class (default: None) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network (default: None) random_state : np.random.RandomState | int use a fixed random state to get a repeatable solution. Can also be of type int (default: None) @@ -185,7 +186,7 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c random_state: np.random.RandomState, neural_network_class: torch.nn.Module = FeedforwardAutoencoder, neural_network_params: dict = None, - neural_network_weights: str = None) -> ( + neural_network_weights: str | Path = None) -> ( torch.device, torch.utils.data.DataLoader, torch.utils.data.DataLoader, int, torch.nn.Module, np.ndarray, int, np.ndarray, np.ndarray, ClusterMixin): """ @@ -231,7 +232,7 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c The neural network class that should be used (default: FeedforwardAutoencoder) neural_network_params : dict Parameters to be used when creating a new neural network using the neural_network_class (default: None) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network (default: None) Returns diff --git a/clustpy/deep/aec.py b/clustpy/deep/aec.py index fcf6e61..920a083 100644 --- a/clustpy/deep/aec.py +++ b/clustpy/deep/aec.py @@ -12,12 +12,13 @@ from clustpy.deep.dcn import _DCN_Module import tqdm from collections.abc import Callable +from pathlib import Path def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, - neural_network: torch.nn.Module | tuple, neural_network_weights: str, + neural_network: torch.nn.Module | tuple, neural_network_weights: str | Path, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple, augmentation_invariance: bool, initial_clustering_class: ClusterMixin, initial_clustering_params: dict, device: torch.device, @@ -48,7 +49,7 @@ def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par neural_network : torch.nn.Module | tuple the input neural network. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network. embedding_size : int size of the embedding within the neural network @@ -242,7 +243,7 @@ class AEC(_AbstractDeepClusteringAlgo): neural_network : torch.nn.Module | tuple the input neural network. If None, a new FeedforwardAutoencoder will be created. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network (default: None) embedding_size : int size of the embedding within the neural network (default: 10) @@ -296,7 +297,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimize clustering_epochs: int = 150, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, clustering_loss_weight: float = 0.1, ssl_loss_weight: float = 1.0, neural_network: torch.nn.Module | tuple = None, - neural_network_weights: str = None, embedding_size: int = 10, custom_dataloaders: tuple = None, + neural_network_weights: str | Path = None, embedding_size: int = 10, custom_dataloaders: tuple = None, augmentation_invariance: bool = False, initial_clustering_class: ClusterMixin = None, initial_clustering_params: dict = None, device: torch.device = None, random_state: np.random.RandomState | int = None): diff --git a/clustpy/deep/dcn.py b/clustpy/deep/dcn.py index 7fcf906..2ed0c53 100644 --- a/clustpy/deep/dcn.py +++ b/clustpy/deep/dcn.py @@ -13,12 +13,13 @@ from sklearn.base import ClusterMixin import tqdm from collections.abc import Callable +from pathlib import Path def _dcn(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, - neural_network: torch.nn.Module | tuple, neural_network_weights: str, + neural_network: torch.nn.Module | tuple, neural_network_weights: str | Path, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple, augmentation_invariance: bool, initial_clustering_class: ClusterMixin, initial_clustering_params: dict, device: torch.device, @@ -49,7 +50,7 @@ def _dcn(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par neural_network : torch.nn.Module | tuple the input neural network. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network. embedding_size : int size of the embedding within the neural network @@ -397,7 +398,7 @@ class DCN(_AbstractDeepClusteringAlgo): neural_network : torch.nn.Module | tuple the input neural network. If None, a new FeedforwardAutoencoder will be created. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network (default: None) embedding_size : int size of the embedding within the neural network (default: 10) @@ -454,7 +455,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimize clustering_epochs: int = 150, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, clustering_loss_weight: float = 0.1, ssl_loss_weight: float = 1.0, neural_network: torch.nn.Module | tuple = None, - neural_network_weights: str = None, embedding_size: int = 10, custom_dataloaders: tuple = None, + neural_network_weights: str | Path = None, embedding_size: int = 10, custom_dataloaders: tuple = None, augmentation_invariance: bool = False, initial_clustering_class: ClusterMixin = KMeans, initial_clustering_params: dict = None, device: torch.device = None, random_state: np.random.RandomState | int = None): diff --git a/clustpy/deep/ddc_n2d.py b/clustpy/deep/ddc_n2d.py index 6cde45b..070a221 100644 --- a/clustpy/deep/ddc_n2d.py +++ b/clustpy/deep/ddc_n2d.py @@ -16,12 +16,13 @@ import inspect from collections.abc import Callable from clustpy.utils.checks import check_parameters +from pathlib import Path def _manifold_based_sequential_dc(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, pretrain_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple, - neural_network_weights: str, embedding_size: int, custom_dataloaders: tuple, + neural_network_weights: str | Path, embedding_size: int, custom_dataloaders: tuple, manifold_class: TransformerMixin, manifold_params: dict, clustering_class: ClusterMixin, clustering_params: dict, device: torch.device, random_state: np.random.RandomState) -> ( @@ -48,7 +49,7 @@ def _manifold_based_sequential_dc(X: np.ndarray, n_clusters: int, batch_size: in neural_network : torch.nn.Module | tuple the input neural network. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network. embedding_size : int size of the embedding within the neural network @@ -261,7 +262,7 @@ class DDC(_AbstractDeepClusteringAlgo): neural_network : torch.nn.Module | tuple the input neural network. If None, a new FeedforwardAutoencoder will be created. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network (default: None) embedding_size : int size of the embedding within the neural network (default: 10) @@ -311,7 +312,7 @@ class DDC(_AbstractDeepClusteringAlgo): def __init__(self, ratio: float = 0.1, batch_size: int = 256, pretrain_optimizer_params: dict = None, pretrain_epochs: int = 100, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, - neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None, + neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None, embedding_size: int = 10, custom_dataloaders: tuple = None, tsne_params: dict = None, device: torch.device = None, random_state: np.random.RandomState | int = None): super().__init__(batch_size, neural_network, neural_network_weights, embedding_size, device, random_state) @@ -409,7 +410,7 @@ class N2D(_AbstractDeepClusteringAlgo): neural_network : torch.nn.Module | tuple the input neural network. If None, a new FeedforwardAutoencoder will be created. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network (default: None) embedding_size : int size of the embedding within the neural network (default: 10) @@ -455,7 +456,7 @@ class N2D(_AbstractDeepClusteringAlgo): def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimizer_params: dict = None, pretrain_epochs: int = 100, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, - neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None, + neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None, embedding_size: int = 10, custom_dataloaders: tuple = None, manifold_class: TransformerMixin = TSNE, manifold_params: dict = None, initial_clustering_params: dict = None, device: torch.device = None, random_state: np.random.RandomState | int = None): diff --git a/clustpy/deep/dec.py b/clustpy/deep/dec.py index a2501fb..7b6a428 100644 --- a/clustpy/deep/dec.py +++ b/clustpy/deep/dec.py @@ -14,12 +14,13 @@ from sklearn.base import ClusterMixin import tqdm from collections.abc import Callable# +from pathlib import Path def _dec(X: np.ndarray, n_clusters: int, alpha: float, batch_size: int, pretrain_optimizer_params: dict, clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, - neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int, + neural_network: torch.nn.Module | tuple, neural_network_weights: str | Path, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple, augmentation_invariance: bool, initial_clustering_class: ClusterMixin, initial_clustering_params: dict, device: torch.device, random_state: np.random.RandomState) -> ( @@ -52,7 +53,7 @@ def _dec(X: np.ndarray, n_clusters: int, alpha: float, batch_size: int, pretrain neural_network : torch.nn.Module | tuple the input neural network. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network. embedding_size : int size of the embedding within the neural network @@ -433,7 +434,7 @@ class DEC(_AbstractDeepClusteringAlgo): neural_network : torch.nn.Module | tuple the input neural network. If None, a new FeedforwardAutoencoder will be created. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network (default: None) embedding_size : int size of the embedding within the neural network (default: 10) @@ -491,7 +492,7 @@ def __init__(self, n_clusters: int = 8, alpha: float = 1.0, batch_size: int = 25 pretrain_epochs: int = 100, clustering_epochs: int = 150, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, - neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None, + neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None, embedding_size: int = 10, clustering_loss_weight: float = 1., custom_dataloaders: tuple = None, augmentation_invariance: bool = False, initial_clustering_class: ClusterMixin = KMeans, initial_clustering_params: dict = None, device: torch.device = None, @@ -586,7 +587,7 @@ class IDEC(DEC): neural_network : torch.nn.Module | tuple the input neural network. If None, a new FeedforwardAutoencoder will be created. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network (default: None) embedding_size : int size of the embedding within the neural network (default: 10) @@ -645,7 +646,7 @@ def __init__(self, n_clusters: int = 8, alpha: float = 1.0, batch_size: int = 25 clustering_optimizer_params: dict = None, pretrain_epochs: int = 100, clustering_epochs: int = 150, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, - neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None, + neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None, embedding_size: int = 10, clustering_loss_weight: float = 0.1, ssl_loss_weight: float = 1.0, custom_dataloaders: tuple = None, augmentation_invariance: bool = False, initial_clustering_class: ClusterMixin = KMeans, initial_clustering_params: dict = None, diff --git a/clustpy/deep/deepect.py b/clustpy/deep/deepect.py index 028afbb..2042f69 100644 --- a/clustpy/deep/deepect.py +++ b/clustpy/deep/deepect.py @@ -15,6 +15,7 @@ import copy from collections.abc import Callable from sklearn.utils.validation import check_is_fitted +from pathlib import Path class _DeepECT_ClusterTreeNode(_ClusterTreeNode): @@ -447,7 +448,7 @@ def _deep_ect(X: np.ndarray, max_n_leaf_nodes: int, batch_size: int, pretrain_op clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, grow_interval: int, pruning_threshold: float, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple, - neural_network_weights: str, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float, + neural_network_weights: str | Path, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple, augmentation_invariance: bool, device: torch.device, random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, torch.nn.Module): """ @@ -480,7 +481,7 @@ def _deep_ect(X: np.ndarray, max_n_leaf_nodes: int, batch_size: int, pretrain_op neural_network : torch.nn.Module | tuple the input neural network. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network. embedding_size : int size of the embedding within the neural network @@ -564,7 +565,7 @@ class DeepECT(_AbstractDeepClusteringAlgo): neural_network : torch.nn.Module | tuple the input neural network. If None, a new FeedforwardAutoencoder will be created. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network (default: None) embedding_size : int Size of the embedding within the neural network (default: 10) @@ -607,7 +608,7 @@ def __init__(self, max_n_leaf_nodes: int = 20, batch_size: int = 256, pretrain_o grow_interval: int = 2, pruning_threshold: float = 0.1, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, - neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None, + neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None, embedding_size: int = 10, clustering_loss_weight: float = 1., ssl_loss_weight: float = 1., custom_dataloaders: tuple = None, augmentation_invariance: bool = False, device: torch.device = None, random_state: np.random.RandomState | int = None): diff --git a/clustpy/deep/den.py b/clustpy/deep/den.py index 7306175..2562e76 100644 --- a/clustpy/deep/den.py +++ b/clustpy/deep/den.py @@ -13,6 +13,7 @@ from sklearn.neighbors import NearestNeighbors from sklearn.cluster import KMeans from collections.abc import Callable +from pathlib import Path class DEN(_AbstractDeepClusteringAlgo): @@ -53,7 +54,7 @@ class DEN(_AbstractDeepClusteringAlgo): neural_network : torch.nn.Module | tuple the input neural network. If None, a new FeedforwardAutoencoder will be created. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network (default: None) embedding_size : int size of the embedding within the neural network (default: None) @@ -98,7 +99,7 @@ def __init__(self, n_clusters: int = 8, group_size : int | list | None = 2, n_ne batch_size: int = 256, pretrain_optimizer_params: dict = None, pretrain_epochs: int = 100, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, - neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None, + neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None, embedding_size: int | None = None, custom_dataloaders: tuple = None, device: torch.device = None, random_state: np.random.RandomState | int = None): super().__init__(batch_size, neural_network, neural_network_weights, embedding_size, device, random_state) diff --git a/clustpy/deep/dipdeck.py b/clustpy/deep/dipdeck.py index 8433330..117cf6d 100644 --- a/clustpy/deep/dipdeck.py +++ b/clustpy/deep/dipdeck.py @@ -15,6 +15,7 @@ import tqdm from collections.abc import Callable from sklearn.utils.validation import check_is_fitted +from pathlib import Path def _merge_by_dip_value(X: np.ndarray, embedded_data: np.ndarray, cluster_labels_cpu: np.ndarray, @@ -102,7 +103,6 @@ def _force_merge(X: np.ndarray, embedded_data: np.ndarray, cluster_labels_cpu: n First strategy is to delete the smallest cluster if it is smaller than 0.2 * the average cluster size. In this case, the samples in this cluster will be assigned to the clostest remaining cluster. If there is no cluster that is sufficiently small, the cluster-combination with the largest dip-value will be merged. - Parameters ---------- @@ -617,7 +617,7 @@ class DipDECK(_AbstractDeepClusteringAlgo): neural_network : torch.nn.Module | tuple the input neural network. If None, a new FeedforwardAutoencoder will be created. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network (default: None) embedding_size : int size of the embedding within the neural network (default: 5) @@ -648,7 +648,6 @@ class DipDECK(_AbstractDeepClusteringAlgo): debug : bool If true, additional information will be printed to the console (default: False) - Attributes ---------- labels_ : np.ndarray @@ -682,7 +681,7 @@ def __init__(self, n_clusters_init: int = 35, dip_merge_threshold: float = 0.9, clustering_optimizer_params: dict = None, pretrain_epochs: int = 100, clustering_epochs: int = 50, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, - neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None, + neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None, embedding_size: int = 5, max_cluster_size_diff_factor: float = 2, pval_strategy: str = "table", n_boots: int = 1000, custom_dataloaders: tuple = None, augmentation_invariance: bool = False, initial_clustering_class: ClusterMixin = KMeans, initial_clustering_params: dict = None, diff --git a/clustpy/deep/dipencoder.py b/clustpy/deep/dipencoder.py index bf6774e..2248a7b 100644 --- a/clustpy/deep/dipencoder.py +++ b/clustpy/deep/dipencoder.py @@ -17,6 +17,7 @@ from clustpy.utils import plot_scatter_matrix import tqdm from collections.abc import Callable +from pathlib import Path """ Dip module - holds backward functions @@ -699,7 +700,7 @@ class DipEncoder(_AbstractDeepClusteringAlgo): neural_network : torch.nn.Module | tuple the input neural network. If None, a new FeedforwardAutoencoder will be created. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network (default: None) embedding_size : int size of the embedding within the neural network (default: 10) @@ -763,7 +764,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = None, pretrain_optimiz clustering_optimizer_params: dict = None, pretrain_epochs: int = 100, clustering_epochs: int = 150, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, - neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None, + neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None, embedding_size: int = 10, max_cluster_size_diff_factor: float = 3, clustering_loss_weight: float = 1., ssl_loss_weight: float = None, custom_dataloaders: tuple = None, augmentation_invariance: bool = False, diff --git a/clustpy/deep/dkm.py b/clustpy/deep/dkm.py index 896a6f3..185e801 100644 --- a/clustpy/deep/dkm.py +++ b/clustpy/deep/dkm.py @@ -12,12 +12,13 @@ from sklearn.base import ClusterMixin import tqdm from collections.abc import Callable +from pathlib import Path def _dkm(X: np.ndarray, n_clusters: int, alphas: list | tuple, batch_size: int, pretrain_optimizer_params: dict, clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, - neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int, + neural_network: torch.nn.Module | tuple, neural_network_weights: str | Path, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple, augmentation_invariance: bool, initial_clustering_class: ClusterMixin, initial_clustering_params: dict, device: torch.device, random_state: np.random.RandomState) -> ( @@ -52,7 +53,7 @@ def _dkm(X: np.ndarray, n_clusters: int, alphas: list | tuple, batch_size: int, neural_network : torch.nn.Module | tuple the input neural network. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network. embedding_size : int size of the embedding within the neural network @@ -406,7 +407,7 @@ class DKM(_AbstractDeepClusteringAlgo): neural_network : torch.nn.Module | tuple the input neural network. If None, a new FeedforwardAutoencoder will be created. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network (default: None) embedding_size : int size of the embedding within the neural network (default: 10) @@ -466,7 +467,7 @@ def __init__(self, n_clusters: int = 8, alphas: tuple = (1000), batch_size: int pretrain_epochs: int = 100, clustering_epochs: int = 150, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, - neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None, + neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None, embedding_size: int = 10, clustering_loss_weight: float = 0.1, ssl_loss_weight: float = 1., custom_dataloaders: tuple = None, augmentation_invariance: bool = False, initial_clustering_class: ClusterMixin = KMeans, initial_clustering_params: dict = None, diff --git a/clustpy/deep/enrc.py b/clustpy/deep/enrc.py index b6b364a..032e91c 100644 --- a/clustpy/deep/enrc.py +++ b/clustpy/deep/enrc.py @@ -18,6 +18,7 @@ from clustpy.alternative.nrkmeans import _get_total_cost_function import tqdm from collections.abc import Callable +from pathlib import Path class _ENRC_Module(torch.nn.Module): @@ -1722,7 +1723,7 @@ def _enrc(X: np.ndarray, n_clusters: list, V: np.ndarray, P: list, input_centers pretrain_optimizer_params: dict, clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, clustering_loss_weight: float, ssl_loss_weight: float, neural_network: torch.nn.Module | tuple, - neural_network_weights: str, embedding_size: int, init: str, random_state: np.random.RandomState, + neural_network_weights: str | Path, embedding_size: int, init: str, random_state: np.random.RandomState, device: torch.device, scheduler: torch.optim.lr_scheduler, scheduler_params: dict, tolerance_threshold: float, init_kwargs: dict, init_subsample_size: int, custom_dataloaders: tuple, augmentation_invariance: bool, final_reclustering: bool, debug: bool) -> ( @@ -1763,7 +1764,7 @@ def _enrc(X: np.ndarray, n_clusters: list, V: np.ndarray, P: list, input_centers neural_network : torch.nn.Module | tuple the input neural network. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network. embedding_size : int size of the embedding within the neural network. Only used if neural_network is None @@ -1948,7 +1949,7 @@ class ENRC(_AbstractDeepClusteringAlgo): neural_network : torch.nn.Module | tuple the input neural network. If None, a new FeedforwardAutoencoder will be created. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network (default: None) embedding_size : int size of the embedding within the neural network. Only used if neural_network is None (default: 20) @@ -2007,7 +2008,7 @@ def __init__(self, n_clusters: list, V: np.ndarray = None, P: list = None, input tolerance_threshold: float = None, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, clustering_loss_weight: float = 1.0, ssl_loss_weight: float = 1.0, - neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None, + neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None, embedding_size: int = 20, init: str = "nrkmeans", device: torch.device = None, scheduler: torch.optim.lr_scheduler = None, scheduler_params: dict = None, init_kwargs: dict = None, init_subsample_size: int = 10000, @@ -2289,7 +2290,7 @@ class ACeDeC(ENRC): neural_network : torch.nn.Module | tuple the input neural network. If None, a new FeedforwardAutoencoder will be created. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network (default: None) embedding_size : int size of the embedding within the neural network. Only used if neural_network is None (default: 20) @@ -2348,7 +2349,7 @@ def __init__(self, n_clusters: int, V: np.ndarray = None, P: list = None, input_ tolerance_threshold: float = None, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, clustering_loss_weight: float = 1.0, ssl_loss_weight: float = 1.0, - neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None, + neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None, embedding_size: int = 20, init: str = "acedec", device: torch.device = None, scheduler: torch.optim.lr_scheduler = None, scheduler_params: dict = None, init_kwargs: dict = None, init_subsample_size: int = 10000, diff --git a/clustpy/deep/neural_networks/_abstract_autoencoder.py b/clustpy/deep/neural_networks/_abstract_autoencoder.py index f582157..33c9d36 100644 --- a/clustpy/deep/neural_networks/_abstract_autoencoder.py +++ b/clustpy/deep/neural_networks/_abstract_autoencoder.py @@ -8,13 +8,13 @@ from clustpy.deep._early_stopping import EarlyStopping from clustpy.deep._data_utils import get_dataloader from clustpy.deep._utils import get_device_from_module, mean_squared_error -import os import tqdm from collections.abc import Callable from sklearn.utils import check_random_state from clustpy.deep._utils import set_torch_seed from collections.abc import Callable from clustpy.utils.checks import check_parameters +from pathlib import Path class FullyConnectedBlock(torch.nn.Module): @@ -394,28 +394,28 @@ def fit(self, n_epochs: int = 100, optimizer_params: dict = None, batch_size: in self.fitted = True return self - def save_parameters(self, path: str) -> None: + def save_parameters(self, path: str | Path) -> None: """ Save the current state_dict of the model. Parameters ---------- - path : str + path : str | Path Path where the state_dict should be stored """ # Check if directory exists - parent_directory = os.path.dirname(path) - if parent_directory != "" and not os.path.isdir(parent_directory): - os.makedirs(parent_directory) + if isinstance(path, str): + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) torch.save(self.state_dict(), path) - def load_parameters(self, path: str) -> '_AbstractAutoencoder': + def load_parameters(self, path: str | Path) -> '_AbstractAutoencoder': """ Load a state_dict into the current model to set its parameters. Parameters ---------- - path : str + path : str | Path Path from where the state_dict should be loaded Returns diff --git a/clustpy/deep/vade.py b/clustpy/deep/vade.py index b5d6e79..ee045b5 100644 --- a/clustpy/deep/vade.py +++ b/clustpy/deep/vade.py @@ -15,12 +15,13 @@ from sklearn.base import ClusterMixin import tqdm from collections.abc import Callable +from pathlib import Path def _vade(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, - neural_network: torch.nn.Module | tuple, neural_network_weights: str, + neural_network: torch.nn.Module | tuple, neural_network_weights: str | Path, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple, initial_clustering_class: ClusterMixin, initial_clustering_params: dict, device: torch.device, random_state: np.random.RandomState) -> ( @@ -51,7 +52,7 @@ def _vade(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_pa neural_network : torch.nn.Module | tuple the input neural network. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network. embedding_size : int size of the embedding within the neural network (central layer with mean and variance) @@ -513,7 +514,7 @@ class VaDE(_AbstractDeepClusteringAlgo): neural_network : torch.nn.Module | tuple the input neural network. If None, a new VariationalAutoencoder will be created. Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None) - neural_network_weights : str + neural_network_weights : str | Path Path to a file containing the state_dict of the neural_network (default: None) embedding_size : int size of the embedding within the neural network (central layer with mean and variance) (default: 10) @@ -571,7 +572,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimize clustering_epochs: int = 150, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = torch.nn.BCELoss(reduction='sum'), clustering_loss_weight: float = 1.0, ssl_loss_weight: float = 1.0, - neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None, + neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None, embedding_size: int = 10, custom_dataloaders: tuple = None, initial_clustering_class: ClusterMixin = GaussianMixture, initial_clustering_params: dict = None, device: torch.device = None, random_state: np.random.RandomState | int = None): diff --git a/clustpy/utils/_information_theory.py b/clustpy/utils/_information_theory.py index a9052e6..f96270c 100644 --- a/clustpy/utils/_information_theory.py +++ b/clustpy/utils/_information_theory.py @@ -33,7 +33,7 @@ def bic_costs(n_points: int, use_log2: bool = False) -> float: return bic_costs -def integer_costs(integer: int) -> float: +def integer_costs(integer: int, use_log2: bool = False) -> float: """ Calculate the costs to encode an integer value. Uses following formula: log(integer) + log(log(integer)) + log(log(log(integer))) + ... + log(const), where const = 2.865064. @@ -42,22 +42,24 @@ def integer_costs(integer: int) -> float: ---------- integer : int The integer value to encode + use_log2 : bool + Defines whether log2 should be used instead of ln (default: False) Returns ------- costs : float The encoding costs of the integer """ - assert type(integer) is int or type(integer) is np.int32 or type( - integer) is np.int64, "The input to calculate the mdl costs of must be an integer. Your input:\n{0} (type: {1})".format( + assert isinstance(integer, (int, np.integer)), "The input to calculate the mdl costs of must be an integer. Your input:\n{0} (type: {1})".format( integer, type(integer)) costs = 0 - if integer != 0: - last_interim_result = np.log2(integer) + if integer > 0: + last_interim_result = np.log2(integer) if use_log2 else np.log(integer) while last_interim_result > 0: costs += last_interim_result - last_interim_result = np.log2(last_interim_result) - costs = costs + np.log2(2.865064) + last_interim_result = np.log2(last_interim_result) if use_log2 else np.log(last_interim_result) + const = np.log2(2.865064) if use_log2 else np.log(2.865064) + costs = costs + const return costs diff --git a/clustpy/utils/evaluation.py b/clustpy/utils/evaluation.py index d95d205..7ea51cb 100644 --- a/clustpy/utils/evaluation.py +++ b/clustpy/utils/evaluation.py @@ -4,10 +4,10 @@ from sklearn.utils import check_random_state from sklearn.base import ClusterMixin from collections.abc import Callable -import os import inspect from sklearn.datasets._base import Bunch import sys +from pathlib import Path, PurePath def _preprocess_dataset(X: np.ndarray, preprocess_methods: list, preprocess_params: list) -> np.ndarray: @@ -113,7 +113,7 @@ def evaluate_dataset(X: np.ndarray, evaluation_algorithms: list, evaluation_metr labels_true: np.ndarray = None, n_repetitions: int = 10, X_test: np.ndarray = None, labels_true_test: np.ndarray = None, aggregation_functions: tuple = (np.mean, np.std), add_runtime: bool = True, - add_n_clusters: bool = False, save_path: str = None, save_labels_path: str = None, + add_n_clusters: bool = False, save_path: str | Path = None, save_labels_path: str | Path = None, ignore_algorithms: tuple = (), dataset_name: str = None, random_state: np.random.RandomState | int | list = None, quiet: bool = False) -> pd.DataFrame: """ @@ -143,10 +143,10 @@ def evaluate_dataset(X: np.ndarray, evaluation_algorithms: list, evaluation_metr Add runtime of each execution to the final table (default: True) add_n_clusters : bool Add the resulting number of clusters to the final table (default: False) - save_path : str - The path where the final DataFrame should be saved as csv. If None, the DataFrame will not be saved (default: None) - save_labels_path : str - The path where the clustering labels should be saved as csv. If None, the labels will not be saved (default: None) + save_path : str | Path + The path where the final DataFrame should be saved. If None, the DataFrame will not be saved (default: None) + save_labels_path : str | Path + The path where the clustering labels should be saved. If None, the labels will not be saved (default: None) ignore_algorithms : tuple List of algorithm names (as specified in the EvaluationAlgorithm object) that should be ignored for this specific data set (default: []) dataset_name : str @@ -195,14 +195,16 @@ def evaluate_dataset(X: np.ndarray, evaluation_algorithms: list, evaluation_metr evaluation_algorithms = [evaluation_algorithms] if type(evaluation_metrics) is not list and evaluation_metrics is not None: evaluation_metrics = [evaluation_metrics] - if save_labels_path is not None and "." not in save_labels_path: - save_labels_path = save_labels_path + ".csv" - assert save_labels_path is None or len( - save_labels_path.split(".")) == 2, "save_labels_path must only contain a single dot. E.g., NAME.csv" - if save_path is not None and "." not in save_path: - save_path = save_path + ".csv" - assert save_path is None or len( - save_path.split(".")) == 2, "save_path must only contain a single dot. E.g., NAME.csv" + if save_labels_path is not None: + if isinstance(save_labels_path, str): + save_labels_path = Path(save_labels_path) + if save_labels_path.suffix == "": + save_labels_path.with_suffix(".csv") + if save_path is not None: + if isinstance(save_path, str): + save_path = Path(save_path) + if save_path.suffix == "": + save_path = save_path.with_suffix(".csv") seeds = _get_fixed_seed_for_each_run(n_repetitions, random_state) algo_names = [a.name for a in evaluation_algorithms] assert max( @@ -300,17 +302,14 @@ def evaluate_dataset(X: np.ndarray, evaluation_algorithms: list, evaluation_metr runtime = time.time() - start_time # Optional: Save labels if save_labels_path is not None: - save_labels_path_algo = None if save_labels_path is None else "{0}_{1}_{2}.{3}".format( - save_labels_path.split(".")[0], eval_algo.name, rep, save_labels_path.split(".")[1]) + save_labels_path_algo = save_labels_path.with_name("{0}_{1}_{2}".format(save_labels_path.name, + eval_algo.name, rep)) # Check if directory exists - parent_directory = os.path.dirname(save_labels_path_algo) - if parent_directory != "" and not os.path.isdir(parent_directory): - os.makedirs(parent_directory) + save_labels_path_algo.parent.mkdir(parents=True, exist_ok=True) np.savetxt(save_labels_path_algo, algo_obj.labels_) # Also save predict labels if X_test is not None and labels_predicted_test is not None: - save_labels_path_algo_test = "{0}_TEST.{1}".format(save_labels_path_algo.split(".")[0], - save_labels_path_algo.split(".")[1]) + save_labels_path_algo_test = save_labels_path_algo.with_name("{0}_TEST".format(save_labels_path_algo.name)) np.savetxt(save_labels_path_algo_test, labels_predicted_test) # Get result of all metrics if evaluation_metrics is not None: @@ -386,17 +385,15 @@ def evaluate_dataset(X: np.ndarray, evaluation_algorithms: list, evaluation_metr print("-> Aggregation {0}: {1}".format(agg.__name__, aggregated_results)) if save_path is not None: # Check if directory exists - parent_directory = os.path.dirname(save_path) - if parent_directory != "" and not os.path.isdir(parent_directory): - os.makedirs(parent_directory) + save_path.parent.mkdir(parents=True, exist_ok=True) df.to_csv(save_path) return df def evaluate_multiple_datasets(evaluation_datasets: list, evaluation_algorithms: list, evaluation_metrics: list = None, n_repetitions: int = 10, aggregation_functions: tuple = (np.mean, np.std), - add_runtime: bool = True, add_n_clusters: bool = False, save_path: str = None, - save_intermediate_results: bool = False, save_labels_path: str = None, + add_runtime: bool = True, add_n_clusters: bool = False, save_path: str | Path = None, + save_intermediate_results: bool = False, save_labels_path: str | Path = None, random_state: np.random.RandomState | int | list = None, quiet: bool = False) -> pd.DataFrame: """ Evaluate the clustering result of different clustering algorithms (as specified by evaluation_algorithms) on a set of data sets (as specified by evaluation_datasets) using different metrics (as specified by evaluation_metrics). @@ -419,15 +416,15 @@ def evaluate_multiple_datasets(evaluation_datasets: list, evaluation_algorithms: Add runtime of each execution to the final table (default: True) add_n_clusters : bool Add the resulting number of clusters to the final table (default: False) - save_path : str - The path where the final DataFrame should be saved as csv. If None, the DataFrame will not be saved (default: None) + save_path : str | Path + The path where the final DataFrame should be saved. If None, the DataFrame will not be saved (default: None) save_intermediate_results : bool Defines whether the result of each data set should be separately saved. Useful if the evaluation takes a lot of time. The files will be saved as [save_path]_[DATASET_NAME]. This implies that save_path has to be defined if save_intermediate_results is set to True (default: False) - save_labels_path : str - The path where the clustering labels should be saved as csv. If None, the labels will not be saved (default: None) + save_labels_path : str | Path + The path where the clustering labels should be saved. If None, the labels will not be saved (default: None) random_state : np.random.RandomState | int | list use a fixed random state to get a repeatable solution. Can also be of type int. Furthermore, if can be a list containing an int for each repetition (default: None) @@ -477,14 +474,16 @@ def evaluate_multiple_datasets(evaluation_datasets: list, evaluation_algorithms: "save_intermediate_results is True" if type(evaluation_datasets) is not list: evaluation_datasets = [evaluation_datasets] - if save_labels_path is not None and "." not in save_labels_path: - save_labels_path = save_labels_path + ".csv" - assert save_labels_path is None or len( - save_labels_path.split(".")) == 2, "save_labels_path must only contain a single dot. E.g., NAME.csv" - if save_path is not None and "." not in save_path: - save_path = save_path + ".csv" - assert save_path is None or len( - save_path.split(".")) == 2, "save_path must only contain a single dot. E.g., NAME.csv" + if save_labels_path is not None: + if isinstance(save_labels_path, str): + save_labels_path = Path(save_labels_path) + if save_labels_path.suffix == "": + save_labels_path.with_suffix(".csv") + if save_path is not None: + if isinstance(save_path, str): + save_path = Path(save_path) + if save_path.suffix == "": + save_path = save_path.with_suffix(".csv") data_names = [d.name for d in evaluation_datasets] assert max( np.unique(data_names, return_counts=True)[1]) == 1, "Some names of your datasets do not seem to be unique!" @@ -506,11 +505,10 @@ def evaluate_multiple_datasets(evaluation_datasets: list, evaluation_algorithms: X = _preprocess_dataset(X, eval_data.preprocess_methods, eval_data.preprocess_params) if X_test is not None: X_test = _preprocess_dataset(X_test, eval_data.preprocess_methods, eval_data.preprocess_params) - inner_save_path = None if not save_intermediate_results else "{0}_{1}.{2}".format(save_path.split(".")[0], - eval_data.name, - save_path.split(".")[1]) - inner_save_labels_path = None if save_labels_path is None else "{0}_{1}.{2}".format( - save_labels_path.split(".")[0], eval_data.name, save_labels_path.split(".")[1]) + inner_save_path = None if not save_intermediate_results else save_path.with_name("{0}_{1}".format(save_path.name, + eval_data.name)) + inner_save_labels_path = None if save_labels_path is None else save_labels_path.with_name("{0}_{1}".format( + save_labels_path.name, eval_data.name)) df = evaluate_dataset(X, evaluation_algorithms, evaluation_metrics=evaluation_metrics, labels_true=labels_true, n_repetitions=n_repetitions, X_test=X_test, labels_true_test=labels_true_test, @@ -527,9 +525,7 @@ def evaluate_multiple_datasets(evaluation_datasets: list, evaluation_algorithms: all_dfs = pd.concat(df_list, keys=data_names) if save_path is not None: # Check if directory exists - parent_directory = os.path.dirname(save_path) - if parent_directory != "" and not os.path.isdir(parent_directory): - os.makedirs(parent_directory) + save_path.parent.mkdir(parents=True, exist_ok=True) all_dfs.to_csv(save_path) return all_dfs @@ -543,7 +539,7 @@ def _get_data_and_labels_from_evaluation_dataset(data_input: np.ndarray, data_lo Parameters ---------- - data_input : np.ndarray + data_input : np.ndarray | str | Path | Callable The actual data set. Can be a np.ndarray, a path to a data file (of type str) or a callable (e.g. a method from clustpy.data) data_loader_params_input : dict Dictionary containing the information necessary to load data from a function or file. Only relevant if data is of type callable or str @@ -567,7 +563,7 @@ def _get_data_and_labels_from_evaluation_dataset(data_input: np.ndarray, data_lo labels_true = None X_test = None labels_true_test = None - if type(data_input) is str: + if isinstance(data_input, (str, PurePath)): X = np.genfromtxt(data_input, **data_loader_params_input) elif type(data_input) is np.ndarray: X = data_input @@ -607,7 +603,7 @@ def _get_data_and_labels_from_evaluation_dataset(data_input: np.ndarray, data_lo return X, labels_true, X_test, labels_true_test -def evaluation_df_to_latex_table(df: pd.DataFrame | str, relevant_row : str = "mean", output_path: str = None, pm_row: str | None = "std", +def evaluation_df_to_latex_table(df: pd.DataFrame | str | Path, relevant_row : str = "mean", output_path: str | Path = None, pm_row: str | None = "std", bracket_row: str | None = None, best_in_bold: bool = True, second_best_underlined: bool = True, third_best_dashed_underlined: bool = False, color_by_value: str = None, higher_is_better: list = None, multiplier: int | float | list | None = 100, decimal_places: int = 1, color_min_max: tuple = (5, 70)) -> str: @@ -620,11 +616,11 @@ def evaluation_df_to_latex_table(df: pd.DataFrame | str, relevant_row : str = "m Parameters ---------- - df : pd.DataFrame | str - The pandas dataframe. Can also be a string that contains the path to the saved dataframe + df : pd.DataFrame | str | Path + The pandas dataframe. Can also be a string/path that contains the path to the saved dataframe relevant_row : str The name of the row in the df that is used to create the latex table (default: "mean") - output_path : str + output_path : str | Path The path were the resulting latex table text file will be stored (default: None) pm_row : str The name of the row in the df that should be added to the latex table after the value from relevant_row separated by plus-minus (default: "std") @@ -662,8 +658,8 @@ def evaluation_df_to_latex_table(df: pd.DataFrame | str, relevant_row : str = "m The created latex string """ # Load dataframe - assert type(df) == pd.DataFrame or type(df) == str, "Type of df must be pandas DataFrame or string (path to file)" - if type(df) == str: + assert isinstance(df, (pd.DataFrame, str, PurePath)), "Type of df must be pandas DataFrame, Path or string (path to file)" + if isinstance(df, (str, PurePath)): df_file = open(df, "r").readlines() multiple_datasets = df_file[2].split(",")[0] != "0" df = pd.read_csv(df, index_col=[0, 1] if multiple_datasets else [0], header=[0, 1]) @@ -814,8 +810,8 @@ class EvaluationDataset(): ---------- name : str Name of the data set. Can be chosen freely - data : np.ndarray - The actual data set. Can be a np.ndarray, a path to a data file (of type str) or a callable (e.g. a method from clustpy.data) + data : np.ndarray | str | Path | Callable + The actual data set. Can be a np.ndarray, a path to a data fileor a callable (e.g. a method from clustpy.data) labels_true : np.ndarray The ground truth labels. Can be a np.ndarray, an int or list specifying which columns of the data contain the labels or None if no ground truth labels are present. If data is a callable, the ground truth labels can also be obtained by that function and labels_true can be None (default: None) @@ -846,13 +842,13 @@ class EvaluationDataset(): >>> ed2 = EvaluationDataset(name="wine", data=X, labels_true=L) """ - def __init__(self, name: str, data: np.ndarray, labels_true: np.ndarray = None, data_loader_params: dict = None, + def __init__(self, name: str, data: np.ndarray | str | Path | Callable, labels_true: np.ndarray = None, data_loader_params: dict = None, train_test_split: bool = None, preprocess_methods: list = None, preprocess_params: list = None, ignore_algorithms: tuple = ()): assert type(name) is str, "name must be a string" self.name = name assert "." not in name, "name must not contain a dot" - assert type(data) is np.ndarray or type(data) is str or callable(data), "data must be a numpy array, a string " \ + assert isinstance(data, (np.ndarray, str, PurePath)) or callable(data), "data must be a numpy array, a string " \ "containing the path to a data file or a " \ "function returning a data and a labels array" self.data = data diff --git a/clustpy/utils/tests/test_information_theory.py b/clustpy/utils/tests/test_information_theory.py index 5132259..ff862fe 100644 --- a/clustpy/utils/tests/test_information_theory.py +++ b/clustpy/utils/tests/test_information_theory.py @@ -89,8 +89,10 @@ def test_bic_costs(): def test_integer_costs(): - costs = integer_costs(77) + costs = integer_costs(77, True) assert abs(costs - 12.328150766) < 1e-9 + costs = integer_costs(77, False) + assert abs(costs - 7.24955913584) < 1e-9 def test_mdl_costs_probability():