From 58e1083ce96cf6d650b516f189bef62e2b6e7acb Mon Sep 17 00:00:00 2001
From: Collin Leiber <collin.leiber@aalto.fi>
Date: Mon, 25 May 2026 13:50:46 +0300
Subject: [PATCH] Change path handling from os to pathlib #77

---
 clustpy/alternative/nrkmeans.py               |  12 +-
 clustpy/data/__init__.py                      |   6 +-
 clustpy/data/_utils.py                        |  93 ++---
 clustpy/data/real_clustpy_data.py             |   4 +-
 clustpy/data/real_medical_mnist_data.py       |  82 ++---
 clustpy/data/real_timeseries_data.py          |  59 ++--
 clustpy/data/real_torchvision_data.py         |  42 +--
 clustpy/data/real_uci_data.py                 | 320 +++++++++---------
 clustpy/data/real_video_data.py               |  54 +--
 clustpy/data/real_world_data.py               | 286 ++++++++++++----
 clustpy/data/tests/test_real_world_data.py    |  16 +-
 .../deep/_abstract_deep_clustering_algo.py    |   5 +-
 clustpy/deep/_data_utils.py                   |   6 +-
 clustpy/deep/_train_utils.py                  |  13 +-
 clustpy/deep/aec.py                           |   9 +-
 clustpy/deep/dcn.py                           |   9 +-
 clustpy/deep/ddc_n2d.py                       |  13 +-
 clustpy/deep/dec.py                           |  13 +-
 clustpy/deep/deepect.py                       |   9 +-
 clustpy/deep/den.py                           |   5 +-
 clustpy/deep/dipdeck.py                       |   7 +-
 clustpy/deep/dipencoder.py                    |   5 +-
 clustpy/deep/dkm.py                           |   9 +-
 clustpy/deep/enrc.py                          |  13 +-
 .../neural_networks/_abstract_autoencoder.py  |  16 +-
 clustpy/deep/vade.py                          |   9 +-
 clustpy/utils/_information_theory.py          |  16 +-
 clustpy/utils/evaluation.py                   | 112 +++---
 .../utils/tests/test_information_theory.py    |   4 +-
 29 files changed, 718 insertions(+), 529 deletions(-)

diff --git a/clustpy/alternative/nrkmeans.py b/clustpy/alternative/nrkmeans.py
index 83348c2..9705d81 100644
--- a/clustpy/alternative/nrkmeans.py
+++ b/clustpy/alternative/nrkmeans.py
@@ -1292,7 +1292,7 @@ def _mdl_m_dependant_subspace_costs(X: np.ndarray, V: np.ndarray, cluster_index:
     # ==== Costs of cluster space ====
     cropped_V_cluster = V[:, P_cluster]
     # Costs for cluster dimensionality
-    cluster_costs = mdl.integer_costs(m_cluster)
+    cluster_costs = mdl.integer_costs(m_cluster, use_log2=True)
     # Costs for centers
     cluster_costs += n_clusters[cluster_index] * _mdl_reference_vector(m_cluster, max_distance, precision)
     # Costs for point encoding
@@ -1306,7 +1306,7 @@ def _mdl_m_dependant_subspace_costs(X: np.ndarray, V: np.ndarray, cluster_index:
     # ==== Costs of noise space ====
     cropped_V_noise = V[:, P_noise]
     # Costs for noise dimensionality
-    noise_costs = mdl.integer_costs(m_noise)
+    noise_costs = mdl.integer_costs(m_noise, use_log2=True)
     # Costs for centers
     noise_costs += n_clusters[noise_index] * _mdl_reference_vector(m_noise, max_distance, precision)
     # Costs for point encoding
@@ -1443,7 +1443,7 @@ def _mdl_costs(X: np.ndarray, n_clusters: list, m: list, P: list, V: np.ndarray,
     # Costs of matrix V
     # global_costs += mdl.mdl_costs_orthogonal_matrix(n_points, mdl.mdl_costs_float_value(n_points))
     # Costs of number of subspaces
-    global_costs += mdl.integer_costs(subspaces)
+    global_costs += mdl.integer_costs(subspaces, use_log2=True)
     # Costs for each subspace
     all_subspace_costs = []
     for subspace in range(subspaces):
@@ -1451,9 +1451,9 @@ def _mdl_costs(X: np.ndarray, n_clusters: list, m: list, P: list, V: np.ndarray,
         # Calculate costs
         model_costs = 0
         # Costs for dimensionality
-        model_costs += mdl.integer_costs(m[subspace])
+        model_costs += mdl.integer_costs(m[subspace], use_log2=True)
         # Number of clusters in this subspace
-        model_costs += mdl.integer_costs(n_clusters[subspace])
+        model_costs += mdl.integer_costs(n_clusters[subspace], use_log2=True)
         # Costs for cluster centers
         model_costs += n_clusters[subspace] * \
                        _mdl_reference_vector(m[subspace], max_distance, precision)
@@ -1462,7 +1462,7 @@ def _mdl_costs(X: np.ndarray, n_clusters: list, m: list, P: list, V: np.ndarray,
         if outliers:
             # Encode number of outliers
             n_outliers = len(labels[:, subspace][labels[:, subspace] == -1])
-            model_costs += mdl.integer_costs(n_outliers)
+            model_costs += mdl.integer_costs(n_outliers, use_log2=True)
             # Encode coding costs of outliers
             outlier_costs += n_outliers * np.log2(n_points)
             outlier_costs += n_outliers * _mdl_costs_uniform_pdf(m[subspace], max_distance)
diff --git a/clustpy/data/__init__.py b/clustpy/data/__init__.py
index 4939f40..1971038 100644
--- a/clustpy/data/__init__.py
+++ b/clustpy/data/__init__.py
@@ -1,6 +1,6 @@
 from .synthetic_data_creator import create_subspace_data, create_nr_data
 from .real_world_data import load_newsgroups, load_iris, load_wine, load_breast_cancer, load_rcv1, load_imagenet_dog, \
-    load_imagenet10, load_coil20, load_coil100, load_olivetti_faces, load_webkb
+    load_imagenet10, load_coil20, load_coil100, load_olivetti_faces, load_webkb, load_bbcnews, load_bbcsport
 from .real_uci_data import load_har, load_letterrecognition, load_optdigits, load_pendigits, load_banknotes, load_htru2, \
     load_mice_protein, load_ecoli, load_spambase, load_seeds, load_statlog_shuttle, load_forest_types, \
     load_breast_tissue, load_soybean_large, load_soybean_small, load_skin, load_user_knowledge, load_dermatology, \
@@ -105,4 +105,6 @@
            'load_gene_expression_cancer_rna_seq',
            'load_sport_articles',
            'load_wholesale_customers',
-           'load_reuters21578']
+           'load_reuters21578',
+           'load_bbcsport',
+           'load_bbcnews']
diff --git a/clustpy/data/_utils.py b/clustpy/data/_utils.py
index 70c4466..33dddf3 100644
--- a/clustpy/data/_utils.py
+++ b/clustpy/data/_utils.py
@@ -14,23 +14,24 @@
         "[WARNING] Could not import PIL in clustpy.data.real_world_data. Please install PIL by 'pip install Pillow' if necessary")
 import numpy as np
 import os
-from pathlib import Path
+from pathlib import Path, PurePath
 from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
 from sklearn.feature_selection import VarianceThreshold
 from sklearn.datasets import fetch_file
+import subprocess
 
 
-DEFAULT_DOWNLOAD_PATH = str(Path.home() / "Downloads/clustpy_datafiles")
+DEFAULT_DOWNLOAD_PATH = Path.home() / "Downloads" / "clustpy_datafiles"
 
 
-def _get_download_dir(downloads_path: str) -> str:
+def _get_download_dir(downloads_path: str | Path) -> Path:
     """
     Helper function to define the path where the data files should be stored. If downloads_path is None then default path
     '[USER]/Downloads/clustpy_datafiles' will be used. If the directory does not exists it will be created.
 
     Parameters
     ----------
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data will be stored. Can be None
 
     Returns
@@ -44,10 +45,13 @@ def _get_download_dir(downloads_path: str) -> str:
         if env_data_path is None:
             downloads_path = DEFAULT_DOWNLOAD_PATH
         else:
-            downloads_path = env_data_path
-    if not os.path.isdir(downloads_path):
-        os.makedirs(downloads_path)
-        with open(downloads_path + "/info.txt", "w") as f:
+            downloads_path = Path(env_data_path)
+    elif isinstance(downloads_path, str):
+        # Cast str to Path
+        downloads_path = Path(downloads_path)
+    if not downloads_path.is_dir():
+        downloads_path.mkdir(parents=True, exist_ok=False)
+        with open(downloads_path / "info.txt", "w") as f:
             f.write("This directory was created by the ClustPy python package to store real world data sets.\n"
                     "The default directory is '[USER]/Downloads/clustpy_datafiles' and can be changed with the "
                     "'downloads_path' parameter when loading a data set.\n"
@@ -55,7 +59,7 @@ def _get_download_dir(downloads_path: str) -> str:
     return downloads_path
 
 
-def _download_file(file_url: str, filename_local: str) -> None:
+def _download_file(file_url: str, filename_local: str | Path) -> None:
     """
     Helper function to download a file into a specified location.
 
@@ -63,17 +67,18 @@ def _download_file(file_url: str, filename_local: str) -> None:
     ----------
     file_url : str
         URL of the file
-    filename_local : str
+    filename_local : str | Path
         local name of the file after it has been downloaded
     """
-    local_path = Path(filename_local)
-    local_dir = local_path.parent
-    local_filename = local_path.name
+    if isinstance(filename_local, str):
+        filename_local = Path(filename_local)
+    local_dir = filename_local.parent
+    local_filename = filename_local.name
     print("Downloading data set from {0} to {1}".format(file_url, filename_local))
     fetch_file(file_url, folder=local_dir, local_filename=local_filename)
 
 
-def _download_file_from_google_drive(file_id: str, filename_local: str, chunk_size: int = 32768) -> None:
+def _download_file_from_google_drive(file_id: str, filename_local: str | Path, chunk_size: int = 32768) -> None:
     """
     Download a file from google drive.
     Code taken from:
@@ -83,7 +88,7 @@ def _download_file_from_google_drive(file_id: str, filename_local: str, chunk_si
     ----------
     file_id : str
         ID of the file on google drive
-    filename_local : str
+    filename_local : str | Path
         local name of the file after it has been downloaded
     chunk_size : int
         chink size when downloading the file (default: 32768)
@@ -107,8 +112,8 @@ def _download_file_from_google_drive(file_id: str, filename_local: str, chunk_si
     session.close()
 
 
-def _load_data_file(filename_local: str, file_url: str, delimiter: str = ",", last_column_are_labels: bool = True) -> (
-        np.ndarray, np.ndarray):
+def _load_data_file(filename_local: Path, file_url: str, delimiter: str = ",", last_column_are_labels: bool = True) -> tuple[
+        np.ndarray, np.ndarray]:
     """
     Helper function to load a data file. Either the first or last column, depending on last_column_are_labels, of the
     data file is used as the label column.
@@ -116,7 +121,7 @@ def _load_data_file(filename_local: str, file_url: str, delimiter: str = ",", la
 
     Parameters
     ----------
-    filename_local : str
+    filename_local : Path
         local name of the file after it has been downloaded
     file_url : str
         URL of the file
@@ -127,10 +132,10 @@ def _load_data_file(filename_local: str, file_url: str, delimiter: str = ",", la
 
     Returns
     -------
-    data, labels : (np.ndarray, np.ndarray)
+    data, labels : tuple[np.ndarray, np.ndarray]
         the data numpy array, the labels numpy array
     """
-    if not os.path.isfile(filename_local):
+    if not filename_local.is_file():
         _download_file(file_url, filename_local)
     datafile = np.genfromtxt(filename_local, delimiter=delimiter)
     if last_column_are_labels:
@@ -144,7 +149,7 @@ def _load_data_file(filename_local: str, file_url: str, delimiter: str = ",", la
     return data, labels
 
 
-def _decompress_z_file(filename: str, directory: str) -> bool:
+def _decompress_z_file(filename: str | Path, directory: str | Path) -> bool:
     """
     Helper function to decompress a 7z file. The function uses an installed version of 7zip to decompress the file.
     If 7zip is not installed on this machine, the function will return False and a warning is printed.
@@ -161,22 +166,30 @@ def _decompress_z_file(filename: str, directory: str) -> bool:
     successful : bool
         True if decompression was successful, else False
     """
-    os.system("7z x {0} -o{1}".format(filename.replace("\\", "/"), directory.replace("\\", "/")))
-    successful = True
-    if not os.path.isfile(filename[:-2]):
+    if isinstance(filename, str):
+        filename = Path(filename)
+    if isinstance(directory, str):
+        directory = Path(directory)
+    cmd = ["7z", "x", filename.as_posix(), f"-o{directory.as_posix()}"]
+    try:
+        subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        print("[WARNING] 7Zip extraction failed or 7z executable is missing!")
+        return False
+    if not filename.with_suffix('').is_file():
         # If no file without .z exists, decompression was not successful
-        successful = False
-        print("[WARNING] 7Zip is needed to uncompress *.Z files!")
-    return successful
+        print("[WARNING] Decompression check failed: expected file not found.")
+        return False
+    return True
 
 
-def _load_image_data(image: str, image_size: tuple, color_image: bool) -> np.ndarray:
+def _load_image_data(image: str | Path | np.ndarray, image_size: tuple, color_image: bool) -> np.ndarray:
     """
     Load image and convert it into a coherent size. Returns a numpy array containing the image data.
 
     Parameters
     ----------
-    image : str
+    image : str | Path | np.ndarray
         Path to the image. Can also be a numpy array containing the specific pixels
     image_size : tuple
         images of various sizes can be converted into a coherent size.
@@ -190,7 +203,7 @@ def _load_image_data(image: str, image_size: tuple, color_image: bool) -> np.nda
     image_data : np.ndarray
         The numpy array containing the image data
     """
-    if isinstance(image, str):
+    if isinstance(image, (str, PurePath)):
         pil_image = Image.open(image)
     else:
         pil_image = Image.fromarray(np.uint8(image))
@@ -231,7 +244,7 @@ def build_analyzer(self):
 
 def _transform_text_data(data: np.ndarray, use_tfidf: bool, use_stemming: bool, use_stop_words: bool, max_df: float | int, 
                          min_df: float | int, max_features: int, min_variance : float, sublinear_tf: bool, 
-                         data_all: np.ndarray = None) -> np.ndarray:
+                         data_all: np.ndarray | None = None) -> tuple[np.ndarray, list[str]]:
     """
     Transform a set of texts into a data matrix.
     Result can be either a raw count matrix or the result of tf-idf.
@@ -261,13 +274,14 @@ def _transform_text_data(data: np.ndarray, use_tfidf: bool, use_stemming: bool,
         The default is to keep all features with non-zero variance, i.e. remove only the features that have the same value in all samples 
     sublinear_tf : bool
         Apply sublinear term frequency scaling, i.e. replace tf with 1 + log(tf) (see sklearn TfidfTransformer)
-    data_all : np.ndarray
+    data_all : np.ndarray | None
         The complete data set, i.e., if no subset is used. If it is None, it will be equal to data (default: None)
 
     Returns
     -------
-    data : np.ndarray
-        The resulting data array
+    tuple : tuple[np.ndarray, list[str]]
+        The resulting data array,
+        The vocabulary of the data output
     """
     if data_all is None:
         data_all = data
@@ -278,18 +292,21 @@ def _transform_text_data(data: np.ndarray, use_tfidf: bool, use_stemming: bool,
         vectorizer = CountVectorizer(dtype=np.float64, stop_words="english" if use_stop_words else None, min_df=min_df, max_df=max_df, max_features=max_features)
     data_sparse_all = vectorizer.fit_transform(data_all)
     data_sparse = vectorizer.transform(data)
+    vocabulary = vectorizer.get_feature_names_out()
     # (Optional) Check for variance threshold
     if min_variance != 0:
         selector = VarianceThreshold(min_variance)
         data_sparse_all = selector.fit_transform(data_sparse_all)
         data_sparse = selector.transform(data_sparse)
+        vocabulary_mask = selector._get_support_mask()
+        vocabulary = vocabulary[vocabulary_mask]
     # (Optional) Apply tf-idf
     if use_tfidf:
         tfidf = TfidfTransformer(sublinear_tf=sublinear_tf)
         tfidf.fit(data_sparse_all)
         data_sparse = tfidf.transform(data_sparse)
     data = np.asarray(data_sparse.todense())
-    return data
+    return data, vocabulary
 
 
 def flatten_images(data: np.ndarray, format: str) -> np.ndarray:
@@ -313,11 +330,11 @@ def flatten_images(data: np.ndarray, format: str) -> np.ndarray:
     format_possibilities = ["HW", "HWD", "CHW", "CHWD", "HWC", "HWDC"]
     assert format in format_possibilities, "Format must be within {0}".format(format_possibilities)
     if format == "HW":
-        assert data.ndim == 3
+        assert data.ndim == 3, f"ndim has to be 3 but is {data.ndim}"
     elif format in ["HWD", "CHW", "HWC"]:
-        assert data.ndim == 4
+        assert data.ndim == 4, f"ndim has to be 4 but is {data.ndim}"
     elif format in ["CHWD", "HWDC"]:
-        assert data.ndim == 5
+        assert data.ndim == 5, f"ndim has to be 5 but is {data.ndim}"
     # Flatten shape
     if format != "HW" and format != "HWD":
         if format == "CHW":
diff --git a/clustpy/data/real_clustpy_data.py b/clustpy/data/real_clustpy_data.py
index 6747ddb..abf96be 100644
--- a/clustpy/data/real_clustpy_data.py
+++ b/clustpy/data/real_clustpy_data.py
@@ -1,7 +1,7 @@
 import numpy as np
-import os
 from sklearn.datasets._base import Bunch
 from clustpy.data._utils import unflatten_images
+from pathlib import Path
 
 
 def _load_nr_data(file_name: str, n_labels: int) -> (np.ndarray, np.ndarray):
@@ -21,7 +21,7 @@ def _load_nr_data(file_name: str, n_labels: int) -> (np.ndarray, np.ndarray):
     data, labels : (np.ndarray, np.ndarray)
         the data numpy array, the labels numpy array
     """
-    path = os.path.dirname(__file__) + "/datasets/" + file_name
+    path = Path(__file__).parent / "datasets" / file_name
     dataset = np.genfromtxt(path, delimiter=",")
     data = dataset[:, n_labels:]
     labels = dataset[:, :n_labels]
diff --git a/clustpy/data/real_medical_mnist_data.py b/clustpy/data/real_medical_mnist_data.py
index 9474164..8a5daf2 100644
--- a/clustpy/data/real_medical_mnist_data.py
+++ b/clustpy/data/real_medical_mnist_data.py
@@ -1,11 +1,11 @@
 import numpy as np
 from clustpy.data._utils import _get_download_dir, _download_file, flatten_images
-import os
 from sklearn.datasets._base import Bunch
+from pathlib import Path
 
 
 def _load_medical_mnist_data(dataset_name: str, subset: str, colored: bool, multiple_labelings: bool,
-                             return_X_y: bool, downloads_path: str) -> Bunch:
+                             return_X_y: bool, downloads_path: str | Path) -> Bunch:
     """
     Helper function to load medical MNIST data from https://medmnist.com/.
 
@@ -21,7 +21,7 @@ def _load_medical_mnist_data(dataset_name: str, subset: str, colored: bool, mult
         specifies if the data set contains multiple labelings (for alternative clusterings)
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored. If input was None this will be equal to
         '[USER]/Downloads/clustpy_datafiles'
 
@@ -38,8 +38,8 @@ def _load_medical_mnist_data(dataset_name: str, subset: str, colored: bool, mult
     assert subset in ["all", "train",
                       "test", "val"], "subset must match 'all', 'train', 'test' or 'val'. Your input {0}".format(subset)
     # Check if data exists
-    filename = _get_download_dir(downloads_path) + "/" + dataset_name + ".npz"
-    if not os.path.isfile(filename):
+    filename = _get_download_dir(downloads_path) / (dataset_name + ".npz")
+    if not filename.is_file():
         _download_file("https://zenodo.org/record/6496656/files/" + dataset_name + ".npz?download=1", filename)
     # Load data
     dataset = np.load(filename)
@@ -104,7 +104,7 @@ def _load_medical_mnist_data(dataset_name: str, subset: str, colored: bool, mult
 """
 
 
-def load_path_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_path_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the PathMNIST data set. It consists of 107180 28x28 colored images belonging to one of 9 classes.
     The data set is composed of 89996 training, 10004 validation and 7180 test samples.
@@ -116,7 +116,7 @@ def load_path_mnist(subset: str = "all", return_X_y: bool = False, downloads_pat
         can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -138,7 +138,7 @@ def load_path_mnist(subset: str = "all", return_X_y: bool = False, downloads_pat
     return _load_medical_mnist_data("pathmnist", subset, True, False, return_X_y, downloads_path)
 
 
-def load_chest_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_chest_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the ChestMNIST data set. It consists of 112120 28x28 grayscale images.
     The ground truth labels consist of 14 labelings with 2 clusters each.
@@ -151,7 +151,7 @@ def load_chest_mnist(subset: str = "all", return_X_y: bool = False, downloads_pa
         can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -173,7 +173,7 @@ def load_chest_mnist(subset: str = "all", return_X_y: bool = False, downloads_pa
     return _load_medical_mnist_data("chestmnist", subset, False, True, return_X_y, downloads_path)
 
 
-def load_derma_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_derma_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the DermaMNIST data set. It consists of 10015 28x28 colored images belonging to one of 7 classes.
     The data set is composed of 7007 training, 1003 validation and 2005 test samples.
@@ -185,7 +185,7 @@ def load_derma_mnist(subset: str = "all", return_X_y: bool = False, downloads_pa
         can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -210,7 +210,7 @@ def load_derma_mnist(subset: str = "all", return_X_y: bool = False, downloads_pa
     return _load_medical_mnist_data("dermamnist", subset, True, False, return_X_y, downloads_path)
 
 
-def load_oct_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_oct_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the OCTMNIST data set. It consists of 109309 28x28 grayscale images belonging to one of 4 classes.
     The data set is composed of 97477 training, 10832 validation and 1000 test samples.
@@ -222,7 +222,7 @@ def load_oct_mnist(subset: str = "all", return_X_y: bool = False, downloads_path
         can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -244,7 +244,7 @@ def load_oct_mnist(subset: str = "all", return_X_y: bool = False, downloads_path
     return _load_medical_mnist_data("octmnist", subset, False, False, return_X_y, downloads_path)
 
 
-def load_pneumonia_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_pneumonia_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the PneumoniaMNIST data set. It consists of 5856 28x28 grayscale images belonging to one of 2 classes.
     The data set is composed of 4708 training, 524 validation and 624 test samples.
@@ -256,7 +256,7 @@ def load_pneumonia_mnist(subset: str = "all", return_X_y: bool = False, download
         can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -278,7 +278,7 @@ def load_pneumonia_mnist(subset: str = "all", return_X_y: bool = False, download
     return _load_medical_mnist_data("pneumoniamnist", subset, False, False, return_X_y, downloads_path)
 
 
-def load_retina_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_retina_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the RetinaMNIST data set. It consists of 1600 28x28 colored images belonging to one of 5 classes.
     The data set is composed of 1080 training, 120 validation and 400 test samples.
@@ -290,7 +290,7 @@ def load_retina_mnist(subset: str = "all", return_X_y: bool = False, downloads_p
         can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -312,7 +312,7 @@ def load_retina_mnist(subset: str = "all", return_X_y: bool = False, downloads_p
     return _load_medical_mnist_data("retinamnist", subset, True, False, return_X_y, downloads_path)
 
 
-def load_breast_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_breast_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the BreastMNIST data set. It consists of 780 28x28 grayscale images belonging to one of 2 classes.
     The data set is composed of 546 training, 78 validation and 156 test samples.
@@ -324,7 +324,7 @@ def load_breast_mnist(subset: str = "all", return_X_y: bool = False, downloads_p
         can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -346,7 +346,7 @@ def load_breast_mnist(subset: str = "all", return_X_y: bool = False, downloads_p
     return _load_medical_mnist_data("breastmnist", subset, False, False, return_X_y, downloads_path)
 
 
-def load_blood_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_blood_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the BloodMNIST data set. It consists of 17092 28x28 colored images belonging to one of 8 classes.
     The data set is composed of 11959 training, 1712 validation and 3421 test samples.
@@ -358,7 +358,7 @@ def load_blood_mnist(subset: str = "all", return_X_y: bool = False, downloads_pa
         can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -380,7 +380,7 @@ def load_blood_mnist(subset: str = "all", return_X_y: bool = False, downloads_pa
     return _load_medical_mnist_data("bloodmnist", subset, True, False, return_X_y, downloads_path)
 
 
-def load_tissue_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_tissue_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the TissueMNIST data set. It consists of 236386 28x28 grayscale images belonging to one of 8 classes.
     The data set is composed of 165466 training, 23640 validation and 47280 test samples.
@@ -392,7 +392,7 @@ def load_tissue_mnist(subset: str = "all", return_X_y: bool = False, downloads_p
         can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -414,7 +414,7 @@ def load_tissue_mnist(subset: str = "all", return_X_y: bool = False, downloads_p
     return _load_medical_mnist_data("tissuemnist", subset, False, False, return_X_y, downloads_path)
 
 
-def load_organ_a_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_organ_a_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the OrganAMNIST data set. It consists of 58850 28x28 grayscale images belonging to one of 11 classes.
     The data set is composed of 34581 training, 6491 validation and 17778 test samples.
@@ -426,7 +426,7 @@ def load_organ_a_mnist(subset: str = "all", return_X_y: bool = False, downloads_
         can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -451,7 +451,7 @@ def load_organ_a_mnist(subset: str = "all", return_X_y: bool = False, downloads_
     return _load_medical_mnist_data("organamnist", subset, False, False, return_X_y, downloads_path)
 
 
-def load_organ_c_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_organ_c_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the OrganCMNIST data set. It consists of 23660 28x28 grayscale images belonging to one of 11 classes.
     The data set is composed of 13000 training, 2392 validation and 8268 test samples.
@@ -463,7 +463,7 @@ def load_organ_c_mnist(subset: str = "all", return_X_y: bool = False, downloads_
         can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -488,7 +488,7 @@ def load_organ_c_mnist(subset: str = "all", return_X_y: bool = False, downloads_
     return _load_medical_mnist_data("organcmnist", subset, False, False, return_X_y, downloads_path)
 
 
-def load_organ_s_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_organ_s_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the OrganSMNIST data set. It consists of 25221 28x28 grayscale images belonging to one of 11 classes.
     The data set is composed of 13940 training, 2452 validation and 8829 test samples.
@@ -500,7 +500,7 @@ def load_organ_s_mnist(subset: str = "all", return_X_y: bool = False, downloads_
         can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -525,7 +525,7 @@ def load_organ_s_mnist(subset: str = "all", return_X_y: bool = False, downloads_
     return _load_medical_mnist_data("organsmnist", subset, False, False, return_X_y, downloads_path)
 
 
-def load_organ_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_organ_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the OrganMNIST3D data set. It consists of 1743 28x28x28 grayscale images belonging to one of 11 classes.
     The data set is composed of 972 training, 161 validation and 610 test samples.
@@ -537,7 +537,7 @@ def load_organ_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads
         can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -562,7 +562,7 @@ def load_organ_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads
     return _load_medical_mnist_data("organmnist3d", subset, False, False, return_X_y, downloads_path)
 
 
-def load_nodule_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_nodule_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the NoduleMNIST3D data set. It consists of 1633 28x28x28 grayscale images belonging to one of 2 classes.
     The data set is composed of 1158 training, 165 validation and 310 test samples.
@@ -574,7 +574,7 @@ def load_nodule_mnist_3d(subset: str = "all", return_X_y: bool = False, download
         can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -596,7 +596,7 @@ def load_nodule_mnist_3d(subset: str = "all", return_X_y: bool = False, download
     return _load_medical_mnist_data("nodulemnist3d", subset, False, False, return_X_y, downloads_path)
 
 
-def load_adrenal_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_adrenal_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the AdrenalMNIST3D data set. It consists of 1584 28x28x28 grayscale images belonging to one of 2 classes.
     The data set is composed of 1188 training, 98 validation and 298 test samples.
@@ -608,7 +608,7 @@ def load_adrenal_mnist_3d(subset: str = "all", return_X_y: bool = False, downloa
         can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -627,7 +627,7 @@ def load_adrenal_mnist_3d(subset: str = "all", return_X_y: bool = False, downloa
     return _load_medical_mnist_data("adrenalmnist3d", subset, False, False, return_X_y, downloads_path)
 
 
-def load_fracture_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_fracture_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the FractureMNIST3D data set. It consists of 1370 28x28x28 grayscale images belonging to one of 3 classes.
     The data set is composed of 1027 training, 103 validation and 240 test samples.
@@ -639,7 +639,7 @@ def load_fracture_mnist_3d(subset: str = "all", return_X_y: bool = False, downlo
         can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -661,7 +661,7 @@ def load_fracture_mnist_3d(subset: str = "all", return_X_y: bool = False, downlo
     return _load_medical_mnist_data("fracturemnist3d", subset, False, False, return_X_y, downloads_path)
 
 
-def load_vessel_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_vessel_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the VesselMNIST3D data set. It consists of 1909 28x28x28 grayscale images belonging to one of 2 classes.
     The data set is composed of 1335 training, 192 validation and 382 test samples.
@@ -673,7 +673,7 @@ def load_vessel_mnist_3d(subset: str = "all", return_X_y: bool = False, download
         can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -695,7 +695,7 @@ def load_vessel_mnist_3d(subset: str = "all", return_X_y: bool = False, download
     return _load_medical_mnist_data("vesselmnist3d", subset, False, False, return_X_y, downloads_path)
 
 
-def load_synapse_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_synapse_mnist_3d(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the SynapseMNIST3D data set. It consists of 1759 28x28x28 grayscale images belonging to one of 2 classes.
     The data set is composed of 1230 training, 177 validation and 352 test samples.
@@ -707,7 +707,7 @@ def load_synapse_mnist_3d(subset: str = "all", return_X_y: bool = False, downloa
         can be 'all', 'test', 'train' or 'val'. 'all' combines test, train and validation data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
diff --git a/clustpy/data/real_timeseries_data.py b/clustpy/data/real_timeseries_data.py
index cd8cff2..fc679cb 100644
--- a/clustpy/data/real_timeseries_data.py
+++ b/clustpy/data/real_timeseries_data.py
@@ -1,12 +1,12 @@
 import numpy as np
 from clustpy.data._utils import _get_download_dir, _download_file
 from sklearn.datasets._base import Bunch
-import os
 import zipfile
+from pathlib import Path
 
 
 def _load_timeseries_classification_data(dataset_name: str, subset: str, labels_minus_one: bool, file_type: str,
-                                         last_column_are_labels: bool, return_X_y: bool, downloads_path: str) -> Bunch:
+                                         last_column_are_labels: bool, return_X_y: bool, downloads_path: str | Path) -> Bunch:
     """
     Helper function to load timeseries data from www.timeseriesclassification.com.
 
@@ -24,7 +24,7 @@ def _load_timeseries_classification_data(dataset_name: str, subset: str, labels_
         specifies if the last column contains the labels. If false labels should be contained in the first column
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored. If input was None this will be equal to
         '[USER]/Downloads/clustpy_datafiles'
 
@@ -38,11 +38,10 @@ def _load_timeseries_classification_data(dataset_name: str, subset: str, labels_
     subset = subset.lower()
     assert subset in ["all", "train",
                       "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
-    directory = _get_download_dir(downloads_path) + "/" + dataset_name + "/"
-    filename = directory + dataset_name + ".zip"
-    if not os.path.isfile(filename):
-        if not os.path.isdir(directory):
-            os.mkdir(directory)
+    directory = _get_download_dir(downloads_path) / dataset_name
+    filename = directory / (dataset_name + ".zip")
+    if not filename.is_file():
+        directory.mkdir(parents=False, exist_ok=True)
         _download_file("http://www.timeseriesclassification.com/aeon-toolkit/" + dataset_name + ".zip",
                        filename)
         # Unpack zipfile
@@ -52,10 +51,10 @@ def _load_timeseries_classification_data(dataset_name: str, subset: str, labels_
     if subset == "all" or subset == "train":
         # Normally we have txt files
         if file_type == "txt":
-            dataset = np.genfromtxt(directory + dataset_name + "_TRAIN.txt")
+            dataset = np.genfromtxt(directory / (dataset_name + "_TRAIN.txt"))
         elif file_type == "ts":
             # Ts files must be changed first
-            with open(directory + dataset_name + "_TRAIN.ts", "rb") as f:
+            with open(directory / (dataset_name + "_TRAIN.ts"), "rb") as f:
                 clean_lines = (line.replace(b":", b",").replace(b"@", b"#") for line in f)
                 dataset = np.genfromtxt(clean_lines, delimiter=",", comments="#")
         # Are labels in first or last column?
@@ -68,10 +67,10 @@ def _load_timeseries_classification_data(dataset_name: str, subset: str, labels_
     if subset == "all" or subset == "test":
         # Normally we have txt files
         if file_type == "txt":
-            test_dataset = np.genfromtxt(directory + dataset_name + "_TEST.txt")
+            test_dataset = np.genfromtxt(directory / (dataset_name + "_TEST.txt"))
         elif file_type == "ts":
             # Ts files must be changed first
-            with open(directory + dataset_name + "_TEST.ts", "rb") as f:
+            with open(directory / (dataset_name + "_TEST.ts"), "rb") as f:
                 clean_lines = (line.replace(b":", b",").replace(b"@", b"#") for line in f)
                 test_dataset = np.genfromtxt(clean_lines, delimiter=",", comments="#")
         # Are labels in first or last column?
@@ -101,7 +100,7 @@ def _load_timeseries_classification_data(dataset_name: str, subset: str, labels_
         return Bunch(dataset_name=dataset_name, data=data, target=labels)
 
 
-def load_motestrain(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_motestrain(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the motestrain data set. It consists of 1272 samples belonging to one of 2 classes.
     The data set is composed of 20 training and 1252 test samples.
@@ -113,7 +112,7 @@ def load_motestrain(subset: str = "all", return_X_y: bool = False, downloads_pat
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -130,7 +129,7 @@ def load_motestrain(subset: str = "all", return_X_y: bool = False, downloads_pat
     return _load_timeseries_classification_data("MoteStrain", subset, True, "txt", False, return_X_y, downloads_path)
 
 
-def load_proximal_phalanx_outline(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_proximal_phalanx_outline(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the proximal phalanx outline data set. It consists of 876 samples belonging to one of 2 classes.
     The data set is composed of 600 training and 276 test samples.
@@ -142,7 +141,7 @@ def load_proximal_phalanx_outline(subset: str = "all", return_X_y: bool = False,
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -160,7 +159,7 @@ def load_proximal_phalanx_outline(subset: str = "all", return_X_y: bool = False,
                                                 return_X_y, downloads_path)
 
 
-def load_diatom_size_reduction(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_diatom_size_reduction(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the diatom size reduction data set. It consists of 322 samples belonging to one of 4 classes.
     The data set is composed of 16 training and 306 test samples.
@@ -172,7 +171,7 @@ def load_diatom_size_reduction(subset: str = "all", return_X_y: bool = False, do
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -190,7 +189,7 @@ def load_diatom_size_reduction(subset: str = "all", return_X_y: bool = False, do
                                                 return_X_y, downloads_path)
 
 
-def load_symbols(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_symbols(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the symbols data set. It consists of 1020 samples belonging to one of 6 classes.
     The data set is composed of 25 training and 995 test samples.
@@ -202,7 +201,7 @@ def load_symbols(subset: str = "all", return_X_y: bool = False, downloads_path:
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -219,7 +218,7 @@ def load_symbols(subset: str = "all", return_X_y: bool = False, downloads_path:
     return _load_timeseries_classification_data("Symbols", subset, True, "txt", False, return_X_y, downloads_path)
 
 
-def load_olive_oil(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_olive_oil(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the OliveOil data set. It consists of 60 samples belonging to one of 4 classes.
     The data set is composed of 30 training and 30 test samples.
@@ -231,7 +230,7 @@ def load_olive_oil(subset: str = "all", return_X_y: bool = False, downloads_path
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -248,7 +247,7 @@ def load_olive_oil(subset: str = "all", return_X_y: bool = False, downloads_path
     return _load_timeseries_classification_data("OliveOil", subset, True, "txt", False, return_X_y, downloads_path)
 
 
-def load_plane(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_plane(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the plane data set. It consists of 210 samples belonging to one of 7 classes.
     The data set is composed of 105 training and 105 test samples.
@@ -260,7 +259,7 @@ def load_plane(subset: str = "all", return_X_y: bool = False, downloads_path: st
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -277,7 +276,7 @@ def load_plane(subset: str = "all", return_X_y: bool = False, downloads_path: st
     return _load_timeseries_classification_data("Plane", subset, True, "txt", False, return_X_y, downloads_path)
 
 
-def load_sony_aibo_robot_surface(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_sony_aibo_robot_surface(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the Sony AIBO Robot Surface 1 data set. It consists of 621 samples belonging to one of 2 classes.
     The data set is composed of 20 training and 601 test samples.
@@ -289,7 +288,7 @@ def load_sony_aibo_robot_surface(subset: str = "all", return_X_y: bool = False,
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -307,7 +306,7 @@ def load_sony_aibo_robot_surface(subset: str = "all", return_X_y: bool = False,
                                                 return_X_y, downloads_path)
 
 
-def load_two_patterns(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_two_patterns(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the two patterns data set. It consists of 5000 samples belonging to one of 4 classes.
     The data set is composed of 1000 training and 4000 test samples.
@@ -319,7 +318,7 @@ def load_two_patterns(subset: str = "all", return_X_y: bool = False, downloads_p
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -336,7 +335,7 @@ def load_two_patterns(subset: str = "all", return_X_y: bool = False, downloads_p
     return _load_timeseries_classification_data("TwoPatterns", subset, True, "txt", False, return_X_y, downloads_path)
 
 
-def load_lsst(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_lsst(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the LSST data set. It consists of 4925 samples belonging to one of 14 classes.
     The data set is composed of 2459 training and 2466 test samples.
@@ -348,7 +347,7 @@ def load_lsst(subset: str = "all", return_X_y: bool = False, downloads_path: str
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
diff --git a/clustpy/data/real_torchvision_data.py b/clustpy/data/real_torchvision_data.py
index a3c0fb0..7e877e1 100644
--- a/clustpy/data/real_torchvision_data.py
+++ b/clustpy/data/real_torchvision_data.py
@@ -3,6 +3,8 @@
 import numpy as np
 from clustpy.data._utils import _get_download_dir, _load_image_data, flatten_images
 from sklearn.datasets._base import Bunch
+from pathlib import Path
+
 
 """
 Torchvision datasets helpers
@@ -55,7 +57,7 @@ def _get_data_and_labels(dataset: torchvision.datasets.VisionDataset, image_size
 
 
 def _load_torch_image_data(data_source: torchvision.datasets.VisionDataset, subset: str, uses_train_param: bool,
-                           image_format: str, return_X_y: bool, downloads_path: str, image_size: tuple = None) -> Bunch:
+                           image_format: str, return_X_y: bool, downloads_path: str | Path, image_size: tuple = None) -> Bunch:
     """
     Helper function to load a data set from the torchvision package.
     All data sets will be returned as a two-dimensional tensor, created out of the HWC (height, width, color channels) image representation.
@@ -73,7 +75,7 @@ def _load_torch_image_data(data_source: torchvision.datasets.VisionDataset, subs
         Abbreviations stand for: H: Height, W: Width, D: Depth, C: Color-channels
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored
     image_size : tuple
         for some datasets (e.g., GTSRB) the images of various sizes must be converted into a coherent size.
@@ -144,7 +146,7 @@ def _load_torch_image_data(data_source: torchvision.datasets.VisionDataset, subs
 """
 
 
-def load_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the MNIST data set. It consists of 70000 28x28 grayscale images showing handwritten digits (0 to 9).
     The data set is composed of 60000 training and 10000 test images.
@@ -156,7 +158,7 @@ def load_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: st
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : bool
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -180,7 +182,7 @@ def load_mnist(subset: str = "all", return_X_y: bool = False, downloads_path: st
     return _load_torch_image_data(torchvision.datasets.MNIST, subset, True, "HW", return_X_y, downloads_path)
 
 
-def load_kmnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_kmnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the Kuzushiji-MNIST data set. It consists of 70000 28x28 grayscale images showing Kanji characters.
     It is composed of 10 different characters, each representing one column of hiragana.
@@ -193,7 +195,7 @@ def load_kmnist(subset: str = "all", return_X_y: bool = False, downloads_path: s
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -217,7 +219,7 @@ def load_kmnist(subset: str = "all", return_X_y: bool = False, downloads_path: s
     return _load_torch_image_data(torchvision.datasets.KMNIST, subset, True, "HW", return_X_y, downloads_path)
 
 
-def load_fmnist(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_fmnist(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the Fashion-MNIST data set. It consists of 70000 28x28 grayscale images showing articles from the Zalando online store.
     Each sample belongs to one of 10 product groups.
@@ -230,7 +232,7 @@ def load_fmnist(subset: str = "all", return_X_y: bool = False, downloads_path: s
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -254,7 +256,7 @@ def load_fmnist(subset: str = "all", return_X_y: bool = False, downloads_path: s
     return _load_torch_image_data(torchvision.datasets.FashionMNIST, subset, True, "HW", return_X_y, downloads_path)
 
 
-def load_usps(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_usps(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the USPS data set. It consists of 9298 16x16 grayscale images showing handwritten digits (0 to 9).
     The data set is composed of 7291 training and 2007 test images.
@@ -266,7 +268,7 @@ def load_usps(subset: str = "all", return_X_y: bool = False, downloads_path: str
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -290,7 +292,7 @@ def load_usps(subset: str = "all", return_X_y: bool = False, downloads_path: str
     return _load_torch_image_data(torchvision.datasets.USPS, subset, True, "HW", return_X_y, downloads_path)
 
 
-def load_cifar10(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_cifar10(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the CIFAR10 data set. It consists of 60000 32x32 color images showing different objects.
     The classes are airplane, automobile, bird, cat, deer, dog, frog, horse, ship and truck.
@@ -303,7 +305,7 @@ def load_cifar10(subset: str = "all", return_X_y: bool = False, downloads_path:
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -327,7 +329,7 @@ def load_cifar10(subset: str = "all", return_X_y: bool = False, downloads_path:
 
 
 def load_cifar100(subset: str = "all", use_superclasses: bool = False, return_X_y: bool = False,
-                  downloads_path: str = None) -> Bunch:
+                  downloads_path: str | Path = None) -> Bunch:
     """
     Load the CIFAR100 data set. It consists of 60000 32x32 color images showing different objects.
     A total of 100 classes are included, each depicting a specific of objects. Each class contains 600 objects.
@@ -343,7 +345,7 @@ def load_cifar100(subset: str = "all", use_superclasses: bool = False, return_X_
         If set to True, the 20 superclasses are used instead of the 100 regular classes (default: False)
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -395,7 +397,7 @@ def load_cifar100(subset: str = "all", use_superclasses: bool = False, return_X_
         return dataset
 
 
-def load_svhn(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_svhn(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the SVHN data set. It consists of 99289 32x32 color images showing house numbers (0 to 9).
     The data set is composed of 73257 training and 26032 test images.
@@ -407,7 +409,7 @@ def load_svhn(subset: str = "all", return_X_y: bool = False, downloads_path: str
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -430,7 +432,7 @@ def load_svhn(subset: str = "all", return_X_y: bool = False, downloads_path: str
     return _load_torch_image_data(torchvision.datasets.SVHN, subset, False, "CHW", return_X_y, downloads_path)
 
 
-def load_stl10(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_stl10(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the STL10 data set. It consists of 13000 96x96 color images showing different objects.
     The classes are airplane, bird, car, cat, deer, dog, horse, monkey, ship and truck.
@@ -443,7 +445,7 @@ def load_stl10(subset: str = "all", return_X_y: bool = False, downloads_path: st
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -468,7 +470,7 @@ def load_stl10(subset: str = "all", return_X_y: bool = False, downloads_path: st
 
 
 def load_gtsrb(subset: str = "all", image_size: tuple = (32, 32), return_X_y: bool = False,
-               downloads_path: str = None) -> Bunch:
+               downloads_path: str | Path = None) -> Bunch:
     """
     Load the GTSRB (German Traffic Sign Recognition Benchmark) data set. It consists of 39270 color images showing 43 different traffic signs.
     Example classes are: stop sign, speed limit 50 sign, speed limit 70 sign, construction site sign and many others.
@@ -484,7 +486,7 @@ def load_gtsrb(subset: str = "all", image_size: tuple = (32, 32), return_X_y: bo
         The tuple equals (width, height) of the images (default: (32, 32))
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
diff --git a/clustpy/data/real_uci_data.py b/clustpy/data/real_uci_data.py
index 219e7e5..3556194 100644
--- a/clustpy/data/real_uci_data.py
+++ b/clustpy/data/real_uci_data.py
@@ -1,14 +1,14 @@
 from clustpy.data._utils import _download_file, _get_download_dir, _decompress_z_file, _load_data_file, flatten_images, _transform_text_data, _load_image_data
-import os
 import numpy as np
 import zipfile
 import tarfile
 from sklearn.preprocessing import LabelEncoder
 import pandas as pd
 from sklearn.datasets._base import Bunch
+from pathlib import Path
 
 
-def load_banknotes(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_banknotes(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the banknote authentication data set. It consists of 1372 genuine and forged banknote samples.
     N=1372, d=4, k=2.
@@ -17,7 +17,7 @@ def load_banknotes(return_X_y: bool = False, downloads_path: str = None) -> Bunc
     ----------
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -31,7 +31,7 @@ def load_banknotes(return_X_y: bool = False, downloads_path: str = None) -> Bunc
     -------
     https://archive.ics.uci.edu/ml/datasets/banknote+authentication
     """
-    filename = _get_download_dir(downloads_path) + "/data_banknote_authentication.txt"
+    filename = _get_download_dir(downloads_path) / "data_banknote_authentication.txt"
     data, labels = _load_data_file(filename,
                                    "https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt")
     # Return values
@@ -41,7 +41,7 @@ def load_banknotes(return_X_y: bool = False, downloads_path: str = None) -> Bunc
         return Bunch(dataset_name="Banknotes", data=data, target=labels)
 
 
-def load_spambase(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_spambase(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the spambase data set. It consists of 4601 spam and non-spam mails.
     N=4601, d=57, k=2.
@@ -50,7 +50,7 @@ def load_spambase(return_X_y: bool = False, downloads_path: str = None) -> Bunch
     ----------
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -64,7 +64,7 @@ def load_spambase(return_X_y: bool = False, downloads_path: str = None) -> Bunch
     -------
     https://archive.ics.uci.edu/ml/datasets/spambase
     """
-    filename = _get_download_dir(downloads_path) + "/spambase.data"
+    filename = _get_download_dir(downloads_path) / "spambase.data"
     data, labels = _load_data_file(filename,
                                    "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data")
     # Return values
@@ -74,7 +74,7 @@ def load_spambase(return_X_y: bool = False, downloads_path: str = None) -> Bunch
         return Bunch(dataset_name="Spambase", data=data, target=labels)
 
 
-def load_seeds(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_seeds(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the seeds data set. It consists of 210 samples belonging to one of three varieties of wheat.
     N=210, d=7, k=3.
@@ -83,7 +83,7 @@ def load_seeds(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
     ----------
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -97,7 +97,7 @@ def load_seeds(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
     -------
     https://archive.ics.uci.edu/ml/datasets/seeds
     """
-    filename = _get_download_dir(downloads_path) + "/seeds_dataset.txt"
+    filename = _get_download_dir(downloads_path) / "seeds_dataset.txt"
     data, labels = _load_data_file(filename,
                                    "https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt",
                                    delimiter=None)
@@ -110,7 +110,7 @@ def load_seeds(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
         return Bunch(dataset_name="Seeds", data=data, target=labels)
 
 
-def load_skin(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_skin(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the Skin Segmentation data set. It consists of 245057 skin- and non-skin samples with their B, G, R color
     information.
@@ -120,7 +120,7 @@ def load_skin(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
     ----------
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -134,7 +134,7 @@ def load_skin(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
     -------
     https://archive.ics.uci.edu/ml/datasets/skin+segmentation
     """
-    filename = _get_download_dir(downloads_path) + "/Skin_NonSkin.txt"
+    filename = _get_download_dir(downloads_path) / "Skin_NonSkin.txt"
     data, labels = _load_data_file(filename,
                                    "https://archive.ics.uci.edu/ml/machine-learning-databases/00229/Skin_NonSkin.txt",
                                    delimiter=None)
@@ -147,7 +147,7 @@ def load_skin(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
         return Bunch(dataset_name="SkinSegmentation", data=data, target=labels)
 
 
-def load_soybean_small(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_soybean_small(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the small version of the soybean data set. It is a small subset of the original soybean data set.
     It consists of 47 samples belonging to one of 4 classes.
@@ -157,7 +157,7 @@ def load_soybean_small(return_X_y: bool = False, downloads_path: str = None) ->
     ----------
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -171,8 +171,8 @@ def load_soybean_small(return_X_y: bool = False, downloads_path: str = None) ->
     -------
     https://archive.ics.uci.edu/ml/datasets/soybean+(small)
     """
-    filename = _get_download_dir(downloads_path) + "/soybean-small.data"
-    if not os.path.isfile(filename):
+    filename = _get_download_dir(downloads_path) / "soybean-small.data"
+    if not filename.is_file():
         _download_file(
             "https://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-small.data",
             filename)
@@ -189,7 +189,7 @@ def load_soybean_small(return_X_y: bool = False, downloads_path: str = None) ->
         return Bunch(dataset_name="SoybeanSmall", data=data, target=labels)
 
 
-def load_soybean_large(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_soybean_large(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the large version of the soybean data set. It consists of 562 samples belonging to one of 15 classes.
     Originally, the data set would have samples and 19 classes but some samples have attributes showing '?' values. Those
@@ -203,7 +203,7 @@ def load_soybean_large(subset: str = "all", return_X_y: bool = False, downloads_
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -221,8 +221,8 @@ def load_soybean_large(subset: str = "all", return_X_y: bool = False, downloads_
     assert subset in ["all", "train",
                       "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
     if subset == "all" or subset == "train":
-        filename = _get_download_dir(downloads_path) + "/soybean-large.data"
-        if not os.path.isfile(filename):
+        filename = _get_download_dir(downloads_path) / "soybean-large.data"
+        if not filename.is_file():
             _download_file(
                 "https://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-large.data",
                 filename)
@@ -232,8 +232,8 @@ def load_soybean_large(subset: str = "all", return_X_y: bool = False, downloads_
         labels_raw = df_train.pop(0)
         data = df_train.values
     if subset == "all" or subset == "test":
-        filename = _get_download_dir(downloads_path) + "/soybean-large.test"
-        if not os.path.isfile(filename):
+        filename = _get_download_dir(downloads_path) / "soybean-large.test"
+        if not filename.is_file():
             _download_file(
                 "https://archive.ics.uci.edu/ml/machine-learning-databases/soybean/soybean-large.test",
                 filename)
@@ -257,7 +257,7 @@ def load_soybean_large(subset: str = "all", return_X_y: bool = False, downloads_
         return Bunch(dataset_name="SoybeanLarge", data=data, target=labels)
 
 
-def load_pendigits(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_pendigits(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the pendigits data set. It consists of 10992 vectors of length 16, representing 8 coordinates. The coordinates
     were taken from the task of writing digits (0 to 9) on a tablet.
@@ -270,7 +270,7 @@ def load_pendigits(subset: str = "all", return_X_y: bool = False, downloads_path
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -288,11 +288,11 @@ def load_pendigits(subset: str = "all", return_X_y: bool = False, downloads_path
     assert subset in ["all", "train",
                       "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
     if subset == "all" or subset == "train":
-        filename = _get_download_dir(downloads_path) + "/pendigits.tra"
+        filename = _get_download_dir(downloads_path) / "pendigits.tra"
         data, labels = _load_data_file(filename,
                                        "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra")
     if subset == "all" or subset == "test":
-        filename = _get_download_dir(downloads_path) + "/pendigits.tes"
+        filename = _get_download_dir(downloads_path) / "pendigits.tes"
         test_data, test_labels = _load_data_file(filename,
                                                  "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tes")
         if subset == "all":
@@ -308,7 +308,7 @@ def load_pendigits(subset: str = "all", return_X_y: bool = False, downloads_path
         return Bunch(dataset_name="Pendigits", data=data, target=labels)
 
 
-def load_ecoli(ignore_small_clusters: bool = False, return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_ecoli(ignore_small_clusters: bool = False, return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the ecoli data set. It consists of 336 samples belonging to one of 8 classes.
     N=336, d=7, k=8.
@@ -319,7 +319,7 @@ def load_ecoli(ignore_small_clusters: bool = False, return_X_y: bool = False, do
         specify if the three small clusters with size 2, 2 and 5 should be ignored (default: False)
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -333,8 +333,8 @@ def load_ecoli(ignore_small_clusters: bool = False, return_X_y: bool = False, do
     -------
     https://archive.ics.uci.edu/ml/datasets/ecoli
     """
-    filename = _get_download_dir(downloads_path) + "/ecoli.data"
-    if not os.path.isfile(filename):
+    filename = _get_download_dir(downloads_path) / "ecoli.data"
+    if not filename.is_file():
         _download_file(
             "https://archive.ics.uci.edu/ml/machine-learning-databases/ecoli/ecoli.data",
             filename)
@@ -361,7 +361,7 @@ def load_ecoli(ignore_small_clusters: bool = False, return_X_y: bool = False, do
         return Bunch(dataset_name="Ecoli", data=data, target=labels)
 
 
-def load_htru2(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_htru2(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the HTRU2 data set. It consists of 17898 samples belonging to the pulsar or non-pulsar class.
     A special property is that more than 90% of the data belongs to class 0.
@@ -371,7 +371,7 @@ def load_htru2(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
     ----------
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -385,18 +385,17 @@ def load_htru2(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
     -------
     https://archive.ics.uci.edu/ml/datasets/HTRU2
     """
-    directory = _get_download_dir(downloads_path) + "/htru2/"
-    filename = directory + "HTRU2.zip"
-    if not os.path.isfile(filename):
-        if not os.path.isdir(directory):
-            os.mkdir(directory)
+    directory = _get_download_dir(downloads_path) / "htru2"
+    filename = directory / "HTRU2.zip"
+    if not filename.is_file():
+        directory.mkdir(parents=False, exist_ok=True)
         _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00372/HTRU2.zip",
                        filename)
         # Unpack zipfile
         with zipfile.ZipFile(filename, 'r') as zipf:
             zipf.extractall(directory)
     # Load data and labels
-    dataset = np.genfromtxt(directory + "HTRU_2.csv", delimiter=",")
+    dataset = np.genfromtxt(directory / "HTRU_2.csv", delimiter=",")
     data = dataset[:, :-1]
     labels = dataset[:, -1]
     # Convert labels to int32 format
@@ -408,7 +407,7 @@ def load_htru2(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
         return Bunch(dataset_name="HTRU2", data=data, target=labels)
 
 
-def load_letterrecognition(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_letterrecognition(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the Letter Recognition data set. It consists of 20000 samples where each sample represents one of the 26 capital
     letters in the English alphabet. All samples are composed of 16 numerical stimuli describing the respective letter.
@@ -418,7 +417,7 @@ def load_letterrecognition(return_X_y: bool = False, downloads_path: str = None)
     ----------
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -432,8 +431,8 @@ def load_letterrecognition(return_X_y: bool = False, downloads_path: str = None)
     -------
     https://archive.ics.uci.edu/ml/datasets/letter+recognition
     """
-    filename = _get_download_dir(downloads_path) + "/letter-recognition.data"
-    if not os.path.isfile(filename):
+    filename = _get_download_dir(downloads_path) / "letter-recognition.data"
+    if not filename.is_file():
         _download_file(
             "https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data",
             filename)
@@ -460,7 +459,7 @@ def load_letterrecognition(return_X_y: bool = False, downloads_path: str = None)
         return Bunch(dataset_name="Letterrecognition", data=data, target=labels)
 
 
-def load_har(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_har(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the Human Activity Recognition data set. It consists of 10299 samples each representing sensor data of a person
     performing an activity. The six activities are walking, walking_upstairs, walking_downstairs, sitting, standing and
@@ -474,7 +473,7 @@ def load_har(subset: str = "all", return_X_y: bool = False, downloads_path: str
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -491,11 +490,10 @@ def load_har(subset: str = "all", return_X_y: bool = False, downloads_path: str
     subset = subset.lower()
     assert subset in ["all", "train",
                       "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
-    directory = _get_download_dir(downloads_path) + "/har/"
-    filename = directory + "UCI HAR Dataset.zip"
-    if not os.path.isfile(filename):
-        if not os.path.isdir(directory):
-            os.mkdir(directory)
+    directory = _get_download_dir(downloads_path) / "har"
+    filename = directory / "UCI HAR Dataset.zip"
+    if not filename.is_file():
+        directory.mkdir(parents=False, exist_ok=True)
         _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip",
                        filename)
         # Unpack zipfile
@@ -503,11 +501,11 @@ def load_har(subset: str = "all", return_X_y: bool = False, downloads_path: str
             zipf.extractall(directory)
     # Load data and labels
     if subset == "all" or subset == "train":
-        data = np.genfromtxt(directory + "UCI HAR Dataset/train/X_train.txt")
-        labels = np.genfromtxt(directory + "UCI HAR Dataset/train/y_train.txt")
+        data = np.genfromtxt(directory / "UCI HAR Dataset/train/X_train.txt")
+        labels = np.genfromtxt(directory / "UCI HAR Dataset/train/y_train.txt")
     if subset == "all" or subset == "test":
-        test_data = np.genfromtxt(directory + "UCI HAR Dataset/test/X_test.txt")
-        test_labels = np.genfromtxt(directory + "UCI HAR Dataset/test/y_test.txt")
+        test_data = np.genfromtxt(directory / "UCI HAR Dataset/test/X_test.txt")
+        test_labels = np.genfromtxt(directory / "UCI HAR Dataset/test/y_test.txt")
         if subset == "all":
             data = np.r_[data, test_data]
             labels = np.r_[labels, test_labels]
@@ -525,7 +523,7 @@ def load_har(subset: str = "all", return_X_y: bool = False, downloads_path: str
         return Bunch(dataset_name="HAR", data=data, target=labels)
 
 
-def load_statlog_shuttle(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_statlog_shuttle(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the statlog shuttle data set. It consists of 58000 samples belonging to one of 7 classes. A special property is
     that about 80% of the data belongs to class 0.
@@ -538,7 +536,7 @@ def load_statlog_shuttle(subset: str = "all", return_X_y: bool = False, download
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -555,30 +553,29 @@ def load_statlog_shuttle(subset: str = "all", return_X_y: bool = False, download
     subset = subset.lower()
     assert subset in ["all", "train",
                       "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
-    directory = _get_download_dir(downloads_path) + "/shuttle/"
+    directory = _get_download_dir(downloads_path) / "shuttle"
     if subset == "all" or subset == "train":
-        filename = directory + "shuttle.trn.Z"
-        if not os.path.isfile(filename):
-            if not os.path.isdir(directory):
-                os.mkdir(directory)
+        filename = directory / "shuttle.trn.Z"
+        if not filename.is_file():
+            directory.mkdir(parents=False, exist_ok=True)
             _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.trn.Z",
                            filename)
             # Unpack z-file
             success = _decompress_z_file(filename, directory)
             if not success:
-                os.remove(filename)
+                filename.unlink()
                 return (None, None) if return_X_y else None
         # Load data and labels
-        dataset = np.genfromtxt(directory + "shuttle.trn")
+        dataset = np.genfromtxt(directory / "shuttle.trn")
         data = dataset[:, :-1]
         labels = dataset[:, -1]
     if subset == "all" or subset == "test":
-        filename = directory + "shuttle.tst"
-        if not os.path.isfile(filename):
+        filename = directory / "shuttle.tst"
+        if not filename.is_file():
             _download_file(
                 "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.tst",
                 filename)
-        test_dataset = np.genfromtxt(directory + "shuttle.tst")
+        test_dataset = np.genfromtxt(directory / "shuttle.tst")
         test_data = test_dataset[:, :-1]
         test_labels = test_dataset[:, -1]
         if subset == "all":
@@ -599,7 +596,7 @@ def load_statlog_shuttle(subset: str = "all", return_X_y: bool = False, download
 
 
 def load_mice_protein(return_additional_labels: bool = False, return_X_y: bool = False,
-                      downloads_path: str = None) -> Bunch:
+                      downloads_path: str | Path = None) -> Bunch:
     """
     Load the Mice Protein Expression data set. It consists of 1077 samples belonging to one of 8 classes.
     Each feature represents the expression level of one of 77 proteins.
@@ -615,7 +612,7 @@ def load_mice_protein(return_additional_labels: bool = False, return_X_y: bool =
         return additional labels (default: False)
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -629,8 +626,8 @@ def load_mice_protein(return_additional_labels: bool = False, return_X_y: bool =
     -------
     https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression
     """
-    filename = _get_download_dir(downloads_path) + "/Data_Cortex_Nuclear.xls"
-    if not os.path.isfile(filename):
+    filename = _get_download_dir(downloads_path) / "Data_Cortex_Nuclear.xls"
+    if not filename.is_file():
         _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00342/Data_Cortex_Nuclear.xls",
                        filename)
     xls = pd.ExcelFile(filename)
@@ -674,7 +671,7 @@ def load_mice_protein(return_additional_labels: bool = False, return_X_y: bool =
         return Bunch(dataset_name="MiceProtein", data=data, target=labels)
 
 
-def load_user_knowledge(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_user_knowledge(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the user knowledge data set. It consists of 403 samples belonging to one of 4 classes.
     The 4 classes are the knowledge levels 'very low', 'low', 'middle' and 'high'.
@@ -687,7 +684,7 @@ def load_user_knowledge(subset: str = "all", return_X_y: bool = False, downloads
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -704,8 +701,8 @@ def load_user_knowledge(subset: str = "all", return_X_y: bool = False, downloads
     subset = subset.lower()
     assert subset in ["all", "train",
                       "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
-    filename = _get_download_dir(downloads_path) + "/Data_User_Modeling_Dataset_Hamdi Tolga KAHRAMAN.xls"
-    if not os.path.isfile(filename):
+    filename = _get_download_dir(downloads_path) / "Data_User_Modeling_Dataset_Hamdi Tolga KAHRAMAN.xls"
+    if not filename.is_file():
         _download_file(
             "https://archive.ics.uci.edu/ml/machine-learning-databases/00257/Data_User_Modeling_Dataset_Hamdi%20Tolga%20KAHRAMAN.xls",
             filename)
@@ -743,7 +740,7 @@ def load_user_knowledge(subset: str = "all", return_X_y: bool = False, downloads
         return Bunch(dataset_name="UserKnowledge", data=data, target=labels)
 
 
-def load_breast_tissue(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_breast_tissue(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the breast tissue data set. It consists of 106 samples belonging to one of 6 classes.
     N=106, d=9, k=6.
@@ -752,7 +749,7 @@ def load_breast_tissue(return_X_y: bool = False, downloads_path: str = None) ->
     ----------
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -766,8 +763,8 @@ def load_breast_tissue(return_X_y: bool = False, downloads_path: str = None) ->
     -------
     http://archive.ics.uci.edu/ml/datasets/breast+tissue
     """
-    filename = _get_download_dir(downloads_path) + "/BreastTissue.xls"
-    if not os.path.isfile(filename):
+    filename = _get_download_dir(downloads_path) / "BreastTissue.xls"
+    if not filename.is_file():
         _download_file("http://archive.ics.uci.edu/ml/machine-learning-databases/00192/BreastTissue.xls",
                        filename)
     xls = pd.ExcelFile(filename)
@@ -786,7 +783,7 @@ def load_breast_tissue(return_X_y: bool = False, downloads_path: str = None) ->
         return Bunch(dataset_name="BreastTissue", data=data, target=labels)
 
 
-def load_forest_types(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_forest_types(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the forest type mapping data set. It consists of 523 samples belonging to one of 4 classes.
     The data set is composed of 198 training and 325 test samples.
@@ -798,7 +795,7 @@ def load_forest_types(subset: str = "all", return_X_y: bool = False, downloads_p
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -815,11 +812,10 @@ def load_forest_types(subset: str = "all", return_X_y: bool = False, downloads_p
     subset = subset.lower()
     assert subset in ["all", "train",
                       "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
-    directory = _get_download_dir(downloads_path) + "/ForestTypes/"
-    filename = directory + "ForestTypes.zip"
-    if not os.path.isfile(filename):
-        if not os.path.isdir(directory):
-            os.mkdir(directory)
+    directory = _get_download_dir(downloads_path) / "ForestTypes"
+    filename = directory / "ForestTypes.zip"
+    if not filename.is_file():
+        directory.mkdir(parents=False, exist_ok=True)
         _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/00333/ForestTypes.zip",
                        filename)
         # Unpack zipfile
@@ -827,11 +823,11 @@ def load_forest_types(subset: str = "all", return_X_y: bool = False, downloads_p
             zipf.extractall(directory)
     # Load data and labels
     if subset == "all" or subset == "train":
-        df_train = pd.read_csv(directory + "/training.csv", delimiter=",")
+        df_train = pd.read_csv(directory / "training.csv", delimiter=",")
         labels_raw = df_train.pop("class")
         data = df_train.values
     if subset == "all" or subset == "test":
-        df_test = pd.read_csv(directory + "/testing.csv", delimiter=",")
+        df_test = pd.read_csv(directory / "testing.csv", delimiter=",")
         labels_test = df_test.pop("class")
         if subset == "all":
             data = np.r_[data, df_test.values]
@@ -849,7 +845,7 @@ def load_forest_types(subset: str = "all", return_X_y: bool = False, downloads_p
         return Bunch(dataset_name="ForestTypes", data=data, target=labels)
 
 
-def load_dermatology(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_dermatology(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the dermatology data set. It consists of 366 samples belonging to one of 6 classes.
     8 samples contain '?' values and are therefore removed.
@@ -859,7 +855,7 @@ def load_dermatology(return_X_y: bool = False, downloads_path: str = None) -> Bu
     ----------
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -873,7 +869,7 @@ def load_dermatology(return_X_y: bool = False, downloads_path: str = None) -> Bu
     -------
     https://archive.ics.uci.edu/ml/datasets/dermatology
     """
-    filename = _get_download_dir(downloads_path) + "/dermatology.data"
+    filename = _get_download_dir(downloads_path) / "dermatology.data"
     data, labels = _load_data_file(filename,
                                    "https://archive.ics.uci.edu/ml/machine-learning-databases/dermatology/dermatology.data",
                                    delimiter=",")
@@ -890,7 +886,7 @@ def load_dermatology(return_X_y: bool = False, downloads_path: str = None) -> Bu
         return Bunch(dataset_name="Dermatology", data=data, target=labels)
 
 
-def load_multiple_features(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_multiple_features(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the multiple features data set. It consists of 2000 samples belonging to one of 10 classes.
     Each class corresponds to handwritten numerals (0-9) extracted from a collection of Dutch utility maps.
@@ -900,7 +896,7 @@ def load_multiple_features(return_X_y: bool = False, downloads_path: str = None)
     ----------
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -914,14 +910,13 @@ def load_multiple_features(return_X_y: bool = False, downloads_path: str = None)
     -------
     https://archive.ics.uci.edu/ml/datasets/Multiple+Features
     """
-    directory = _get_download_dir(downloads_path) + "/MultipleFeatures/"
-    if not os.path.isdir(directory):
-        os.mkdir(directory)
+    directory = _get_download_dir(downloads_path) / "MultipleFeatures"
+    directory.mkdir(parents=False, exist_ok=True)
     data = np.zeros((2000, 0))
     # Dataset consists of multiple .xls files
     for file in ["mfeat-fac", "mfeat-fou", "mfeat-kar", "mfeat-mor", "mfeat-pix", "mfeat-zer"]:
-        filename = directory + file + ".xls"
-        if not os.path.isfile(filename):
+        filename = directory / (file + ".xls")
+        if not filename.is_file():
             _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/mfeat/" + file,
                            filename)
         data_tmp = np.genfromtxt(filename, delimiter=None)
@@ -935,7 +930,7 @@ def load_multiple_features(return_X_y: bool = False, downloads_path: str = None)
         return Bunch(dataset_name="MultipleFeatures", data=data, target=labels)
 
 
-def load_statlog_australian_credit_approval(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_statlog_australian_credit_approval(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the statlog Australian Credit Approval data set. It consists of 690 samples belonging to one of 2 classes.
     N=690, d=14, k=2.
@@ -944,7 +939,7 @@ def load_statlog_australian_credit_approval(return_X_y: bool = False, downloads_
     ----------
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -958,7 +953,7 @@ def load_statlog_australian_credit_approval(return_X_y: bool = False, downloads_
     -------
     https://archive.ics.uci.edu/ml/datasets/statlog+(australian+credit+approval)
     """
-    filename = _get_download_dir(downloads_path) + "/australian.dat"
+    filename = _get_download_dir(downloads_path) / "australian.dat"
     data, labels = _load_data_file(filename,
                                    "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/australian/australian.dat",
                                    delimiter=None)
@@ -969,7 +964,7 @@ def load_statlog_australian_credit_approval(return_X_y: bool = False, downloads_
         return Bunch(dataset_name="StatlogAustralianCreditApproval", data=data, target=labels)
 
 
-def load_breast_cancer_wisconsin_original(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_breast_cancer_wisconsin_original(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the original breast cancer Wisconsin data set. It consists of 699 samples belonging to one of 2 classes.
     16 samples contain '?' values and will be removed.
@@ -979,7 +974,7 @@ def load_breast_cancer_wisconsin_original(return_X_y: bool = False, downloads_pa
     ----------
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -993,7 +988,7 @@ def load_breast_cancer_wisconsin_original(return_X_y: bool = False, downloads_pa
     -------
     https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+%28original%29
     """
-    filename = _get_download_dir(downloads_path) + "/breast-cancer-wisconsin.data"
+    filename = _get_download_dir(downloads_path) / "breast-cancer-wisconsin.data"
     data, labels = _load_data_file(filename,
                                    "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
                                    delimiter=",")
@@ -1014,7 +1009,7 @@ def load_breast_cancer_wisconsin_original(return_X_y: bool = False, downloads_pa
         return Bunch(dataset_name="BreastCancerWisconsin", data=data, target=labels)
 
 
-def load_optdigits(subset: str = "all", return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_optdigits(subset: str = "all", return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the optdigits data set. It consists of 5620 8x8 grayscale images, each representing a digit (0 to 9).
     Each pixel depicts the number of marked pixel within a 4x4 block of the original 32x32 bitmaps.
@@ -1027,7 +1022,7 @@ def load_optdigits(subset: str = "all", return_X_y: bool = False, downloads_path
         can be 'all', 'test' or 'train'. 'all' combines test and train data (default: 'all')
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -1046,11 +1041,11 @@ def load_optdigits(subset: str = "all", return_X_y: bool = False, downloads_path
     assert subset in ["all", "train",
                       "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
     if subset == "all" or subset == "train":
-        filename = _get_download_dir(downloads_path) + "/optdigits.tra"
+        filename = _get_download_dir(downloads_path) / "optdigits.tra"
         data, labels = _load_data_file(filename,
                                        "https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra")
     if subset == "all" or subset == "test":
-        filename = _get_download_dir(downloads_path) + "/optdigits.tes"
+        filename = _get_download_dir(downloads_path) / "optdigits.tes"
         test_data, test_labels = _load_data_file(filename,
                                                  "https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tes")
         if subset == "all":
@@ -1067,7 +1062,7 @@ def load_optdigits(subset: str = "all", return_X_y: bool = False, downloads_path
         return Bunch(dataset_name="Optdigits", data=data, target=labels, images=data_image, image_format="HW")
 
 
-def load_semeion(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_semeion(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the semeion data set. It consists of 1593 samples belonging to one of 10 classes.
     Each sample corresponds to a grayscale 16x16 scan of handwritten digits originating from about 80 different persons.
@@ -1078,7 +1073,7 @@ def load_semeion(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
     ----------
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -1093,8 +1088,8 @@ def load_semeion(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
     -------
     https://archive.ics.uci.edu/ml/datasets/semeion+handwritten+digit
     """
-    filename = _get_download_dir(downloads_path) + "/semeion.data"
-    if not os.path.isfile(filename):
+    filename = _get_download_dir(downloads_path) / "semeion.data"
+    if not filename.is_file():
         _download_file("https://archive.ics.uci.edu/ml/machine-learning-databases/semeion/semeion.data",
                        filename)
     datafile = np.genfromtxt(filename)
@@ -1111,19 +1106,19 @@ def load_semeion(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
         return Bunch(dataset_name="Semeion", data=data, target=labels, images=data_image, image_format="HW")
 
 
-def load_cmu_faces(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_cmu_faces(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the CMU Face Images data set. It consists of 640 30x32 grayscale images showing 20 persons in different poses
     (up, straight, left, right) and with different expressions (neutral, happy, sad, angry). Additionally, the persons
     can wear sunglasses or not.
     16 images show glitches which is why the final data set only contains 624 images.
-    N=624, d=400, k=[20,4,4,2].
+    N=624, d=960, k=[20,4,4,2].
 
     Parameters
     -------
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -1132,17 +1127,16 @@ def load_cmu_faces(return_X_y: bool = False, downloads_path: str = None) -> Bunc
         A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
         Furthermore, the original images are contained in the 'images' attribute.
         Alternatively, if return_X_y is True two arrays will be returned:
-        the data numpy array (624 x 400), the labels numpy array (624 x 4)
+        the data numpy array (624 x 960), the labels numpy array (624 x 4)
 
     References
     -------
     http://archive.ics.uci.edu/ml/datasets/cmu+face+images
     """
-    directory = _get_download_dir(downloads_path) + "/cmufaces/"
-    filename = directory + "faces_4.tar.gz"
-    if not os.path.isfile(filename):
-        if not os.path.isdir(directory):
-            os.mkdir(directory)
+    directory = _get_download_dir(downloads_path) / "cmufaces"
+    filename = directory / "faces_4.tar.gz"
+    if not filename.is_file():
+        directory.mkdir(parents=False, exist_ok=True)
         _download_file("http://archive.ics.uci.edu/ml/machine-learning-databases/faces-mld/faces_4.tar.gz",
                        filename)
         # Unpack zipfile
@@ -1157,14 +1151,15 @@ def load_cmu_faces(return_X_y: bool = False, downloads_path: str = None) -> Bunc
     data_list = []
     label_list = []
     for name in names:
-        path_images = directory + "/faces_4/" + name
-        for image in os.listdir(path_images):
-            if not image.endswith("_4.pgm"):
+        path_images = directory / "faces_4" / name
+        for image in path_images.iterdir():
+            image_str = image.name
+            if not image_str.endswith("_4.pgm"):
                 continue
             # get image data
-            image_array = _load_image_data(path_images + "/" + image, None, False)
+            image_array = _load_image_data(image, None, False)
             # Get labels
-            name_parts = image.split("_")
+            name_parts = image_str.split("_")
             user_id = np.argwhere(names == name_parts[0])[0][0]
             position = np.argwhere(positions == name_parts[1])[0][0]
             expression = np.argwhere(expressions == name_parts[2])[0][0]
@@ -1185,7 +1180,7 @@ def load_cmu_faces(return_X_y: bool = False, downloads_path: str = None) -> Bunc
                      classes=(names, positions, expressions, eyes))
 
 
-def load_gene_expression_cancer_rna_seq(return_X_y: bool = False, downloads_path: str = None):
+def load_gene_expression_cancer_rna_seq(return_X_y: bool = False, downloads_path: str | Path = None):
     """
     Load the Gene Expression Cancer RNA-SEQ data set. It consists of 801 samples belonging to one of 5 classes.
     N=801, d=20531, k=5.
@@ -1194,7 +1189,7 @@ def load_gene_expression_cancer_rna_seq(return_X_y: bool = False, downloads_path
     ----------
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -1208,21 +1203,20 @@ def load_gene_expression_cancer_rna_seq(return_X_y: bool = False, downloads_path
     -------
     https://archive.ics.uci.edu/dataset/401/gene+expression+cancer+rna+seq
     """
-    directory = _get_download_dir(downloads_path) + "/GeneExpressionRNASEQ/"
-    filename = directory + "gene+expression+cancer+rna+seq.zip"
-    if not os.path.isfile(filename):
-        if not os.path.isdir(directory):
-            os.mkdir(directory)
+    directory = _get_download_dir(downloads_path) / "GeneExpressionRNASEQ"
+    filename = directory / "gene+expression+cancer+rna+seq.zip"
+    if not filename.is_file():
+        directory.mkdir(parents=False, exist_ok=True)
         _download_file("https://archive.ics.uci.edu/static/public/401/gene+expression+cancer+rna+seq.zip",
                        filename)
         # Unpack zipfile
         with zipfile.ZipFile(filename, 'r') as zipf:
             zipf.extractall(directory)
-        with tarfile.open(directory + "TCGA-PANCAN-HiSeq-801x20531.tar.gz", "r:gz") as tar:
+        with tarfile.open(directory / "TCGA-PANCAN-HiSeq-801x20531.tar.gz", "r:gz") as tar:
             tar.extractall(directory)
     # Load data and labels
-    data = np.genfromtxt(directory + "TCGA-PANCAN-HiSeq-801x20531/data.csv", delimiter=",")[1:,1:]
-    labels_raw = np.genfromtxt(directory + "TCGA-PANCAN-HiSeq-801x20531/labels.csv", delimiter=",", dtype=str)[1:,1]
+    data = np.genfromtxt(directory / "TCGA-PANCAN-HiSeq-801x20531" / "data.csv", delimiter=",")[1:,1:]
+    labels_raw = np.genfromtxt(directory / "TCGA-PANCAN-HiSeq-801x20531" / "labels.csv", delimiter=",", dtype=str)[1:,1]
     LE = LabelEncoder()
     labels = LE.fit_transform(labels_raw)
     # Return values
@@ -1232,7 +1226,7 @@ def load_gene_expression_cancer_rna_seq(return_X_y: bool = False, downloads_path
         return Bunch(dataset_name="GeneExpressionCancerRNA-SEQ", data=data, target=labels)
 
 
-def load_sport_articles(return_X_y: bool = False, downloads_path: str = None):
+def load_sport_articles(return_X_y: bool = False, downloads_path: str | Path = None):
     """
     Load the Sport Articles data set. It consists of 1000 samples belonging to one of 2 classes (objective or subjective).
     We only consider features that correspond to specific frequencies and, therefore, ignore the attributes 
@@ -1243,7 +1237,7 @@ def load_sport_articles(return_X_y: bool = False, downloads_path: str = None):
     ----------
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -1257,11 +1251,10 @@ def load_sport_articles(return_X_y: bool = False, downloads_path: str = None):
     -------
     https://archive.ics.uci.edu/dataset/450/sports+articles+for+objectivity+analysis
     """
-    directory = _get_download_dir(downloads_path) + "/SportArticles/"
-    filename = directory + "sports+articles+for+objectivity+analysis.zip"
-    if not os.path.isfile(filename):
-        if not os.path.isdir(directory):
-            os.mkdir(directory)
+    directory = _get_download_dir(downloads_path) / "SportArticles"
+    filename = directory / "sports+articles+for+objectivity+analysis.zip"
+    if not filename.is_file():
+        directory.mkdir(parents=False, exist_ok=True)
         _download_file("https://archive.ics.uci.edu/static/public/450/sports+articles+for+objectivity+analysis.zip",
                        filename)
         # Unpack zipfile
@@ -1272,7 +1265,7 @@ def load_sport_articles(return_X_y: bool = False, downloads_path: str = None):
     labels = np.zeros(1000, dtype=np.int32)
     row = -2 # first row is the header and should be skipped
     column = 0
-    with open(directory + "features.xls", "r") as f:
+    with open(directory / "features.xls", "r") as f:
         for _, line in enumerate(f.readlines()):
             if "</Table>" in line:
                 # Next table is not relevant for the data
@@ -1295,7 +1288,7 @@ def load_sport_articles(return_X_y: bool = False, downloads_path: str = None):
         return Bunch(dataset_name="SportArticles", data=data, target=labels)
 
 
-def load_wholesale_customers(return_X_y: bool = False, downloads_path: str = None):
+def load_wholesale_customers(return_X_y: bool = False, downloads_path: str | Path = None):
     """
     Load the Wholesale Customers data set. It consists of 440 samples and can be grouped in two different ways:
     Either two classes based on the channel (Horeca or Retail) or three classes based on the region (Lisbon, Oporto or Other region).
@@ -1305,7 +1298,7 @@ def load_wholesale_customers(return_X_y: bool = False, downloads_path: str = Non
     ----------
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -1319,17 +1312,16 @@ def load_wholesale_customers(return_X_y: bool = False, downloads_path: str = Non
     -------
     https://archive.ics.uci.edu/dataset/292/wholesale+customers
     """
-    directory = _get_download_dir(downloads_path) + "/WholeCustomers/"
-    filename = directory + "wholesale+customers.zip"
-    if not os.path.isfile(filename):
-        if not os.path.isdir(directory):
-            os.mkdir(directory)
+    directory = _get_download_dir(downloads_path) / "WholeCustomers"
+    filename = directory / "wholesale+customers.zip"
+    if not filename.is_file():
+        directory.mkdir(parents=False, exist_ok=True)
         _download_file("https://archive.ics.uci.edu/static/public/292/wholesale+customers.zip",
                        filename)
         # Unpack zipfile
         with zipfile.ZipFile(filename, 'r') as zipf:
             zipf.extractall(directory)
-    wholesale = np.genfromtxt(directory + "Wholesale customers data.csv", delimiter=",", skip_header=True)
+    wholesale = np.genfromtxt(directory / "Wholesale customers data.csv", delimiter=",", skip_header=True)
     labels = wholesale[:,:2] - 1
     data = wholesale[:,2:]
     # Convert labels to int32 format
@@ -1340,10 +1332,11 @@ def load_wholesale_customers(return_X_y: bool = False, downloads_path: str = Non
     else:
         return Bunch(dataset_name="WholesaleCustomers", data=data, target=labels)
 
+
 def load_reuters21578(subset: str = "all", categories: tuple = ("grain", "money-fx", "earn", "acq", "crude"),
                use_tfidf: bool = True, use_stemming: bool = True, use_stop_words: bool = True, max_df: float | int = 1., 
                min_df: float | int = 1, max_features: int = 2000, min_variance : float = 0., 
-               sublinear_tf: bool = False, return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+               sublinear_tf: bool = False, return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the Reuters21578 data set. It consists of 21578 Reuters newswire artices divided into different categories.
     When loading the artices, the title will be included in the text.
@@ -1382,7 +1375,7 @@ def load_reuters21578(subset: str = "all", categories: tuple = ("grain", "money-
         Apply sublinear term frequency scaling, i.e. replace tf with 1 + log(tf) (see sklearn TfidfTransformer) (default: False)
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -1400,27 +1393,26 @@ def load_reuters21578(subset: str = "all", categories: tuple = ("grain", "money-
     assert subset in ["all", "train",
                       "test", "test-cgi", "train-cgi"], "subset must match 'all', 'train', 'test', 'train-cgi' or 'test-cgi'. Your input {0}".format(subset)
     # Check if data is already downloaded
-    directory = _get_download_dir(downloads_path) + "/Reuters21578/"
-    filename = directory + "reuters+21578+text+categorization+collection.zip"
-    if not os.path.isfile(filename):
-        if not os.path.isdir(directory):
-            os.mkdir(directory)
+    directory = _get_download_dir(downloads_path) / "Reuters21578"
+    filename = directory / "reuters+21578+text+categorization+collection.zip"
+    if not filename.is_file():
+        directory.mkdir(parents=False, exist_ok=True)
         _download_file("https://archive.ics.uci.edu/static/public/137/reuters+21578+text+categorization+collection.zip",
                        filename)
         # Unpack zipfile
         with zipfile.ZipFile(filename, 'r') as zipf:
             zipf.extractall(directory)
-        with tarfile.open(directory + "reuters21578.tar.gz", "r:gz") as tar:
+        with tarfile.open(directory / "reuters21578.tar.gz", "r:gz") as tar:
             tar.extractall(directory)
     # Load actual articles into arrays
     all_topics = []
     all_bodies = []
     all_lewis_splits = []
     all_cgi_splits = []
-    for file in os.listdir(directory):
-        if file.endswith(".sgm"):
+    for file in directory.iterdir():
+        if file.suffix == ".sgm":
             in_body = False
-            with open(directory + file, "rb") as f:
+            with open(file, "rb") as f:
                 for line in f.readlines():
                     # Needed so that reut2-017.sgm is not crashing due to encoding
                     line = line.decode('utf-8','ignore')
@@ -1482,7 +1474,7 @@ def load_reuters21578(subset: str = "all", categories: tuple = ("grain", "money-
         else:
             all_topics[i] = new_topic
     # Transform raw data
-    data = _transform_text_data(all_bodies, use_tfidf, use_stemming, use_stop_words, max_df, min_df, max_features, min_variance, 
+    data, vocabulary = _transform_text_data(all_bodies, use_tfidf, use_stemming, use_stop_words, max_df, min_df, max_features, min_variance, 
                                 sublinear_tf)
     # Get labels
     LE = LabelEncoder()
@@ -1503,4 +1495,4 @@ def load_reuters21578(subset: str = "all", categories: tuple = ("grain", "money-
     if return_X_y:
         return data, labels
     else:
-        return Bunch(dataset_name="Reuters21578", data=data, target=labels, classes=categories)
+        return Bunch(dataset_name="Reuters21578", data=data, target=labels, classes=categories, columns=vocabulary)
diff --git a/clustpy/data/real_video_data.py b/clustpy/data/real_video_data.py
index 8a3211b..40b2217 100644
--- a/clustpy/data/real_video_data.py
+++ b/clustpy/data/real_video_data.py
@@ -4,22 +4,23 @@
     print("[WARNING] Could not import cv2 in clustpy.data.real_video_data. Please install cv2 by 'pip install opencv-python' if necessary")
 from clustpy.data._utils import _download_file, _get_download_dir, flatten_images
 import numpy as np
-import os
 import zipfile
 from sklearn.datasets._base import Bunch
+from pathlib import Path
+
 
 """
 Helpers
 """
 
 
-def _load_video(path: str, image_size: tuple) -> np.ndarray:
+def _load_video(path: str | Path, image_size: tuple) -> np.ndarray:
     """
     Load a video by saving each frame within a numpy array.
 
     Parameters
     ----------
-    path : str
+    path : Path | Path
         Path to the video
     image_size : tuple
         The single frames can be downsized. This is necessary for large datasets.
@@ -99,7 +100,7 @@ def _downsample_frames(data: np.ndarray, labels: np.ndarray, frame_sampling_rati
 
 def load_video_weizmann(use_actions : tuple = None, use_persons : tuple = None, 
                         image_size: tuple = None, frame_sampling_ratio: float = 1, return_X_y: bool = False,
-                        downloads_path: str = None) -> Bunch:
+                        downloads_path: str | Path = None) -> Bunch:
     """
     Load the Weizmann video data set.
     It consists of 93 videos showing 9 different persons performing 10 different activities.
@@ -123,7 +124,7 @@ def load_video_weizmann(use_actions : tuple = None, use_persons : tuple = None,
         Can take values within (0, 1] (default: 1)
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -139,7 +140,7 @@ def load_video_weizmann(use_actions : tuple = None, use_persons : tuple = None,
     -------
     https://www.wisdom.weizmann.ac.il/~vision/SpaceTimeActions.html
     """
-    directory = _get_download_dir(downloads_path) + "/Video_Weizmann/"
+    directory = _get_download_dir(downloads_path) / "Video_Weizmann"
     all_actions = ["walk", "run", "jump", "side", "bend", "wave1", "wave2", "pjump", "jack", "skip"]
     if use_actions is None:
         use_actions = all_actions.copy()
@@ -153,10 +154,9 @@ def load_video_weizmann(use_actions : tuple = None, use_persons : tuple = None,
     # Download data
     for action in use_actions:
         my_zip_file = action + ".zip"
-        filename = directory + my_zip_file
-        if not os.path.isfile(filename):
-            if not os.path.isdir(directory):
-                os.mkdir(directory)
+        filename = directory / my_zip_file
+        if not filename.is_file():
+            directory.mkdir(parents=False, exist_ok=True)
             _download_file(
                 "https://www.wisdom.weizmann.ac.il/~vision/VideoAnalysis/Demos/SpaceTimeActions/DB/" + my_zip_file,
                 filename)
@@ -164,11 +164,11 @@ def load_video_weizmann(use_actions : tuple = None, use_persons : tuple = None,
             with zipfile.ZipFile(filename, 'r') as zipf:
                 zipf.extractall(directory)
     # Load data, iterate over all video files
-    for v_file in os.listdir(directory):
+    for v_file in directory.iterdir():
         # Ignore zip files
-        if v_file.endswith(".avi"):
+        if v_file.suffix == ".avi":
             # Get name of person and type of activity
-            relevant_parts = v_file.split(".")[0]
+            relevant_parts = v_file.name.split(".")[0]
             person = relevant_parts.split("_")[0]
             action = relevant_parts.split("_")[1]
             # Sometimes a person performs an action twice. In that case a 1/2 is appended to the action
@@ -179,7 +179,7 @@ def load_video_weizmann(use_actions : tuple = None, use_persons : tuple = None,
             if person not in use_persons or action not in use_actions:
                 continue
             # Load video
-            data_local = _load_video(directory + "/" + v_file, image_size)
+            data_local = _load_video(directory / v_file, image_size)
             # Transform string to label
             label_person = use_persons.index(person)
             label_action = use_actions.index(action)
@@ -207,7 +207,7 @@ def load_video_weizmann(use_actions : tuple = None, use_persons : tuple = None,
 
 
 def load_video_keck_gesture(subset: str = "all", image_size: tuple = (200, 200), frame_sampling_ratio: float = 1,
-                            return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+                            return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the Keck Gesture video data set.
     It consists of 42 training and 56 testing videos showing 4 different persons performing 14 different gestures.
@@ -234,7 +234,7 @@ def load_video_keck_gesture(subset: str = "all", image_size: tuple = (200, 200),
         Can take values within (0, 1] (default: 1)
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -293,12 +293,11 @@ def parse_frames_file(frames_file: str) -> (dict, dict):
     subset = subset.lower()
     assert subset in ["all", "train",
                       "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
-    directory = _get_download_dir(downloads_path) + "/Video_Keck_Gesture/"
-    filename = directory + "Keck_Dataset.zip"
-    frames_file = directory + "sequences.txt"
-    if not os.path.isfile(filename):
-        if not os.path.isdir(directory):
-            os.mkdir(directory)
+    directory = _get_download_dir(downloads_path) / "Video_Keck_Gesture"
+    filename = directory / "Keck_Dataset.zip"
+    frames_file = directory / "sequences.txt"
+    if not filename.is_file():
+        directory.mkdir(parents=False, exist_ok=True)
         _download_file("http://www.zhuolin.umiacs.io/PrototypeTree/Keck_Dataset.zip", filename)
         # Unpack zipfile
         with zipfile.ZipFile(filename, 'r') as zipf:
@@ -318,13 +317,14 @@ def parse_frames_file(frames_file: str) -> (dict, dict):
         file_directories.append((False, "testingfiles/"))
     # load videos
     for train_data, file_directory in file_directories:
-        directory_files = directory + "Keck Dataset/" + file_directory
+        directory_files = directory / "Keck Dataset" / file_directory
         # Iterate over all video files
-        for v_file in os.listdir(directory_files):
-            data_local = _load_video(directory_files + v_file, image_size)
+        for v_file in directory_files.iterdir():
+            v_file_str = v_file.name
+            data_local = _load_video(v_file, image_size)
             # Transform string to label
-            label_gesture = int(v_file.split("_")[1].replace("gesture", ""))
-            label_person = int(v_file.split("_")[0].replace("person", "")) - 1
+            label_gesture = int(v_file_str.split("_")[1].replace("gesture", ""))
+            label_person = int(v_file_str.split("_")[0].replace("person", "")) - 1
             labels_local = np.array([[0, label_person]] * data_local.shape[0], dtype="int32")
             # Use frames_dicts to set gestures correctly
             if train_data:
diff --git a/clustpy/data/real_world_data.py b/clustpy/data/real_world_data.py
index f9637fd..78ad801 100644
--- a/clustpy/data/real_world_data.py
+++ b/clustpy/data/real_world_data.py
@@ -1,6 +1,5 @@
 from clustpy.data._utils import _download_file, _get_download_dir, _download_file_from_google_drive, _load_image_data, \
     flatten_images, _transform_text_data
-import os
 import numpy as np
 import zipfile
 import tarfile
@@ -10,6 +9,8 @@
 from scipy.io import loadmat
 import re
 from sklearn.datasets._base import Bunch
+from pathlib import Path
+
 
 # More datasets https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#usps
 
@@ -189,12 +190,12 @@ def load_newsgroups(subset: str = "all", use_tfidf: bool = True, use_stemming: b
     data_raw = newsgroups.data
     # Get all data so that transformations can be applied to all possible subsets
     data_all = fetch_20newsgroups(subset="all", remove=('headers', 'footers', 'quotes')).data if subset != "all" else data_raw
-    data = _transform_text_data(data_raw, use_tfidf, use_stemming, use_stop_words, max_df, min_df, max_features, min_variance, 
+    data, vocabulary = _transform_text_data(data_raw, use_tfidf, use_stemming, use_stop_words, max_df, min_df, max_features, min_variance, 
                                 sublinear_tf, data_all)
     if return_X_y:
         return data, newsgroups.target
     else:
-        return Bunch(dataset_name="20Newsgroups", data=data, target=newsgroups.target)
+        return Bunch(dataset_name="20Newsgroups", data=data, target=newsgroups.target, columns=vocabulary)
 
 
 def load_rcv1(subset: str = "all", n_features: int = 2000, categories: tuple = ("CCAT", "GCAT", "MCAT", "ECAT"),
@@ -286,7 +287,7 @@ def load_imagenet_dog(subset: str = "all",
                                       "n02102177-Welsh_springer_spaniel", "n02105056-groenendael", "n02105412-kelpie",
                                       "n02105855-Shetland_sheepdog", "n02107142-Doberman", "n02110958-pug",
                                       "n02112137-chow"],
-                      return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+                      return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the ImageNet Dog data set. It consists of 20580 color images of different sizes showing 120 breeds of dogs.
     The data set is composed of 12000 training and 8580 test images.
@@ -308,7 +309,7 @@ def load_imagenet_dog(subset: str = "all",
         Usually, a subset consisting of 15 breeds is extracted (default: list with 15 dog breeds)
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : bool
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -333,19 +334,18 @@ def load_imagenet_dog(subset: str = "all",
     subset = subset.lower()
     assert subset in ["all", "train",
                       "test"], "subset must match 'all', 'train' or 'test'. Your input {0}".format(subset)
-    directory = _get_download_dir(downloads_path) + "/ImageNetDog/"
-    filename = directory + "images.tar"
-    if not os.path.isfile(filename):
-        if not os.path.isdir(directory):
-            os.mkdir(directory)
+    directory = _get_download_dir(downloads_path) / "ImageNetDog"
+    filename = directory / "images.tar"
+    if not filename.is_file():
+        directory.mkdir(parents=False, exist_ok=True)
         _download_file("http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar",
                        filename)
         # Unpack zipfile
         with tarfile.open(filename, "r") as tar:
             tar.extractall(directory)
     # Get files for test/train split
-    train_test_filename = directory + "lists.tar"
-    if not os.path.isfile(train_test_filename):
+    train_test_filename = directory / "lists.tar"
+    if not train_test_filename.is_file():
         _download_file("http://vision.stanford.edu/aditya86/ImageNetDogs/lists.tar",
                        train_test_filename)
         # Unpack zipfile
@@ -353,15 +353,15 @@ def load_imagenet_dog(subset: str = "all",
             tar.extractall(directory)
     # Check breeds list
     if breeds is None:
-        breeds = os.listdir(directory + "/Images")
+        breeds = [br.name for br in (directory / "Images").iterdir()]
     # Load data lists
     data_list = []
     if subset == "train":
-        object_list = loadmat(directory + "/train_list.mat")
+        object_list = loadmat(directory / "train_list.mat")
     elif subset == "test":
-        object_list = loadmat(directory + "/test_list.mat")
+        object_list = loadmat(directory / "test_list.mat")
     else:
-        object_list = loadmat(directory + "/file_list.mat")
+        object_list = loadmat(directory / "file_list.mat")
     labels = object_list["labels"]
     file_list = object_list["file_list"]
     # get image data
@@ -369,7 +369,7 @@ def load_imagenet_dog(subset: str = "all",
     for i, file in enumerate(file_list):
         file = file[0][0]
         if file.split("/")[0] in breeds:
-            image_data = _load_image_data(directory + "/Images/" + file, image_size, True)
+            image_data = _load_image_data(directory / "Images" / file, image_size, True)
             data_list.append(image_data)
         else:
             use_image[i] = False
@@ -392,7 +392,7 @@ def load_imagenet_dog(subset: str = "all",
                      images=data_image, image_format=image_format, classes=breeds)
 
 
-def load_imagenet10(use_224_size: bool = True, return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_imagenet10(use_224_size: bool = True, return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the ImageNet-10 data set. This is a subset of the well-known ImageNet data set with only 10 classes.
     It consists of 13000 224x224 (or 96x96) color images showing different objects.
@@ -404,7 +404,7 @@ def load_imagenet10(use_224_size: bool = True, return_X_y: bool = False, downloa
         defines wheter the images should be loaded in the size (224 x 224) or (96 x 96) (default: True)
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -425,23 +425,22 @@ def load_imagenet10(use_224_size: bool = True, return_X_y: bool = False, downloa
     Russakovsky, Olga, et al. "Imagenet large scale visual recognition challenge."
     International journal of computer vision 115 (2015): 211-252.
     """
-    directory = _get_download_dir(downloads_path) + "/ImageNet10"
-    if not os.path.isdir(directory):
-        os.mkdir(directory)
+    directory = _get_download_dir(downloads_path) / "ImageNet10"
+    directory.mkdir(parents=False, exist_ok=True)
     # Source: https://drive.google.com/drive/folders/1XL0Nohi4vO2f1I4znf388n2pMP8PiKFd
     if use_224_size:
-        filename_data = directory + "/data_224.npy"
-        if not os.path.isfile(filename_data):
+        filename_data = directory / "data_224.npy"
+        if not filename_data.is_file():
             _download_file_from_google_drive("1sLfA0U9s9Q5Cf8o32GxYoyiyrzZN1K_6", filename_data)
-        filename_labels = directory + "/labels_224.npy"
-        if not os.path.isfile(filename_labels):
+        filename_labels = directory / "labels_224.npy"
+        if not filename_labels.is_file():
             _download_file_from_google_drive("1OjAQwaGnAfJBW66HFkR7yODLFxnTZWWI", filename_labels)
     else:
-        filename_data = directory + "/data_96.npy"
-        if not os.path.isfile(filename_data):
+        filename_data = directory / "data_96.npy"
+        if not filename_data.is_file():
             _download_file_from_google_drive("13VbP1qYz6bSeibnoR-w0J_jL9bQf6tGX", filename_data)
-        filename_labels = directory + "/labels_96.npy"
-        if not os.path.isfile(filename_labels):
+        filename_labels = directory / "labels_96.npy"
+        if not filename_labels.is_file():
             _download_file_from_google_drive("1uiuYUdjyCITLURc5eo8ByP9b51MK_Uk6", filename_labels)
     # Load data and labels
     data_image = np.load(filename_data)
@@ -460,7 +459,7 @@ def load_imagenet10(use_224_size: bool = True, return_X_y: bool = False, downloa
                      images=data_image, image_format=image_format)
 
 
-def load_coil20(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_coil20(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the COIL-20 data set.
     It consists of 1440 128x128 gray-scale images of 20 objects photographed from 72 different angles.
@@ -470,7 +469,7 @@ def load_coil20(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
     ----------
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -485,11 +484,10 @@ def load_coil20(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
     -------
     https://www.cs.columbia.edu/CAVE/software/softlib/coil-20.php
     """
-    directory = _get_download_dir(downloads_path) + "/COIL20/"
-    filename = directory + "coil-20-proc.zip"
-    if not os.path.isfile(filename):
-        if not os.path.isdir(directory):
-            os.mkdir(directory)
+    directory = _get_download_dir(downloads_path) / "COIL20"
+    filename = directory / "coil-20-proc.zip"
+    if not filename.is_file():
+        directory.mkdir(parents=False, exist_ok=True)
         _download_file("https://cave.cs.columbia.edu/old/databases/SLAM_coil-20_coil-100/coil-20/coil-20-proc.zip",
                        filename)
         # Unpack zipfile
@@ -500,7 +498,7 @@ def load_coil20(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
     labels = np.zeros(1440, dtype=np.int32)
     for i in range(20):
         for j in range(72):
-            image_data = _load_image_data(directory + "coil-20-proc/obj{0}__{1}.png".format(i + 1, j), None, False)
+            image_data = _load_image_data(directory / "coil-20-proc" / "obj{0}__{1}.png".format(i + 1, j), None, False)
             assert image_data.shape == (
                 128, 128), "Shape of image obj{0}__{1}.png is not correct. Mest be (128, 128) but is {2}".format(i + 1,
                                                                                                                  j,
@@ -518,7 +516,7 @@ def load_coil20(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
         return Bunch(dataset_name="COIL20", data=data_flatten, target=labels, images=data_image, image_format="HW")
 
 
-def load_coil100(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
+def load_coil100(return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
     """
     Load the COIL-100 data set.
     It consists of 7200 128x128 color images of 100 objects photographed from 72 different angles.
@@ -528,7 +526,7 @@ def load_coil100(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
     ----------
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -544,11 +542,10 @@ def load_coil100(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
     -------
     https://www.cs.columbia.edu/CAVE/software/softlib/coil-100.php
     """
-    directory = _get_download_dir(downloads_path) + "/COIL100/"
-    filename = directory + "coil-100.zip"
-    if not os.path.isfile(filename):
-        if not os.path.isdir(directory):
-            os.mkdir(directory)
+    directory = _get_download_dir(downloads_path) / "COIL100"
+    filename = directory / "coil-100.zip"
+    if not filename.is_file():
+        directory.mkdir(parents=False, exist_ok=True)
         _download_file("http://cave.cs.columbia.edu/old/databases/SLAM_coil-20_coil-100/coil-100/coil-100.zip",
                        filename)
         # Unpack zipfile
@@ -559,7 +556,7 @@ def load_coil100(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
     labels = np.zeros(7200, dtype=np.int32)
     for i in range(100):
         for j in range(72):
-            image_data = _load_image_data(directory + "coil-100/obj{0}__{1}.png".format(i + 1, j * 5), None, True)
+            image_data = _load_image_data(directory / "coil-100" / "obj{0}__{1}.png".format(i + 1, j * 5), None, True)
             assert image_data.shape == (
                 128, 128, 3), "Shape of image obj{0}__{1}.png is not correct. Mest be (128, 128, 3) but is {2}".format(
                 i + 1, j, image_data.shape)
@@ -587,16 +584,16 @@ def load_coil100(return_X_y: bool = False, downloads_path: str = None) -> Bunch:
 def load_webkb(use_universities: tuple = ("cornell", "texas", "washington", "wisconsin"),
                use_categories: tuple = ("course", "faculty", "project", "student"), use_tfidf: bool = True, 
                use_stemming: bool = True, use_stop_words: bool = True, max_df: float | int = 1., 
-               min_df: float | int = 0.01, max_features: int = None, min_variance : float = 0.25, 
+               min_df: float | int = 1, max_features: int = 2000, min_variance : float = 0., 
                sublinear_tf: bool = False, remove_headers: bool = True, return_X_y: bool = False, 
-               downloads_path: str = None) -> Bunch:
+               downloads_path: str | Path = None) -> Bunch:
     """
     Load the WebKB data set. It consists of 8282 Html documents from different universities ("wisconsin", "washington", "texas", "cornell", "misc").
     These web pages have a specified category ("student", "staff", "project", "faculty", "department", "course", "other").
     The first column of the labels contains the category information and the second the university information.
     For more information see the references website.
     The data is usually preprocessed by using stemming and removing stop words. Furthermore, words with a document frequency
-    smaller than min_df or with a variance smaller than min_variance are usually removed and tf-idf is applied.
+    smaller than min_df or with a variance smaller than min_variance are often removed and tf-idf is applied.
     N=1041, d=323, k=[4,4] using the default settings.
 
     Parameters
@@ -616,20 +613,20 @@ def load_webkb(use_universities: tuple = ("cornell", "texas", "washington", "wis
         If float, the parameter represents a proportion of documents, integer corresponds to absolute counts (see sklearn CountVectorizer) (default: 1.0)
     min_df : float | int
         Ignore words that have a document frequency strictly lower than min_df.
-        If float, the parameter represents a proportion of documents, integer corresponds to absolute counts (see sklearn CountVectorizer) (default: 0.01)
+        If float, the parameter represents a proportion of documents, integer corresponds to absolute counts (see sklearn CountVectorizer) (default: 1)
     max_features : int
         If not None, the resulting count matric will ony contain the top max_features ordered by term frequency across the corpus (see sklearn CountVectorizer).
-        Note that this value could be further reduced if min_variance is smaller than one (default: None)
+        Note that this value could be further reduced if min_variance is smaller than one (default: 2000)
     min_variance : float
         Features with a variance lower than min_variance will be removed (see sklearn VarianceThreshold). 
-        The default is to keep all features with non-zero variance, i.e. remove only the features that have the same value in all samples (default: 0.25)
+        The default is to keep all features with non-zero variance, i.e. remove only the features that have the same value in all samples (default: 0.)
     sublinear_tf : bool
         Apply sublinear term frequency scaling, i.e. replace tf with 1 + log(tf) (see sklearn TfidfTransformer) (default: False)
     remove_headers : bool
         Specifies if the headers of the Html files should be removed (default: True)
     return_X_y : bool
         If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
-    downloads_path : str
+    downloads_path : str | Path
         path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
 
     Returns
@@ -652,11 +649,10 @@ def load_webkb(use_universities: tuple = ("cornell", "texas", "washington", "wis
         use_categories = possible_categories.copy()
     assert all([cat in possible_categories for cat in use_categories])
     # Check if data is already downloaded
-    directory = _get_download_dir(downloads_path) + "/WebKB/"
-    filename = directory + "webkb-data.gtar.gz"
-    if not os.path.isfile(filename):
-        if not os.path.isdir(directory):
-            os.mkdir(directory)
+    directory = _get_download_dir(downloads_path) / "WebKB"
+    filename = directory / "webkb-data.gtar.gz"
+    if not filename.is_file():
+        directory.mkdir(parents=False, exist_ok=True)
         _download_file("http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/webkb-data.gtar.gz",
                        filename)
         # Unpack zipfile
@@ -673,7 +669,7 @@ def load_webkb(use_universities: tuple = ("cornell", "texas", "washington", "wis
                     f = tar.extractfile(obj)
                     lines = f.readlines()
                     # Write file
-                    with open(directory + new_name, "wb") as output:
+                    with open(directory / new_name, "wb") as output:
                         for line in lines:
                             output.write(line)
     texts = []
@@ -684,10 +680,9 @@ def load_webkb(use_universities: tuple = ("cornell", "texas", "washington", "wis
     # Read files
     for i, category in enumerate(use_categories):
         for j, univerity in enumerate(use_universities):
-            inner_directory = "{0}webkb/{1}/{2}/".format(directory, category, univerity)
-            files = os.listdir(inner_directory)
-            for file in files:
-                with open(inner_directory + file, "r", encoding='latin-1') as f:
+            inner_directory = directory / "webkb" / category / univerity
+            for file in inner_directory.iterdir():
+                with open(file, "r", encoding='latin-1') as f:
                     lines = f.read()
                     if remove_headers:
                         # Remove header
@@ -698,10 +693,171 @@ def load_webkb(use_universities: tuple = ("cornell", "texas", "washington", "wis
                     texts.append(lines)
                     labels = np.r_[labels, [[i, j]]]
     # Transform raw data
-    data = _transform_text_data(texts, use_tfidf, use_stemming, use_stop_words, max_df, min_df, max_features, min_variance, 
+    data, vocabulary = _transform_text_data(texts, use_tfidf, use_stemming, use_stop_words, max_df, min_df, max_features, min_variance, 
+                                sublinear_tf)
+    # Return values
+    if return_X_y:
+        return data, labels
+    else:
+        return Bunch(dataset_name="WebKB", data=data, target=labels, classes=(use_categories, use_universities), columns=vocabulary)
+
+
+"""
+BBC Data
+"""
+
+
+def load_bbcsport(use_tfidf: bool = True, use_stemming: bool = True, use_stop_words: bool = True, max_df: float | int = 1., 
+               min_df: float | int = 1, max_features: int = 2000, min_variance : float = 0., sublinear_tf: bool = False, 
+               return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
+    """
+    Load the BBC Sport data set. It consists of a collection of 18846 BBC sport documents, partitioned
+    into the topics "athletics", "cricket", "football", "rugby", and "tennis". 
+    The documents are usually converted into feature vectors using tf-idf.
+    N=737, d=2000, k=5 using the default settings.
+
+    Parameters
+    ----------
+    use_tfidf : bool
+        If true, tf-idf will be applied as the last step of the pipeline (default: True)
+    use_stemming : bool
+        If true, the SnowballStemmer from nltk will be used when creating the count matrix (default: True)
+    use_stop_words : bool
+        If true, the list of English stopwords from sklearn CountVectorizer will be used (default: True)
+    max_df : float | int
+        Ignore words that have a document frequency strictly higher than max_df.
+        If float, the parameter represents a proportion of documents, integer corresponds to absolute counts (see sklearn CountVectorizer) (default: 1.0)
+    min_df : float | int
+        Ignore words that have a document frequency strictly lower than min_df.
+        If float, the parameter represents a proportion of documents, integer corresponds to absolute counts (see sklearn CountVectorizer) (default: 1)
+    max_features : int
+        If not None, the resulting count matric will ony contain the top max_features ordered by term frequency across the corpus (see sklearn CountVectorizer).
+        Note that this value could be further reduced if min_variance is smaller than one (default: 2000)
+    min_variance : float
+        Features with a variance lower than min_variance will be removed (see sklearn VarianceThreshold).
+        The default is to keep all features with non-zero variance, i.e. remove only the features that have the same value in all samples (default: 0.)
+    sublinear_tf : bool
+        Apply sublinear term frequency scaling, i.e. replace tf with 1 + log(tf) (see sklearn TfidfTransformer) (default: False)
+    return_X_y : bool
+        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
+    downloads_path : str | Path
+        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
+
+    Returns
+    -------
+    bunch : Bunch
+        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
+        Alternatively, if return_X_y is True two arrays will be returned:
+        the data numpy array (737 x 2000 - using the default settings), the labels numpy array (737)
+
+    References
+    -------
+    http://mlg.ucd.ie/datasets/bbc.html
+    """
+    # Check if data is already downloaded
+    directory = _get_download_dir(downloads_path) / "bbcsport"
+    filename = directory / "bbcsport-fulltext.zip"
+    if not filename.is_file():
+        directory.mkdir(parents=False, exist_ok=True)
+        _download_file("http://mlg.ucd.ie/files/datasets/bbcsport-fulltext.zip", filename)
+        # Unpack zipfile
+        with zipfile.ZipFile(filename, 'r') as zipf:
+            zipf.extractall(directory)
+    directory = directory / "bbcsport"
+    labels = []
+    texts = []
+    topics = ["athletics", "cricket", "football", "rugby", "tennis"]
+    for i, topic in enumerate(topics):
+        inner_directory = directory / topic
+        for file in inner_directory.iterdir():
+            with open(file, "r") as f:
+                lines = f.read()
+                texts.append(lines)
+                labels.append(i)
+    # Transform raw data
+    data, vocabulary = _transform_text_data(texts, use_tfidf, use_stemming, use_stop_words, max_df, min_df, max_features, min_variance, 
+                                sublinear_tf)
+    labels = np.array(labels)
+    # Return values
+    if return_X_y:
+        return data, labels
+    else:
+        return Bunch(dataset_name="BBCSport", data=data, target=labels, classes=topics, columns=vocabulary)
+    
+
+def load_bbcnews(use_tfidf: bool = True, use_stemming: bool = True, use_stop_words: bool = True, max_df: float | int = 1., 
+               min_df: float | int = 1, max_features: int = 2000, min_variance : float = 0., sublinear_tf: bool = False, 
+               return_X_y: bool = False, downloads_path: str | Path = None) -> Bunch:
+    """
+    Load the BBC News data set. It consists of a collection of 2225 BBC news documents, partitioned
+    into the topics "business", "entertainment", "politics", "sport", and "tech". 
+    The documents are usually converted into feature vectors using tf-idf.
+    N=2225, d=2000, k=5 using the default settings.
+
+    Parameters
+    ----------
+    use_tfidf : bool
+        If true, tf-idf will be applied as the last step of the pipeline (default: True)
+    use_stemming : bool
+        If true, the SnowballStemmer from nltk will be used when creating the count matrix (default: True)
+    use_stop_words : bool
+        If true, the list of English stopwords from sklearn CountVectorizer will be used (default: True)
+    max_df : float | int
+        Ignore words that have a document frequency strictly higher than max_df.
+        If float, the parameter represents a proportion of documents, integer corresponds to absolute counts (see sklearn CountVectorizer) (default: 1.0)
+    min_df : float | int
+        Ignore words that have a document frequency strictly lower than min_df.
+        If float, the parameter represents a proportion of documents, integer corresponds to absolute counts (see sklearn CountVectorizer) (default: 1)
+    max_features : int
+        If not None, the resulting count matric will ony contain the top max_features ordered by term frequency across the corpus (see sklearn CountVectorizer).
+        Note that this value could be further reduced if min_variance is smaller than one (default: 2000)
+    min_variance : float
+        Features with a variance lower than min_variance will be removed (see sklearn VarianceThreshold).
+        The default is to keep all features with non-zero variance, i.e. remove only the features that have the same value in all samples (default: 0.)
+    sublinear_tf : bool
+        Apply sublinear term frequency scaling, i.e. replace tf with 1 + log(tf) (see sklearn TfidfTransformer) (default: False)
+    return_X_y : bool
+        If True, returns (data, target) instead of a Bunch object. See below for more information about the data and target object (default: False)
+    downloads_path : str | Path
+        path to the directory where the data is stored (default: None -> [USER]/Downloads/clustpy_datafiles)
+
+    Returns
+    -------
+    bunch : Bunch
+        A Bunch object containing the data in the 'data' attribute and the labels in the 'target' attribute.
+        Alternatively, if return_X_y is True two arrays will be returned:
+        the data numpy array (2225 x 2000 - using the default settings), the labels numpy array (2225)
+
+    References
+    -------
+    http://mlg.ucd.ie/datasets/bbc.html
+    """
+    # Check if data is already downloaded
+    directory = _get_download_dir(downloads_path) / "bbcnews"
+    filename = directory / "bbc-fulltext.zip"
+    if not filename.is_file():
+        directory.mkdir(parents=False, exist_ok=True)
+        _download_file("http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip", filename)
+        # Unpack zipfile
+        with zipfile.ZipFile(filename, 'r') as zipf:
+            zipf.extractall(directory)
+    directory = directory / "bbc"
+    labels = []
+    texts = []
+    topics = ["business", "entertainment", "politics", "sport", "tech"]
+    for i, topic in enumerate(topics):
+        inner_directory = directory / topic
+        for file in inner_directory.iterdir():
+            with open(file, "r") as f:
+                lines = f.read()
+                texts.append(lines)
+                labels.append(i)
+    # Transform raw data
+    data, vocabulary = _transform_text_data(texts, use_tfidf, use_stemming, use_stop_words, max_df, min_df, max_features, min_variance, 
                                 sublinear_tf)
+    labels = np.array(labels)
     # Return values
     if return_X_y:
         return data, labels
     else:
-        return Bunch(dataset_name="WebKB", data=data, target=labels, classes=(use_categories, use_universities))
+        return Bunch(dataset_name="BBCNews", data=data, target=labels, classes=topics, columns=vocabulary)
diff --git a/clustpy/data/tests/test_real_world_data.py b/clustpy/data/tests/test_real_world_data.py
index c1dba3c..ed7cc90 100644
--- a/clustpy/data/tests/test_real_world_data.py
+++ b/clustpy/data/tests/test_real_world_data.py
@@ -1,6 +1,6 @@
 from clustpy.data.tests._helpers_for_tests import _helper_test_data_loader
 from clustpy.data import load_iris, load_wine, load_breast_cancer, load_olivetti_faces, load_newsgroups, load_rcv1, \
-    load_imagenet_dog, load_imagenet10, load_coil20, load_coil100, load_webkb
+    load_imagenet_dog, load_imagenet10, load_coil20, load_coil100, load_webkb, load_bbcnews, load_bbcsport
 import pytest
 import shutil
 
@@ -129,5 +129,15 @@ def test_load_coil100(my_tmp_dir):
 
 @pytest.mark.data
 def test_load_webkb(my_tmp_dir):
-    _helper_test_data_loader(load_webkb, 1041, 323, [4, 4], dataloader_params={"downloads_path": my_tmp_dir})
-    _helper_test_data_loader(load_webkb, 8282, 761, [7, 5], dataloader_params={"downloads_path": my_tmp_dir, "use_categories": None, "use_universities": None})
+    _helper_test_data_loader(load_webkb, 1041, 2000, [4, 4], dataloader_params={"downloads_path": my_tmp_dir})
+    _helper_test_data_loader(load_webkb, 1041, 323, [4, 4], dataloader_params={"downloads_path": my_tmp_dir, "min_df": 0.01, "max_features": None, "min_variance": 0.25})
+    _helper_test_data_loader(load_webkb, 8282, 2000, [7, 5], dataloader_params={"downloads_path": my_tmp_dir, "use_categories": None, "use_universities": None})
+
+
+@pytest.mark.data
+def test_load_bbcsport(my_tmp_dir):
+    _helper_test_data_loader(load_bbcsport, 737, 2000, 5, dataloader_params={"downloads_path": my_tmp_dir})
+
+@pytest.mark.data
+def test_load_bbcnews(my_tmp_dir):
+    _helper_test_data_loader(load_bbcnews, 2225, 2000, 5, dataloader_params={"downloads_path": my_tmp_dir})
diff --git a/clustpy/deep/_abstract_deep_clustering_algo.py b/clustpy/deep/_abstract_deep_clustering_algo.py
index c776ff6..68ba772 100644
--- a/clustpy/deep/_abstract_deep_clustering_algo.py
+++ b/clustpy/deep/_abstract_deep_clustering_algo.py
@@ -6,6 +6,7 @@
 from clustpy.utils.checks import check_parameters
 from sklearn.utils.validation import check_is_fitted
 from sklearn.metrics.pairwise import pairwise_distances_argmin_min
+from pathlib import Path
 
 
 class _AbstractDeepClusteringAlgo(TransformerMixin, ClusterMixin, BaseEstimator):
@@ -19,7 +20,7 @@ class _AbstractDeepClusteringAlgo(TransformerMixin, ClusterMixin, BaseEstimator)
     neural_network : torch.nn.Module | tuple
         the neural network used for the computations.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict).
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network.
     embedding_size : int
         size of the embedding within the autoencoder
@@ -29,7 +30,7 @@ class _AbstractDeepClusteringAlgo(TransformerMixin, ClusterMixin, BaseEstimator)
         use a fixed random state to get a repeatable solution. Can also be of type int
     """
 
-    def __init__(self, batch_size: int, neural_network: torch.nn.Module | tuple, neural_network_weights: str,
+    def __init__(self, batch_size: int, neural_network: torch.nn.Module | tuple, neural_network_weights: str | Path,
                  embedding_size: int, device: torch.device, random_state: np.random.RandomState | int):
         self.batch_size = batch_size
         self.neural_network = neural_network
diff --git a/clustpy/deep/_data_utils.py b/clustpy/deep/_data_utils.py
index fc787fa..257e08a 100644
--- a/clustpy/deep/_data_utils.py
+++ b/clustpy/deep/_data_utils.py
@@ -2,6 +2,7 @@
 import torchvision
 import numpy as np
 from typing import Callable, List
+from pathlib import PurePath
 
 
 class _ClustpyDataset(torch.utils.data.Dataset):
@@ -287,9 +288,9 @@ def get_train_and_test_dataloader(X: np.ndarray | torch.Tensor, batch_size: int
     else:
         trainloader, testloader = custom_dataloaders
         # If train-/testloader is string, it can be loaded from a file
-        if type(trainloader) is str:
+        if isinstance(trainloader, (str, PurePath)):
             trainloader = torch.load(trainloader, weights_only=False)
-        if type(testloader) is str:
+        if isinstance(testloader, (str, PurePath)):
             testloader = torch.load(testloader, weights_only=False)
         if trainloader.batch_size != testloader.batch_size:
             print(
@@ -313,7 +314,6 @@ def get_default_augmented_dataloaders(X: np.ndarray | torch.Tensor, batch_size:
     a channel-wise z-transformation.
     Optionally, the images can be flatten afterward.
 
-
     Parameters
     ----------
     X : np.ndarray | torch.Tensor
diff --git a/clustpy/deep/_train_utils.py b/clustpy/deep/_train_utils.py
index e776d96..4618a5c 100644
--- a/clustpy/deep/_train_utils.py
+++ b/clustpy/deep/_train_utils.py
@@ -6,6 +6,7 @@
 from clustpy.deep._data_utils import get_dataloader, get_train_and_test_dataloader, get_data_dim_from_dataloader
 from clustpy.deep._utils import run_initial_clustering, detect_device, encode_batchwise, mean_squared_error
 from collections.abc import Callable
+from pathlib import Path
 
 
 def _get_default_layers(input_dim: int, embedding_size: int) -> list:
@@ -31,7 +32,7 @@ def _get_default_layers(input_dim: int, embedding_size: int) -> list:
 
 def get_neural_network(input_dim: int, embedding_size: int = 10, neural_network: torch.nn.Module | tuple = None,
                         neural_network_class: torch.nn.Module = FeedforwardAutoencoder,
-                        neural_network_params: dict = None, neural_network_weights: str = None, device : torch.device = None,
+                        neural_network_params: dict = None, neural_network_weights: str | Path = None, device : torch.device = None,
                         random_state: np.random.RandomState | int = None) -> torch.nn.Module:
     """This function returns a new neural_network.
     - If neural_network is already a torch.nn.module, nothing will happen.
@@ -52,7 +53,7 @@ def get_neural_network(input_dim: int, embedding_size: int = 10, neural_network:
         The neural network class that should be used (default: FeedforwardAutoencoder)
     neural_network_params : dict
         Parameters to be used when creating a new neural network using the neural_network_class (default: None)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network (default: None)
     device : torch.device
         The device on which to perform the computations.
@@ -111,7 +112,7 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n
                         ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, embedding_size: int = 10,
                         neural_network: torch.nn.Module | tuple = None,
                         neural_network_class: torch.nn.Module = FeedforwardAutoencoder,
-                        neural_network_params: dict = None, neural_network_weights: str = None,
+                        neural_network_params: dict = None, neural_network_weights: str | Path = None,
                         random_state: np.random.RandomState | int = None) -> torch.nn.Module:
     """This function returns a trained neural network. The following cases are considered
        - If the neural network is initialized and trained (neural_network.fitted==True), then return input neural network without training it again.
@@ -147,7 +148,7 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n
         The neural network class that should be used (default: FeedforwardAutoencoder)
     neural_network_params : dict
         Parameters to be used when creating a new neural network using the neural_network_class (default: None)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network (default: None)
     random_state : np.random.RandomState | int
         use a fixed random state to get a repeatable solution. Can also be of type int (default: None)
@@ -185,7 +186,7 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c
                                                random_state: np.random.RandomState,
                                                neural_network_class: torch.nn.Module = FeedforwardAutoencoder,
                                                neural_network_params: dict = None,
-                                               neural_network_weights: str = None) -> (
+                                               neural_network_weights: str | Path = None) -> (
         torch.device, torch.utils.data.DataLoader, torch.utils.data.DataLoader, int, torch.nn.Module, np.ndarray, int,
         np.ndarray, np.ndarray, ClusterMixin):
     """
@@ -231,7 +232,7 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c
         The neural network class that should be used (default: FeedforwardAutoencoder)
     neural_network_params : dict
         Parameters to be used when creating a new neural network using the neural_network_class (default: None)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network (default: None)
 
     Returns
diff --git a/clustpy/deep/aec.py b/clustpy/deep/aec.py
index fcf6e61..920a083 100644
--- a/clustpy/deep/aec.py
+++ b/clustpy/deep/aec.py
@@ -12,12 +12,13 @@
 from clustpy.deep.dcn import _DCN_Module
 import tqdm
 from collections.abc import Callable
+from pathlib import Path
 
 
 def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict,
          clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int,
          optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss,
-         neural_network: torch.nn.Module | tuple, neural_network_weights: str,
+         neural_network: torch.nn.Module | tuple, neural_network_weights: str | Path,
          embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float,
          custom_dataloaders: tuple, augmentation_invariance: bool, initial_clustering_class: ClusterMixin,
          initial_clustering_params: dict, device: torch.device,
@@ -48,7 +49,7 @@ def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par
     neural_network : torch.nn.Module | tuple
         the input neural network.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network.
     embedding_size : int
         size of the embedding within the neural network
@@ -242,7 +243,7 @@ class AEC(_AbstractDeepClusteringAlgo):
     neural_network : torch.nn.Module | tuple
         the input neural network. If None, a new FeedforwardAutoencoder will be created.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network (default: None)
     embedding_size : int
         size of the embedding within the neural network (default: 10)
@@ -296,7 +297,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimize
                  clustering_epochs: int = 150, optimizer_class: torch.optim.Optimizer = torch.optim.Adam,
                  ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, clustering_loss_weight: float = 0.1,
                  ssl_loss_weight: float = 1.0, neural_network: torch.nn.Module | tuple = None,
-                 neural_network_weights: str = None, embedding_size: int = 10, custom_dataloaders: tuple = None,
+                 neural_network_weights: str | Path = None, embedding_size: int = 10, custom_dataloaders: tuple = None,
                  augmentation_invariance: bool = False, initial_clustering_class: ClusterMixin = None,
                  initial_clustering_params: dict = None, device: torch.device = None,
                  random_state: np.random.RandomState | int = None):
diff --git a/clustpy/deep/dcn.py b/clustpy/deep/dcn.py
index 7fcf906..2ed0c53 100644
--- a/clustpy/deep/dcn.py
+++ b/clustpy/deep/dcn.py
@@ -13,12 +13,13 @@
 from sklearn.base import ClusterMixin
 import tqdm
 from collections.abc import Callable
+from pathlib import Path
 
 
 def _dcn(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict,
          clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int,
          optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss,
-         neural_network: torch.nn.Module | tuple, neural_network_weights: str,
+         neural_network: torch.nn.Module | tuple, neural_network_weights: str | Path,
          embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float,
          custom_dataloaders: tuple, augmentation_invariance: bool, initial_clustering_class: ClusterMixin,
          initial_clustering_params: dict, device: torch.device,
@@ -49,7 +50,7 @@ def _dcn(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par
     neural_network : torch.nn.Module | tuple
         the input neural network.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network.
     embedding_size : int
         size of the embedding within the neural network
@@ -397,7 +398,7 @@ class DCN(_AbstractDeepClusteringAlgo):
     neural_network : torch.nn.Module | tuple
         the input neural network. If None, a new FeedforwardAutoencoder will be created.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network (default: None)
     embedding_size : int
         size of the embedding within the neural network (default: 10)
@@ -454,7 +455,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimize
                  clustering_epochs: int = 150, optimizer_class: torch.optim.Optimizer = torch.optim.Adam,
                  ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, clustering_loss_weight: float = 0.1,
                  ssl_loss_weight: float = 1.0, neural_network: torch.nn.Module | tuple = None,
-                 neural_network_weights: str = None, embedding_size: int = 10, custom_dataloaders: tuple = None,
+                 neural_network_weights: str | Path = None, embedding_size: int = 10, custom_dataloaders: tuple = None,
                  augmentation_invariance: bool = False, initial_clustering_class: ClusterMixin = KMeans,
                  initial_clustering_params: dict = None, device: torch.device = None,
                  random_state: np.random.RandomState | int = None):
diff --git a/clustpy/deep/ddc_n2d.py b/clustpy/deep/ddc_n2d.py
index 6cde45b..070a221 100644
--- a/clustpy/deep/ddc_n2d.py
+++ b/clustpy/deep/ddc_n2d.py
@@ -16,12 +16,13 @@
 import inspect
 from collections.abc import Callable
 from clustpy.utils.checks import check_parameters
+from pathlib import Path
 
 
 def _manifold_based_sequential_dc(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict,
                                   pretrain_epochs: int, optimizer_class: torch.optim.Optimizer,
                                   ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple,
-                                  neural_network_weights: str, embedding_size: int, custom_dataloaders: tuple,
+                                  neural_network_weights: str | Path, embedding_size: int, custom_dataloaders: tuple,
                                   manifold_class: TransformerMixin, manifold_params: dict,
                                   clustering_class: ClusterMixin, clustering_params: dict, device: torch.device,
                                   random_state: np.random.RandomState) -> (
@@ -48,7 +49,7 @@ def _manifold_based_sequential_dc(X: np.ndarray, n_clusters: int, batch_size: in
     neural_network : torch.nn.Module | tuple
         the input neural network.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network.
     embedding_size : int
         size of the embedding within the neural network
@@ -261,7 +262,7 @@ class DDC(_AbstractDeepClusteringAlgo):
     neural_network : torch.nn.Module | tuple
         the input neural network. If None, a new FeedforwardAutoencoder will be created.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network (default: None)
     embedding_size : int
         size of the embedding within the neural network (default: 10)
@@ -311,7 +312,7 @@ class DDC(_AbstractDeepClusteringAlgo):
     def __init__(self, ratio: float = 0.1, batch_size: int = 256, pretrain_optimizer_params: dict = None,
                  pretrain_epochs: int = 100, optimizer_class: torch.optim.Optimizer = torch.optim.Adam,
                  ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error,
-                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None,
+                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None,
                  embedding_size: int = 10, custom_dataloaders: tuple = None, tsne_params: dict = None,
                  device: torch.device = None, random_state: np.random.RandomState | int = None):
         super().__init__(batch_size, neural_network, neural_network_weights, embedding_size, device, random_state)
@@ -409,7 +410,7 @@ class N2D(_AbstractDeepClusteringAlgo):
     neural_network : torch.nn.Module | tuple
         the input neural network. If None, a new FeedforwardAutoencoder will be created.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network (default: None)
     embedding_size : int
         size of the embedding within the neural network (default: 10)
@@ -455,7 +456,7 @@ class N2D(_AbstractDeepClusteringAlgo):
     def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimizer_params: dict = None,
                  pretrain_epochs: int = 100, optimizer_class: torch.optim.Optimizer = torch.optim.Adam,
                  ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error,
-                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None,
+                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None,
                  embedding_size: int = 10, custom_dataloaders: tuple = None, manifold_class: TransformerMixin = TSNE,
                  manifold_params: dict = None, initial_clustering_params: dict = None, device: torch.device = None,
                  random_state: np.random.RandomState | int = None):
diff --git a/clustpy/deep/dec.py b/clustpy/deep/dec.py
index a2501fb..7b6a428 100644
--- a/clustpy/deep/dec.py
+++ b/clustpy/deep/dec.py
@@ -14,12 +14,13 @@
 from sklearn.base import ClusterMixin
 import tqdm
 from collections.abc import Callable#
+from pathlib import Path
 
 
 def _dec(X: np.ndarray, n_clusters: int, alpha: float, batch_size: int, pretrain_optimizer_params: dict,
          clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int,
          optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss,
-         neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int,
+         neural_network: torch.nn.Module | tuple, neural_network_weights: str | Path, embedding_size: int,
          clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple,
          augmentation_invariance: bool, initial_clustering_class: ClusterMixin, initial_clustering_params: dict,
          device: torch.device, random_state: np.random.RandomState) -> (
@@ -52,7 +53,7 @@ def _dec(X: np.ndarray, n_clusters: int, alpha: float, batch_size: int, pretrain
     neural_network : torch.nn.Module | tuple
         the input neural network.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network.
     embedding_size : int
         size of the embedding within the neural network
@@ -433,7 +434,7 @@ class DEC(_AbstractDeepClusteringAlgo):
     neural_network : torch.nn.Module | tuple
         the input neural network. If None, a new FeedforwardAutoencoder will be created.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network (default: None)
     embedding_size : int
         size of the embedding within the neural network (default: 10)
@@ -491,7 +492,7 @@ def __init__(self, n_clusters: int = 8, alpha: float = 1.0, batch_size: int = 25
                  pretrain_epochs: int = 100, clustering_epochs: int = 150,
                  optimizer_class: torch.optim.Optimizer = torch.optim.Adam,
                  ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error,
-                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None,
+                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None,
                  embedding_size: int = 10, clustering_loss_weight: float = 1., custom_dataloaders: tuple = None,
                  augmentation_invariance: bool = False, initial_clustering_class: ClusterMixin = KMeans,
                  initial_clustering_params: dict = None, device: torch.device = None,
@@ -586,7 +587,7 @@ class IDEC(DEC):
     neural_network : torch.nn.Module | tuple
         the input neural network. If None, a new FeedforwardAutoencoder will be created.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network (default: None)
     embedding_size : int
         size of the embedding within the neural network (default: 10)
@@ -645,7 +646,7 @@ def __init__(self, n_clusters: int = 8, alpha: float = 1.0, batch_size: int = 25
                  clustering_optimizer_params: dict = None, pretrain_epochs: int = 100,
                  clustering_epochs: int = 150, optimizer_class: torch.optim.Optimizer = torch.optim.Adam,
                  ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error,
-                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None,
+                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None,
                  embedding_size: int = 10, clustering_loss_weight: float = 0.1, ssl_loss_weight: float = 1.0,
                  custom_dataloaders: tuple = None, augmentation_invariance: bool = False,
                  initial_clustering_class: ClusterMixin = KMeans, initial_clustering_params: dict = None,
diff --git a/clustpy/deep/deepect.py b/clustpy/deep/deepect.py
index 028afbb..2042f69 100644
--- a/clustpy/deep/deepect.py
+++ b/clustpy/deep/deepect.py
@@ -15,6 +15,7 @@
 import copy
 from collections.abc import Callable
 from sklearn.utils.validation import check_is_fitted
+from pathlib import Path
 
 
 class _DeepECT_ClusterTreeNode(_ClusterTreeNode):
@@ -447,7 +448,7 @@ def _deep_ect(X: np.ndarray, max_n_leaf_nodes: int, batch_size: int, pretrain_op
               clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, grow_interval: int,
               pruning_threshold: float, optimizer_class: torch.optim.Optimizer,
               ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple,
-              neural_network_weights: str, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float,
+              neural_network_weights: str | Path, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float,
               custom_dataloaders: tuple, augmentation_invariance: bool, device: torch.device,
               random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, torch.nn.Module):
     """
@@ -480,7 +481,7 @@ def _deep_ect(X: np.ndarray, max_n_leaf_nodes: int, batch_size: int, pretrain_op
     neural_network : torch.nn.Module | tuple
         the input neural network.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network.
     embedding_size : int
         size of the embedding within the neural network
@@ -564,7 +565,7 @@ class DeepECT(_AbstractDeepClusteringAlgo):
     neural_network : torch.nn.Module | tuple
         the input neural network. If None, a new FeedforwardAutoencoder will be created.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network (default: None)
     embedding_size : int
         Size of the embedding within the neural network (default: 10)
@@ -607,7 +608,7 @@ def __init__(self, max_n_leaf_nodes: int = 20, batch_size: int = 256, pretrain_o
                  grow_interval: int = 2, pruning_threshold: float = 0.1,
                  optimizer_class: torch.optim.Optimizer = torch.optim.Adam,
                  ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error,
-                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None,
+                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None,
                  embedding_size: int = 10, clustering_loss_weight: float = 1., ssl_loss_weight: float = 1.,
                  custom_dataloaders: tuple = None, augmentation_invariance: bool = False,
                  device: torch.device = None, random_state: np.random.RandomState | int = None):
diff --git a/clustpy/deep/den.py b/clustpy/deep/den.py
index 7306175..2562e76 100644
--- a/clustpy/deep/den.py
+++ b/clustpy/deep/den.py
@@ -13,6 +13,7 @@
 from sklearn.neighbors import NearestNeighbors
 from sklearn.cluster import KMeans
 from collections.abc import Callable
+from pathlib import Path
 
 
 class DEN(_AbstractDeepClusteringAlgo):
@@ -53,7 +54,7 @@ class DEN(_AbstractDeepClusteringAlgo):
     neural_network : torch.nn.Module | tuple
         the input neural network. If None, a new FeedforwardAutoencoder will be created.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network (default: None)
     embedding_size : int
         size of the embedding within the neural network (default: None)
@@ -98,7 +99,7 @@ def __init__(self, n_clusters: int = 8, group_size : int | list | None = 2, n_ne
                  batch_size: int = 256, pretrain_optimizer_params: dict = None,
                  pretrain_epochs: int = 100, optimizer_class: torch.optim.Optimizer = torch.optim.Adam,
                  ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error,
-                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None,
+                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None,
                  embedding_size: int | None = None, custom_dataloaders: tuple = None,
                  device: torch.device = None, random_state: np.random.RandomState | int = None):
         super().__init__(batch_size, neural_network, neural_network_weights, embedding_size, device, random_state)
diff --git a/clustpy/deep/dipdeck.py b/clustpy/deep/dipdeck.py
index 8433330..117cf6d 100644
--- a/clustpy/deep/dipdeck.py
+++ b/clustpy/deep/dipdeck.py
@@ -15,6 +15,7 @@
 import tqdm
 from collections.abc import Callable
 from sklearn.utils.validation import check_is_fitted
+from pathlib import Path
 
 
 def _merge_by_dip_value(X: np.ndarray, embedded_data: np.ndarray, cluster_labels_cpu: np.ndarray,
@@ -102,7 +103,6 @@ def _force_merge(X: np.ndarray, embedded_data: np.ndarray, cluster_labels_cpu: n
     First strategy is to delete the smallest cluster if it is smaller than 0.2 * the average cluster size.
     In this case, the samples in this cluster will be assigned to the clostest remaining cluster.
     If there is no cluster that is sufficiently small, the cluster-combination with the largest dip-value will be merged.
-
     
     Parameters
     ----------
@@ -617,7 +617,7 @@ class DipDECK(_AbstractDeepClusteringAlgo):
     neural_network : torch.nn.Module | tuple
         the input neural network. If None, a new FeedforwardAutoencoder will be created.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network (default: None)
     embedding_size : int
         size of the embedding within the neural network (default: 5)
@@ -648,7 +648,6 @@ class DipDECK(_AbstractDeepClusteringAlgo):
     debug : bool
         If true, additional information will be printed to the console (default: False)
 
-
     Attributes
     ----------
     labels_ : np.ndarray
@@ -682,7 +681,7 @@ def __init__(self, n_clusters_init: int = 35, dip_merge_threshold: float = 0.9,
                  clustering_optimizer_params: dict = None, pretrain_epochs: int = 100, clustering_epochs: int = 50,
                  optimizer_class: torch.optim.Optimizer = torch.optim.Adam,
                  ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error,
-                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None,
+                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None,
                  embedding_size: int = 5, max_cluster_size_diff_factor: float = 2, pval_strategy: str = "table",
                  n_boots: int = 1000, custom_dataloaders: tuple = None, augmentation_invariance: bool = False,
                  initial_clustering_class: ClusterMixin = KMeans, initial_clustering_params: dict = None,
diff --git a/clustpy/deep/dipencoder.py b/clustpy/deep/dipencoder.py
index bf6774e..2248a7b 100644
--- a/clustpy/deep/dipencoder.py
+++ b/clustpy/deep/dipencoder.py
@@ -17,6 +17,7 @@
 from clustpy.utils import plot_scatter_matrix
 import tqdm
 from collections.abc import Callable
+from pathlib import Path
 
 """
 Dip module - holds backward functions
@@ -699,7 +700,7 @@ class DipEncoder(_AbstractDeepClusteringAlgo):
     neural_network : torch.nn.Module | tuple
         the input neural network. If None, a new FeedforwardAutoencoder will be created.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network (default: None)
     embedding_size : int
         size of the embedding within the neural network (default: 10)
@@ -763,7 +764,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = None, pretrain_optimiz
                  clustering_optimizer_params: dict = None, pretrain_epochs: int = 100,
                  clustering_epochs: int = 150, optimizer_class: torch.optim.Optimizer = torch.optim.Adam,
                  ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error,
-                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None,
+                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None,
                  embedding_size: int = 10, max_cluster_size_diff_factor: float = 3,
                  clustering_loss_weight: float = 1., ssl_loss_weight: float = None,
                  custom_dataloaders: tuple = None, augmentation_invariance: bool = False,
diff --git a/clustpy/deep/dkm.py b/clustpy/deep/dkm.py
index 896a6f3..185e801 100644
--- a/clustpy/deep/dkm.py
+++ b/clustpy/deep/dkm.py
@@ -12,12 +12,13 @@
 from sklearn.base import ClusterMixin
 import tqdm
 from collections.abc import Callable
+from pathlib import Path
 
 
 def _dkm(X: np.ndarray, n_clusters: int, alphas: list | tuple, batch_size: int, pretrain_optimizer_params: dict,
          clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int,
          optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss,
-         neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int,
+         neural_network: torch.nn.Module | tuple, neural_network_weights: str | Path, embedding_size: int,
          clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple,
          augmentation_invariance: bool, initial_clustering_class: ClusterMixin,
          initial_clustering_params: dict, device: torch.device, random_state: np.random.RandomState) -> (
@@ -52,7 +53,7 @@ def _dkm(X: np.ndarray, n_clusters: int, alphas: list | tuple, batch_size: int,
     neural_network : torch.nn.Module | tuple
         the input neural network.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network.
     embedding_size : int
         size of the embedding within the neural network
@@ -406,7 +407,7 @@ class DKM(_AbstractDeepClusteringAlgo):
     neural_network : torch.nn.Module | tuple
         the input neural network. If None, a new FeedforwardAutoencoder will be created.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network (default: None)
     embedding_size : int
         size of the embedding within the neural network (default: 10)
@@ -466,7 +467,7 @@ def __init__(self, n_clusters: int = 8, alphas: tuple = (1000), batch_size: int
                  pretrain_epochs: int = 100, clustering_epochs: int = 150,
                  optimizer_class: torch.optim.Optimizer = torch.optim.Adam,
                  ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error,
-                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None,
+                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None,
                  embedding_size: int = 10, clustering_loss_weight: float = 0.1, ssl_loss_weight: float = 1.,
                  custom_dataloaders: tuple = None, augmentation_invariance: bool = False,
                  initial_clustering_class: ClusterMixin = KMeans, initial_clustering_params: dict = None,
diff --git a/clustpy/deep/enrc.py b/clustpy/deep/enrc.py
index b6b364a..032e91c 100644
--- a/clustpy/deep/enrc.py
+++ b/clustpy/deep/enrc.py
@@ -18,6 +18,7 @@
 from clustpy.alternative.nrkmeans import _get_total_cost_function
 import tqdm
 from collections.abc import Callable
+from pathlib import Path
 
 
 class _ENRC_Module(torch.nn.Module):
@@ -1722,7 +1723,7 @@ def _enrc(X: np.ndarray, n_clusters: list, V: np.ndarray, P: list, input_centers
           pretrain_optimizer_params: dict, clustering_optimizer_params: dict, pretrain_epochs: int,
           clustering_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss,
           clustering_loss_weight: float, ssl_loss_weight: float, neural_network: torch.nn.Module | tuple,
-          neural_network_weights: str, embedding_size: int, init: str, random_state: np.random.RandomState,
+          neural_network_weights: str | Path, embedding_size: int, init: str, random_state: np.random.RandomState,
           device: torch.device, scheduler: torch.optim.lr_scheduler, scheduler_params: dict, tolerance_threshold: float,
           init_kwargs: dict, init_subsample_size: int, custom_dataloaders: tuple, augmentation_invariance: bool,
           final_reclustering: bool, debug: bool) -> (
@@ -1763,7 +1764,7 @@ def _enrc(X: np.ndarray, n_clusters: list, V: np.ndarray, P: list, input_centers
     neural_network : torch.nn.Module | tuple
         the input neural network.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network.
     embedding_size : int
         size of the embedding within the neural network. Only used if neural_network is None
@@ -1948,7 +1949,7 @@ class ENRC(_AbstractDeepClusteringAlgo):
     neural_network : torch.nn.Module | tuple
         the input neural network. If None, a new FeedforwardAutoencoder will be created.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network (default: None)
     embedding_size : int
         size of the embedding within the neural network. Only used if neural_network is None (default: 20)
@@ -2007,7 +2008,7 @@ def __init__(self, n_clusters: list, V: np.ndarray = None, P: list = None, input
                  tolerance_threshold: float = None, optimizer_class: torch.optim.Optimizer = torch.optim.Adam,
                  ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error,
                  clustering_loss_weight: float = 1.0, ssl_loss_weight: float = 1.0,
-                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None,
+                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None,
                  embedding_size: int = 20, init: str = "nrkmeans",
                  device: torch.device = None, scheduler: torch.optim.lr_scheduler = None,
                  scheduler_params: dict = None, init_kwargs: dict = None, init_subsample_size: int = 10000,
@@ -2289,7 +2290,7 @@ class ACeDeC(ENRC):
     neural_network : torch.nn.Module | tuple
         the input neural network. If None, a new FeedforwardAutoencoder will be created.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network (default: None)
     embedding_size : int
         size of the embedding within the neural network. Only used if neural_network is None (default: 20)
@@ -2348,7 +2349,7 @@ def __init__(self, n_clusters: int, V: np.ndarray = None, P: list = None, input_
                  tolerance_threshold: float = None, optimizer_class: torch.optim.Optimizer = torch.optim.Adam,
                  ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error,
                  clustering_loss_weight: float = 1.0, ssl_loss_weight: float = 1.0,
-                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None,
+                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None,
                  embedding_size: int = 20, init: str = "acedec",
                  device: torch.device = None, scheduler: torch.optim.lr_scheduler = None,
                  scheduler_params: dict = None, init_kwargs: dict = None, init_subsample_size: int = 10000,
diff --git a/clustpy/deep/neural_networks/_abstract_autoencoder.py b/clustpy/deep/neural_networks/_abstract_autoencoder.py
index f582157..33c9d36 100644
--- a/clustpy/deep/neural_networks/_abstract_autoencoder.py
+++ b/clustpy/deep/neural_networks/_abstract_autoencoder.py
@@ -8,13 +8,13 @@
 from clustpy.deep._early_stopping import EarlyStopping
 from clustpy.deep._data_utils import get_dataloader
 from clustpy.deep._utils import get_device_from_module, mean_squared_error
-import os
 import tqdm
 from collections.abc import Callable
 from sklearn.utils import check_random_state
 from clustpy.deep._utils import set_torch_seed
 from collections.abc import Callable
 from clustpy.utils.checks import check_parameters
+from pathlib import Path
 
 
 class FullyConnectedBlock(torch.nn.Module):
@@ -394,28 +394,28 @@ def fit(self, n_epochs: int = 100, optimizer_params: dict = None, batch_size: in
         self.fitted = True
         return self
 
-    def save_parameters(self, path: str) -> None:
+    def save_parameters(self, path: str | Path) -> None:
         """
         Save the current state_dict of the model.
 
         Parameters
         ----------
-        path : str
+        path : str | Path
             Path where the state_dict should be stored
         """
         # Check if directory exists
-        parent_directory = os.path.dirname(path)
-        if parent_directory != "" and not os.path.isdir(parent_directory):
-            os.makedirs(parent_directory)
+        if isinstance(path, str):
+            path = Path(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
         torch.save(self.state_dict(), path)
 
-    def load_parameters(self, path: str) -> '_AbstractAutoencoder':
+    def load_parameters(self, path: str | Path) -> '_AbstractAutoencoder':
         """
         Load a state_dict into the current model to set its parameters.
 
         Parameters
         ----------
-        path : str
+        path : str | Path
             Path from where the state_dict should be loaded
 
         Returns
diff --git a/clustpy/deep/vade.py b/clustpy/deep/vade.py
index b5d6e79..ee045b5 100644
--- a/clustpy/deep/vade.py
+++ b/clustpy/deep/vade.py
@@ -15,12 +15,13 @@
 from sklearn.base import ClusterMixin
 import tqdm
 from collections.abc import Callable
+from pathlib import Path
 
 
 def _vade(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict,
           clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int,
           optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss,
-          neural_network: torch.nn.Module | tuple, neural_network_weights: str,
+          neural_network: torch.nn.Module | tuple, neural_network_weights: str | Path,
           embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float,
           custom_dataloaders: tuple, initial_clustering_class: ClusterMixin, initial_clustering_params: dict,
           device: torch.device, random_state: np.random.RandomState) -> (
@@ -51,7 +52,7 @@ def _vade(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_pa
     neural_network : torch.nn.Module | tuple
         the input neural network.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network.
     embedding_size : int
         size of the embedding within the neural network (central layer with mean and variance)
@@ -513,7 +514,7 @@ class VaDE(_AbstractDeepClusteringAlgo):
     neural_network : torch.nn.Module | tuple
         the input neural network. If None, a new VariationalAutoencoder will be created.
         Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None)
-    neural_network_weights : str
+    neural_network_weights : str | Path
         Path to a file containing the state_dict of the neural_network (default: None)
     embedding_size : int
         size of the embedding within the neural network (central layer with mean and variance) (default: 10)
@@ -571,7 +572,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimize
                  clustering_epochs: int = 150, optimizer_class: torch.optim.Optimizer = torch.optim.Adam,
                  ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = torch.nn.BCELoss(reduction='sum'),
                  clustering_loss_weight: float = 1.0, ssl_loss_weight: float = 1.0,
-                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None,
+                 neural_network: torch.nn.Module | tuple = None, neural_network_weights: str | Path = None,
                  embedding_size: int = 10, custom_dataloaders: tuple = None,
                  initial_clustering_class: ClusterMixin = GaussianMixture, initial_clustering_params: dict = None,
                  device: torch.device = None, random_state: np.random.RandomState | int = None):
diff --git a/clustpy/utils/_information_theory.py b/clustpy/utils/_information_theory.py
index a9052e6..f96270c 100644
--- a/clustpy/utils/_information_theory.py
+++ b/clustpy/utils/_information_theory.py
@@ -33,7 +33,7 @@ def bic_costs(n_points: int, use_log2: bool = False) -> float:
     return bic_costs
 
 
-def integer_costs(integer: int) -> float:
+def integer_costs(integer: int, use_log2: bool = False) -> float:
     """
     Calculate the costs to encode an integer value. Uses following formula:
     log(integer) + log(log(integer)) + log(log(log(integer))) + ... + log(const), where const = 2.865064.
@@ -42,22 +42,24 @@ def integer_costs(integer: int) -> float:
     ----------
     integer : int
         The integer value to encode
+    use_log2 : bool
+        Defines whether log2 should be used instead of ln (default: False)
 
     Returns
     -------
     costs : float
         The encoding costs of the integer
     """
-    assert type(integer) is int or type(integer) is np.int32 or type(
-        integer) is np.int64, "The input to calculate the mdl costs of must be an integer. Your input:\n{0} (type: {1})".format(
+    assert isinstance(integer, (int, np.integer)), "The input to calculate the mdl costs of must be an integer. Your input:\n{0} (type: {1})".format(
         integer, type(integer))
     costs = 0
-    if integer != 0:
-        last_interim_result = np.log2(integer)
+    if integer > 0:
+        last_interim_result = np.log2(integer) if use_log2 else np.log(integer)
         while last_interim_result > 0:
             costs += last_interim_result
-            last_interim_result = np.log2(last_interim_result)
-    costs = costs + np.log2(2.865064)
+            last_interim_result = np.log2(last_interim_result) if use_log2 else np.log(last_interim_result)
+    const = np.log2(2.865064) if use_log2 else np.log(2.865064)
+    costs = costs + const
     return costs
 
 
diff --git a/clustpy/utils/evaluation.py b/clustpy/utils/evaluation.py
index d95d205..7ea51cb 100644
--- a/clustpy/utils/evaluation.py
+++ b/clustpy/utils/evaluation.py
@@ -4,10 +4,10 @@
 from sklearn.utils import check_random_state
 from sklearn.base import ClusterMixin
 from collections.abc import Callable
-import os
 import inspect
 from sklearn.datasets._base import Bunch
 import sys
+from pathlib import Path, PurePath
 
 
 def _preprocess_dataset(X: np.ndarray, preprocess_methods: list, preprocess_params: list) -> np.ndarray:
@@ -113,7 +113,7 @@ def evaluate_dataset(X: np.ndarray, evaluation_algorithms: list, evaluation_metr
                      labels_true: np.ndarray = None, n_repetitions: int = 10,
                      X_test: np.ndarray = None, labels_true_test: np.ndarray = None,
                      aggregation_functions: tuple = (np.mean, np.std), add_runtime: bool = True,
-                     add_n_clusters: bool = False, save_path: str = None, save_labels_path: str = None,
+                     add_n_clusters: bool = False, save_path: str | Path = None, save_labels_path: str | Path = None,
                      ignore_algorithms: tuple = (), dataset_name: str = None,
                      random_state: np.random.RandomState | int | list = None, quiet: bool = False) -> pd.DataFrame:
     """
@@ -143,10 +143,10 @@ def evaluate_dataset(X: np.ndarray, evaluation_algorithms: list, evaluation_metr
         Add runtime of each execution to the final table (default: True)
     add_n_clusters : bool
         Add the resulting number of clusters to the final table (default: False)
-    save_path : str
-        The path where the final DataFrame should be saved as csv. If None, the DataFrame will not be saved (default: None)
-    save_labels_path : str
-        The path where the clustering labels should be saved as csv. If None, the labels will not be saved (default: None)
+    save_path : str | Path
+        The path where the final DataFrame should be saved. If None, the DataFrame will not be saved (default: None)
+    save_labels_path : str | Path
+        The path where the clustering labels should be saved. If None, the labels will not be saved (default: None)
     ignore_algorithms : tuple
         List of algorithm names (as specified in the EvaluationAlgorithm object) that should be ignored for this specific data set (default: [])
     dataset_name : str
@@ -195,14 +195,16 @@ def evaluate_dataset(X: np.ndarray, evaluation_algorithms: list, evaluation_metr
         evaluation_algorithms = [evaluation_algorithms]
     if type(evaluation_metrics) is not list and evaluation_metrics is not None:
         evaluation_metrics = [evaluation_metrics]
-    if save_labels_path is not None and "." not in save_labels_path:
-        save_labels_path = save_labels_path + ".csv"
-    assert save_labels_path is None or len(
-        save_labels_path.split(".")) == 2, "save_labels_path must only contain a single dot. E.g., NAME.csv"
-    if save_path is not None and "." not in save_path:
-        save_path = save_path + ".csv"
-    assert save_path is None or len(
-        save_path.split(".")) == 2, "save_path must only contain a single dot. E.g., NAME.csv"
+    if save_labels_path is not None:
+        if isinstance(save_labels_path, str):
+            save_labels_path = Path(save_labels_path)
+        if save_labels_path.suffix == "":
+            save_labels_path.with_suffix(".csv")
+    if save_path is not None:
+        if isinstance(save_path, str):
+            save_path = Path(save_path)
+        if save_path.suffix == "":
+            save_path = save_path.with_suffix(".csv")
     seeds = _get_fixed_seed_for_each_run(n_repetitions, random_state)
     algo_names = [a.name for a in evaluation_algorithms]
     assert max(
@@ -300,17 +302,14 @@ def evaluate_dataset(X: np.ndarray, evaluation_algorithms: list, evaluation_metr
                 runtime = time.time() - start_time
                 # Optional: Save labels
                 if save_labels_path is not None:
-                    save_labels_path_algo = None if save_labels_path is None else "{0}_{1}_{2}.{3}".format(
-                        save_labels_path.split(".")[0], eval_algo.name, rep, save_labels_path.split(".")[1])
+                    save_labels_path_algo = save_labels_path.with_name("{0}_{1}_{2}".format(save_labels_path.name,
+                                                                                               eval_algo.name, rep))
                     # Check if directory exists
-                    parent_directory = os.path.dirname(save_labels_path_algo)
-                    if parent_directory != "" and not os.path.isdir(parent_directory):
-                        os.makedirs(parent_directory)
+                    save_labels_path_algo.parent.mkdir(parents=True, exist_ok=True)
                     np.savetxt(save_labels_path_algo, algo_obj.labels_)
                     # Also save predict labels
                     if X_test is not None and labels_predicted_test is not None:
-                        save_labels_path_algo_test = "{0}_TEST.{1}".format(save_labels_path_algo.split(".")[0],
-                                                                           save_labels_path_algo.split(".")[1])
+                        save_labels_path_algo_test = save_labels_path_algo.with_name("{0}_TEST".format(save_labels_path_algo.name))
                         np.savetxt(save_labels_path_algo_test, labels_predicted_test)
                 # Get result of all metrics
                 if evaluation_metrics is not None:
@@ -386,17 +385,15 @@ def evaluate_dataset(X: np.ndarray, evaluation_algorithms: list, evaluation_metr
             print("-> Aggregation {0}: {1}".format(agg.__name__, aggregated_results))
     if save_path is not None:
         # Check if directory exists
-        parent_directory = os.path.dirname(save_path)
-        if parent_directory != "" and not os.path.isdir(parent_directory):
-            os.makedirs(parent_directory)
+        save_path.parent.mkdir(parents=True, exist_ok=True)
         df.to_csv(save_path)
     return df
 
 
 def evaluate_multiple_datasets(evaluation_datasets: list, evaluation_algorithms: list, evaluation_metrics: list = None,
                                n_repetitions: int = 10, aggregation_functions: tuple = (np.mean, np.std),
-                               add_runtime: bool = True, add_n_clusters: bool = False, save_path: str = None,
-                               save_intermediate_results: bool = False, save_labels_path: str = None,
+                               add_runtime: bool = True, add_n_clusters: bool = False, save_path: str | Path = None,
+                               save_intermediate_results: bool = False, save_labels_path: str | Path = None,
                                random_state: np.random.RandomState | int | list = None, quiet: bool = False) -> pd.DataFrame:
     """
     Evaluate the clustering result of different clustering algorithms (as specified by evaluation_algorithms) on a set of data sets (as specified by evaluation_datasets) using different metrics (as specified by evaluation_metrics).
@@ -419,15 +416,15 @@ def evaluate_multiple_datasets(evaluation_datasets: list, evaluation_algorithms:
         Add runtime of each execution to the final table (default: True)
     add_n_clusters : bool
         Add the resulting number of clusters to the final table (default: False)
-    save_path : str
-        The path where the final DataFrame should be saved as csv. If None, the DataFrame will not be saved (default: None)
+    save_path : str | Path
+        The path where the final DataFrame should be saved. If None, the DataFrame will not be saved (default: None)
     save_intermediate_results : bool
         Defines whether the result of each data set should be separately saved. 
         Useful if the evaluation takes a lot of time.
         The files will be saved as [save_path]_[DATASET_NAME]. 
         This implies that save_path has to be defined if save_intermediate_results is set to True (default: False)
-    save_labels_path : str
-        The path where the clustering labels should be saved as csv. If None, the labels will not be saved (default: None)
+    save_labels_path : str | Path
+        The path where the clustering labels should be saved. If None, the labels will not be saved (default: None)
     random_state : np.random.RandomState | int | list
         use a fixed random state to get a repeatable solution. Can also be of type int.
         Furthermore, if can be a list containing an int for each repetition (default: None)
@@ -477,14 +474,16 @@ def evaluate_multiple_datasets(evaluation_datasets: list, evaluation_algorithms:
                                                                    "save_intermediate_results is True"
     if type(evaluation_datasets) is not list:
         evaluation_datasets = [evaluation_datasets]
-    if save_labels_path is not None and "." not in save_labels_path:
-        save_labels_path = save_labels_path + ".csv"
-    assert save_labels_path is None or len(
-        save_labels_path.split(".")) == 2, "save_labels_path must only contain a single dot. E.g., NAME.csv"
-    if save_path is not None and "." not in save_path:
-        save_path = save_path + ".csv"
-    assert save_path is None or len(
-        save_path.split(".")) == 2, "save_path must only contain a single dot. E.g., NAME.csv"
+    if save_labels_path is not None:
+        if isinstance(save_labels_path, str):
+            save_labels_path = Path(save_labels_path)
+        if save_labels_path.suffix == "":
+            save_labels_path.with_suffix(".csv")
+    if save_path is not None:
+        if isinstance(save_path, str):
+            save_path = Path(save_path)
+        if save_path.suffix == "":
+            save_path = save_path.with_suffix(".csv")
     data_names = [d.name for d in evaluation_datasets]
     assert max(
         np.unique(data_names, return_counts=True)[1]) == 1, "Some names of your datasets do not seem to be unique!"
@@ -506,11 +505,10 @@ def evaluate_multiple_datasets(evaluation_datasets: list, evaluation_algorithms:
                 X = _preprocess_dataset(X, eval_data.preprocess_methods, eval_data.preprocess_params)
                 if X_test is not None:
                     X_test = _preprocess_dataset(X_test, eval_data.preprocess_methods, eval_data.preprocess_params)
-            inner_save_path = None if not save_intermediate_results else "{0}_{1}.{2}".format(save_path.split(".")[0],
-                                                                                              eval_data.name,
-                                                                                              save_path.split(".")[1])
-            inner_save_labels_path = None if save_labels_path is None else "{0}_{1}.{2}".format(
-                save_labels_path.split(".")[0], eval_data.name, save_labels_path.split(".")[1])
+            inner_save_path = None if not save_intermediate_results else save_path.with_name("{0}_{1}".format(save_path.name,
+                                                                                              eval_data.name))
+            inner_save_labels_path = None if save_labels_path is None else save_labels_path.with_name("{0}_{1}".format(
+                save_labels_path.name, eval_data.name))
             df = evaluate_dataset(X, evaluation_algorithms, evaluation_metrics=evaluation_metrics,
                                   labels_true=labels_true,
                                   n_repetitions=n_repetitions, X_test=X_test, labels_true_test=labels_true_test,
@@ -527,9 +525,7 @@ def evaluate_multiple_datasets(evaluation_datasets: list, evaluation_algorithms:
     all_dfs = pd.concat(df_list, keys=data_names)
     if save_path is not None:
         # Check if directory exists
-        parent_directory = os.path.dirname(save_path)
-        if parent_directory != "" and not os.path.isdir(parent_directory):
-            os.makedirs(parent_directory)
+        save_path.parent.mkdir(parents=True, exist_ok=True)
         all_dfs.to_csv(save_path)
     return all_dfs
 
@@ -543,7 +539,7 @@ def _get_data_and_labels_from_evaluation_dataset(data_input: np.ndarray, data_lo
 
     Parameters
     ----------
-    data_input : np.ndarray
+    data_input : np.ndarray | str | Path | Callable
         The actual data set. Can be a np.ndarray, a path to a data file (of type str) or a callable (e.g. a method from clustpy.data)
     data_loader_params_input : dict
         Dictionary containing the information necessary to load data from a function or file. Only relevant if data is of type callable or str
@@ -567,7 +563,7 @@ def _get_data_and_labels_from_evaluation_dataset(data_input: np.ndarray, data_lo
     labels_true = None
     X_test = None
     labels_true_test = None
-    if type(data_input) is str:
+    if isinstance(data_input, (str, PurePath)):
         X = np.genfromtxt(data_input, **data_loader_params_input)
     elif type(data_input) is np.ndarray:
         X = data_input
@@ -607,7 +603,7 @@ def _get_data_and_labels_from_evaluation_dataset(data_input: np.ndarray, data_lo
     return X, labels_true, X_test, labels_true_test
 
 
-def evaluation_df_to_latex_table(df: pd.DataFrame | str, relevant_row : str = "mean", output_path: str = None, pm_row: str | None = "std", 
+def evaluation_df_to_latex_table(df: pd.DataFrame | str | Path, relevant_row : str = "mean", output_path: str | Path = None, pm_row: str | None = "std", 
                                  bracket_row: str | None = None, best_in_bold: bool = True, second_best_underlined: bool = True, 
                                  third_best_dashed_underlined: bool = False, color_by_value: str = None, higher_is_better: list = None, 
                                  multiplier: int | float | list | None = 100, decimal_places: int = 1, color_min_max: tuple = (5, 70)) -> str:
@@ -620,11 +616,11 @@ def evaluation_df_to_latex_table(df: pd.DataFrame | str, relevant_row : str = "m
 
     Parameters
     ----------
-    df : pd.DataFrame | str
-        The pandas dataframe. Can also be a string that contains the path to the saved dataframe
+    df : pd.DataFrame | str | Path
+        The pandas dataframe. Can also be a string/path that contains the path to the saved dataframe
     relevant_row : str
         The name of the row in the df that is used to create the latex table (default: "mean")
-    output_path : str
+    output_path : str | Path
         The path were the resulting latex table text file will be stored (default: None)
     pm_row : str
         The name of the row in the df that should be added to the latex table after the value from relevant_row separated by plus-minus (default: "std")
@@ -662,8 +658,8 @@ def evaluation_df_to_latex_table(df: pd.DataFrame | str, relevant_row : str = "m
         The created latex string
     """
     # Load dataframe
-    assert type(df) == pd.DataFrame or type(df) == str, "Type of df must be pandas DataFrame or string (path to file)"
-    if type(df) == str:
+    assert isinstance(df, (pd.DataFrame, str, PurePath)), "Type of df must be pandas DataFrame, Path or string (path to file)"
+    if isinstance(df, (str, PurePath)):
         df_file = open(df, "r").readlines()
         multiple_datasets = df_file[2].split(",")[0] != "0"
         df = pd.read_csv(df, index_col=[0, 1] if multiple_datasets else [0], header=[0, 1])
@@ -814,8 +810,8 @@ class EvaluationDataset():
     ----------
     name : str
         Name of the data set. Can be chosen freely
-    data : np.ndarray
-        The actual data set. Can be a np.ndarray, a path to a data file (of type str) or a callable (e.g. a method from clustpy.data)
+    data : np.ndarray | str | Path | Callable
+        The actual data set. Can be a np.ndarray, a path to a data fileor a callable (e.g. a method from clustpy.data)
     labels_true : np.ndarray
         The ground truth labels. Can be a np.ndarray, an int or list specifying which columns of the data contain the labels or None if no ground truth labels are present.
         If data is a callable, the ground truth labels can also be obtained by that function and labels_true can be None (default: None)
@@ -846,13 +842,13 @@ class EvaluationDataset():
     >>> ed2 = EvaluationDataset(name="wine", data=X, labels_true=L)
     """
 
-    def __init__(self, name: str, data: np.ndarray, labels_true: np.ndarray = None, data_loader_params: dict = None,
+    def __init__(self, name: str, data: np.ndarray | str | Path | Callable, labels_true: np.ndarray = None, data_loader_params: dict = None,
                  train_test_split: bool = None, preprocess_methods: list = None, preprocess_params: list = None,
                  ignore_algorithms: tuple = ()):
         assert type(name) is str, "name must be a string"
         self.name = name
         assert "." not in name, "name must not contain a dot"
-        assert type(data) is np.ndarray or type(data) is str or callable(data), "data must be a numpy array, a string " \
+        assert isinstance(data, (np.ndarray, str, PurePath)) or callable(data), "data must be a numpy array, a string " \
                                                                                 "containing the path to a data file or a " \
                                                                                 "function returning a data and a labels array"
         self.data = data
diff --git a/clustpy/utils/tests/test_information_theory.py b/clustpy/utils/tests/test_information_theory.py
index 5132259..ff862fe 100644
--- a/clustpy/utils/tests/test_information_theory.py
+++ b/clustpy/utils/tests/test_information_theory.py
@@ -89,8 +89,10 @@ def test_bic_costs():
 
 
 def test_integer_costs():
-    costs = integer_costs(77)
+    costs = integer_costs(77, True)
     assert abs(costs - 12.328150766) < 1e-9
+    costs = integer_costs(77, False)
+    assert abs(costs - 7.24955913584) < 1e-9
 
 
 def test_mdl_costs_probability():