Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions clustpy/alternative/nrkmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -1292,7 +1292,7 @@ def _mdl_m_dependant_subspace_costs(X: np.ndarray, V: np.ndarray, cluster_index:
# ==== Costs of cluster space ====
cropped_V_cluster = V[:, P_cluster]
# Costs for cluster dimensionality
cluster_costs = mdl.integer_costs(m_cluster)
cluster_costs = mdl.integer_costs(m_cluster, use_log2=True)
# Costs for centers
cluster_costs += n_clusters[cluster_index] * _mdl_reference_vector(m_cluster, max_distance, precision)
# Costs for point encoding
Expand All @@ -1306,7 +1306,7 @@ def _mdl_m_dependant_subspace_costs(X: np.ndarray, V: np.ndarray, cluster_index:
# ==== Costs of noise space ====
cropped_V_noise = V[:, P_noise]
# Costs for noise dimensionality
noise_costs = mdl.integer_costs(m_noise)
noise_costs = mdl.integer_costs(m_noise, use_log2=True)
# Costs for centers
noise_costs += n_clusters[noise_index] * _mdl_reference_vector(m_noise, max_distance, precision)
# Costs for point encoding
Expand Down Expand Up @@ -1443,17 +1443,17 @@ def _mdl_costs(X: np.ndarray, n_clusters: list, m: list, P: list, V: np.ndarray,
# Costs of matrix V
# global_costs += mdl.mdl_costs_orthogonal_matrix(n_points, mdl.mdl_costs_float_value(n_points))
# Costs of number of subspaces
global_costs += mdl.integer_costs(subspaces)
global_costs += mdl.integer_costs(subspaces, use_log2=True)
# Costs for each subspace
all_subspace_costs = []
for subspace in range(subspaces):
cropped_V = V[:, P[subspace]]
# Calculate costs
model_costs = 0
# Costs for dimensionality
model_costs += mdl.integer_costs(m[subspace])
model_costs += mdl.integer_costs(m[subspace], use_log2=True)
# Number of clusters in this subspace
model_costs += mdl.integer_costs(n_clusters[subspace])
model_costs += mdl.integer_costs(n_clusters[subspace], use_log2=True)
# Costs for cluster centers
model_costs += n_clusters[subspace] * \
_mdl_reference_vector(m[subspace], max_distance, precision)
Expand All @@ -1462,7 +1462,7 @@ def _mdl_costs(X: np.ndarray, n_clusters: list, m: list, P: list, V: np.ndarray,
if outliers:
# Encode number of outliers
n_outliers = len(labels[:, subspace][labels[:, subspace] == -1])
model_costs += mdl.integer_costs(n_outliers)
model_costs += mdl.integer_costs(n_outliers, use_log2=True)
# Encode coding costs of outliers
outlier_costs += n_outliers * np.log2(n_points)
outlier_costs += n_outliers * _mdl_costs_uniform_pdf(m[subspace], max_distance)
Expand Down
6 changes: 4 additions & 2 deletions clustpy/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .synthetic_data_creator import create_subspace_data, create_nr_data
from .real_world_data import load_newsgroups, load_iris, load_wine, load_breast_cancer, load_rcv1, load_imagenet_dog, \
load_imagenet10, load_coil20, load_coil100, load_olivetti_faces, load_webkb
load_imagenet10, load_coil20, load_coil100, load_olivetti_faces, load_webkb, load_bbcnews, load_bbcsport
from .real_uci_data import load_har, load_letterrecognition, load_optdigits, load_pendigits, load_banknotes, load_htru2, \
load_mice_protein, load_ecoli, load_spambase, load_seeds, load_statlog_shuttle, load_forest_types, \
load_breast_tissue, load_soybean_large, load_soybean_small, load_skin, load_user_knowledge, load_dermatology, \
Expand Down Expand Up @@ -105,4 +105,6 @@
'load_gene_expression_cancer_rna_seq',
'load_sport_articles',
'load_wholesale_customers',
'load_reuters21578']
'load_reuters21578',
'load_bbcsport',
'load_bbcnews']
93 changes: 55 additions & 38 deletions clustpy/data/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,24 @@
"[WARNING] Could not import PIL in clustpy.data.real_world_data. Please install PIL by 'pip install Pillow' if necessary")
import numpy as np
import os
from pathlib import Path
from pathlib import Path, PurePath
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.datasets import fetch_file
import subprocess


DEFAULT_DOWNLOAD_PATH = str(Path.home() / "Downloads/clustpy_datafiles")
DEFAULT_DOWNLOAD_PATH = Path.home() / "Downloads" / "clustpy_datafiles"


def _get_download_dir(downloads_path: str) -> str:
def _get_download_dir(downloads_path: str | Path) -> Path:
"""
Helper function to define the path where the data files should be stored. If downloads_path is None then default path
'[USER]/Downloads/clustpy_datafiles' will be used. If the directory does not exists it will be created.

Parameters
----------
downloads_path : str
downloads_path : str | Path
path to the directory where the data will be stored. Can be None

Returns
Expand All @@ -44,36 +45,40 @@ def _get_download_dir(downloads_path: str) -> str:
if env_data_path is None:
downloads_path = DEFAULT_DOWNLOAD_PATH
else:
downloads_path = env_data_path
if not os.path.isdir(downloads_path):
os.makedirs(downloads_path)
with open(downloads_path + "/info.txt", "w") as f:
downloads_path = Path(env_data_path)
elif isinstance(downloads_path, str):
# Cast str to Path
downloads_path = Path(downloads_path)
if not downloads_path.is_dir():
downloads_path.mkdir(parents=True, exist_ok=False)
with open(downloads_path / "info.txt", "w") as f:
f.write("This directory was created by the ClustPy python package to store real world data sets.\n"
"The default directory is '[USER]/Downloads/clustpy_datafiles' and can be changed with the "
"'downloads_path' parameter when loading a data set.\n"
"Alternatively, a global python environment variable for the path can be defined with os.environ['CLUSTPY_DATA'] = 'PATH'.")
return downloads_path


def _download_file(file_url: str, filename_local: str) -> None:
def _download_file(file_url: str, filename_local: str | Path) -> None:
"""
Helper function to download a file into a specified location.

Parameters
----------
file_url : str
URL of the file
filename_local : str
filename_local : str | Path
local name of the file after it has been downloaded
"""
local_path = Path(filename_local)
local_dir = local_path.parent
local_filename = local_path.name
if isinstance(filename_local, str):
filename_local = Path(filename_local)
local_dir = filename_local.parent
local_filename = filename_local.name
print("Downloading data set from {0} to {1}".format(file_url, filename_local))
fetch_file(file_url, folder=local_dir, local_filename=local_filename)


def _download_file_from_google_drive(file_id: str, filename_local: str, chunk_size: int = 32768) -> None:
def _download_file_from_google_drive(file_id: str, filename_local: str | Path, chunk_size: int = 32768) -> None:
"""
Download a file from google drive.
Code taken from:
Expand All @@ -83,7 +88,7 @@ def _download_file_from_google_drive(file_id: str, filename_local: str, chunk_si
----------
file_id : str
ID of the file on google drive
filename_local : str
filename_local : str | Path
local name of the file after it has been downloaded
chunk_size : int
chink size when downloading the file (default: 32768)
Expand All @@ -107,16 +112,16 @@ def _download_file_from_google_drive(file_id: str, filename_local: str, chunk_si
session.close()


def _load_data_file(filename_local: str, file_url: str, delimiter: str = ",", last_column_are_labels: bool = True) -> (
np.ndarray, np.ndarray):
def _load_data_file(filename_local: Path, file_url: str, delimiter: str = ",", last_column_are_labels: bool = True) -> tuple[
np.ndarray, np.ndarray]:
"""
Helper function to load a data file. Either the first or last column, depending on last_column_are_labels, of the
data file is used as the label column.
If file does not exist on the local machine it will be downloaded.

Parameters
----------
filename_local : str
filename_local : Path
local name of the file after it has been downloaded
file_url : str
URL of the file
Expand All @@ -127,10 +132,10 @@ def _load_data_file(filename_local: str, file_url: str, delimiter: str = ",", la

Returns
-------
data, labels : (np.ndarray, np.ndarray)
data, labels : tuple[np.ndarray, np.ndarray]
the data numpy array, the labels numpy array
"""
if not os.path.isfile(filename_local):
if not filename_local.is_file():
_download_file(file_url, filename_local)
datafile = np.genfromtxt(filename_local, delimiter=delimiter)
if last_column_are_labels:
Expand All @@ -144,7 +149,7 @@ def _load_data_file(filename_local: str, file_url: str, delimiter: str = ",", la
return data, labels


def _decompress_z_file(filename: str, directory: str) -> bool:
def _decompress_z_file(filename: str | Path, directory: str | Path) -> bool:
"""
Helper function to decompress a 7z file. The function uses an installed version of 7zip to decompress the file.
If 7zip is not installed on this machine, the function will return False and a warning is printed.
Expand All @@ -161,22 +166,30 @@ def _decompress_z_file(filename: str, directory: str) -> bool:
successful : bool
True if decompression was successful, else False
"""
os.system("7z x {0} -o{1}".format(filename.replace("\\", "/"), directory.replace("\\", "/")))
successful = True
if not os.path.isfile(filename[:-2]):
if isinstance(filename, str):
filename = Path(filename)
if isinstance(directory, str):
directory = Path(directory)
cmd = ["7z", "x", filename.as_posix(), f"-o{directory.as_posix()}"]
try:
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except (subprocess.CalledProcessError, FileNotFoundError):
print("[WARNING] 7Zip extraction failed or 7z executable is missing!")
return False
if not filename.with_suffix('').is_file():
# If no file without .z exists, decompression was not successful
successful = False
print("[WARNING] 7Zip is needed to uncompress *.Z files!")
return successful
print("[WARNING] Decompression check failed: expected file not found.")
return False
return True


def _load_image_data(image: str, image_size: tuple, color_image: bool) -> np.ndarray:
def _load_image_data(image: str | Path | np.ndarray, image_size: tuple, color_image: bool) -> np.ndarray:
"""
Load image and convert it into a coherent size. Returns a numpy array containing the image data.

Parameters
----------
image : str
image : str | Path | np.ndarray
Path to the image. Can also be a numpy array containing the specific pixels
image_size : tuple
images of various sizes can be converted into a coherent size.
Expand All @@ -190,7 +203,7 @@ def _load_image_data(image: str, image_size: tuple, color_image: bool) -> np.nda
image_data : np.ndarray
The numpy array containing the image data
"""
if isinstance(image, str):
if isinstance(image, (str, PurePath)):
pil_image = Image.open(image)
else:
pil_image = Image.fromarray(np.uint8(image))
Expand Down Expand Up @@ -231,7 +244,7 @@ def build_analyzer(self):

def _transform_text_data(data: np.ndarray, use_tfidf: bool, use_stemming: bool, use_stop_words: bool, max_df: float | int,
min_df: float | int, max_features: int, min_variance : float, sublinear_tf: bool,
data_all: np.ndarray = None) -> np.ndarray:
data_all: np.ndarray | None = None) -> tuple[np.ndarray, list[str]]:
"""
Transform a set of texts into a data matrix.
Result can be either a raw count matrix or the result of tf-idf.
Expand Down Expand Up @@ -261,13 +274,14 @@ def _transform_text_data(data: np.ndarray, use_tfidf: bool, use_stemming: bool,
The default is to keep all features with non-zero variance, i.e. remove only the features that have the same value in all samples
sublinear_tf : bool
Apply sublinear term frequency scaling, i.e. replace tf with 1 + log(tf) (see sklearn TfidfTransformer)
data_all : np.ndarray
data_all : np.ndarray | None
The complete data set, i.e., if no subset is used. If it is None, it will be equal to data (default: None)

Returns
-------
data : np.ndarray
The resulting data array
tuple : tuple[np.ndarray, list[str]]
The resulting data array,
The vocabulary of the data output
"""
if data_all is None:
data_all = data
Expand All @@ -278,18 +292,21 @@ def _transform_text_data(data: np.ndarray, use_tfidf: bool, use_stemming: bool,
vectorizer = CountVectorizer(dtype=np.float64, stop_words="english" if use_stop_words else None, min_df=min_df, max_df=max_df, max_features=max_features)
data_sparse_all = vectorizer.fit_transform(data_all)
data_sparse = vectorizer.transform(data)
vocabulary = vectorizer.get_feature_names_out()
# (Optional) Check for variance threshold
if min_variance != 0:
selector = VarianceThreshold(min_variance)
data_sparse_all = selector.fit_transform(data_sparse_all)
data_sparse = selector.transform(data_sparse)
vocabulary_mask = selector._get_support_mask()
vocabulary = vocabulary[vocabulary_mask]
# (Optional) Apply tf-idf
if use_tfidf:
tfidf = TfidfTransformer(sublinear_tf=sublinear_tf)
tfidf.fit(data_sparse_all)
data_sparse = tfidf.transform(data_sparse)
data = np.asarray(data_sparse.todense())
return data
return data, vocabulary


def flatten_images(data: np.ndarray, format: str) -> np.ndarray:
Expand All @@ -313,11 +330,11 @@ def flatten_images(data: np.ndarray, format: str) -> np.ndarray:
format_possibilities = ["HW", "HWD", "CHW", "CHWD", "HWC", "HWDC"]
assert format in format_possibilities, "Format must be within {0}".format(format_possibilities)
if format == "HW":
assert data.ndim == 3
assert data.ndim == 3, f"ndim has to be 3 but is {data.ndim}"
elif format in ["HWD", "CHW", "HWC"]:
assert data.ndim == 4
assert data.ndim == 4, f"ndim has to be 4 but is {data.ndim}"
elif format in ["CHWD", "HWDC"]:
assert data.ndim == 5
assert data.ndim == 5, f"ndim has to be 5 but is {data.ndim}"
# Flatten shape
if format != "HW" and format != "HWD":
if format == "CHW":
Expand Down
4 changes: 2 additions & 2 deletions clustpy/data/real_clustpy_data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import os
from sklearn.datasets._base import Bunch
from clustpy.data._utils import unflatten_images
from pathlib import Path


def _load_nr_data(file_name: str, n_labels: int) -> (np.ndarray, np.ndarray):
Expand All @@ -21,7 +21,7 @@ def _load_nr_data(file_name: str, n_labels: int) -> (np.ndarray, np.ndarray):
data, labels : (np.ndarray, np.ndarray)
the data numpy array, the labels numpy array
"""
path = os.path.dirname(__file__) + "/datasets/" + file_name
path = Path(__file__).parent / "datasets" / file_name
dataset = np.genfromtxt(path, delimiter=",")
data = dataset[:, n_labels:]
labels = dataset[:, :n_labels]
Expand Down
Loading
Loading