diff --git a/clustering/tabnet-classifier-tools/.bumpversion.cfg b/clustering/tabnet-classifier-tools/.bumpversion.cfg new file mode 100644 index 0000000..986af44 --- /dev/null +++ b/clustering/tabnet-classifier-tools/.bumpversion.cfg @@ -0,0 +1,29 @@ +[bumpversion] +current_version = 0.1.0-dev0 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:plugin.json] +[bumpversion:file:PytorchTabnet.cwl] +[bumpversion:file:ict.yaml] + +[bumpversion:file:VERSION] + +[bumpversion:file:src/polus/tabular/clustering/pytorch_tabnet/__init__.py] \ No newline at end of file diff --git a/clustering/tabnet-classifier-tools/CHANGELOG.md b/clustering/tabnet-classifier-tools/CHANGELOG.md new file mode 100644 index 0000000..425b725 --- /dev/null +++ b/clustering/tabnet-classifier-tools/CHANGELOG.md @@ -0,0 +1 @@ +# PyTorch TabNet tool(0.1.0-dev0) diff --git a/clustering/tabnet-classifier-tools/Dockerfile b/clustering/tabnet-classifier-tools/Dockerfile new file mode 100644 index 0000000..48dd58e --- /dev/null +++ b/clustering/tabnet-classifier-tools/Dockerfile @@ -0,0 +1,20 @@ +FROM polusai/bfio:2.3.6 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".arrow" +ENV POLUS_LOG="INFO" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + +ENTRYPOINT ["python3", "-m", "polus.tabular.clustering.pytorch_tabnet"] +CMD ["--help"] diff --git a/clustering/tabnet-classifier-tools/PytorchTabnet.cwl b/clustering/tabnet-classifier-tools/PytorchTabnet.cwl new file mode 100644 index 0000000..96f5fa5 --- /dev/null +++ b/clustering/tabnet-classifier-tools/PytorchTabnet.cwl @@ -0,0 +1,172 @@ +class: CommandLineTool +cwlVersion: v1.2 +inputs: + batchSize: + inputBinding: + prefix: --batchSize + type: string? + catEmbDim: + inputBinding: + prefix: --catEmbDim + type: string? + classifier: + inputBinding: + prefix: --classifier + type: string + clipValue: + inputBinding: + prefix: --clipValue + type: double? + computeImportance: + inputBinding: + prefix: --computeImportance + type: boolean? + deviceName: + inputBinding: + prefix: --deviceName + type: string + dropLast: + inputBinding: + prefix: --dropLast + type: boolean? + epsilon: + inputBinding: + prefix: --epsilon + type: double? + evalMetric: + inputBinding: + prefix: --evalMetric + type: string + filePattern: + inputBinding: + prefix: --filePattern + type: string? + gamma: + inputBinding: + prefix: --gamma + type: double? + groupedFeatures: + inputBinding: + prefix: --groupedFeatures + type: string? + inpDir: + inputBinding: + prefix: --inpDir + type: Directory + lambdaSparse: + inputBinding: + prefix: --lambdaSparse + type: double? + lossFn: + inputBinding: + prefix: --lossFn + type: string + lr: + inputBinding: + prefix: --lr + type: double? + maskType: + inputBinding: + prefix: --maskType + type: string + maxEpochs: + inputBinding: + prefix: --maxEpochs + type: string? + momentum: + inputBinding: + prefix: --momentum + type: double? + nA: + inputBinding: + prefix: --nA + type: string? + nD: + inputBinding: + prefix: --nD + type: string? + nIndepDecoder: + inputBinding: + prefix: --nIndepDecoder + type: string? + nIndependent: + inputBinding: + prefix: --nIndependent + type: string? + nShared: + inputBinding: + prefix: --nShared + type: string? + nSharedDecoder: + inputBinding: + prefix: --nSharedDecoder + type: string? + nSteps: + inputBinding: + prefix: --nSteps + type: string? + numWorkers: + inputBinding: + prefix: --numWorkers + type: string? + optimizerFn: + inputBinding: + prefix: --optimizerFn + type: string + outDir: + inputBinding: + prefix: --outDir + type: Directory + patience: + inputBinding: + prefix: --patience + type: string? + preview: + inputBinding: + prefix: --preview + type: boolean? + schedulerFn: + inputBinding: + prefix: --schedulerFn + type: string + seed: + inputBinding: + prefix: --seed + type: string? + stepSize: + inputBinding: + prefix: --stepSize + type: string? + targetVar: + inputBinding: + prefix: --targetVar + type: string + testSize: + inputBinding: + prefix: --testSize + type: double? + virtualBatchSize: + inputBinding: + prefix: --virtualBatchSize + type: string? + warmStart: + inputBinding: + prefix: --warmStart + type: boolean? + weights: + inputBinding: + prefix: --weights + type: string? +outputs: + outDir: + outputBinding: + glob: $(inputs.outDir.basename) + type: Directory +requirements: + DockerRequirement: + dockerPull: polusai/pytorch-tabnet-tool:0.1.0-dev0 + InitialWorkDirRequirement: + listing: + - entry: $(inputs.outDir) + writable: true + InlineJavascriptRequirement: {} diff --git a/clustering/tabnet-classifier-tools/README.md b/clustering/tabnet-classifier-tools/README.md new file mode 100644 index 0000000..642cb8c --- /dev/null +++ b/clustering/tabnet-classifier-tools/README.md @@ -0,0 +1,73 @@ +# PyTorch TabNet tool(v0.1.0-dev0) + +This tool uses [tabnet](https://github.com/dreamquark-ai/tabnet/tree/develop), a deep learning model designed for tabular data structured in rows and columns. TabNet is suitable for classification, regression, and multi-task learning. + +## Inputs: + +### Input data: +The input tabular data that need to be trained. This plugin supports `.csv`, `.feather`and `.arrow` file formats + +### Details: + +PyTorch-TabNet can be employed for:. + +1. TabNetClassifier: For binary and multi-class classification problems +2. TabNetRegressor: For simple and multi-task regression problems +3. TabNetMultiTaskClassifier: multi-task multi-classification problems + + +## Building + +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. +For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). + +## Options + +This plugin takes 38 input arguments and one output argument: + +| Name | Description | I/O | Type | +| ---------------- | --------------------------------------------------------------------------- | ------ | ------------- | +| `--inpdir` | Input tabular data | Input | genericData | +| `--filePattern` | Pattern to parse tabular files | Input | string | +| `--testSize` | Proportion of the dataset to include in the test set | Input | number | +| `--nD` | Width of the decision prediction layer | Input | integer | +| `--nA` | Width of the attention embedding for each mask | Input | integer | +| `--nSteps` | Number of steps in the architecture | Input | integer | +| `--gamma` | Coefficient for feature reuse in the masks | Input | number | +| `--catEmbDim` | List of embedding sizes for each categorical feature | Input | integer | +| `--nIndependent` | Number of independent Gated Linear Unit layers at each step | Input | integer | +| `--nShared` | Number of shared Gated Linear Unit layers at each step | Input | integer | +| `--epsilon` | Constant value | Input | number | +| `--seed` | Random seed for reproducibility | Input | integer | +| `--momentum` | Momentum for batch normalization | Input | number | +| `--clipValue` | Clipping of the gradient value | Input | number | +| `--lambdaSparse` | Extra sparsity loss coefficient | Input | number | +| `--optimizerFn` | Pytorch optimizer function | Input | enum | +| `--lr` | learning rate for the optimizer | Input | number | +| `--schedulerFn` | Parameters used initialize the optimizer | Input | enum | +| `--stepSize` | Parameter to apply to the scheduler_fn | Input | integer | +| `--deviceName` | Platform used for training | Input | enum | +| `--maskType` | A masking function for feature selection | Input | enum | +| `--groupedFeatures` | Allow the model to share attention across features within the same group | Input | integer | +| `--nSharedDecoder` | Number of shared GLU block in decoder | Input | integer | +| `--nIndepDecoder` | Number of independent GLU block in decoder | Input | integer | +| `--evalMetric` | Metrics utilized for early stopping evaluation | Input | enum | +| `--maxEpochs` | Maximum number of epochs for training | Input | integer | +| `--patience` | Consecutive epochs without improvement before early stopping | Input | integer | +| `--weights` | Sampling parameter only for TabNetClassifier | Input | integer | +| `--lossFn` | Loss function | Input | enum | +| `--batchSize` | Batch size | Input | integer | +| `--virtualBatchSize` | Size of mini-batches for Ghost Batch Normalization | Input | integer | +| `--numWorkers` | Number or workers used in torch.utils.data.Dataloader | Input | integer | +| `--dropLast` | Option to drop incomplete last batch during training | Input | boolean | +| `--warmStart` | For scikit-learn compatibility, enabling fitting the same model twice | Input | boolean | +| `--targetVar` | Target feature containing classification labels | Input | string | +| `--computeImportance` | Compute feature importance | Input | boolean | +| `--classifier` | Pytorch tabnet Classifier for training | Input | enum | +| `--preview` | Generate JSON file of sample outputs | Input | boolean | +| `--outdir` | Output collection | Output | genericData | diff --git a/clustering/tabnet-classifier-tools/VERSION b/clustering/tabnet-classifier-tools/VERSION new file mode 100644 index 0000000..206c085 --- /dev/null +++ b/clustering/tabnet-classifier-tools/VERSION @@ -0,0 +1 @@ +0.1.0-dev0 diff --git a/clustering/tabnet-classifier-tools/build-docker.sh b/clustering/tabnet-classifier-tools/build-docker.sh new file mode 100644 index 0000000..eff76ca --- /dev/null +++ b/clustering/tabnet-classifier-tools/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(" +] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +filepattern = "^2.0.4" +typer = "^0.12.3" +vaex = "^4.7.0" +torch = "2.2.2" +pytorch-tabnet = "^4.1.0" +scikit-learn = "^1.5.0" + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pre-commit = "^3.0.4" +black = "^23.1.0" +flake8 = "^6.0.0" +mypy = "^1.0.0" +pytest = "^7.2.1" +ipykernel = "^6.29.4" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/clustering/tabnet-classifier-tools/run-plugin.sh b/clustering/tabnet-classifier-tools/run-plugin.sh new file mode 100644 index 0000000..3f9e094 --- /dev/null +++ b/clustering/tabnet-classifier-tools/run-plugin.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +version=$( None: + """Tool for training tabular data using PyTorch TabNet.""" + logger.info(f"--inpDir = {inp_dir}") + logger.info(f"--filePattern = {file_pattern}") + logger.info(f"--testSize = {test_size}") + logger.info(f"--nD = {n_d}") + logger.info(f"--nA = {n_a}") + logger.info(f"--nSteps = {n_steps}") + logger.info(f"--gamma = {gamma}") + logger.info(f"--catEmbDim = {cat_emb_dim}") + logger.info(f"--nIndependent = {n_independent}") + logger.info(f"--nShared = {n_shared}") + logger.info(f"--epsilon = {epsilon}") + logger.info(f"--seed = {seed}") + logger.info(f"--momentum = {momentum}") + logger.info(f"--clipValue = {clip_value}") + logger.info(f"--lambdaSparse = {lambda_sparse}") + logger.info(f"--optimizerFn = {optimizer_fn}") + logger.info(f"--lr = {lr}") + logger.info(f"--schedulerFn = {scheduler_fn}") + logger.info(f"--stepSize = {step_size}") + logger.info(f"--deviceName = {device_name}") + logger.info(f"--maskType = {mask_type}") + logger.info(f"--groupedFeatures = {grouped_features}") + logger.info(f"--nSharedDecoder = {n_shared_decoder}") + logger.info(f"--nIndepDecode = {n_indep_decoder}") + logger.info(f"--evalMetric = {eval_metric}") + logger.info(f"--maxEpochs = {max_epochs}") + logger.info(f"--patience = {patience}") + logger.info(f"--weights = {weights}") + logger.info(f"--lossFn = {loss_fn}") + logger.info(f"--batch_size = {batch_size}") + logger.info(f"--virtualBatchSize = {virtual_batch_size}") + logger.info(f"--numWorkers = {num_workers}") + logger.info(f"--dropLast = {drop_last}") + logger.info(f"--warm_start = {warm_start}") + logger.info(f"--computeImportance = {compute_importance}") + logger.info(f"--targetVar = {target_var}") + logger.info(f"--classifier = {classifier}") + logger.info(f"--outDir = {out_dir}") + + if not Path(inp_dir).exists(): + msg = f"The input directory {Path(inp_dir).stem} does not exist." + raise FileNotFoundError(msg) + + if not Path(out_dir).exists(): + Path(out_dir).mkdir(exist_ok=False, parents=True) + msg = f"The output directory {out_dir} created." + logger.info(msg) + + params = { + "test_size": test_size, + "n_d": n_d, + "n_a": n_a, + "n_steps": n_steps, + "gamma": gamma, + "cat_emb_dim": cat_emb_dim, + "n_independent": n_independent, + "n_shared": n_shared, + "epsilon": epsilon, + "seed": seed, + "momentum": momentum, + "clip_value": clip_value, + "lambda_sparse": lambda_sparse, + "optimizer_fn": ut.Map_OptimizersFn[optimizer_fn], + "optimizer_params": {"lr": lr}, + "scheduler_fn": ut.Map_SchedulerFn[scheduler_fn], + "scheduler_params": {"step_size": step_size, "gamma": 0.95}, + "device_name": device_name.value, + "mask_type": mask_type.value, + "grouped_features": grouped_features, + "n_shared_decoder": n_shared_decoder, + "n_indep_decoder": n_indep_decoder, + "eval_metric": eval_metric.value, + "max_epochs": max_epochs, + "patience": patience, + "weights": weights, + "loss_fn": loss_fn.value, + "batch_size": batch_size, + "virtual_batch_size": virtual_batch_size, + "num_workers": num_workers, + "drop_last": drop_last, + "warm_start": warm_start, + "compute_importance": compute_importance, + } + + fps = fp.FilePattern(inp_dir, file_pattern) + + flist = [f[1][0] for f in fps()] + + if len(flist) == 0: + msg = f"No files found with pattern: {file_pattern}." + raise ValueError(msg) + + if preview: + ut.generate_preview(out_dir) + + if not preview: + for file in flist: + model = pt.PytorchTabnet( + **params, + file_path=file, + target_var=target_var, + classifier=classifier, + out_dir=out_dir, + ) + mod_params = dict(model) + + model.fit_model(params=mod_params) + + +if __name__ == "__main__": + app() diff --git a/clustering/tabnet-classifier-tools/src/polus/tabular/clustering/pytorch_tabnet/tabnet.py b/clustering/tabnet-classifier-tools/src/polus/tabular/clustering/pytorch_tabnet/tabnet.py new file mode 100644 index 0000000..04d4c16 --- /dev/null +++ b/clustering/tabnet-classifier-tools/src/polus/tabular/clustering/pytorch_tabnet/tabnet.py @@ -0,0 +1,241 @@ +"""Pytorch TabNet tool.""" + +import json +import logging +import os +from pathlib import Path +from typing import Any + +import numpy as np +import polus.tabular.clustering.pytorch_tabnet.utils as ut +import vaex +from scipy.sparse import csr_matrix +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import LabelEncoder + +from pytorch_tabnet.multitask import TabNetMultiTaskClassifier +from pytorch_tabnet.tab_model import TabNetClassifier +from pytorch_tabnet.tab_model import TabNetRegressor + +# Initialize the logger +logger = logging.getLogger(__name__) +logger.setLevel(os.environ.get("POLUS_LOG", logging.INFO)) + + +class PytorchTabnet(ut.TabnetParameters): + """Train a Pytorch TabNet model and evaluate it on validation data. + + Args: + file_path: Path to the tabular data used for training. + target_var: Dataset feature with classification labels. + classifier: Select TabNetClassifier,TabNetMultiTaskClassifier,TabNetRegressor. + out_dir: Path to the output directory. + """ + + file_path: Path + target_var: str + classifier: ut.Classifier + out_dir: Path + + @property + def convert_vaex_dataframe(self) -> vaex.dataframe.DataFrame: + """Vaex supports reading tabular data in .csv, .feather, and .arrow formats.""" + extensions = [".arrow", ".feather", ".parquet"] + if self.file_path.name.endswith(".csv"): + return vaex.read_csv( + Path(self.file_path), + convert=True, + chunk_size=5_000_000, + ) + if self.file_path.name.endswith(tuple(extensions)): + return vaex.open(Path(self.file_path)) + return None + + @property + def get_data(self) -> ut.MyTupleType: + """Subsetting for train/validation,extracting categorical indices/dimensions.""" + data = self.convert_vaex_dataframe + + if not isinstance(data.shape, tuple) and all(el != 0 for el in data.shape): + msg = "Vaex dataframe is empty" + raise ValueError(msg) + + if self.target_var not in list(data.columns): + msg = f"{self.target_var} does not exist!!" + raise ValueError(msg) + + features = [ + feature for feature in data.get_column_names() if feature != self.target_var + ] + + cat_idxs = [] + cat_dims = [] + for i, col in enumerate(list(data.columns)): + unique_values = 200 + if data[col].dtype == "string" or len(data[col].unique()) < unique_values: + l_enc = LabelEncoder() + data[col] = data[col].fillna("fillna") + data[col] = l_enc.fit_transform(data[col].values) + if col != self.target_var: + cat_idxs.append(i) + cat_dims.append(len(l_enc.classes_)) + else: + # Calculate the mean of the column, ignoring NA values + column_mean = data[col].mean() + # Replace NA values with the column mean + data[col] = data[col].fillna(column_mean) + + if len(cat_idxs) == 0 and len(cat_dims) == 0: + cat_idxs = [] + cat_dims = [] + logger.info("Categorical features are not dectected") + + features = [ + feature for feature in data.get_column_names() if feature != self.target_var + ] + + x = np.array(data[features]) + if self.classifier.value in ["TabNetRegressor", "TabNetMultiTaskClassifier"]: + y = data[self.target_var].to_numpy().reshape(-1, 1) + else: + y = data[self.target_var].to_numpy() + + x_train, x_test, y_train, y_test = train_test_split( + x, + y, + test_size=self.test_size, + random_state=42, + stratify=y, + ) + x_train, x_val, y_train, y_val = train_test_split( + x_train, + y_train, + test_size=self.test_size, + random_state=42, + stratify=y_train, + ) + + return ( + x_train, + y_train, + x_test, + y_test, + x_val, + y_val, + cat_idxs, + cat_dims, + features, + ) + + @staticmethod + def parameters(params: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]: + """Segmenting input parameters for model and evaluation.""" + exclude_params = [ + "out_dir", + "file_path", + "classifier", + "test_size", + "target_var", + ] + + evalparams = [ + "eval_metric", + "max_epochs", + "patience", + "weights", + "loss_fn", + "batch_size", + "virtual_batch_size", + "num_workers", + "drop_last", + "warm_start", + "compute_importance", + ] + + params = { + k: v for k, v in params.items() if k not in exclude_params if v is not None + } + + model_params = {k: v for k, v in params.items() if k not in evalparams} + eval_params = {k: v for k, v in params.items() if k in evalparams} + + return model_params, eval_params + + def fit_model(self, params: dict[str, Any]) -> None: + """Train a PyTorch tabNet model.""" + ( + x_train, + y_train, + _, + _, + x_val, + y_val, + cat_idxs, + cat_dims, + features, + ) = self.get_data + + model_params, eval_params = self.parameters(params) + + model_params["cat_idxs"] = cat_idxs + model_params["cat_dims"] = cat_dims + + eval_metric = eval_params["eval_metric"] + + if self.classifier.value == "TabNetClassifier": + model = TabNetClassifier(**model_params) + if eval_metric not in ut.BINARY_EVAL_METRIC: + msg = f"Invalid eval_metric: {eval_metric} for {self.classifier.value}" + raise ValueError(msg) + + if self.classifier.value == "TabNetMultiTaskClassifier": + model = TabNetMultiTaskClassifier(**model_params) + if eval_metric not in ut.MULTICLASS_EVAL_METRIC: + msg = f"Invalid eval_metric: {eval_metric} for {self.classifier.value}" + raise ValueError(msg) + + if self.classifier.value == "TabNetRegressor": + model = TabNetRegressor(**model_params) + if eval_metric not in ut.REGRESSION_EVAL_METRIC: + msg = f"Invalid eval_metric: {eval_metric} for {self.classifier.value}" + raise ValueError(msg) + + # This illustrates the behaviour of the model's fit method using + # Compressed Sparse Row matrices + sparse_x_train = csr_matrix(x_train) + sparse_x_val = csr_matrix(x_val) + + model.fit( + X_train=sparse_x_train, + y_train=y_train, + eval_set=[(x_train, y_train), (sparse_x_val, y_val)], + eval_name=["train", "valid"], + eval_metric=[eval_params["eval_metric"]], + max_epochs=eval_params["max_epochs"], + patience=eval_params["patience"], + weights=eval_params["weights"], + batch_size=eval_params["batch_size"], + virtual_batch_size=eval_params["virtual_batch_size"], + num_workers=eval_params["num_workers"], + drop_last=eval_params["drop_last"], + warm_start=eval_params["warm_start"], + compute_importance=eval_params["compute_importance"], + ) + + # save tabnet model + model_name = f"tabnet_{Path(self.file_path.name).stem}" + model_path = self.out_dir.joinpath(model_name) + logger.info("Saving of trained model") + model.save_model(model_path) + + imp_features = [round(i, 4) for i in model.feature_importances_] + + feature_importance_pairs = list(zip(features, imp_features)) + sorted_feature_importance_pairs = dict( + sorted(feature_importance_pairs, key=lambda x: x[1], reverse=True), + ) + + save_feat_path = self.out_dir.joinpath("feature_importances.json") + with Path.open(save_feat_path, "w") as jf: + logger.info("Save feature importances") + json.dump(sorted_feature_importance_pairs, jf, indent=4) diff --git a/clustering/tabnet-classifier-tools/src/polus/tabular/clustering/pytorch_tabnet/utils.py b/clustering/tabnet-classifier-tools/src/polus/tabular/clustering/pytorch_tabnet/utils.py new file mode 100644 index 0000000..115b322 --- /dev/null +++ b/clustering/tabnet-classifier-tools/src/polus/tabular/clustering/pytorch_tabnet/utils.py @@ -0,0 +1,224 @@ +"""Pytorch TabNet tool.""" + +import os +import shutil +from enum import Enum +from pathlib import Path +from typing import Any +from typing import Optional +from typing import Union + +import numpy as np +import torch +from pydantic import BaseModel +from pydantic import Field + +POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".arrow") + +BINARY_EVAL_METRIC = ["auc", "accuracy", "balanced_accuracy", "logloss"] +MULTICLASS_EVAL_METRIC = ["accuracy", "balanced_accuracy", "logloss"] +REGRESSION_EVAL_METRIC = ["mse", "mae", "rmse", "rmsle"] + + +MyTupleType = tuple[ + np.ndarray, + np.array, + np.ndarray, + np.array, + np.ndarray, + np.array, + list[int], + list[int], + list[str], +] + + +def generate_preview( + path: Path, +) -> None: + """Generate preview of the plugin outputs.""" + source_path = Path(__file__).parents[5].joinpath("examples") + shutil.copytree(source_path, path, dirs_exist_ok=True) + + +class OptimizersFn(str, Enum): + """Optimizers Function.""" + + Adadelta = "Adadelta" + Adagrad = "Adagrad" + Adam = "Adam" + AdamW = "AdamW" + SparseAdam = "SparseAdam" + Adamax = "Adamax" + ASGD = "ASGD" + LBFGS = "LBFGS" + NAdam = "NAdam" + RAdam = "RAdam" + RMSprop = "RMSprop" + Rprop = "Rprop" + SGD = "SGD" + Default = "Adam" + + +Map_OptimizersFn = { + OptimizersFn.Adadelta: torch.optim.Adadelta, + OptimizersFn.Adagrad: torch.optim.Adagrad, + OptimizersFn.Adam: torch.optim.Adam, + OptimizersFn.AdamW: torch.optim.AdamW, + OptimizersFn.SparseAdam: torch.optim.SparseAdam, + OptimizersFn.Adamax: torch.optim.Adamax, + OptimizersFn.ASGD: torch.optim.ASGD, + OptimizersFn.LBFGS: torch.optim.LBFGS, + OptimizersFn.ASGD: torch.optim.ASGD, + OptimizersFn.LBFGS: torch.optim.LBFGS, + OptimizersFn.NAdam: torch.optim.NAdam, + OptimizersFn.RAdam: torch.optim.RAdam, + OptimizersFn.RMSprop: torch.optim.RMSprop, + OptimizersFn.Rprop: torch.optim.Rprop, + OptimizersFn.SGD: torch.optim.SGD, + OptimizersFn.Default: torch.optim.Adam, +} + + +class SchedulerFn(str, Enum): + """Scheduler Function.""" + + LambdaLR = "LambdaLR" + MultiplicativeLR = "MultiplicativeLR" + StepLR = "StepLR" + MultiStepLR = "MultiStepLR" + ConstantLR = "ConstantLR" + LinearLR = "LinearLR" + ExponentialLR = "ExponentialLR" + PolynomialLR = "PolynomialLR" + CosineAnnealingLR = "CosineAnnealingLR" + ChainedScheduler = "ChainedScheduler" + SequentialLR = "SequentialLR" + CyclicLR = "CyclicLR" + OneCycleLR = "OneCycleLR" + CosineAnnealingWarmRestarts = "CosineAnnealingWarmRestarts" + Default = "StepLR" + + +Map_SchedulerFn = { + SchedulerFn.LambdaLR: torch.optim.lr_scheduler.LambdaLR, + SchedulerFn.MultiplicativeLR: torch.optim.lr_scheduler.MultiplicativeLR, + SchedulerFn.StepLR: torch.optim.lr_scheduler.StepLR, + SchedulerFn.MultiStepLR: torch.optim.lr_scheduler.MultiStepLR, + SchedulerFn.ConstantLR: torch.optim.lr_scheduler.ConstantLR, + SchedulerFn.LinearLR: torch.optim.lr_scheduler.LinearLR, + SchedulerFn.ExponentialLR: torch.optim.lr_scheduler.ExponentialLR, + SchedulerFn.PolynomialLR: torch.optim.lr_scheduler.PolynomialLR, + SchedulerFn.CosineAnnealingLR: torch.optim.lr_scheduler.CosineAnnealingLR, + SchedulerFn.ChainedScheduler: torch.optim.lr_scheduler.ChainedScheduler, + SchedulerFn.SequentialLR: torch.optim.lr_scheduler.SequentialLR, + SchedulerFn.CyclicLR: torch.optim.lr_scheduler.CyclicLR, + SchedulerFn.OneCycleLR: torch.optim.lr_scheduler.OneCycleLR, + SchedulerFn.CosineAnnealingWarmRestarts: torch.optim.lr_scheduler.CosineAnnealingWarmRestarts, # noqa : E501 + SchedulerFn.Default: torch.optim.lr_scheduler.StepLR, +} + + +class Evalmetric(str, Enum): + """Evaluation Metric.""" + + AUC = "auc" + ACCURACY = "accuracy" + BALANCEDACCURACY = "balanced_accuracy" + LOGLOSS = "logloss" + MSE = "mse" + MAE = "mae" + RMSE = "rmse" + RMSLE = "rmsle" + DEFAULT = "auc" + + +class MaskType(str, Enum): + """Masking Function.""" + + SPARSEMAX = "sparsemax" + ENTMAX = "entmax" + DEFAULT = "entmax" + + +class DeviceName(str, Enum): + """Platform Name.""" + + CPU = "cpu" + GPU = "gpu" + AUTO = "auto" + DEFAULT = "auto" + + +class Classifier(str, Enum): + """Pytorch TabNet Classifier.""" + + TabNetClassifier = "TabNetClassifier" + TabNetRegressor = "TabNetRegressor" + TabNetMultiTaskClassifier = "TabNetMultiTaskClassifier" + DEFAULT = "TabNetClassifier" + + +class LossFunctions(str, Enum): + """Loss Functions.""" + + L1Loss = "L1Loss" + NLLLoss = "NLLLoss" + NLLLoss2d = "NLLLoss2d" + PoissonNLLLoss = "PoissonNLLLoss" + GaussianNLLLoss = "GaussianNLLLoss" + KLDivLoss = "KLDivLoss" + MSELoss = "MSELoss" + BCELoss = "BCELoss" + BCEWithLogitsLoss = "BCEWithLogitsLoss" + HingeEmbeddingLoss = "HingeEmbeddingLoss" + SmoothL1Loss = "SmoothL1Loss" + HuberLoss = "HuberLoss" + SoftMarginLoss = "SoftMarginLoss" + CrossEntropyLoss = "CrossEntropyLoss" + MultiLabelSoftMarginLoss = "MultiLabelSoftMarginLoss" + CosineEmbeddingLoss = "CosineEmbeddingLoss" + MarginRankingLoss = "MarginRankingLoss" + MultiMarginLoss = "MultiMarginLoss" + TripletMarginLoss = "TripletMarginLoss" + TripletMarginWithDistanceLoss = "TripletMarginWithDistanceLoss" + CTCLoss = "CTCLoss" + DEFAULT = "MSELoss" + + +class TabnetParameters(BaseModel): + """Parameters for Pytorch TabNet model.""" + + test_size: float = Field(default=0.2, ge=0.1, le=0.4) + n_d: int = Field(default=8, ge=8, le=64) + n_a: int = Field(default=8, ge=8, le=64) + n_steps: int = Field(default=3, ge=3, le=10) + gamma: float = Field(default=1.3, ge=1.0, le=2.0) + cat_emb_dim: int = Field(default=1) + n_independent: int = Field(default=2, ge=1, le=5) + n_shared: int = Field(default=2, ge=1, le=5) + epsilon: float = Field(default=1e-15) + seed: int = Field(default=0) + momentum: float = Field(default=0.02, ge=0.01, le=0.4) + clip_value: Union[float, None] = Field(default=None) + lambda_sparse: float = Field(default=1e-3) + optimizer_fn: Any = Field(default=torch.optim.Adam) + optimizer_params: dict = Field(default={"lr": 0.02}) + scheduler_fn: Any = Field(default=torch.optim.lr_scheduler.StepLR) + scheduler_params: dict = Field(default={"step_size": 10, "gamma": 0.95}) + device_name: str = Field(default="auto") + mask_type: str = Field(default="entmax") + grouped_features: Optional[Union[list[int], None]] = Field(default=None) + n_shared_decoder: int = Field(default=1) + n_indep_decoder: int = Field(default=1) + eval_metric: str = Field(default="auc") + max_epochs: int = Field(default=200) + patience: int = Field(default=10) + weights: int = Field(default=0) + loss_fn: str = Field(default="MSELoss") + batch_size: int = Field(default=1024) + virtual_batch_size: int = Field(default=128) + num_workers: int = Field(default=0) + drop_last: bool = Field(default=False) + warm_start: bool = Field(default=False) + compute_importance: bool = Field(default=True) diff --git a/clustering/tabnet-classifier-tools/tests/__init__.py b/clustering/tabnet-classifier-tools/tests/__init__.py new file mode 100644 index 0000000..4d84872 --- /dev/null +++ b/clustering/tabnet-classifier-tools/tests/__init__.py @@ -0,0 +1 @@ +"""Pytorch TabNet tool.""" diff --git a/clustering/tabnet-classifier-tools/tests/conftest.py b/clustering/tabnet-classifier-tools/tests/conftest.py new file mode 100644 index 0000000..defa87b --- /dev/null +++ b/clustering/tabnet-classifier-tools/tests/conftest.py @@ -0,0 +1,208 @@ +"""Test Fixtures.""" + +import shutil +import tempfile +from pathlib import Path +from typing import Union + +import numpy as np +import pandas as pd +import pytest + +EXT = None + + +def pytest_addoption(parser: pytest.Parser) -> None: + """Add options to pytest.""" + parser.addoption( + "--slow", + action="store_true", + dest="slow", + default=False, + help="run slow tests", + ) + + +def clean_directories() -> None: + """Remove all temporary directories.""" + for d in Path(".").cwd().iterdir(): + if d.is_dir() and d.name.startswith("tmp"): + shutil.rmtree(d) + + +@pytest.fixture() +def input_directory() -> Union[str, Path]: + """Create input directory.""" + return Path(tempfile.mkdtemp(dir=Path.cwd())) + + +@pytest.fixture() +def output_directory() -> Union[str, Path]: + """Create output directory.""" + return Path(tempfile.mkdtemp(dir=Path.cwd())) + + +@pytest.fixture() +def create_dataset() -> Union[str, Path]: + """Create output directory.""" + size = 5000 + + inp_dir = Path(tempfile.mkdtemp(dir=Path.cwd())) + + rng = np.random.default_rng() + + workclass = [ + "Private", + "Local-gov", + "Self-emp-not-inc", + "Federal-gov", + "State-gov", + "Self-emp-inc", + "Without-pay", + "Never-worked", + ] + + education = [ + "11th", + "HS-grad", + "Assoc-acdm", + "Some-college", + "10th", + "Prof-school", + "7th-8th", + "Bachelors", + "Masters", + "Doctorate", + "5th-6th", + "Assoc-voc", + "9th", + "12th", + "1st-4th", + "Preschool", + ] + + marital_status = [ + "Never-married", + "Married-civ-spouse", + "Widowed", + "Divorced", + "Separated", + "Married-spouse-absent", + "Married-AF-spouse", + ] + + occupation = [ + "Machine-op-inspct", + "Farming-fishing", + "Protective-serv", + "?", + "Other-service", + "Prof-specialty", + "Craft-repair", + "Adm-clerical", + "Exec-managerial", + "Tech-support", + "Sales", + "Priv-house-serv", + "Transport-moving", + "Handlers-cleaners", + "Armed-Forces", + ] + + relationship = [ + "Own-child", + "Husband", + "Not-in-family", + "Unmarried", + "Wife", + "Other-relative", + ] + + race = ["Black", "White", "Asian-Pac-Islander", "Other", "Amer-Indian-Eskimo"] + + gender = ["Male", "Female"] + + income = ["<=50K", ">50K"] + + countries = [ + "United-States", + "Peru", + "Guatemala", + "Mexico", + "Dominican-Republic", + "Ireland", + "Germany", + "Philippines", + "Thailand", + "Haiti", + "El-Salvador", + "Puerto-Rico", + "Vietnam", + "South", + "Columbia", + "Japan", + "India", + "Cambodia", + "Poland", + "Laos", + "England", + "Cuba", + "Taiwan", + "Italy", + "Canada", + "Portugal", + "China", + "Nicaragua", + "Honduras", + "Iran", + "Scotland", + "Jamaica", + "Ecuador", + "Yugoslavia", + "Hungary", + "Hong", + "Greece", + "Trinadad&Tobago", + "Outlying-US(Guam-USVI-etc)", + "France", + "Holand-Netherlands", + ] + + diction_1 = { + "age": rng.integers(low=15, high=80, size=size), + "workclass": rng.choice(workclass, size), + "fnlwgt": rng.integers(low=12285, high=1490400, size=size), + "education": rng.choice(education, size), + "educational-num": rng.integers(low=1, high=16, size=size), + "marital-status": rng.choice(marital_status, size), + "occupation": rng.choice(occupation, size), + "relationship": rng.choice(relationship, size), + "race": rng.choice(race, size), + "gender": rng.choice(gender, size), + "capital-gain": rng.integers(low=0, high=99999, size=size), + "capital-loss": rng.integers(low=0, high=4356, size=size), + "hours-per-week": rng.integers(low=1, high=99, size=size), + "native-country": rng.choice(countries, size), + "income": rng.choice(income, size), + } + + data = pd.DataFrame(diction_1) + + data.to_csv(Path(inp_dir, "adult.csv"), index=False) + data.to_feather(Path(inp_dir, "adult.arrow")) + data.to_parquet(Path(inp_dir, "adult.parquet")) + + return inp_dir + + +@pytest.fixture( + params=[ + (0.3, "Adam", "StepLR", "accuracy", "L1Loss", "TabNetMultiTaskClassifier"), + (0.2, "Adadelta", "StepLR", "mse", "GaussianNLLLoss", "TabNetRegressor"), + (0.2, "Adagrad", "StepLR", "logloss", "MSELoss", "TabNetClassifier"), + (0.2, "RAdam", "StepLR", "auc", "CrossEntropyLoss", "TabNetClassifier"), + ], +) +def get_params(request: pytest.FixtureRequest) -> pytest.FixtureRequest: + """To get the parameter of the fixture.""" + return request.param diff --git a/clustering/tabnet-classifier-tools/tests/test_cli.py b/clustering/tabnet-classifier-tools/tests/test_cli.py new file mode 100644 index 0000000..a0493e1 --- /dev/null +++ b/clustering/tabnet-classifier-tools/tests/test_cli.py @@ -0,0 +1,50 @@ +"""Test Command line Tool.""" + +from typer.testing import CliRunner +from pathlib import Path +import pytest +from polus.tabular.clustering.pytorch_tabnet.__main__ import app +from .conftest import clean_directories +from typing import Union + + +def test_cli( + output_directory: Path, + create_dataset: Union[str, Path], + get_params: pytest.FixtureRequest, +) -> None: + """Test the command line.""" + + inp_dir = create_dataset + + runner = CliRunner() + + test_size, optimizer_fn, scheduler_fn, eval_metric, loss_fn, classifier = get_params + + result = runner.invoke( + app, + [ + "--inpDir", + inp_dir, + "--filePattern", + ".*.csv", + "--testSize", + test_size, + "--optimizerFn", + optimizer_fn, + "--evalMetric", + eval_metric, + "--schedulerFn", + scheduler_fn, + "--lossFn", + loss_fn, + "--targetVar", + "income", + "--classifier", + classifier, + "--outDir", + output_directory, + ], + ) + assert result.exit_code == 0 + clean_directories() diff --git a/clustering/tabnet-classifier-tools/tests/test_tabnet.py b/clustering/tabnet-classifier-tools/tests/test_tabnet.py new file mode 100644 index 0000000..0b4cf45 --- /dev/null +++ b/clustering/tabnet-classifier-tools/tests/test_tabnet.py @@ -0,0 +1,151 @@ +"""Pytorch TabNet tool.""" + +import filepattern as fp +import pytest +import torch +import polus.tabular.clustering.pytorch_tabnet.tabnet as tb +from .conftest import clean_directories +from pathlib import Path +from typing import Union +import numpy as np + + +# @pytest.mark.skipif("not config.getoption('slow')") +def test_convert_vaex_dataframe( + output_directory: Path, + create_dataset: Union[str, Path], + get_params: pytest.FixtureRequest, +) -> None: + """Testing reading vaex dataframe.""" + + inp_dir = create_dataset + test_size, optimizer_fn, scheduler_fn, eval_metric, loss_fn, classifier = get_params + + params = { + "test_size": test_size, + "n_d": 8, + "n_a": 8, + "seed": 0, + "optimizer_fn": optimizer_fn, + "optimizer_params": {"lr": 0.001}, + "scheduler_fn": scheduler_fn, + "device_name": "cpu", + "eval_metric": eval_metric, + "max_epochs": 10, + "loss_fn": loss_fn, + } + + patterns = [".*.csv", ".*.arrow", ".*.parquet"] + + for pat in patterns: + fps = fp.FilePattern(inp_dir, pat) + + for f in fps: + model = tb.PytorchTabnet( + **params, + file_path=f[1][0], + target_var="income", + classifier=classifier, + out_dir=output_directory, + ) + df = model.convert_vaex_dataframe + assert df.shape == (5000, 15) + assert df is not None + + clean_directories() + + +def test_get_data( + output_directory: Path, + create_dataset: Union[str, Path], + get_params: pytest.FixtureRequest, +) -> None: + """Testing getting data.""" + + inp_dir = create_dataset + test_size, optimizer_fn, scheduler_fn, eval_metric, loss_fn, classifier = get_params + + params = { + "test_size": test_size, + "n_d": 8, + "n_a": 8, + "seed": 0, + "optimizer_fn": optimizer_fn, + "optimizer_params": {"lr": 0.001}, + "scheduler_fn": scheduler_fn, + "device_name": "cpu", + "eval_metric": eval_metric, + "max_epochs": 10, + "loss_fn": loss_fn, + } + fps = fp.FilePattern(inp_dir, ".*.csv") + file = [f[1][0] for f in fps()][0] + + model = tb.PytorchTabnet( + **params, + file_path=file, + target_var="income", + classifier=classifier, + out_dir=output_directory, + ) + + ( + X_train, + y_train, + X_test, + y_test, + X_val, + y_val, + cat_idxs, + cat_dims, + features, + ) = model.get_data + + assert all( + isinstance(arr, np.ndarray) + for arr in [X_train, y_train, X_test, y_test, X_val, y_val] + ) + assert all(isinstance(i, list) for i in [cat_idxs, cat_dims, features]) + + +def test_fit_model( + output_directory: Path, + create_dataset: Union[str, Path], + get_params: pytest.FixtureRequest, +) -> None: + """Testing fitting model.""" + + inp_dir = create_dataset + test_size, _, _, eval_metric, loss_fn, classifier = get_params + + params = { + "test_size": test_size, + "n_d": 8, + "n_a": 8, + "seed": 0, + "optimizer_fn": torch.optim.Adam, + "scheduler_fn": torch.optim.lr_scheduler.StepLR, + "device_name": "cpu", + "eval_metric": eval_metric, + "max_epochs": 10, + "loss_fn": loss_fn, + } + fps = fp.FilePattern(inp_dir, ".*.csv") + file = [f[1][0] for f in fps()][0] + + model = tb.PytorchTabnet( + **params, + file_path=file, + target_var="income", + classifier=classifier, + out_dir=output_directory, + ) + mod_params = dict(model) + model.fit_model(params=mod_params) + + files = [ + f for f in Path(output_directory).iterdir() if f.suffix in [".zip", ".json"] + ] + + assert len(files) != 0 + clean_directories()