elm_implementations/data_utils.py at main · ppedin/elm_implementations · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
import random
import numpy as np
import pickle
import pandas as pd
from ucimlrepo import fetch_ucirepo

"""
The following are lists of dataset ids that are used to import datasets from the UCI repository.
These lists are used to import datasets with a specific number of features, features of a specific type, etc.
"""
ids_low_number_of_features_datasets = [186, 19, 544, 165, 42, 545, 14, 159, 484, 101, 936, 29]
ids_medium_number_of_features_datasets = [17, 320, 73, 856, 63, 151, 342, 419, 90, 857, 22, 69]
ids_high_number_of_features_datasets = [183, 579, 211, 54, 75, 74]
ids_numerical_datasets = [92, 54, 257, 149, 342, 419, 247, 161, 291, 488, 379]
ids_categorical_datasets = [19, 73, 14, 936, 101, 76, 12, 90, 44, 70, 13, 58]


class Dataloader():
    """
    Class inspired by the PyTorch dataloader. It creates a generator of batch data.
    """
    def __init__(self, data, num_of_targets):
        """
        Initialization takes as input a numpy array of size (num_of_instances, num_of_features + num_of_targets)
        We assume that in the array, targets are the last num_of_targets columns
        """
        self.dataset = data
        self.num_of_targets = num_of_targets


    def generator(self, batch_size, shuffle=True):
        """
        Given a batch size, it creates a generator of batch data. Useful for Stochastic Gradient Descent.
        If shuffle is True, the data is shuffled at each iteration.
        The generator yields a tuple of two numpy arrays: the first one contains the input batch and the second one contains the output batch.
        The size of the input batch is (batch_size, number of input features) and the size of the output batch is (batch_size, number of target features).
        """
        indices = list(range(self.dataset.shape[0]))  #  indices is the interval [0, number_of_instances]
        if shuffle:
            random.shuffle(indices)

        for i in range(0, self.dataset.shape[0], batch_size):
            batch_indices = indices[i:i + batch_size]   #  for example, if batch_size is 32 and we are at the first iteration, we will take [0,32)
            input_batch = np.array([self.dataset[idx, :-self.num_of_targets] for idx in batch_indices])  #  input_batch has size (batch_size, number of input features)

            output_batch = np.array([self.dataset[idx, -self.num_of_targets:] for idx in batch_indices]) #  output_batch has size (batch_size, number of target features)
            yield input_batch, output_batch


def encode_dataset(X_df, y_df, variables):
    """
    Encodes the datasets retrieved from the UCI repository.

    This function takes as input:
      - X_df: A Pandas DataFrame containing the features.
      - y_df: A Pandas DataFrame containing the targets.
      - variables: A DataFrame with metadata about the variables (each row should have keys 'name', 'role', and 'type').

    It performs the following operations:
      1. Spots the categorical (and binary) variables using the 'role' and 'type' information.
      2. One-hot encodes the categorical variables (using pd.get_dummies).
      3. For the variables that are not one-hot encoded (i.e. numeric ones), applies min–max normalization.
      4. Converts the processed dataframes to NumPy arrays.

    Returns:
      X_df, y_df, X, y
        - X_df and y_df: The processed Pandas DataFrames.
        - X and y: The corresponding NumPy arrays.
    """

    # Lists for the variable names
    categorical_input_variables = []
    numeric_input_variables = []
    categorical_output_variables = []
    numeric_output_variables = []

    # Iterate over rows of the variables dataframe to determine variable types
    for index, row in variables.iterrows():
        if row['role'] == 'Feature':
            if row['type'] in ['Categorical', 'Binary']:
                categorical_input_variables.append(row['name'])
            else:
                numeric_input_variables.append(row['name'])
        elif row['role'] == 'Target':
            if row['type'] in ['Categorical', 'Binary']:
                categorical_output_variables.append(row['name'])
            else:
                numeric_output_variables.append(row['name'])

    # One-hot encode the categorical variables
    X_df = pd.get_dummies(X_df, columns=categorical_input_variables)
    y_df = pd.get_dummies(y_df, columns=categorical_output_variables)

    # Apply min-max normalization to numeric input features
    for col in numeric_input_variables:
        if col in X_df.columns:
            col_min = X_df[col].min()
            col_max = X_df[col].max()
            if col_max - col_min != 0:
                X_df[col] = (X_df[col] - col_min) / (col_max - col_min)
            else:
                # In case the column is constant, set it to zero (or you could choose another default)
                X_df[col] = 0

    # Apply min-max normalization to numeric output targets
    for col in numeric_output_variables:
        if col in y_df.columns:
            col_min = y_df[col].min()
            col_max = y_df[col].max()
            if col_max - col_min != 0:
                y_df[col] = (y_df[col] - col_min) / (col_max - col_min)
            else:
                y_df[col] = 0

    # Convert the processed DataFrames to NumPy arrays
    X = X_df.to_numpy()
    y = y_df.to_numpy()

    return X_df, y_df, X, y


def import_uci_dataset_by_id(dataset_id):
    """
    This function retrieves data for a given dataset id from the UCI repository.
    It returns the features, targets, metadata and variables of the dataset, after encoding.
    If encoding is not possible due to missing info about the variables, it returns

    """
    try:
        dataset = fetch_ucirepo(id=dataset_id)
    except Exception as e:
        print(e)
        return None

    try:
        X = dataset.data.features
    except Exception as e:
        print(e)
        return None

    try:
        y = dataset.data.targets
    except Exception as e:
        print(e)
        return None

    try:
        variables = dataset.variables
    except Exception as e:
        print(e)
        return None

    try:
        X_df, y_df, X, y = encode_dataset(X, y, variables)
    except Exception as e:
        print(e)
        return None

    metadata = dataset.metadata
    return X_df.astype(int).to_numpy(), y_df.astype(int).to_numpy(), metadata


def import_uci_datasets_by_criteria(type, min_number_of_features, max_number_of_features, feature_type, min_number_of_instances, max_number_of_instances, num_datasets, pickle_file_path):
    """
    Retrieves a list of UCI datasets that match the given criteria.
    Possible criteria are:
    - type: here, we consider only "Multivariate" or "Text" or "All" (the first is for general use, the second is for text classification)
    - min_number_of_features: the minimum number of features in the dataset
    - max_number_of_features: the maximum number of features in the dataset
    - feature_type: the type of the features (here, we consider only "Numerical" or "Categorical", or "All". If "Numerical", we look for "Real" or "Integer")
    - min_number_of_instances: the minimum number of instances in the dataset
    - max_number_of_instances: the maximum number of instances in the dataset
    - num_datasets: the number of datasets to retrieve

    Returns a list of dictionaries, where each dictionary has the following structure:
    - metadata: the metadata of the dataset in the form provided by the ucimlrepo library
    - X: the features of the dataset as a numpy array
    - y: the targets of the dataset as a numpy array
    """
    num_datasets_retrieved = 0
    datasets = []

    id = 0  #  iterate over ids starting from 0
    while num_datasets_retrieved < num_datasets:
        try:
            X, y, metadata = import_uci_dataset_by_id(id)
            number_of_features = X.shape[1]
            number_of_instances = X.shape[0]
            number_of_features_condition = number_of_features >= min_number_of_features and number_of_features <= max_number_of_features
            number_of_instances_condition = number_of_instances >= min_number_of_instances and number_of_instances <= max_number_of_instances
            if type == "All":
                type_condition = True
            else:
                type_condition = type in metadata['characteristics']
            if feature_type == "All":
                feature_type_condition = True
            elif feature_type == "Numerical":
                feature_type_condition = "Real" in metadata['feature_types'] or "Integer" in metadata['feature_types']
            else:
                feature_type_condition = feature_type in metadata['feature_types']
            # missing_values_condition = metadata['missing_values'] == "no"  #  it will not be used since many datasets have missing values
            if number_of_features_condition and number_of_instances_condition and type_condition and feature_type_condition:
                try:
                    datasets.append({"metadata": metadata, "X": X, "y": y})
                except Exception as e:
                    print(e)
                num_datasets_retrieved += 1
                print(f"dataset {num_datasets_retrieved} retrieved")
                id += 1
            else:
                id += 1
                continue
        except Exception as e:
            id += 1
            continue

    with open(pickle_file_path, "wb") as f:
        print(f"Saving datasets to {pickle_file_path}")
        pickle.dump(datasets, f)


def get_sparsity(X):
    """
    Returns the sparsity of the dataset (computed as the percentage of zero elements in the dataset).
    """
    return np.sum(X == 0) / X.size


def get_info_data(data_file_path):
    """
    Process a subset of the UCI datasets (the subset has been selected by browsing the site for datasets that differ in number of features and sparsity)
    and prints on a csv file the information about the dataset.

    The csv file will be used to select the datasets to be used for the experiments.
    """
    ids = list(set([186, 19, 544, 165, 42, 545, 14, 159, 484, 101, 936, 29,
           17, 320, 73, 856, 63, 151, 342, 419, 90, 857, 22, 69,
           183, 579, 211, 54, 75, 74,
           92, 54, 257, 149, 342, 419, 247, 161, 291, 488, 379,
           19, 73, 14, 936, 101, 76, 12, 90, 44, 70, 13, 58,
           63, 75, 165, 159, 545, 73, 857, 69, 54, 291]))

    info_data = {"uci_id": [],
                 "name": [],
                 "task": [],
                 "num_of_input_features": [],
                 "num_of_output_features": [],
                 "num_of_instances": [],
                 "characteristics": [],
                 "feature_types": [],
                 "sparsity": []}
    for i, id in enumerate(ids):
        print(f"Importing dataset {i+1} of {len(ids)}")
        try:
            X, y, metadata = import_uci_dataset_by_id(id)
            info_data["uci_id"].append(id)
            info_data["name"].append(metadata["name"])
            info_data["task"].append(", ".join(metadata["tasks"]))
            info_data["num_of_input_features"].append(X.shape[1])
            info_data["num_of_output_features"].append(y.shape[1])
            info_data["num_of_instances"].append(X.shape[0])
            info_data["characteristics"].append(", ".join(metadata["characteristics"]))
            info_data["feature_types"].append(", ".join(metadata["feature_types"]))
            info_data["sparsity"].append(get_sparsity(X))
        except Exception as e:
            print(f"Error importing dataset {id}: {e}. Skipping dataset..")
            continue
    df = pd.DataFrame(info_data)
    df.to_csv(data_file_path, index=False)


if __name__ == "__main__":
    get_info_data("info_data.csv")