-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsampling.py
More file actions
123 lines (96 loc) · 4.39 KB
/
Copy pathsampling.py
File metadata and controls
123 lines (96 loc) · 4.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Grey Files
# Muhammad Alahmadi
# Md Mirajul Islam
from imblearn.under_sampling import OneSidedSelection
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from collections import Counter
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
def one_sided_selection(x_data, y_data, random_state=0):
"""
Resample the given data to better balance the target classes using One Sided Selection
Input:
x_data: attributes of the set to be resampled
y_data: target values of the set to be resampled
random_state: random seed for the resampling
Output:
X_resampled: attributes of the resampled data set
y_resampled: target values of the resampled data set
"""
return OneSidedSelection(random_state=random_state).fit_resample(x_data, y_data)
def random_undersampling(x_data, y_data, random_state=0):
"""
Resample the given data to better balance the target classes using Random Undersampling
Input:
x_data: attributes of the set to be resampled
y_data: target values of the set to be resampled
random_state: random seed for the resampling
Output:
X_resampled: attributes of the resampled data set
y_resampled: target values of the resampled data set
"""
return RandomUnderSampler(random_state=random_state).fit_resample(x_data, y_data)
def smote(x_data, y_data, random_state=0):
"""
Resample the given data to better balance the target classes using SMOTE (Synthetic Minority Over-sampling Technique)
Input:
x_data: attributes of the set to be resampled
y_data: target values of the set to be resampled
random_state: random seed for the resampling
Output:
X_resampled: attributes of the resampled data set
y_resampled: target values of the resampled data set
"""
return SMOTE(random_state=random_state).fit_resample(x_data, y_data)
def near_miss_custom(x_data, y_data):
"""
Resample the given data to better balance the target classes using our own version of the Near Miss - version 3 algorithm
ASSUMPTIONS:
1) Only 2 classes are present in the y_data array
2) Only numeric attributes present in the x_data array
Input:
x_data: numpy array of attributes of the set to be resampled
y_data: numpy array of target values of the set to be resampled
Output:
X_resampled: numpy array of attributes of the resampled data set
y_resampled: numpy array of target values of the resampled data set
"""
counts = Counter(y_data).most_common(2)
majority_target = counts[0][0]
minority_target = counts[1][0]
majority_indices = [i for i in range(len(y_data)) if y_data[i] == majority_target]
minority_indices = [i for i in range(len(y_data)) if y_data[i] == minority_target]
x_majority = x_data[majority_indices]
x_minority = x_data[minority_indices]
y_majority = y_data[majority_indices]
y_minority = y_data[minority_indices]
X_resampled = x_minority
y_resampled = y_minority
avg_majority_dist = np.empty((0, 0))
for maj_point in x_majority:
total_dist = 0.0
for min_point in x_minority:
total_dist += np.sqrt(np.sum(np.square(maj_point - min_point)))
avg_majority_dist = np.append(avg_majority_dist, total_dist / len(y_minority))
for i in range(0, len(y_minority)):
min_dist_index = np.argmin(avg_majority_dist)
X_resampled = np.append(X_resampled, [x_majority[min_dist_index]], axis = 0)
y_resampled = np.append(y_resampled, y_majority[min_dist_index])
x_majority = np.delete(x_majority, min_dist_index, axis = 0)
y_majority = np.delete(y_majority, min_dist_index)
avg_majority_dist = np.delete(avg_majority_dist, min_dist_index)
return X_resampled, y_resampled
# Data sets to test the resampling techniques
'''
x_data = np.array([[1, 6, 1, 6], [2, 7, 2, 7], [3, 8, 3, 8], [4, 9, 4, 9], [5, 10, 5, 10]])
y_data = np.array(['False', 'False', 'False', 'True', 'True'])
X_resampled, y_resampled = near_miss_custom(x_data, y_data)
print(X_resampled)
print(y_resampled)
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
X_resampled2, y_resampled2 = near_miss_custom(X, y)
print(Counter(y))
print(Counter(y_resampled2))
'''