You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
df.isna().sum()*100/df.shape[0]
#Showing missing data at which index of the datafig, ax=plt.subplots(figsize=(15,4))
sns.heatmap(df.isna().transpose(), ax=ax, cmap="crest")
plt.show()
Duplicates Values
# Count duplicate rows in train_datatrain_duplicates=df.duplicated().sum()
test_duplicates=df.duplicated().sum()
# Print the resultsprint(f"Number of duplicate rows in df_train: {train_duplicates}")
print(f"Number of duplicate rows in df_test: {test_duplicates}")
Numercial & Catergorical Feature Identification
We can identify the catergorical columns by the number of unique values
deffind_categorical(df, cutoff=12):
""" Function to find categorical columns in the dataframe. cutoff (int): is determinied when plotting the histogram distribution for numerical cols """cat_cols= []
forcolindf.columns:
iflen(df[col].unique()) <=cutoff:
cat_cols.append(col)
returncat_colsdefto_categorical(cat_cols, df):
""" Converts the columns passed in `columns` to categorical datatype for keras model """forcolincat_cols:
df[col] =df[col].astype('category')
returndf# Step 1: identify catergorical columns in the dfcat_cols=find_categorical(df)
# Step 2: convert atergorical columns astype "category"# This step can be performed later as categorical column with numerical value like (1,2,3) can be used in the EDA# df = to_categorical(cat_cols, df)# Step 3: identify the numerical columnsnum_cols=list(set(df.columns) -set(cat_cols))
# Extra: You also can get the summary on the catergorical columnsdefsummarize_categoricals(df, show_levels=False):
""" Display uniqueness in each column df: dataframe contains only catergorical features show_levels: how many unique values in a catergoical column """data= [[df[c].unique().to_list(), len(df[c].unique()), df[c].isnull().sum()] forcindf.columns]
df_temp=pd.DataFrame(data, index=df.columns,
columns=['Levels', 'No. of Levels', 'No. of Missing Values'])
returndf_temp.iloc[:, 0ifshow_levelselse1:]
summarize_categoricals(df[cat_cols], show_levels=True)
EDA
Univariate Analysis
Numerical Columns
For the numerical, histogram plot will help to identify the distribution and data skew (log transform if needed) & box plot to identify the outliers
Quantitative Variables include catergorical variables which are numeric, so we can use .select_dtypes(include=np.number) to identify the list of Quantitative Variables.
Removing collinear within the input features can help a model to generalize and improves the interpretability of the model.
The function below will help to remove collinear features in a dataframe with a correlation coefficient greater than the threshold.
defremove_collinear_features(df,
quantitative_variables,
target_column,
threshold=0.99):
''' Objective: Remove collinear features in a dataframe with a correlation coefficient greater than the threshold. Removing collinear features can help a model to generalize and improves the interpretability of the model. Inputs: df: original dataframe quantitative_variables: list of quantitative variables target_column: this to ignore the target_column as this function is to remove col-linear among features threshold: features with correlations greater than this value are removed Output: dataframe that contains only the non-highly-collinear features '''x=df[quantitative_variables]
x=x.drop(target_column, axis=1)
# Calculate the correlation matrixcorr_matrix=x.corr()
iters=range(len(corr_matrix.columns) -1)
drop_cols= []
# Iterate through the correlation matrix and compare correlationsforiiniters:
forjinrange(i+1):
item=corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
col=item.columnsrow=item.indexval=abs(item.values)
# If correlation exceeds the thresholdifval>=threshold:
# Print the correlated features and the correlation valueprint(f"({col.values[0]:15s} | {row.values[0]:15s} | {round(val[0][0], 2)}) -> Remove '{col.values[0]}'")
drop_cols.append(col.values[0])
drops=set(drop_cols)
# Drop one of each pair of correlated columns in the original dfdf=df.drop(columns=drops, axis=1)
returndfremove_collinear_features(
df,
quantitative_variables,
target_column="Exited",
threshold=0.8
)
Once finished, we might need to re-update the list of numerical & categorical columns
Numberical Features vs Target Variables
In the case of classification, we can specify the hue=target_columns to observe the different distribution with respect to the different class
age_bins= [0, 18, 25, 30, 40, 65, np.inf] # Adjust the bin edges as neededage_labels= ['0-18', '18-25', '25-30', '30-40', '40-65', '> 65']
# Bin the Age columnage_bin_df=pd.cut(df[['Age']].iloc[:, 0], bins=age_bins, labels=age_labels, right=False)
fig, ax=plt.subplots(figsize=(18, 4))
fig=sns.histplot(data=age_bin_df, kde=True, palette=palette)
plt.show()
Pipeline
Create custome transformers for data cleaning or feature engineering.
fromsklearn.baseimportBaseEstimator, TransformerMixinfromsklearn.utils.validationimportcheck_array, check_is_fittedclassOutlierRemover(BaseEstimator, TransformerMixin):
def__init__(self,factor=1.5):
self.factor=factordef_outlier_detector(self,X,y=None):
X=pd.Series(X).copy()
q1=X.quantile(0.25)
q3=X.quantile(0.75)
iqr=q3-q1self.lower_bound.append(q1- (self.factor*iqr))
self.upper_bound.append(q3+ (self.factor*iqr))
deffit(self, X: np.ndarray,y=None):
# if fit in another dataset, this two arrays must be resetself.lower_bound= []
self.upper_bound= []
X.apply(self._outlier_detector)
self.feature_names_in_=X.shape[1] # this is required for get_feature_names_outreturnselfdeftransform(self,X: pd.DataFrame, y=None):
X=pd.DataFrame(X).copy() # convert X into Pandas dataframe to use .iloc[:, i]foriinrange(X.shape[1]):
x=X.iloc[:, i].copy()
x[(x<self.lower_bound[i]) | (x>self.upper_bound[i])] =np.nanX.iloc[:, i] =xself.columns=X.columnsreturnXdefget_feature_names_out(self, feature_names):
return [colforcolinself.columns]
Depends on different set of columns, we can define different pipelines to process & perform feature engineering
Default numerical pipepline
num_pipeline=make_pipeline(
OutlierRemover(), # from the custom transformerSimpleImputer(strategy="median"),
MinMaxScaler()
)
For the categorical feature, we might need to use different encoders such as OneHot, Ordinal, Hash encoders
defmake_cat_pipeline(encoder):
# the function to generate the cat_pipeline base on different encoderreturnmake_pipeline(
SimpleImputer(strategy='most_frequent'),
encoder
)
# One-hot encoder pipelineoh_cat_pipeline=make_cat_pipeline(OneHotEncoder(handle_unknown='ignore'))
# Ordinal encoder pipelineord_cat_pipeline=make_cat_pipeline(
OrdinalEncoder(
handle_unknown="use_encoded_value",
unknown_value=-1
)
)
For feature engineering, we can use the functional transformers to combine multiple columns
# this example is to use FunctionTransformer to combine (Geography + Gender) column into Geo-Gender column such as SpainMale, GemarnyFemaledefgeo_gender_name(function_transformer, feature_names_in):
# '__GeoGender to be appendedreturn ["GeoGender"] # feature names outgeo_gender_pipeline=make_pipeline(
SimpleImputer(strategy='most_frequent'),
# X = df[['Geography','Gender']]FunctionTransformer(lambdaX: X[:, [0]] +X[:, [1]], feature_names_out=geo_gender_name), # concat__GeoGenderOneHotEncoder(handle_unknown='ignore')
)
For binning, we can use the built-in Sklearn's KBinsDiscretizer
# we also can definedefbinning_pipeline(n_bins, encode, bin_strategy):
""" KBinsDiscretizer: Bin continuous data into intervals. n_bins: number of bins encode: {‘onehot’, ‘onehot-dense’, ‘ordinal’}, default=’onehot’ strategy: {‘uniform’, ‘quantile’, ‘kmeans’}, default=’quantile’ """returnmake_pipeline(
OutlierRemover(), # custom transformerSimpleImputer(strategy="median"),
KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=bin_strategy)
)
Full Pipeline
We can ensemble all seperate pipelines for each set of feature columns
Define the trainer class to perform the cross validation
classTrainer:
def__init__(self,
model_list) ->None:
self.model_list=model_listdeffit_and_evaluate(self, X_train, y_train, X_val, y_val, metrics: str, cv: int=5) ->pd.DataFrame:
baseline_results=pd.DataFrame(columns=['model_name', f'{metrics}_train_cv', f'{metrics}_val'])
foridxintqdm.tqdm(range(len(model_list))):
clf=model_list[idx]
# cross_val_score uses the KFold strategy with default parameters for making the train-test splits,# which means splits into consecutive chunks rather than shuffling. -> shuffle=True# Stratified is to ensure the class distribution equal in each foldkfold=StratifiedKFold(n_splits=cv, shuffle=True, random_state=2024)
# using cross_val_score# list of "scoring": https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parametermetrics_train=np.round(
np.mean(
cross_val_score(clf, X_train, y_train,
scoring=metrics, cv=kfold)
), 3
)
# test on val_setclf.fit(X_train, y_train)
y_pred_val=clf.predict_proba(X_val)[:, 1]
metrics_val=self.cal_metrics(y_val, y_pred_val)
baseline_results.loc[len(baseline_results)] = [clf.__class__.__name__, metrics_train, metrics_val]
returnbaseline_results \
.sort_values(by=f'{metrics}_val', ascending=False) \
.set_index('model_name')
defcal_metrics(self, y, y_pred) ->float:
fpr, tpr, thresholds=roc_curve(y, y_pred)
returnauc(fpr, tpr)
You can initialise the Trainer class and perform the evaluation