From a7885622da1672a11c2e86fea19d154399fdb7b4 Mon Sep 17 00:00:00 2001 From: livewaves Date: Wed, 20 Sep 2023 10:17:34 +0200 Subject: [PATCH] managing possible corrupted images in the dataset when loading images. --- main.py | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/main.py b/main.py index 1a3f2dd..f0ea4f8 100644 --- a/main.py +++ b/main.py @@ -1,3 +1,9 @@ +# https://www.microsoft.com/en-US/download/details.aspx?id=54765 +# https://www.kaggle.com/competitions/dogs-vs-cats/data +# this script is examined for a total dataset lower than 3000 images due to memory limitations and CPU. +# The highest performance is feasible with GPU on googlecolab. +# This script successfully manges image corruptions in the dataset + import os import pickle @@ -11,41 +17,45 @@ # prepare data -input_dir = '/home/phillip/Desktop/todays_tutorial/19_parking_car_counter/code/clf-data' -categories = ['empty', 'not_empty'] - +input_dir = "./PetImages" +categories = ['Cat', 'Dog'] data = [] labels = [] + + for category_idx, category in enumerate(categories): for file in os.listdir(os.path.join(input_dir, category)): img_path = os.path.join(input_dir, category, file) - img = imread(img_path) - img = resize(img, (15, 15)) - data.append(img.flatten()) - labels.append(category_idx) + try: + img = imread(img_path) + img = resize(img, (20, 20)) + flattened_img = img.flatten() + data.append(flattened_img) + labels.append(category_idx) + except Exception as e: + print(f"Error reading image {img_path}: {e}") + print("reading next image") + continue # Skip this image and continue with the next one data = np.asarray(data) labels = np.asarray(labels) -# train / test split -x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, shuffle=True, stratify=labels) +print("data and labels are ready for training") +# train/split data +x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, + shuffle=True, stratify=labels) +print("splitting dataset is completed") # train classifier classifier = SVC() - parameters = [{'gamma': [0.01, 0.001, 0.0001], 'C': [1, 10, 100, 1000]}] - grid_search = GridSearchCV(classifier, parameters) - +print("start training") grid_search.fit(x_train, y_train) # test performance best_estimator = grid_search.best_estimator_ - y_prediction = best_estimator.predict(x_test) - score = accuracy_score(y_prediction, y_test) - print('{}% of samples were correctly classified'.format(str(score * 100))) - pickle.dump(best_estimator, open('./model.p', 'wb'))