From a7885622da1672a11c2e86fea19d154399fdb7b4 Mon Sep 17 00:00:00 2001
From: livewaves <hsnhejazin@gmail.com>
Date: Wed, 20 Sep 2023 10:17:34 +0200
Subject: [PATCH] managing possible corrupted images in the dataset when
 loading images.

---
 main.py | 42 ++++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/main.py b/main.py
index 1a3f2dd..f0ea4f8 100644
--- a/main.py
+++ b/main.py
@@ -1,3 +1,9 @@
+# https://www.microsoft.com/en-US/download/details.aspx?id=54765
+# https://www.kaggle.com/competitions/dogs-vs-cats/data
+# this script is examined for a total dataset lower than 3000 images due to memory limitations and CPU.
+# The highest performance is feasible with GPU on googlecolab.
+# This script successfully manges image corruptions in the dataset
+
 import os
 import pickle
 
@@ -11,41 +17,45 @@
 
 
 # prepare data
-input_dir = '/home/phillip/Desktop/todays_tutorial/19_parking_car_counter/code/clf-data'
-categories = ['empty', 'not_empty']
-
+input_dir = "./PetImages"
+categories = ['Cat', 'Dog']
 data = []
 labels = []
+
+
 for category_idx, category in enumerate(categories):
     for file in os.listdir(os.path.join(input_dir, category)):
         img_path = os.path.join(input_dir, category, file)
-        img = imread(img_path)
-        img = resize(img, (15, 15))
-        data.append(img.flatten())
-        labels.append(category_idx)
+        try:
+            img = imread(img_path)
+            img = resize(img, (20, 20))
+            flattened_img = img.flatten()
+            data.append(flattened_img)
+            labels.append(category_idx)
+        except Exception as e:
+            print(f"Error reading image {img_path}: {e}")
+            print("reading next image")
+            continue  # Skip this image and continue with the next one
 
 data = np.asarray(data)
 labels = np.asarray(labels)
 
-# train / test split
-x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, shuffle=True, stratify=labels)
+print("data and labels are ready for training")
+# train/split data
+x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2,
+                                                    shuffle=True, stratify=labels)
+print("splitting dataset is completed")
 
 # train classifier
 classifier = SVC()
-
 parameters = [{'gamma': [0.01, 0.001, 0.0001], 'C': [1, 10, 100, 1000]}]
-
 grid_search = GridSearchCV(classifier, parameters)
-
+print("start training")
 grid_search.fit(x_train, y_train)
 
 # test performance
 best_estimator = grid_search.best_estimator_
-
 y_prediction = best_estimator.predict(x_test)
-
 score = accuracy_score(y_prediction, y_test)
-
 print('{}% of samples were correctly classified'.format(str(score * 100)))
-
 pickle.dump(best_estimator, open('./model.p', 'wb'))