코세라의 deeplearning.AI tensorflow developer 전문가 자격증 과정내에
Convolutional Neural Networks in TensorFlow
과정의 2주차
Augmentation: A technique to avoid overfitting
챕터의 코드 예제입니다.
1) cats and dogs dataset 다운로드
2) Training set과 Test set 디렉토리 경로 설정
3) image dataset을 traing set과 test set으로 나누는 spilt_data함수 구현
4) Conv2D 3개 layer을 이용한 모델 구성
5) preprocessing.image name space의 ImageDataGenerator를 이용해서, 밝기를 1/255로 정규화한다.
이외에, rotation, shift, shear, zoom, flip 등의 이미지를 변환하여, traing set의 수를 증가시킨다. 이 변환된 이미지들은 실재 디스크에 저장되지 않고, 원본을 바꾸지도 않는다. model fit traing중에 RAM memory에서만 생성되어, training set으로만 사용된다. 증가된 다양한 종류의 augmentation 된 training set들은 기존 training set에 너무 overfitting되는 학습을 막아준다.
6) flow_from_directory method를 이용해서, training data와 validation data가 있는 디렉토리를 설정하고, target image size와 class mode, batch size를 설정한다. train generator와 validation generator를 각각 선언한다.
7) fit_generator를 이용해서 모델을 훈련한다. train generator와 validation generator를 인자로 받는다.
#!/usr/bin/env python
# coding: utf-8
# In[1]:
# ATTENTION: Please do not alter any of the provided code in the exercise. Only add your own code where indicated
# ATTENTION: Please do not add or remove any cells in the exercise. The grader will check specific cells based on the cell position.
# ATTENTION: Please use the provided epoch values when training.
# In this exercise you will train a CNN on the FULL Cats-v-dogs dataset
# This will require you doing a lot of data preprocessing because
# the dataset isn't split into training and validation for you
# This code block has all the required inputs
import os
import zipfile
import random
import shutil
import tensorflow as tf
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from shutil import copyfile
from os import getcwd
# In[2]:
# This code block unzips the full Cats-v-Dogs dataset to /tmp
# which will create a tmp/PetImages directory containing subdirectories
# called 'Cat' and 'Dog' (that's how the original researchers structured it)
path_cats_and_dogs = f"{getcwd()}/../tmp2/cats-and-dogs.zip"
shutil.rmtree('/tmp')
local_zip = path_cats_and_dogs
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('/tmp')
zip_ref.close()
# In[3]:
print(len(os.listdir('/tmp/PetImages/Cat/')))
print(len(os.listdir('/tmp/PetImages/Dog/')))
# Expected Output:
# 1500
# 1500
# In[4]:
# Use os.mkdir to create your directories
# You will need a directory for cats-v-dogs, and subdirectories for training
# and testing. These in turn will need subdirectories for 'cats' and 'dogs'
try:
#YOUR CODE GOES HERE
os.mkdir('/tmp/cats-v-dogs')
os.mkdir('/tmp/cats-v-dogs/training')
os.mkdir('/tmp/cats-v-dogs/testing')
os.mkdir('/tmp/cats-v-dogs/training/cats')
os.mkdir('/tmp/cats-v-dogs/training/dogs')
os.mkdir('/tmp/cats-v-dogs/testing/cats')
os.mkdir('/tmp/cats-v-dogs/testing/dogs')
except OSError:
pass
# In[5]:
# Write a python function called split_data which takes
# a SOURCE directory containing the files
# a TRAINING directory that a portion of the files will be copied to
# a TESTING directory that a portion of the files will be copie to
# a SPLIT SIZE to determine the portion
# The files should also be randomized, so that the training set is a random
# X% of the files, and the test set is the remaining files
# SO, for example, if SOURCE is PetImages/Cat, and SPLIT SIZE is .9
# Then 90% of the images in PetImages/Cat will be copied to the TRAINING dir
# and 10% of the images will be copied to the TESTING dir
# Also -- All images should be checked, and if they have a zero file length,
# they will not be copied over
#
# os.listdir(DIRECTORY) gives you a listing of the contents of that directory
# os.path.getsize(PATH) gives you the size of the file
# copyfile(source, destination) copies a file from source to destination
# random.sample(list, len(list)) shuffles a list
def split_data(SOURCE, TRAINING, TESTING, SPLIT_SIZE):
# YOUR CODE STARTS HERE
dataset = []
for unitData in os.listdir(SOURCE):
data = SOURCE + unitData
if(os.path.getsize(data) > 0):
dataset.append(unitData)
else:
print('Skipped ' + unitData)
print('Invalid file i.e zero size')
train_set_length = int(len(dataset) * SPLIT_SIZE)
test_set_length = int(len(dataset) - train_set_length)
shuffled_set = random.sample(dataset, len(dataset))
train_set = dataset[0:train_set_length]
test_set = dataset[-test_set_length:]
for unitData in train_set:
temp_train_set = SOURCE + unitData
final_train_set = TRAINING + unitData
copyfile(temp_train_set, final_train_set)
for unitData in test_set:
temp_test_set = SOURCE + unitData
final_test_set = TESTING + unitData
copyfile(temp_test_set, final_test_set)
# YOUR CODE ENDS HERE
CAT_SOURCE_DIR = "/tmp/PetImages/Cat/"
TRAINING_CATS_DIR = "/tmp/cats-v-dogs/training/cats/"
TESTING_CATS_DIR = "/tmp/cats-v-dogs/testing/cats/"
DOG_SOURCE_DIR = "/tmp/PetImages/Dog/"
TRAINING_DOGS_DIR = "/tmp/cats-v-dogs/training/dogs/"
TESTING_DOGS_DIR = "/tmp/cats-v-dogs/testing/dogs/"
split_size = .9
split_data(CAT_SOURCE_DIR, TRAINING_CATS_DIR, TESTING_CATS_DIR, split_size)
split_data(DOG_SOURCE_DIR, TRAINING_DOGS_DIR, TESTING_DOGS_DIR, split_size)
# In[6]:
print(len(os.listdir('/tmp/cats-v-dogs/training/cats/')))
print(len(os.listdir('/tmp/cats-v-dogs/training/dogs/')))
print(len(os.listdir('/tmp/cats-v-dogs/testing/cats/')))
print(len(os.listdir('/tmp/cats-v-dogs/testing/dogs/')))
# Expected output:
# 1350
# 1350
# 150
# 150
# In[7]:
# DEFINE A KERAS MODEL TO CLASSIFY CATS V DOGS
# USE AT LEAST 3 CONVOLUTION LAYERS
model = tf.keras.models.Sequential([
# YOUR CODE HERE
tf.keras.layers.Conv2D(16, (3,3), activation='relu', input_shape=(150, 150, 3)),
tf.keras.layers.MaxPooling2D(2,2),
tf.keras.layers.Conv2D(32, (3,3), activation='relu'),
tf.keras.layers.MaxPooling2D(2,2),
tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
tf.keras.layers.MaxPooling2D(2,2),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(512, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer=RMSprop(lr=0.001), loss='binary_crossentropy', metrics=['acc'])
# # NOTE:
#
# In the cell below you **MUST** use a batch size of 10 (`batch_size=10`) for the `train_generator` and the `validation_generator`. Using a batch size greater than 10 will exceed memory limits on the Coursera platform.
# In[8]:
TRAINING_DIR = "/tmp/cats-v-dogs/training" #YOUR CODE HERE
train_datagen = ImageDataGenerator(rescale=1.0/255,
rotation_range=40,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
fill_mode='nearest') #YOUR CODE HERE
# NOTE: YOU MUST USE A BATCH SIZE OF 10 (batch_size=10) FOR THE
# TRAIN GENERATOR.
train_generator = train_datagen.flow_from_directory(TRAINING_DIR,
batch_size=10,
class_mode='binary',
target_size=(150, 150)) #YOUR CODE HERE
VALIDATION_DIR = "/tmp/cats-v-dogs/testing" #YOUR CODE HERE
validation_datagen = ImageDataGenerator(rescale=1.0/255) #YOUR CODE HERE
# NOTE: YOU MUST USE A BACTH SIZE OF 10 (batch_size=10) FOR THE
# VALIDATION GENERATOR.
validation_generator = validation_datagen.flow_from_directory(VALIDATION_DIR,
batch_size=10,
class_mode='binary',
target_size=(150, 150)) #YOUR CODE HERE
# Expected Output:
# Found 2700 images belonging to 2 classes.
# Found 300 images belonging to 2 classes.
# In[9]:
history = model.fit_generator(train_generator,
epochs=2,
verbose=1,
validation_data=validation_generator)
# In[10]:
# PLOT LOSS AND ACCURACY
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
#-----------------------------------------------------------
# Retrieve a list of list results on training and test data
# sets for each training epoch
#-----------------------------------------------------------
acc=history.history['acc']
val_acc=history.history['val_acc']
loss=history.history['loss']
val_loss=history.history['val_loss']
epochs=range(len(acc)) # Get number of epochs
#------------------------------------------------
# Plot training and validation accuracy per epoch
#------------------------------------------------
plt.plot(epochs, acc, 'r', "Training Accuracy")
plt.plot(epochs, val_acc, 'b', "Validation Accuracy")
plt.title('Training and validation accuracy')
plt.figure()
#------------------------------------------------
# Plot training and validation loss per epoch
#------------------------------------------------
plt.plot(epochs, loss, 'r', "Training Loss")
plt.plot(epochs, val_loss, 'b', "Validation Loss")
plt.title('Training and validation loss')