PCA #11 본문
import os
import numpy as np
class Data():
def __init__(self, fname, ratio):
f = open(fname)
data =
lines = data.split("\n")
header = lines[0].split('.')
lines = lines[1:]
values = [line.split(",")[1:] for line in lines]
self.float_data = np.array(values).astype('float32')
self.data_length = self.float_data.shape[-1]
self.ratio = ratio
self.train_set_length = int(self.float_data.shape[0] * ratio[0])
self.val_set_length = int(self.float_data.shape[0] * ratio[1])
def normalize(self):
import copy = copy.deepcopy(self.float_data)
mean =[:self.train_set_length].mean(axis=0) -= mean
std =[:self.train_set_length].std(axis=0) /= std
self.mean = mean
self.std = std
def get_generators(self, lookback, delay, batch_size=128, step=6):
self.train_gen = Data.generator(self, lookback=lookback, delay=delay,
min_index=0, max_index=self.train_set_length,
shuffle=True, step=step, batch_size=batch_size)
self.val_gen = Data.generator(self, lookback=lookback, delay=delay,
max_index=self.train_set_length + self.val_set_length,
shuffle=False, step=step, batch_size=batch_size)
self.test_gen = Data.generator(self, lookback=lookback, delay=delay,
min_index=self.train_set_length + self.val_set_length,
shuffle=False, step=step, batch_size=batch_size)
self.train_steps = (self.train_set_length - lookback) // batch_size
self.val_steps = (self.val_set_length - lookback) // batch_size
self.test_steps = (len( - self.val_set_length - self.train_set_length - lookback) // batch_size
self.lookback = lookback
self.batch_size = batch_size
self.step = step
def generator(self, lookback, delay, min_index, max_index, shuffle=False, batch_size=128, step=6):
if max_index is None:
max_index = len( - delay - 1
i = min_index + lookback
while 1:
if shuffle:
rows = np.random.randint(min_index + lookback, max_index, size=batch_size)
if i + batch_size >= max_index:
i = min_index + lookback
rows = np.arrange(i, min(i + batch_size, max_index))
i += len(rows)
samples = np.zeros((len(rows), lookback // step, self.data_length))
targets = np.zeros((len(rows),))
for j, row in enumerate(rows):
indices = range(rows[j] - lookback, rows[j], step)
samples[j] =[indices]
targets[j] =[rows[j] + delay - 1][1]
yield samples, targets
def main(ratio=[0.5, 0.25, 0.25], lookback=1440, step=6, delay=144, batch_size=128, epochs=20):
fname = "./jena_climate/jena_climate_2009_2016.csv"
dataset = Data(fname, ratio)
dataset.get_generators(lookback=lookback, delay=delay, batch_size=batch_size, step=6)
from tensorflow.keras import layers, models
model = models.Sequential()
model.add(layers.SimpleRNN(32, activation='relu'))
model.compile(optimizer='RMSprop', loss='mae')
history = model.fit_generator(dataset.train_gen, steps_per_epoch=dataset.train_steps, epochs=epochs,
validation_data=dataset.val_gen, validation_steps=dataset.val_steps)
train_loss = model.evaluate_generator(dataset.train_gen, steps=dataset.val_steps)
test_loss = model.evaluate_generator(dataset.test_gen, steps=dataset.test_steps)
print('train_loss: ', train_loss)
print('test_loss: ', test_loss)
import matplotlib.pyplot as plt
plt.plot(range(0, epochs), history.history['loss'], 'b')
plt.plot(range(0, epochs), history.history['val_loss'], 'g')
plt.legend(('train_loss', 'val_loss'))
• The data pre-processing in RNN(2) slides 4-6 uses the data of 10 prior days.
• If we want to utilize 20 prior days, what should we do?
144 data points per day이며 원래는 10일 전의 data를 사용하는 것이어서 144 x 10 해서 1440 data points였는데 20일 전의 data를 사용하려면 144 x 20을 계산하여 2880 data points를 사용하면 된다. 따라서 Lookback=2880 (20 prior days), Steps=6, Delay=144을 data pre-processing에 사용하면 된다.
• Using 20 prior days, run the model of simpleANN (RNN(2)- slide 7)
• Attach its learning curve (training loss and validation loss, as shown in RNN(2) slide 11), and MAE and MAPE in the test set.
Loss graph
MAE and MAPE in the test set:
• Let’s check the number of parameters using model.summary.
• How many parameters in the simpleANN? It must be different from the number of parameters in the slide 7. Why did it happen?
10 prior days의 data를 사용하게 되면 144/6 datapoints a day * 14 measures * 10 days를 계산하여 flatten layer에서 3360 inputs가 나오게 되고 (3360 * 32) weights + 32 bias를 계산하여 107,552 parameters이며 32 weights + 1 bias를 계산하여 33 parameters여서 총 parameters의 수는 107,585개가 된다.
하지만 20 prior days의 data를 사용하게 되면 144/6 datapoints a day * 14 measures * 20 days를 계산하여 flatten layer에서 6720 inputs가 나오게 되며 (6720 * 32) weights + 32 bias를 계산하여 215,072 parameters이며 32 weights + 1 bias를 계산하여 33 parameters여서 총 parameters의 수는 215,105개가 된다.
• Using 20 prior days, run the model of LSTM (RNN(2)- slide 9)
• Attach its learning curve (training loss and validation loss, as shown in RNN(2) slide 11), and MAE and MAPE in the test set.
Loss graph
MAE and MAPE in the test set:
• Let’s check the number of parameters using model.summary.
• How many parameters in the LSTM?
• What was “n” in the equation of the slide 9? Why is it so?
14 measures a timestep = 14 inputs이고
Total parameters 값은 4*n^2+61n+1가 되고 이 값을 6049라고했을 때 n 값을 구해보면 32가 나오게 된다. 따라서 히든뉴런 수 32개가 된다.