printf("ho_tari\n");
PCA #11 본문
<코드>
import os
import numpy as np
class Data():
def __init__(self, fname, ratio):
f = open(fname)
data = f.read()
f.close()
lines = data.split("\n")
header = lines[0].split('.')
lines = lines[1:]
values = [line.split(",")[1:] for line in lines]
self.float_data = np.array(values).astype('float32')
self.data_length = self.float_data.shape[-1]
self.ratio = ratio
self.train_set_length = int(self.float_data.shape[0] * ratio[0])
self.val_set_length = int(self.float_data.shape[0] * ratio[1])
Data.normalize(self)
def normalize(self):
import copy
self.data = copy.deepcopy(self.float_data)
mean = self.data[:self.train_set_length].mean(axis=0)
self.data -= mean
std = self.data[:self.train_set_length].std(axis=0)
self.data /= std
self.mean = mean
self.std = std
def get_generators(self, lookback, delay, batch_size=128, step=6):
self.train_gen = Data.generator(self, lookback=lookback, delay=delay,
min_index=0, max_index=self.train_set_length,
shuffle=True, step=step, batch_size=batch_size)
self.val_gen = Data.generator(self, lookback=lookback, delay=delay,
min_index=self.train_set_length,
max_index=self.train_set_length + self.val_set_length,
shuffle=False, step=step, batch_size=batch_size)
self.test_gen = Data.generator(self, lookback=lookback, delay=delay,
min_index=self.train_set_length + self.val_set_length,
max_index=None,
shuffle=False, step=step, batch_size=batch_size)
self.train_steps = (self.train_set_length - lookback) // batch_size
self.val_steps = (self.val_set_length - lookback) // batch_size
self.test_steps = (len(self.data) - self.val_set_length - self.train_set_length - lookback) // batch_size
self.lookback = lookback
self.batch_size = batch_size
self.step = step
def generator(self, lookback, delay, min_index, max_index, shuffle=False, batch_size=128, step=6):
if max_index is None:
max_index = len(self.data) - delay - 1
i = min_index + lookback
while 1:
if shuffle:
rows = np.random.randint(min_index + lookback, max_index, size=batch_size)
else:
if i + batch_size >= max_index:
i = min_index + lookback
rows = np.arrange(i, min(i + batch_size, max_index))
i += len(rows)
samples = np.zeros((len(rows), lookback // step, self.data_length))
targets = np.zeros((len(rows),))
for j, row in enumerate(rows):
indices = range(rows[j] - lookback, rows[j], step)
samples[j] = self.data[indices]
targets[j] = self.data[rows[j] + delay - 1][1]
yield samples, targets
def main(ratio=[0.5, 0.25, 0.25], lookback=1440, step=6, delay=144, batch_size=128, epochs=20):
fname = "./jena_climate/jena_climate_2009_2016.csv"
dataset = Data(fname, ratio)
dataset.get_generators(lookback=lookback, delay=delay, batch_size=batch_size, step=6)
from tensorflow.keras import layers, models
model = models.Sequential()
model.add(layers.SimpleRNN(32, activation='relu'))
model.add(layers.Dense(1))
model.compile(optimizer='RMSprop', loss='mae')
history = model.fit_generator(dataset.train_gen, steps_per_epoch=dataset.train_steps, epochs=epochs,
validation_data=dataset.val_gen, validation_steps=dataset.val_steps)
train_loss = model.evaluate_generator(dataset.train_gen, steps=dataset.val_steps)
test_loss = model.evaluate_generator(dataset.test_gen, steps=dataset.test_steps)
print('train_loss: ', train_loss)
print('test_loss: ', test_loss)
import matplotlib.pyplot as plt
plt.plot(range(0, epochs), history.history['loss'], 'b')
plt.plot(range(0, epochs), history.history['val_loss'], 'g')
plt.legend(('train_loss', 'val_loss'))
plt.savefig('RNN_loss.png')
Q1
• The data pre-processing in RNN(2) slides 4-6 uses the data of 10 prior days.
• If we want to utilize 20 prior days, what should we do?
144 data points per day이며 원래는 10일 전의 data를 사용하는 것이어서 144 x 10 해서 1440 data points였는데 20일 전의 data를 사용하려면 144 x 20을 계산하여 2880 data points를 사용하면 된다. 따라서 Lookback=2880 (20 prior days), Steps=6, Delay=144을 data pre-processing에 사용하면 된다.
Q2
• Using 20 prior days, run the model of simpleANN (RNN(2)- slide 7)
• Attach its learning curve (training loss and validation loss, as shown in RNN(2) slide 11), and MAE and MAPE in the test set.
Loss graph
<MAE>
<MAPE>
MAE and MAPE in the test set:
<MAE>
<MAPE>
Q3
• Let’s check the number of parameters using model.summary.
• How many parameters in the simpleANN? It must be different from the number of parameters in the slide 7. Why did it happen?
10 prior days의 data를 사용하게 되면 144/6 datapoints a day * 14 measures * 10 days를 계산하여 flatten layer에서 3360 inputs가 나오게 되고 (3360 * 32) weights + 32 bias를 계산하여 107,552 parameters이며 32 weights + 1 bias를 계산하여 33 parameters여서 총 parameters의 수는 107,585개가 된다.
하지만 20 prior days의 data를 사용하게 되면 144/6 datapoints a day * 14 measures * 20 days를 계산하여 flatten layer에서 6720 inputs가 나오게 되며 (6720 * 32) weights + 32 bias를 계산하여 215,072 parameters이며 32 weights + 1 bias를 계산하여 33 parameters여서 총 parameters의 수는 215,105개가 된다.
Q4
• Using 20 prior days, run the model of LSTM (RNN(2)- slide 9)
• Attach its learning curve (training loss and validation loss, as shown in RNN(2) slide 11), and MAE and MAPE in the test set.
Loss graph
<MAE>
<MAPE>
MAE and MAPE in the test set:
<MAE>
<MAPE>
Q5
• Let’s check the number of parameters using model.summary.
• How many parameters in the LSTM?
• What was “n” in the equation of the slide 9? Why is it so?
14 measures a timestep = 14 inputs이고
Total parameters 값은 4*n^2+61n+1가 되고 이 값을 6049라고했을 때 n 값을 구해보면 32가 나오게 된다. 따라서 히든뉴런 수 32개가 된다.