playdata

11/11_오토인코더를 활용한 이상데이터 찾기 - MNIST dataset.ipynb

_JAEJAE_ 2021. 11. 11. 15:55

In [59]:

import random
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [60]:

df = pd.read_csv('mnist_test.csv')

In [61]:

df.shape

Out[61]:

(10000, 785)

In [62]:

df.head(1)

Out[62]:

	label	1x1	1x2	1x3	1x4	1x5	1x6	1x7	1x8	1x9	1x10	1x11	1x12	1x13	1x14	1x15	1x16	1x17	1x18	1x19	1x20	1x21	1x22	1x23	1x24	1x25	1x26	1x27	1x28	2x1	2x2	2x3	2x4	2x5	2x6	2x7	2x8	2x9	2x10	2x11	...	27x17	27x18	27x19	27x20	27x21	27x22	27x23	27x24	27x25	27x26	27x27	27x28	28x1	28x2	28x3	28x4	28x5	28x6	28x7	28x8	28x9	28x10	28x11	28x12	28x13	28x14	28x15	28x16	28x17	28x18	28x19	28x20	28x21	28x22	28x23	28x24	28x25	28x26	28x27	28x28
0	7	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0

1 rows × 785 columns

In [63]:

anom = df[:1000].copy()
clean = df[1000:].copy()

In [64]:

for i in range(len(anom)):
    # select row from anom
    row = anom.iloc[i]
    # iterate through each element in row
    for i in range(len(row)-1):
        # add noise to element
        row[i+1] = min(255, row[i+1]+random.randint(100,200))

In [65]:

anom['label'] = 1
clean['label'] = 0

In [66]:

img = anom.iloc[:, 1:]
for i in range(10):
    plt.subplot(1, 10, i + 1)
    plt.axis('off')
    plt.imshow(img.iloc[i,:].values.reshape(28,28), cmap = "gray_r")

In [67]:

img2 = clean.iloc[:, 1:]
for i in range(10):
    plt.subplot(1, 10, i + 1)
    plt.axis('off')
    plt.imshow(img2.iloc[i,:].values.reshape(28,28), cmap = "gray_r")

In [68]:

an_test = pd.concat([anom, clean])  # join
an_test.sample(frac=1)              # shuffle
an_test.to_csv('anom.csv')          # save

In [69]:

import torch.nn as nn

class AE(nn.Module):
    def __init__(self):
        
        super(AE, self).__init__()        
        self.enc = nn.Sequential(
                    nn.Linear(784, 512),
                    nn.ReLU(),
                    nn.Linear(512, 256),
                    nn.ReLU(),
                    nn.Linear(256, 128),
                    nn.ReLU(),
                    nn.Linear(128, 64),
                    nn.ReLU(),
                    nn.Linear(64, 32),
                    nn.ReLU(),
                    nn.Linear(32, 16),
                    nn.ReLU()
        )
        self.dec = nn.Sequential(
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 784),
            nn.ReLU()
        )
    def forward(self, x):
            encode = self.enc(x)
            decode = self.dec(encode)
            return decode

In [70]:

import torch
import time
import random
import matplotlib.pyplot as plt
import torch.nn as nn
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict
from datetime import timedelta

In [71]:

batch_size = 32
lr = 1e-2         # learning rate
# 과적합을 방지하기 위해서 규제 항목 추가 
w_d = 1e-5        # weight decay
momentum = 0.9   
epochs = 15

In [72]:

class Loader(torch.utils.data.Dataset):
    def __init__(self):
        super(Loader, self).__init__()
        self.dataset = ''
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        row = self.dataset.iloc[idx]
        row = row.drop(labels={'label'})
        data = torch.from_numpy(np.array(row)/255).float()
        return data
    
class Train_Loader(Loader):
    def __init__(self):
        super(Train_Loader, self).__init__()
        self.dataset = pd.read_csv(
                       'mnist_train.csv',
                       index_col=False
                       )

In [73]:

train_set = Train_Loader()

train_ = torch.utils.data.DataLoader(
            train_set,
            batch_size=batch_size,
            shuffle=True,
            #num_workers=20,
            pin_memory=True,
            drop_last=True
        )

In [74]:

train_

Out[74]:

<torch.utils.data.dataloader.DataLoader at 0x7fcc70caab90>

In [75]:

metrics = defaultdict(list)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = AE()
model.to(device)
criterion = nn.MSELoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=w_d)

In [76]:

model.train()
start = time.time()
for epoch in range(epochs):
    ep_start = time.time()
    running_loss = 0.0
    for bx, (data) in enumerate(train_):
        sample = model(data.to(device))
        loss = criterion(data.to(device), sample)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    epoch_loss = running_loss/len(train_set)
    metrics['train_loss'].append(epoch_loss)
    ep_end = time.time()
    print('-----------------------------------------------')
    print('[EPOCH] {}/{}\n[LOSS] {}'.format(epoch+1,epochs,epoch_loss))
    print('Epoch Complete in {}'.format(timedelta(seconds=ep_end-ep_start)))
end = time.time()
print('-----------------------------------------------')
print('[System Complete: {}]'.format(timedelta(seconds=end-start)))

-----------------------------------------------
[EPOCH] 1/15
[LOSS] 0.0033628352807213865
Epoch Complete in 0:00:56.783185
-----------------------------------------------
[EPOCH] 2/15
[LOSS] 0.0032696612836172183
Epoch Complete in 0:00:56.611871
-----------------------------------------------
[EPOCH] 3/15
[LOSS] 0.003175515274827679
Epoch Complete in 0:00:56.700423
-----------------------------------------------
[EPOCH] 4/15
[LOSS] 0.0030771671233077846
Epoch Complete in 0:00:56.654788
-----------------------------------------------
[EPOCH] 5/15
[LOSS] 0.0029693941445400316
Epoch Complete in 0:00:56.375477
-----------------------------------------------
[EPOCH] 6/15
[LOSS] 0.0028628753361602624
Epoch Complete in 0:00:56.756814
-----------------------------------------------
[EPOCH] 7/15
[LOSS] 0.002775606703013182
Epoch Complete in 0:00:56.784614
-----------------------------------------------
[EPOCH] 8/15
[LOSS] 0.002713648401697477
Epoch Complete in 0:00:56.438546
-----------------------------------------------
[EPOCH] 9/15
[LOSS] 0.002675542098407944
Epoch Complete in 0:00:56.707990
-----------------------------------------------
[EPOCH] 10/15
[LOSS] 0.002651872635136048
Epoch Complete in 0:00:56.943371
-----------------------------------------------
[EPOCH] 11/15
[LOSS] 0.0026385190553963184
Epoch Complete in 0:00:56.485788
-----------------------------------------------
[EPOCH] 12/15
[LOSS] 0.002632019795353214
Epoch Complete in 0:00:56.587136
-----------------------------------------------
[EPOCH] 13/15
[LOSS] 0.0026265366087357203
Epoch Complete in 0:00:56.497987
-----------------------------------------------
[EPOCH] 14/15
[LOSS] 0.002623496456195911
Epoch Complete in 0:00:56.393601
-----------------------------------------------
[EPOCH] 15/15
[LOSS] 0.002622064042712251
Epoch Complete in 0:00:56.413059
-----------------------------------------------
[System Complete: 0:14:09.153094]

In [77]:

_, ax = plt.subplots(1,1,figsize=(15,10))
ax.set_title('Loss')
ax.plot(metrics['train_loss'])

Out[77]:

[<matplotlib.lines.Line2D at 0x7fcc577fa890>]

In [78]:

model.eval()
loss_dist = []
anom = pd.read_csv('anom.csv', index_col=[0])
#for bx, data in enumerate(test_):
for i in range(len(anom)):
    data = torch.from_numpy(np.array(anom.iloc[i][1:])/255).float()
    sample = model(data.to(device))
    loss = criterion(data.to(device), sample)
    loss_dist.append(loss.item())

In [79]:

loss_sc = []
for i in loss_dist:
    loss_sc.append((i,i))
plt.scatter(*zip(*loss_sc))
plt.axvline(0.3, 0.0, 1)

Out[79]:

<matplotlib.lines.Line2D at 0x7fcc57829190>

In [80]:

lower_threshold = 0.0
upper_threshold = 0.3
plt.figure(figsize=(12,6))
plt.title('Loss Distribution')
sns.distplot(loss_dist,bins=100,kde=True, color='blue')
plt.axvline(upper_threshold, 0.0, 10, color='r')
plt.axvline(lower_threshold, 0.0, 10, color='b')

/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Out[80]:

<matplotlib.lines.Line2D at 0x7fcc576dc990>

In [81]:

df = pd.read_csv('anom.csv', index_col=[0])
ddf = pd.DataFrame(columns=df.columns)
tp = 0
fp = 0
tn = 0
fn = 0
total_anom = 0
for i in range(len(loss_dist)):
    total_anom += df.iloc[i]['label']
    if loss_dist[i] >= upper_threshold:
        n_df = pd.DataFrame([df.iloc[i]])
        n_df['loss'] = loss_dist[i]
        ddf = pd.concat([df,n_df], sort = True)
        if float(df.iloc[i]['label']) == 1.0:
            tp += 1
        else:
            fp += 1
    else:
        if float(df.iloc[i]['label']) == 1.0:
            fn += 1
        else:
            tn += 1
print('[TP] {}\t[FP] {}\t[MISSED] {}'.format(tp, fp, total_anom-tp))
print('[TN] {}\t[FN] {}'.format(tn, fn))

[TP] 1000	[FP] 0	[MISSED] 0
[TN] 9000	[FN] 0