深層学習の学習【その8】 - Kuroyagi飼育日誌

前回は今後の方針について簡単に整理しました。
　
　
　
【記事1】
kuroyagi.hatenablog.com
　
　
　
今回は早速1の【mnist手書き判別において何をやっていたのかを実装レベルで確認しなおす】について深めていきます。

　
　
　
その前に先ずは動くようになった自作(他作微修正)のソースコードを載せておきます。これを足がかりにしないと、知識ばかりで動くコードがかけなくなるのは必至です。ということで、動くソースコードはこちら。
　
　
　
【ソースコード1】

import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_mldata
import chainer
from chainer import cuda, Variable, optimizers, Chain
import chainer.functions  as F
import chainer.links as L
import sys

mnist = fetch_mldata("MNIST original", data_home=".")

# 学習用データを N個、検証用データを残りの個数と設定
batchsize = 100
N = 60000

# 学習の繰り返し回数
n_epoch   = 1

x_all = mnist['data'].astype(np.float32) / 255      # 型変換
y_all = mnist['target'].astype(np.int32)
x_train, x_test = np.split(x_all, [N])
y_train, y_test = np.split(y_all, [N])
N_test = y_test.size

class MLP(Chain):                   # スーパークラスをChainに設定
    def __init__(self):             # コンストラクタを定義
        super(MLP, self).__init__(  # サブクラスMLPにスーパークラスのコンストラクタを呼び出し
        l1=L.Linear(784, 1000),     # 1層目の入力:784 / 出力:1000に設定
        l2=L.Linear(1000, 100),     # 2層目の入力:1000 / 出力:100に設定
        l3=L.Linear(100, 10),       # 3層目の入力:100 / 出力:10に設定
        )

    def __call__(self, x):                      # MLPのインスタンス呼び出し時の処理
        h1 = F.dropout(F.relu(self.l1(x)))
        h2 = F.dropout(F.relu(self.l2(h1)))
        y = self.l3(h2)
        return y

class Classifier(Chain):                    # スーパークラスをChainに設定
    def __init__(self, predictor):
        super(Classifier, self).__init__()
        with self.init_scope():
            self.predictor = predictor

    def __call__(self, x, t):
        y = self.predictor(x)
        loss = F.softmax_cross_entropy(y, t)
        accuracy = F.accuracy(y, t)
        report({'loss': loss, 'accuracy': accuracy}, self)
        return loss

model = L.Classifier(MLP())         # instalceの生成
optimizer = optimizers.Adam()       #
optimizer.setup(model)



train_loss = np.array([])
train_acc = np.array([])
test_loss = np.array([])
test_acc  = np.array([])

l1_W = np.array( [] )
l2_W = np.array( [] )
l3_W = np.array( [] )

for epoch in range(1, n_epoch+1):
    print('epoch %d' % epoch)
    indexes = np.random.permutation(N)

    # learning loop
    sum_loss = 0
    sum_acc = 0
    for i in range(0, N, batchsize):
        x = Variable(x_train[indexes[i : i + batchsize]])
        t = Variable(y_train[indexes[i : i + batchsize]])

        model.zerograds()
        loss = model(x, t)
        loss.backward()
        optimizer.update()

        train_loss = np.append(train_loss, model.loss.data)
        train_acc = np.append(train_acc, model.accuracy.data)

        sum_loss += float(cuda.to_cpu(model.loss.data)) * batchsize
        sum_acc += float(cuda.to_cpu(model.accuracy.data)) * batchsize
    # 訓練データの誤差と、正解精度を表示
    print('train mean loss={}, accuracy={}'.format(sum_loss / N, sum_acc / N))

    # test loop
    sum_loss = 0
    sum_acc = 0
    for i in range(0, N_test, batchsize):
        x = Variable(x_test[i : i + batchsize])
        t = Variable(y_test[i : i + batchsize])

        loss = model(x, t)

        test_loss = np.append(test_loss, model.loss.data)
        test_acc = np.append(test_acc, model.accuracy.data)

        sum_loss += float(cuda.to_cpu(model.loss.data)) * batchsize
        sum_acc += float(cuda.to_cpu(model.accuracy.data)) * batchsize
        # テストデータでの誤差と、正解精度を表示
    print('test  mean loss={}, accuracy={}'.format(sum_loss / N_test, sum_acc / N_test))

# 学習結果を保存
l1_W = model.predictor.l1.W
l2_W = model.predictor.l2.W
l3_W = model.predictor.l3.W


# 精度と誤差をグラフ描画
nfig = 1
fig = plt.figure(nfig, figsize=(8,6))
ax1 = plt.subplot(2,2,1)
plt.plot(range(len(train_acc)), train_acc, color='tomato', linewidth=0.5)
plt.ylim([0.7, 1.0])
plt.title("Accuracy of digit recognition.")
ax1.patch.set_facecolor('lightgray')
plt.legend(["train_acc","train_acc"],loc=4)
ax1.grid(color='white')
plt.draw()

ax2 = plt.subplot(2,2,2)
plt.plot(range(len(test_acc)), test_acc, color='blue', linewidth=0.5)
plt.ylim([0.7, 1.0])
plt.title("Accuracy of digit recognition.")
ax2.patch.set_facecolor('lightgray')
plt.legend(["test_acc","test_acc"],loc=4)
ax2.grid(color='white')
plt.draw()

ax3 = plt.subplot(2,2,3)
plt.plot(range(len(train_loss)), train_loss, color='tomato', linewidth=0.5)
plt.ylim([0, 1.0])
plt.title("Accuracy of digit recognition.")
ax3.patch.set_facecolor('lightgray')
plt.legend(["train_loss","train_loss"],loc=2)
ax3.grid(color='white')
plt.draw()

ax4 = plt.subplot(2,2,4)
plt.plot(range(len(test_loss)), test_loss, color='blue', linewidth=0.5)
plt.ylim([0, 1.0])
plt.title("Loss of digit recognition of test.")
ax4.patch.set_facecolor('lightgray')
plt.legend(["loss_acc","loss_acc"],loc=2)
ax4.grid(color='white')
plt.savefig('Learn_Test.png')

# Result
# 手書き数字データを描画する関数
plt.style.use('fivethirtyeight')
def draw_digit(data, n, ans, recog):
    size = 28
    # plt.subplot(10,10,n)
    plt.subplot()

    X, Y = np.meshgrid(range(size),range(size))
    Z = data.reshape(size,size)   # convert from vector to 28x28 matrix
    Z = Z[::-1,:]             # flip vertical
    plt.xlim(0,27)
    plt.ylim(0,27)
    plt.title("ans=%d, recog=%d"%(ans,recog), size=15)
    plt.pcolor(X, Y, Z)
    plt.gray()
    plt.tick_params(labelbottom="off")
    plt.tick_params(labelleft="off")

cnt = 0
for idx in np.random.permutation(N)[:10]:
    cnt += 1
    #
    nfig += 1
    plt.figure(nfig)
    #
    x = x_train[idx].astype(np.float32)
    pred = model.predictor(Variable(np.array(x.reshape((1, 784)), dtype=np.float32))).data
    draw_digit(x_train[idx], cnt, y_train[idx], pred.argmax(axis=1)[0])
    plt.savefig('Confirmation_%03.f'%cnt+'.png')

def draw_digit2(data, n, i):
    size = 28
    plt.subplot(10, 10, n)
    Z = data.reshape(size, size)
    Z = Z[::-1, :]
    Z = Z.data
    X, Y = np.meshgrid(range(28),range(28))
    plt.xlim(0, 27)
    plt.ylim(0, 27)
    plt.pcolor(X, Y, Z)
    plt.title("%d"%i, size=9)
    plt.gray()
    plt.tick_params(labelbottom="off")
    plt.tick_params(labelleft="off")

nfig += 1
plt.figure(nfig, figsize=(10,10))
cnt = 1

for i in np.random.permutation(1000)[:100]:
    draw_digit2(l1_W[i], cnt, i)
    cnt += 1
plt.savefig('Hidden_layer_1_randam.png')

plt.show()

　
　
　
コメントがちょっとしか追加できてないのはご了承下さい。まだ理解していないのです。
　
　
　
参考にはちょうど良さそうな以下の記事があったので【ソースコード1】ととも【記事2】に進めていきます。(かつて見たことのある記事ですが、当時は良く分からなかったのでパスしていました。多分今くらいの知識になっていたら分かる部分も増えているはず…)

【記事2】
ailaby.com
　
　
　
Classfierの引数であるpredictorにはMLPが入ってくるとあるのですが、これはmodelインスタンスを生成しているところで
　
　
　
【ソースコード2】

model = L.Classfier(MLP())

と書いてあるところでしょう。ここでもともとの

【ソースコード3】

class Classifier(Chain):                    # スーパークラスをChainに設定
    def __init__(self, predictor):
        super(Classifier, self).__init__()
        with self.init_scope():
            self.predictor = predictor

におけるコンストラクタの引数であるpredictorにMLP()が代入されているというわけですね。
　
　
　
ちなみにself.init_scopeの意味が良く分からないので調べてみたところ下記リンクに旧書式が載っていました。しかし、初心者の私にとっては本質を理解するにはちょっと情報が少ないかな…
　
　
　
【記事3】
chainer.links.Classifier — Chainer 3.2.0 documentation
　
　
　
ということでもう少し彷徨ってみると、良い感じの記事がありました。
　
　
　
【記事4】
programming-study.com
　
　
　
【記事5】
www.lifewithpython.com

【記事6】
qiita.com

【記事6】が具体的に書いてあって初心者にはありがたいですね。もとの自作ソースコードにおけるよく分からなかった部分はself.init_scope():の部分でした。これはpythonの機能ではなく、chainerの機能です。確かに良く見るとそもそもスーパークラスをchainerからimportしたChainクラスとして生成したSubClassのClassifierのメソッドとして呼び出されているように見えます。

　
一連の流れを見ると以下のような流れだと思います。
　
　
　
modelインスタンスの生成時にL.Classifier(MLP())のコンストラクタを使う
　
L.Classifierはpredictorという引数を持つ
　
predictor引数にはMLP()が代入される
　
MLP()のコンストラクタはChainクラスを継承して層の入出力セットを引数としてL.linearにより全結合のネットワークを作る
　
　
　
結果として指定した入出力をもつ層をChainクラスのpredictorに設定出来たということでしょう。この点については【記事2】に分かりやすくmodelのインスタンスの構造について書いてあります。なるほど！分かってきた気がします！
　
　
　
続いて作ったネットワークの重みバイアス更新です。これをしないと学習無しのポンコツDNNです。
　
　
　
optimizerが関連している記述は以下のあたりです。
　
　
　
【ソースコード4】

optimizer = optimizers.Adam()

学習ループ内にも…
　
　
　
【ソースコード5】

optimizer.update()

このあたりはほうそうなのかという程度なのでAdamに関しては後に回します。
　
　
　
今回はここまで！
　
　
　
今回のまとめ:
ネットワークの初期状態を設定するところまで理解できた

　
　
　
今回は関係ないですが、そのうち問題になるかもしれないこと
　
　
　
johshisha.hatenablog.com