lundi 24 juillet 2017

Tensorflow batch_norm (tf.contrib.layers.batch_norm) does not work during testing with shared weights

I'm trying to use a Siamese CNN to train a stereo matching network. Training works fine, but if I set is_training = False, I get super high test errors, higher than a randomly initialized network. What I've tried:

  • output the single layers before and after batch_norm --> batch norm definitely is applied during training and testing. Also normlized layers in testing mode are similar to those of training mode, so the running averages seem to be applied!
  • outputting the running average after each iteration --> they definitely change
  • Save model and restore --> batch_norm weights are definitely saved and they are not just 1 and 0, but seem to make sense.
  • remove normalization layer --> works fine but the error converges at 0.3, so batch_norm gives better results (at least with training mode) and converges 3 times faster

The training error starts around 3.2 and goes to 0.15 after 2000 iterations. The test error starts around 3.2 goes down to 2.6 and then rises to 7. I should also mention that I'm training on a randomly generated training set on which I also test. So when I say test error, I'm actually testing on the same set, I just set is_training = False. So the difference between training and testing error comes just from the training/testing mode of batch_norm. That's why I don't see how this could be an overfitting problem.

I think it has something to do with the fact that I share weights between the two branches of the Siamese network. But then it doesn't make sense that training works fine, and that the test error decreases in the beginning.

I would appreciate any kind of hint that could help! Thanks !

Btw I'm usng python 3.5 and tensorflow 1.2.0

def conv_relu(input, kernel_shape, bias_shape, phase,reuse,scope):
    with tf.variable_scope(scope,reuse=reuse):
        weights = tf.get_variable("weights", kernel_shape, initializer = tf.contrib.layers.xavier_initializer_conv2d()) #xavier
        biases = tf.get_variable("biases", bias_shape, initializer=tf.contrib.layers.xavier_initializer())
        conv = conv2d(input, weights)
        normal = tf.contrib.layers.batch_norm(tf.nn.bias_add(conv,biases), 
                                      center=True, scale=True, 
                                      is_training=phase, decay=0.9,
                                      scope='bn')

        return tf.nn.relu(normal)

def network(input,reuse,disp):
    h1_ = conv_relu(input, [3, 3, n_channels, n_units], [n_units],phase,reuse, 'conv1')
    h2_ = conv_relu_pool(h1_, [3, 3, n_units, n_units], [n_units],phase,reuse, 'conv2')
    h3_ = conv_relu(h2_, [3, 3, n_units, n_units], [n_units],phase,reuse,'conv3')
    h4_ = conv_relu_pool(h3_, [3, 3, n_units, n_units], [n_units],phase,reuse,'conv4')
    h5_ = conv_relu(h4_, [3, 3, n_units, n_units], [n_units],phase,reuse,'conv5')
    h6_ = conv_relu(h5_, [3, 3, n_units, n_units], [n_units],phase,reuse,'conv6')
    h7_ = conv(h6_, [3, 3, n_units, n_units], [n_units],reuse,'conv7')
    h8_ = deconv(h7_, [3, 3, n_units, n_units], [n_units], [batch,HimSize,tf.cast(imSize/2+(disp/2),tf.int32),n_units],reuse,'conv8')
    h9_ = deconv(h8_, [3, 3, n_units, n_units], [n_units], [batch,imSize,imSize + disp,n_units],reuse,'conv9')
    return h9_

with tf.name_scope('ImLeft'):
    x_left = tf.placeholder(tf.float32, shape=[None, imSize, imSize, n_channels])
with tf.name_scope('ImRight'):    
    x_right = tf.placeholder(tf.float32, shape=[None, imSize, imSize +MaxDisp, n_channels])
with tf.name_scope('Labels'):
    y_ = tf.placeholder(tf.float32, shape=[None,imSize,imSize])

y_onehot = tf.one_hot(tf.cast(y_,tf.int32),n_classes,axis=3)    
phase = tf.placeholder(tf.bool)

with tf.variable_scope("siamese_network") as scope:
    h9_left = network(input = x_left,reuse= False,disp = 0)
    h9_right = network(input = x_right,reuse = True,disp = MaxDisp)

I then do some transformations and get the output layer:

output = tf.reshape(output_layer_t,[batch*imSize*imSize,n_classes])
with tf.name_scope('Loss'):
    cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = tf.reshape(y_onehot,(batch*imSize*imSize,n_classes)), logits = output))


update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
    train_step = tf.train.AdamOptimizer(learn_rate).minimize(cross_entropy)

#run session
saver = tf.train.Saver()
sess = tf.Session()
sess.run(tf.global_variables_initializer())
summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())

for i in range(1000):
    #print(tf.get_collection(tf.GraphKeys.VARIABLES, scope="siamese_network"))
    idx = np.random.randint(160, size=batch)
    _,loss_,summary = sess.run([train_step,cross_entropy,merged_summary_op], feed_dict={x_left:trainImsLeft[idx,:,:].reshape(-1,imSize,imSize,1),
                                        x_right:trainImsRight[idx,:,:].reshape(-1,imSize,imSize+MaxDisp,1),
                                        y_:trainLabs[idx,:,:].reshape(-1,imSize,imSize), phase:True})

    summary_writer.add_summary(summary, i)
    if (i+1) % 2 == 0:
        print('------',i)

        loss1 = sess.run([cross_entropy], feed_dict={x_left:trainImsLeft.reshape(-1,imSize,imSize,1),
                                        x_right:trainImsRight.reshape(-1,imSize,imSize+MaxDisp,1),
                                        y_:trainLabs.reshape(-1,imSize,imSize), phase:True})
        loss0 = sess.run([cross_entropy], feed_dict={x_left:trainImsLeft.reshape(-1,imSize,imSize,1),
                                        x_right:trainImsRight.reshape(-1,imSize,imSize+MaxDisp,1),
                                        y_:trainLabs.reshape(-1,imSize,imSize), phase:False})
        print('training loss: ',loss1)
        print('testing loss: ',loss0)


saver.save(sess, MODEL_FILENAME)

Aucun commentaire:

Enregistrer un commentaire