dimanche 11 juin 2017

Why does adding dropout layers work on validation set, but not on test set?

I'm working on a convolutional neural network in TensorFlow and having trouble with the dropout layers. As recommended, I'm passing a keep_probability placeholder to the graph and setting the value to 0.5 during training, and 1.0 during validation and testing. When observing the training process, the results are good for the validation set. However, when I test the network after training, the network fails. I do not understand why the network is failing on the test set when it works on the validation set?

I've added the code for training, testing and for the graph itself.

Code for training the network:

with tf.Graph().as_default():
   #Probablitity that the neuron's output will be kept during dropout
    keep_probability = tf.placeholder(tf.float32, name="keep_probabilty")

    global_step = tf.Variable(0, trainable=False)

    images, labels = Inputs.datasetInputs(image_filenames, label_filenames, FLAGS.batch_size)
    val_images, val_labels = Inputs.datasetInputs(val_image_filenames, val_label_filenames, FLAGS.batch_size)

    train_data_node = tf.placeholder(tf.float32, shape=[FLAGS.batch_size, FLAGS.image_h, FLAGS.image_w, 3])
    train_labels_node = tf.placeholder(tf.int64, shape=[FLAGS.batch_size, FLAGS.image_h, FLAGS.image_w, 1])
    phase_train = tf.placeholder(tf.bool, name='phase_train')

    logits = model.inference(train_data_node, phase_train, FLAGS.batch_size, keep_probability) #tensor, nothing calculated yet
    loss = model.cal_loss(logits, train_labels_node)
    # Build a Graph that trains the model with one batch of examples and updates the model parameters.
     train_op = model.train(loss, global_step)

  with tf.Session() as sess:

    """ Starting iterations to train the network """
    for step in range(startstep, startstep + FLAGS.max_steps):
        image_batch ,label_batch = sess.run(fetches=[images, labels])
        # since we still use mini-batches in eval, still set bn-layer phase_train = True
        feed_dict = {
          train_data_node: image_batch,
          train_labels_node: label_batch,
          phase_train: True,
          keep_probability: 0.5

        _, loss_value = sess.run(fetches=[train_op, loss], feed_dict=feed_dict)

        if step % 10 == 0:
          num_examples_per_step = FLAGS.batch_size
          examples_per_sec = num_examples_per_step / duration
          sec_per_batch = float(duration)

          # eval current training batch pre-class accuracy
          pred = sess.run(fetches=logits, feed_dict=feed_dict)

          Utils.per_class_acc(pred, label_batch)

        if step % 100 == 0 or (step + 1) == FLAGS.max_steps:
          """ Validate training by running validation dataset """
          total_val_loss = 0.0
          hist = np.zeros((FLAGS.num_class, FLAGS.num_class))
          for test_step in range(TEST_ITER):
            val_images_batch, val_labels_batch = sess.run(fetches=[val_images, val_labels])
            feed_dict = {
              train_data_node: val_images_batch,
              train_labels_node: val_labels_batch,
              phase_train: True,
              keep_probability: 1.0 #During testing droput should be turned off -> 100% chance for keeping variable
            _val_loss, _val_pred = sess.run(fetches=[loss, logits], feed_dict=feed_dict)


Code for testing the network:

keep_probability = tf.placeholder(tf.float32, name="keep_probabilty")

  image_filenames, label_filenames = Inputs.get_filename_list(FLAGS.test_dir)
  test_data_node = tf.placeholder(tf.float32, shape=[testing_batch_size, FLAGS.image_h, FLAGS.image_w, FLAGS.image_c])  #360, 480, 3
  test_labels_node = tf.placeholder(tf.int64, shape=[FLAGS.test_batch_size, FLAGS.image_h, FLAGS.image_w, 1])

  phase_train = tf.placeholder(tf.bool, name='phase_train')

  logits = model.inference(test_data_node, phase_train, testing_batch_size, keep_probability)
  loss = model.cal_loss(logits, test_labels_node)
  pred = tf.argmax(logits, dimension=3)

  with tf.Session() as sess:
    # Load checkpoint
    saver.restore(sess, FLAGS.model_ckpt_dir)

    images, labels = Inputs.get_all_test_data(image_filenames, label_filenames)
    threads = tf.train.start_queue_runners(sess=sess)
    hist = np.zeros((FLAGS.num_class, FLAGS.num_class))
    for image_batch, label_batch  in zip(images, labels):
      feed_dict = { #maps graph elements to values
        test_data_node: image_batch,
        test_labels_node: label_batch,
        phase_train: False,
        keep_probability: 1.0 #During testing droput should be turned off -> 100% chance for keeping variable

      dense_prediction, im = sess.run(fetches=[logits, pred], feed_dict=feed_dict)

The graph:

def inference(images, phase_train, batch_size, keep_prob):
  conv1_1 = conv_layer_with_bn(images, [7, 7, images.get_shape().as_list()[3], 64], phase_train, name="conv1_1")
  conv1_2 = conv_layer_with_bn(conv1_1, [7, 7, 64, 64], phase_train, name="conv1_2")
  dropout1 = tf.layers.dropout(conv1_2, rate=(1-keep_prob), training=phase_train, name="dropout1")
  pool1, pool1_indices = tf.nn.max_pool_with_argmax(dropout1, ksize=[1, 2, 2, 1],
                                                    strides=[1, 2, 2, 1], padding='SAME', name='pool1')
  conv2_1 = conv_layer_with_bn(pool1, [7, 7, 64, 64], phase_train, name="conv2_1")
  conv2_2 = conv_layer_with_bn(conv2_1, [7, 7, 64, 64], phase_train, name="conv2_2")
  dropout2 = tf.layers.dropout(conv2_2, rate=(1-keep_prob), training=phase_train, name="dropout2")
  pool2, pool2_indices = tf.nn.max_pool_with_argmax(dropout2, ksize=[1, 2, 2, 1],
                                                    strides=[1, 2, 2, 1], padding='SAME', name='pool2')
  conv3_1 = conv_layer_with_bn(pool2, [7, 7, 64, 64], phase_train, name="conv3_1")
  conv3_2 = conv_layer_with_bn(conv3_1, [7, 7, 64, 64], phase_train, name="conv3_2")
  conv3_3 = conv_layer_with_bn(conv3_2, [7, 7, 64, 64], phase_train, name="conv3_3")
  dropout3 = tf.layers.dropout(conv3_3, rate=(1-keep_prob), training=phase_train, name="dropout3")
  pool3, pool3_indices = tf.nn.max_pool_with_argmax(dropout3, ksize=[1, 2, 2, 1],
                                                    strides=[1, 2, 2, 1], padding='SAME', name='pool3')
  conv4_1 = conv_layer_with_bn(pool3, [7, 7, 64, 64], phase_train, name="conv4_1")
  conv4_2 = conv_layer_with_bn(conv4_1, [7, 7, 64, 64], phase_train, name="conv4_2")
  conv4_3 = conv_layer_with_bn(conv4_2, [7, 7, 64, 64], phase_train, name="conv4_3")
  dropout4 = tf.layers.dropout(conv4_3, rate=(1-keep_prob), training=phase_train, name="dropout4")
  pool4, pool4_indices = tf.nn.max_pool_with_argmax(dropout4, ksize=[1, 2, 2, 1],
                                                    strides=[1, 2, 2, 1], padding='SAME', name='pool4')
  conv5_1 = conv_layer_with_bn(pool4, [7, 7, 64, 64], phase_train, name="conv5_1")
  conv5_2 = conv_layer_with_bn(conv5_1, [7, 7, 64, 64], phase_train, name="conv5_2")
  conv5_3 = conv_layer_with_bn(conv5_2, [7, 7, 64, 64], phase_train, name="conv5_3")
  dropout5 = tf.layers.dropout(conv5_3, rate=(1-keep_prob), training=phase_train, name="dropout5")
  pool5, pool5_indices = tf.nn.max_pool_with_argmax(dropout5, ksize=[1, 2, 2, 1],
                                                    strides=[1, 2, 2, 1], padding='SAME', name='pool5')
  """ End of encoder """

  """ Start decoder """
  dropout5_decode = tf.layers.dropout(pool5, rate=(1-keep_prob), training=phase_train, name="dropout5_decode")
  upsample5 = deconv_layer(dropout5_decode, [2, 2, 64, 64], [batch_size, FLAGS.image_h//16, FLAGS.image_w//16, 64], 2, "up5")
  conv_decode5_1 = conv_layer_with_bn(upsample5, [7, 7, 64, 64], phase_train, True, name="conv_decode5_1")
  conv_decode5_2 = conv_layer_with_bn(conv_decode5_1, [7, 7, 64, 64], phase_train, True, name="conv_decode5_2")
  conv_decode5_3 = conv_layer_with_bn(conv_decode5_2, [7, 7, 64, 64], phase_train, True, name="conv_decode5_3")

  dropout4_decode = tf.layers.dropout(conv_decode5_3, rate=(1-keep_prob), training=phase_train, name="dropout4_decode")
  upsample4 = deconv_layer(dropout4_decode, [2, 2, 64, 64], [batch_size, FLAGS.image_h//8, FLAGS.image_w//8, 64], 2, "up4")
  conv_decode4_1 = conv_layer_with_bn(upsample4, [7, 7, 64, 64], phase_train, True, name="conv_decode4_1")
  conv_decode4_2 = conv_layer_with_bn(conv_decode4_1, [7, 7, 64, 64], phase_train, True, name="conv_decode4_2")
  conv_decode4_3 = conv_layer_with_bn(conv_decode4_2, [7, 7, 64, 64], phase_train, True, name="conv_decode4_3")

  dropout3_decode = tf.layers.dropout(conv_decode4_3, rate=(1-keep_prob), training=phase_train, name="dropout3_decode")
  upsample3 = deconv_layer(dropout3_decode, [2, 2, 64, 64], [batch_size, FLAGS.image_h//4, FLAGS.image_w//4, 64], 2, "up3")
  conv_decode3_1 = conv_layer_with_bn(upsample3, [7, 7, 64, 64], phase_train, True, name="conv_decode3_1")
  conv_decode3_2 = conv_layer_with_bn(conv_decode3_1, [7, 7, 64, 64], phase_train, True, name="conv_decode3_2")
  conv_decode3_3 = conv_layer_with_bn(conv_decode3_2, [7, 7, 64, 64], phase_train, True, name="conv_decode3_3")

  dropout2_decode = tf.layers.dropout(conv_decode3_3, rate=(1-keep_prob), training=phase_train, name="dropout2_decode")
  upsample2= deconv_layer(dropout2_decode, [2, 2, 64, 64], [batch_size, FLAGS.image_h//2, FLAGS.image_w//2, 64], 2, "up2")
  conv_decode2_1 = conv_layer_with_bn(upsample2, [7, 7, 64, 64], phase_train, True, name="conv_decode2_1")
  conv_decode2_2 = conv_layer_with_bn(conv_decode2_1, [7, 7, 64, 64], phase_train, True, name="conv_decode2_2")

  dropout1_decode = tf.layers.dropout(conv_decode2_2, rate=(1-keep_prob), training=phase_train, name="dropout1_deconv")
  upsample1 = deconv_layer(dropout1_decode, [2, 2, 64, 64], [batch_size, FLAGS.image_h, FLAGS.image_w, 64], 2, "up1")
  conv_decode1_1 = conv_layer_with_bn(upsample1, [7, 7, 64, 64], phase_train, True, name="conv_decode1_1")
  conv_decode1_2 = conv_layer_with_bn(conv_decode1_1, [7, 7, 64, 64], phase_train, True, name="conv_decode1_2")
  """ End of decoder """

  """ Start Classify """
  # output predicted class number (2)
  with tf.variable_scope('conv_classifier') as scope:
    shape=[1, 1, 64, FLAGS.num_class]
    kernel = _variable_with_weight_decay('weights', shape=shape, initializer=tf.contrib.layers.variance_scaling_initializer(), #orthogonal_initializer()

    conv = tf.nn.conv2d(conv_decode1_2, kernel, [1, 1, 1, 1], padding='SAME')
    biases = _variable_on_cpu('biases', [FLAGS.num_class], tf.constant_initializer(0.0))
    conv_classifier = tf.nn.bias_add(conv, biases, name=scope.name) #tf.nn.bias_add is an activation function. Simple add that specifies 1-D tensor bias
    #logit = conv_classifier = prediction
  return conv_classifier

