I'm trying to use a Siamese CNN to train a stereo matching network. Training works fine, but if I set is_training = False, I get super high test errors, higher than a randomly initialized network. What I've tried:
- output the single layers before and after batch_norm --> batch norm definitely is applied during training and testing. Also normlized layers in testing mode are similar to those of training mode, so the running averages seem to be applied!
- outputting the running average after each iteration --> they definitely change
- Save model and restore --> batch_norm weights are definitely saved and they are not just 1 and 0, but seem to make sense.
- remove normalization layer --> works fine but the error converges at 0.3, so batch_norm gives better results (at least with training mode) and converges 3 times faster
The training error starts around 3.2 and goes to 0.15 after 2000 iterations. The test error starts around 3.2 goes down to 2.6 and then rises to 7. I should also mention that I'm training on a randomly generated training set on which I also test. So when I say test error, I'm actually testing on the same set, I just set is_training = False. So the difference between training and testing error comes just from the training/testing mode of batch_norm. That's why I don't see how this could be an overfitting problem.
I think it has something to do with the fact that I share weights between the two branches of the Siamese network. But then it doesn't make sense that training works fine, and that the test error decreases in the beginning.
I would appreciate any kind of hint that could help! Thanks !
Btw I'm usng python 3.5 and tensorflow 1.2.0
def conv_relu(input, kernel_shape, bias_shape, phase,reuse,scope):
with tf.variable_scope(scope,reuse=reuse):
weights = tf.get_variable("weights", kernel_shape, initializer = tf.contrib.layers.xavier_initializer_conv2d()) #xavier
biases = tf.get_variable("biases", bias_shape, initializer=tf.contrib.layers.xavier_initializer())
conv = conv2d(input, weights)
normal = tf.contrib.layers.batch_norm(tf.nn.bias_add(conv,biases),
center=True, scale=True,
is_training=phase, decay=0.9,
scope='bn')
return tf.nn.relu(normal)
def network(input,reuse,disp):
h1_ = conv_relu(input, [3, 3, n_channels, n_units], [n_units],phase,reuse, 'conv1')
h2_ = conv_relu_pool(h1_, [3, 3, n_units, n_units], [n_units],phase,reuse, 'conv2')
h3_ = conv_relu(h2_, [3, 3, n_units, n_units], [n_units],phase,reuse,'conv3')
h4_ = conv_relu_pool(h3_, [3, 3, n_units, n_units], [n_units],phase,reuse,'conv4')
h5_ = conv_relu(h4_, [3, 3, n_units, n_units], [n_units],phase,reuse,'conv5')
h6_ = conv_relu(h5_, [3, 3, n_units, n_units], [n_units],phase,reuse,'conv6')
h7_ = conv(h6_, [3, 3, n_units, n_units], [n_units],reuse,'conv7')
h8_ = deconv(h7_, [3, 3, n_units, n_units], [n_units], [batch,HimSize,tf.cast(imSize/2+(disp/2),tf.int32),n_units],reuse,'conv8')
h9_ = deconv(h8_, [3, 3, n_units, n_units], [n_units], [batch,imSize,imSize + disp,n_units],reuse,'conv9')
return h9_
with tf.name_scope('ImLeft'):
x_left = tf.placeholder(tf.float32, shape=[None, imSize, imSize, n_channels])
with tf.name_scope('ImRight'):
x_right = tf.placeholder(tf.float32, shape=[None, imSize, imSize +MaxDisp, n_channels])
with tf.name_scope('Labels'):
y_ = tf.placeholder(tf.float32, shape=[None,imSize,imSize])
y_onehot = tf.one_hot(tf.cast(y_,tf.int32),n_classes,axis=3)
phase = tf.placeholder(tf.bool)
with tf.variable_scope("siamese_network") as scope:
h9_left = network(input = x_left,reuse= False,disp = 0)
h9_right = network(input = x_right,reuse = True,disp = MaxDisp)
I then do some transformations and get the output layer:
output = tf.reshape(output_layer_t,[batch*imSize*imSize,n_classes])
with tf.name_scope('Loss'):
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = tf.reshape(y_onehot,(batch*imSize*imSize,n_classes)), logits = output))
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_step = tf.train.AdamOptimizer(learn_rate).minimize(cross_entropy)
#run session
saver = tf.train.Saver()
sess = tf.Session()
sess.run(tf.global_variables_initializer())
summary_writer = tf.summary.FileWriter(logs_path, graph=tf.get_default_graph())
for i in range(1000):
#print(tf.get_collection(tf.GraphKeys.VARIABLES, scope="siamese_network"))
idx = np.random.randint(160, size=batch)
_,loss_,summary = sess.run([train_step,cross_entropy,merged_summary_op], feed_dict={x_left:trainImsLeft[idx,:,:].reshape(-1,imSize,imSize,1),
x_right:trainImsRight[idx,:,:].reshape(-1,imSize,imSize+MaxDisp,1),
y_:trainLabs[idx,:,:].reshape(-1,imSize,imSize), phase:True})
summary_writer.add_summary(summary, i)
if (i+1) % 2 == 0:
print('------',i)
loss1 = sess.run([cross_entropy], feed_dict={x_left:trainImsLeft.reshape(-1,imSize,imSize,1),
x_right:trainImsRight.reshape(-1,imSize,imSize+MaxDisp,1),
y_:trainLabs.reshape(-1,imSize,imSize), phase:True})
loss0 = sess.run([cross_entropy], feed_dict={x_left:trainImsLeft.reshape(-1,imSize,imSize,1),
x_right:trainImsRight.reshape(-1,imSize,imSize+MaxDisp,1),
y_:trainLabs.reshape(-1,imSize,imSize), phase:False})
print('training loss: ',loss1)
print('testing loss: ',loss0)
saver.save(sess, MODEL_FILENAME)
Aucun commentaire:
Enregistrer un commentaire