mini batch

GuangxiaoSong · GuangxiaoSong · commit 2b8703a735c1 · 2016-12-10T12:53:21.000+08:00
diff --git a/0503_1_tf_TFrecords_input.py b/0503_1_tf_TFrecords_input.py
@@ -29,11 +29,11 @@ def read_and_decode(filename):
 
     return X, y
 
-img, label = read_and_decode("data/merge/scat_data_test.tfrecords")
+img, label = read_and_decode("data/tvtsets/test_scat_data.tfrecords")
 
 #使用shuffle_batch可以随机打乱输入
 img_batch, label_batch = tf.train.shuffle_batch([img, label],
-                                                batch_size=2, capacity=2000,
+                                                batch_size=20, capacity=2000,
                                                 min_after_dequeue=1000)
 init = tf.global_variables_initializer()
 
diff --git a/0503_2_tf_TFrecords_single_input.py b/0503_2_tf_TFrecords_single_input.py
@@ -0,0 +1,63 @@
+# -*- coding:utf-8 -*-
+
+"""
+@author: Songgx
+@file: 0503_1_tf_TFrecords_input.py
+@time: 12/1/16 7:33 PM
+"""
+
+from __future__ import print_function
+import tensorflow as tf
+
+# https://indico.io/blog/tensorflow-data-inputs-part1-placeholders-protobufs-queues/
+
+def read_and_decode_single_example(filename):
+    # first construct a queue containing a list of filenames.
+    # this lets a user split up there dataset in multiple files to keep
+    # size down
+    filename_queue = tf.train.string_input_producer([filename], num_epochs=None)
+    # Unlike the TFRecordWriter, the TFRecordReader is symbolic
+    reader = tf.TFRecordReader()
+    # One can read a single serialized example from a filename
+    # serialized_example is a Tensor of type string.
+    _, serialized_example = reader.read(filename_queue)
+    # The serialized example is converted back to actual values.
+    # One needs to describe the format of the objects to be returned
+    features = tf.parse_single_example(
+        serialized_example,
+        features={
+            # We know the length of both fields. If not the
+            # tf.VarLenFeature could be used
+            'label': tf.FixedLenFeature([], tf.int64),
+            'feature': tf.VarLenFeature(tf.float32)
+        })
+    # now return the converted data
+    label = features['label']
+    audio = features['feature']
+    return label, audio
+
+# returns symbolic label and audio
+label, audio = read_and_decode_single_example("data/tvtsets/test_scat_data.tfrecords")
+
+sess = tf.Session()
+
+# Required. See below for explanation
+init = tf.global_variables_initializer()
+sess.run(init)
+tf.train.start_queue_runners(sess=sess)
+
+# grab examples back.
+# first example from file
+label_val_1, audio_val_1 = sess.run([label, audio])
+# second example from file
+label_val_2, audio_val_2 = sess.run([label, audio])
+
+'''
+The fact that this works requires a fair bit of effort behind the scenes.
+First, it is important to remember that TensorFlow’s graphs contain state.
+It is this state that allows the TFRecordReader to remember the location of the tfrecord
+it’s reading and always return the next one. This is why for almost all TensorFlow work
+we need to initialize the graph. We can use the helper function tf.initialize_all_variables(),
+which constructs an op that initializes the state on the graph when you run it.
+
+'''
diff --git a/0503_3_tf_TFrecords_minibatch_input.py b/0503_3_tf_TFrecords_minibatch_input.py
@@ -0,0 +1,76 @@
+# -*- coding:utf-8 -*-
+
+"""
+@author: Songgx
+@file: 0503_1_tf_TFrecords_input.py
+@time: 12/1/16 7:33 PM
+"""
+
+from __future__ import print_function
+import tensorflow as tf
+
+# https://indico.io/blog/tensorflow-data-inputs-part1-placeholders-protobufs-queues/
+
+def read_and_decode(filename):
+    filename_queue = tf.train.string_input_producer([filename])
+
+    reader = tf.TFRecordReader()
+    _, serialized_example = reader.read(filename_queue)
+    features = tf.parse_single_example(serialized_example,
+                                       features={
+                                           'label': tf.FixedLenFeature([], tf.int64),
+                                           # We know the length of both fields. If not the
+                                           # tf.VarLenFeature could be used
+                                           'features': tf.FixedLenFeature([8660], tf.float32),
+                                       })
+
+    X = tf.cast(features['features'], tf.float32)
+    y = tf.cast(features['label'], tf.int32)
+
+    return X, y
+
+img, label = read_and_decode("data/tvtsets/test_scat_data.tfrecords")
+
+#使用shuffle_batch可以随机打乱输入
+img_batch, label_batch = tf.train.shuffle_batch([img, label],
+                                                batch_size=20, capacity=2000,
+                                                min_after_dequeue=1000)
+init = tf.global_variables_initializer()
+
+# simple model
+w = tf.get_variable("w1", [8660, 10])
+y_pred = tf.matmul(img_batch, w)
+loss = tf.nn.sparse_softmax_cross_entropy_with_logits(y_pred, label_batch)
+
+# for monitoring
+loss_mean = tf.reduce_mean(loss)
+train_op = tf.train.AdamOptimizer().minimize(loss)
+
+sess = tf.Session()
+init = tf.global_variables_initializer()
+sess.run(init)
+tf.train.start_queue_runners(sess=sess)
+
+for i in range(200):
+  # pass it in through the feed_dict
+  _, loss_val = sess.run([train_op, loss_mean])
+  print (loss_val)
+
+
+'''
+with tf.Session() as sess:
+    sess.run(init)
+    coord = tf.train.Coordinator()
+    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
+    try:
+        for i in range(10):
+            val, l= sess.run([img_batch, label_batch])
+            print(val[-10:], l)
+    except tf.errors.OutOfRangeError:
+        print ('Done reading')
+    finally:
+        coord.request_stop()
+
+    coord.join(threads)
+    sess.close()
+'''
diff --git a/0504_tf_full_connect_NN.py b/0504_tf_full_connect_NN.py
@@ -24,20 +24,42 @@ def count_column_num(fname, field_delim):
         # the last column is the class number -->  -1
         return len(line)
 
+
 def dense_to_one_hot(labels_dense, num_classes=10):
     """Convert class labels from scalars to one-hot vectors."""
     num_labels = labels_dense.shape[0]
     index_offset = np.arange(num_labels) * num_classes
     labels_one_hot = np.zeros((num_labels, num_classes))
     labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
     return labels_one_hot
+
+
+def read_and_decode(filename):
+    filename_queue = tf.train.string_input_producer([filename])
+
+    reader = tf.TFRecordReader()
+    _, serialized_example = reader.read(filename_queue)
+    features = tf.parse_single_example(serialized_example,
+                                       features={
+                                           'label': tf.FixedLenFeature([], tf.int64),
+                                           # We know the length of both fields. If not the
+                                           # tf.VarLenFeature could be used
+                                           'features': tf.FixedLenFeature([8660], tf.float32),
+                                       })
+
+    X = tf.cast(features['features'], tf.float32)
+    y = tf.cast(features['label'], tf.int32)
+
+    return X, y
+
+
 # Parameters
 learning_rate = 0.001
 training_epochs = 10000
 display_step = 1
 num_threads = 4
-csv_file_path = "data/merge/scat_data.txt"
-training_file_path = "data/merge/scat_data.tfrecords"
+csv_file_path = "data/tvtsets/training_scat_data.txt"
+training_file_path = "data/tvtsets/training_scat_data.tfrecords"
 column_num = count_column_num(csv_file_path, " ")
 # file_length = file_len(csv_file_path)
 # Network Parameters
@@ -106,8 +128,6 @@ def multilayer_perceptron(x, weights, biases):
             features_array = np.reshape(features_array, (1, n_input))
             label_array = dense_to_one_hot(np.array([label]), num_classes = n_classes)
 
-            with open("0504_log.txt", "w") as f:
-                f.write("features: {}, label: {}".format(features_array, label_array))
             _, c = sess.run([optimizer, cost], feed_dict={x: features_array, y: label_array})
         # Display logs per epoch step
         if epoch % display_step == 0:
diff --git a/data/0203_convert_to_TFrecords.py b/data/0203_convert_to_TFrecords.py
@@ -17,8 +17,8 @@
 # 2,5,3,3,6,7,7,5,1,1
 
 
-def convert_tfrecords(input_filename, output_filename):
-    current_path = os.getcwd() + "/merge/"
+def convert_tfrecords(input_filename, output_filename, data_folder):
+    current_path = os.getcwd() + data_folder
     input_file = os.path.join(current_path, input_filename)
     output_file = os.path.join(current_path, output_filename)
     print("Start to convert {} to {}".format(input_file, output_file))
@@ -43,4 +43,14 @@ def convert_tfrecords(input_filename, output_filename):
     print("Successfully convert {} to {}".format(input_file, output_file))
 
 
-convert_tfrecords("scat_data_test.txt", "scat_data_test.tfrecords")
+# convert_tfrecords("scat_data_test.txt", "scat_data_test.tfrecords", "/merge/")
+
+if __name__ == "__main__":
+
+    # 转换所有tvtsets目录下的txt文件为tfrecords文件
+    for root, dirs, file in os.walk("tvtsets"):
+        for fn in file:
+            if fn.endswith(".txt"):
+                tfrecords_name = fn.replace(".txt", ".tfrecords")
+                # print (tfrecords_name)
+                convert_tfrecords(fn, tfrecords_name, "/tvtsets/")