Initial commit

philkr · Dec 29, 2015 · f2186be · f2186be
commit f2186be
Show file tree

Hide file tree

Showing 5 changed files with 353 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+**/__pycache__
+**/*.pyc
+caffe
+*.kdev4
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,26 @@
+Copyright (c) 2016, Philipp Krähenbühl
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+The views and conclusions contained in the software and documentation are those
+of the authors and should not be interpreted as representing official policies,
+either expressed or implied, of the FreeBSD Project.
diff --git a/README.md b/README.md
@@ -0,0 +1,35 @@
+# Data-dependent initialization of convolutional neural networks
+
+Created by Philipp Krähenbühl.
+
+### Introduction
+
+This code implements the initialization presented in our [arXiv tech report](http://arxiv.org/abs/1511.06856), which is under submission at ICLR 2016.
+
+*This is a reimplementation and currently work in progress. Use at your own risk.*
+
+### License
+
+This code is released under the BSD License (refer to the LICENSE file for details).
+
+### Citing
+
+If you find our initialization useful in your research, please consider citing:
+
+    @article{krahenbuhl2015data,
+      title={Data-dependent Initializations of Convolutional Neural Networks},
+      author={Kr{\"a}henb{\"u}hl, Philipp and Doersch, Carl and Donahue, Jeff and Darrell, Trevor},
+      journal={arXiv preprint arXiv:1511.06856},
+      year={2015}
+    }
+
+### Setup
+
+Checkout the project and create a symlink to caffe in the `magic_init` directory:
+```Shell
+ln -s path/to/caffe caffe
+```
+
+### Examples
+
+*Will follow soon*
diff --git a/load.py b/load.py
@@ -0,0 +1,61 @@
+import caffe
+
+def parseProtoString(s):
+	from google.protobuf import text_format
+	from caffe.proto import caffe_pb2 as pb
+	proto_net = pb.NetParameter()
+	text_format.Merge(s, proto_net)
+	return proto_net
+
+
+def get_param(l, exclude=set(['top', 'bottom', 'name', 'type'])):
+	if not hasattr(l,'ListFields'):
+		if hasattr(l,'__delitem__'):
+			return list(l)
+		return l
+	r = dict()
+	for f, v in l.ListFields():
+		if f.name not in exclude:
+			r[f.name] = get_param(v, [])
+	return r
+
+class ProtoDesc:
+	def __init__(self, prototxt):
+		from os import path
+		self.prototxt = prototxt
+		self.parsed_proto = parseProtoString(open(self.prototxt, 'r').read())
+		# Guess the input dimension
+		self.input_dim = (3, 227, 227)
+		net = self.parsed_proto
+		if len(net.input_dim) > 0:
+			self.input_dim = net.input_dim[1:]
+		else:
+			lrs = net.layer
+			cs = [l.transform_param.crop_size for l in lrs
+				if l.HasField('transform_param')]
+			if len(cs):
+				self.input_dim = (3, cs[0], cs[0])
+
+	def __call__(self, clip=None, **inputs):
+		from caffe import layers as L
+		from collections import OrderedDict
+		net = self.parsed_proto
+		blobs = OrderedDict(inputs)
+		for l in net.layer:
+			if l.name not in inputs:
+				in_place = l.top == l.bottom
+				param = get_param(l)
+				assert all([b in blobs for b in l.bottom]), "Some bottoms not founds: " + ', '.join([b for b in l.bottom if not b in blobs])
+				tops = getattr(L, l.type)(*[blobs[b] for b in l.bottom],
+				                          ntop=len(l.top), in_place=in_place,
+				                          name=l.name,
+				                          **param)
+				if len(l.top) <= 1:
+					tops = [tops]
+				for i, t in enumerate(l.top):
+					blobs[t] = tops[i]
+			if l.name == clip:
+				break
+		return list(blobs.values())[-1]
+
+
diff --git a/magic_init.py b/magic_init.py
@@ -0,0 +1,227 @@
+
+INPUT_LAYERS = ['Data', 'ImageData']
+PARAMETER_LAYERS = ['Convolution', 'InnerProduct']
+SUPPORTED_LAYERS = ['ReLU', 'Sigmoid', 'LRN', 'Pooling']
+# Use 'Dropout' at your own risk
+# Unless Jon merges #2865 , 'Split' cannot be supported
+UNSUPPORTED_LAYERS = ['Split']
+
+def forward(net, i, NIT, data, output_names):
+	n = net._layer_names[i]
+	# Create the top data if needed
+	output = {t: [None]*NIT for t in output_names}
+	for it in range(NIT):
+		for b in data:
+			net.blobs[b].data[...] = data[b][it]
+		net._forward(i, i)
+		for t in output_names:
+			output[t][it] = 1*net.blobs[t].data
+	return output
+
+def flattenData(data):
+	import numpy as np
+	return np.concatenate([d.swapaxes(0, 1).reshape((d.shape[1],-1)) for d in data], axis=1)
+
+def gatherInputData(net, layer_id, bottom_data, top_name):
+	# This functions gathers all input data.
+	# In order to not replicate all the internal functionality of convolutions (eg. padding ...)
+	# we gather the data in the output space and use random gaussian weights. The output of this
+	# function is W and D, there the input data I = D * W^-1  [with some abuse of tensor notation]
+	# If we not compute an initialization A for D, we then simply multiply A by W to obtain the
+	# proper initialization in the input space
+	import numpy as np
+	l = net.layers[layer_id]
+	NIT = len(list(bottom_data.values())[0])
+	# How many times do we need to over-sample to get a full basis (out of random projections)
+	OS = int(np.ceil( np.prod(l.blobs[0].data.shape[1:]) / l.blobs[0].data.shape[0] ))
+	# Note this could cause some memory issues in the FC layers
+	W, D = [], []
+	for i in range(OS):
+		d = l.blobs[0].data
+		d[...] = np.random.normal(0, 1, d.shape)
+		W.append(1*d)
+		D.append(np.concatenate(forward(net, layer_id, NIT, bottom_data, [top_name])[top_name], axis=0))
+	return np.concatenate(W, axis=0), np.concatenate(D, axis=1)
+
+def initializeWeight(D, type, N_OUT):
+	# TODO: Compute the initialization using D
+	return np.random.normal(0, 1, (N_OUT,D.shape[1]))
+
+
+def initializeLayer(net, layer_id, bottom_data, top_name, bias=0, type='elwise'):
+	import numpy as np
+	l = net.layers[layer_id]
+	NIT = len(list(bottom_data.values())[0])
+
+	for p in l.blobs: p.data[...] = 0
+	# Initialize the weights [k-means, ...]
+	if type == 'elwise':
+		d = l.blobs[0].data
+		d[...] = np.random.normal(0, 1, d.shape)
+	else: # Use the input data
+		# Gather the input data
+		T, D = gatherInputData(net, layer_id, bottom_data, top_name)
+
+		# Figure out the output dimensionality of d
+		d = l.blobs[0].data
+
+		# Prepare the data
+		D = D.swapaxes(0, 1).reshape((D.shape[1],-1)).T
+
+		# Compute the weights
+		W = initializeWeight(D, type, N_OUT=d.shape[0])
+
+		# Multiply the weights by the random basis
+		# NOTE: This matrix multiplication is a bit large, if it's too slow,
+		#       reduce the oversampling in gatherInputData
+		d[...] = np.dot(W, T.reshape((T.shape[0],-1))).reshape(d.shape)
+
+	# Scale the mean and initialize the bias
+	top_data = forward(net, layer_id, NIT, bottom_data, [top_name])[top_name]
+	flat_data = flattenData(top_data)
+	mu = flat_data.mean(axis=1)
+	std = flat_data.std(axis=1)
+	l.blobs[0].data[...] /= std.reshape((-1,)+(1,)*(len(l.blobs[0].data.shape)-1))
+	for b in l.blobs[1:]:
+		b.data[...] = -mu / std + bias
+
+def magicInitialize(net, bias=0, NIT=10, type='elwise', bottom_names={}, top_names={}):
+	import numpy as np
+	# What layers was a certain blob first produced
+	first_produced = {}
+	# When was a blob last used
+	last_used = {}
+	# Make sure all layers are supported, and compute the range each blob is used in
+	for i, (n, l) in enumerate(zip(net._layer_names, net.layers)):
+		if l.type in UNSUPPORTED_LAYERS:
+			print( "WARNING: Layer type '%s' not supported! Things might go very wrong..."%l.type )
+		elif l.type not in SUPPORTED_LAYERS+PARAMETER_LAYERS+INPUT_LAYERS:
+			print( "Unknown layer type '%s'. double check if it is supported"%l.type )
+		for t in top_names[n]:
+			if not t in first_produced:
+				first_produced[t] = i
+		for b in bottom_names[n]:
+			last_used[b] = i
+
+	active_data = {}
+	# Read all the input data
+	for i, (n, l) in enumerate(zip(net._layer_names, net.layers)):
+		# Initialize the layer
+		if len(l.blobs) > 0:
+			assert l.type in PARAMETER_LAYERS, "Unsupported parameter layer"
+			assert len(top_names[n]) == 1, "Exactly one output supported"
+			if np.sum(np.abs(l.blobs[0].data)) <= 1e-10:
+				# Fill the parameters
+				initializeLayer(net, i, {b: active_data[b] for b in bottom_names[n]}, top_names[n][0], bias, type)
+
+			# TODO: Estimate and rescale the values [TODO: Record and undo this scaling above]
+
+		# Run the network forward
+		new_data = forward(net, i, NIT, {b: active_data[b] for b in bottom_names[n]}, top_names[n])
+		active_data.update(new_data)
+
+		# Delete all unused data
+		for k in list(active_data):
+			if k not in last_used or last_used[k] == i:
+				del active_data[k]
+
+		print( '%-3d %-10s\t%-10s'%(i, n, l.type), '\t\t', ', '.join(list(active_data)) )
+		print( [np.mean(np.abs(d)) for d in active_data.values()] )
+
+
+
+def netFromString(s, t=None):
+	import caffe
+	from tempfile import NamedTemporaryFile
+	if t is None: t = caffe.TEST
+	f = NamedTemporaryFile('w')
+	f.write(s)
+	f.flush()
+	r = caffe.Net(f.name, t)
+	f.close()
+	return r
+
+def layerTypes(net_proto):
+	return {l.name: l.type for l in net_proto.layer}
+
+def layerTops(net_proto):
+	return {l.name: list(l.top) for l in net_proto.layer}
+
+def layerBottoms(net_proto):
+	return {l.name: list(l.bottom) for l in net_proto.layer}
+
+def getFileList(f):
+	from glob import glob
+	from os import path
+	return [f for f in glob(f) if path.isfile(f)]
+
+def main():
+	from argparse import ArgumentParser
+	from os import path
+
+	parser = ArgumentParser()
+	parser.add_argument('prototxt')
+	parser.add_argument('output_caffemodel')
+	parser.add_argument('-l', '--load', help='Load a pretrained model and rescale it [bias and type are not supported]')
+	parser.add_argument('-d', '--data', default=None, help='Image list to use [default prototxt data]')
+	parser.add_argument('-b', '--bias', type=float, default=0.1, help='Bias')
+	parser.add_argument('-t', '--type', default='elwise', help='Type: elwise, pca, zca, kmeans, rand (random input patches)')
+	parser.add_argument('-z', action='store_true', help='Zero all weights and reinitialize')
+	parser.add_argument('-cs',  action='store_true', help='Correct for scaling')
+	parser.add_argument('-q', action='store_true', help='Quiet execution')
+	parser.add_argument('-s', type=float, default=1.0, help='Scale the input [only custom data "-d"]')
+	parser.add_argument('-bs', type=int, default=16, help='Batch size [only custom data "-d"]')
+	parser.add_argument('-nit', type=int, default=10, help='Number of iterations')
+	parser.add_argument('--gpu', type=int, default=0, help='What gpu to run it on?')
+	args = parser.parse_args()
+
+	if args.q:
+		from os import environ
+		environ['GLOG_minloglevel'] = '2'
+	import caffe, load
+	from caffe import NetSpec, layers as L
+
+	caffe.set_mode_gpu()
+	if args.gpu is not None:
+		caffe.set_device(args.gpu)
+
+	model = load.ProtoDesc(args.prototxt)
+	net = NetSpec()
+	if args.data is not None:
+		fl = getFileList(args.data)
+		if len(fl) == 0:
+			print("Unknown data type for '%s'"%args.data)
+			exit(1)
+		from tempfile import NamedTemporaryFile
+		f = NamedTemporaryFile('w')
+		f.write('\n'.join([path.abspath(i)+' 0' for i in fl]))
+		f.flush()
+		net.data, net.label = L.ImageData(source=f.name, batch_size=args.bs, new_width=model.input_dim[-1], new_height=model.input_dim[-1], transform_param=dict(mean_value=[104,117,123], scale=args.s),ntop=2)
+		net.out = model(data=net.data, label=net.label)
+	else:
+		net.out = model()
+
+	net_proto = net.to_proto()
+	n = netFromString('force_backward:true\n'+str(net_proto), caffe.TRAIN )
+	layer_top = layerTops( net_proto )
+	layer_bottoms = layerBottoms( net_proto )
+
+	if args.load is not None:
+		n.copy_from(args.load)
+		# Rescale existing layers?
+		if args.fix:
+			magicFix(n, args.nit)
+
+	if args.z:
+		# Zero out all layers
+		for l in n.layers:
+			for b in l.blobs:
+				b.data[...] = 0
+
+	magicInitialize(n, args.bias, NIT=args.nit, type=args.type, top_names=layer_top, bottom_names=layer_bottoms)
+	if args.cs:
+		calibrateGradientRatio(n)
+	n.save(args.output_caffemodel)
+
+if __name__ == "__main__":
+	main()