-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathGradient_analysis.py
366 lines (286 loc) · 13.9 KB
/
Gradient_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created by
@author Jacob Goldman-Wetzler
Modified by
@author Joel Hayford
"""
# ================ Load libraries and dependencies =========================
import tensorflow as tf
import matplotlib.pyplot as plt
import deepxde as dde
import numpy as np
import matplotlib.patches as mpatches
from matplotlib.colors import ListedColormap
import pandas as pd
import matplotlib as mpl
mpl.rcParams['font.size'] = 18
SEED = 0xdde
dde.config.set_random_seed(SEED)
# ============================================================================
# ======================= Define functions =================================
# ============================================================================
def f1(x):
return x * np.sin(5 * x)
def get_gradients_of_weights(model16):
x_train = model16.data.train_x
y_train = model16.data.train_y
with tf.GradientTape() as tape:
y_pred = model16.net.call(x_train)
loss_fn = dde.losses.get("MSE")
loss = loss_fn(y_train, y_pred)
gradients16 = tape.gradient(loss,model16.net.trainable_weights)
gradients161d = np.concatenate([gradient.numpy().ravel() for gradient in gradients16])
return gradients161d
def get_weights(model16):
return np.concatenate([weight.flatten() for weight in model16.net.get_weights()])
def cos_sim_and_dist_of_vectors(g16, g32):
def cosine_similarity(vector1, vector2):
dot_product = np.dot(vector1, vector2)
norm_vector1 = np.linalg.norm(vector1)
norm_vector2 = np.linalg.norm(vector2)
return dot_product / (norm_vector1 * norm_vector2)
csim = cosine_similarity(g16, g32)
dist = np.linalg.norm(g16 - g32)
return csim, dist
class SaveGradientsCallback(dde.callbacks.Callback):
def __init__(self):
super().__init__()
self.list_of_weights = []
def on_epoch_begin(self):
weights = get_weights(self.model)
grads = get_gradients_of_weights(self.model)
# print(np.array(weights).shape, np.array(grads).shape)
self.list_of_weights.append((weights, grads))
# ============================================================================
# =============== Create the float32 and float16 models ====================
# ============================================================================
dde.config.set_default_float('float32')
geom = dde.geometry.Interval(-1, 1)
data = dde.data.Function(geom, f1, 16, 100)
net = dde.nn.FNN([1] + [10] * 2 + [1], "tanh", "Glorot uniform")
model32 = dde.Model(data, net)
model32.compile("adam", lr=0.001, metrics=["l2 relative error"])
model32.train(iterations=0)
dde.config.set_default_float('float16')
geom = dde.geometry.Interval(-1, 1)
data = dde.data.Function(geom, f1, 16, 100)
net = dde.nn.FNN([1] + [10] * 2 + [1], "tanh", "Glorot uniform")
model16 = dde.Model(data, net)
model16.compile("adam", lr=0.001, metrics=["l2 relative error"])
model16.train(iterations=0)
# ================= copy the weights from float32 to float16 ==================
print(model32.net.__dict__)
for i, layer in enumerate(model32.net.denses):
model16.net.denses[i].set_weights(
[tf.cast(w, dtype=tf.float16) for w in layer.get_weights()]
)
# ============================================================================
# =============== Train the float32 and float16 models =====================
# ============================================================================
dde.config.set_default_float('float32')
cback32 = SaveGradientsCallback()
losshistory, train_state = model32.train(iterations=10_000,
callbacks=[cback32]
)
dde.config.set_default_float('float16')
cback16 = SaveGradientsCallback()
losshistory, train_state = model16.train(iterations=10_000,callbacks=[cback16])
# ============================================================================
# ========================== Process the data ==============================
# ============================================================================
cos_similarity_grad = []
grad_mags = []
dist_grad = []
cos_similarity_weights = []
mags_weights = []
dist_weights = []
def adjust_gradients_for_float16(gradients, learning_rate):
adjusted_gradients = gradients * learning_rate
return adjusted_gradients
def adjust_gradients_for_float32(gradients, learning_rate):
adjusted_gradients = gradients * learning_rate
return adjusted_gradients
adjusted_gradients_16_all = []
adjusted_gradients_32_all = []
weights_16_all = []
weights_32_all = []
for (weights16, grads16), (weights32, grads32) in zip(cback16.list_of_weights,cback32.list_of_weights):
# calculate the metrics for the gradients
csim, dist = cos_sim_and_dist_of_vectors(grads16, grads32)
grad_mags.append([np.linalg.norm(grads16), np.linalg.norm(grads32)])
cos_similarity_grad.append(csim)
dist_grad.append(dist)
# calculate the metrics for the weights
csim, dist = cos_sim_and_dist_of_vectors(weights16, weights32)
mags_weights.append([np.linalg.norm(weights16), np.linalg.norm(weights32)])
cos_similarity_weights.append(csim)
dist_weights.append(dist)
weights_16_all.append(weights16)
weights_32_all.append(weights32)
adjusted_gradients_16 = adjust_gradients_for_float16(grads16, learning_rate= 0.001)
adjusted_gradients_16_all.append(adjusted_gradients_16)
zero_count_16 = sum(np.count_nonzero(adjusted == 0) for adjusted in adjusted_gradients_16)
nan_count_16 = sum(np.count_nonzero(np.isnan(adjusted)) for adjusted in adjusted_gradients_16)
adjusted_gradients_32 = adjust_gradients_for_float32(grads32, learning_rate= 0.001)
adjusted_gradients_32_all.append(adjusted_gradients_32)
zero_count_32 = sum(np.count_nonzero(adjusted == 0) for adjusted in adjusted_gradients_32)
nan_count_32 = sum(np.count_nonzero(np.isnan(adjusted)) for adjusted in adjusted_gradients_32)
print(f"Gradients_16 adjusted to zero: {zero_count_16}")
print(f"Gradients_16 adjusted to NaN: {nan_count_16}")
# Convert the lists of NumPy arrays into lists of lists, replacing NaNs with a string for clarity
grads_16_lists = [list(np.where(np.isnan(grad), 'NaN', grad)) for grad in adjusted_gradients_16_all]
grads_32_lists = [list(np.where(np.isnan(grad), 'NaN', grad)) for grad in adjusted_gradients_32_all]
# Convert lists of lists into Pandas DataFrames
df_grads_16 = pd.DataFrame(grads_16_lists)
df_grads_32 = pd.DataFrame(grads_32_lists)
# Save to CSV
df_grads_16.to_csv('adjusted_gradients_float16.csv', index=False, header=False)
df_grads_32.to_csv('adjusted_gradients_float32.csv', index=False, header=False)
print(f"Gradients_32 adjusted to zero: {zero_count_32}")
print(f"Gradients_32 adjusted to NaN: {nan_count_32}")
# ============================================================================
# ======================= Visualize the results ============================
# ============================================================================
grad_mags = np.array(grad_mags)
mags_weights = np.array(mags_weights)
epochaxis = np.linspace(0, 10_000, 10_000)
fig, ax = plt.subplots()
ax.plot(epochaxis, grad_mags[:,1],'r', label="Float32")
ax.plot(epochaxis, grad_mags[:,0],'b', label="Float16")
ax.set_yscale('log')
plt.xlabel('No. of iterations')
plt.ylabel('$L^2$ norm of gradients')
plt.xlim(left=0)
plt.legend(loc=(0.5, 0.8), frameon=False)
# Set the linewidth of the figure border to 1.5
for axis in ['top', 'bottom', 'left', 'right']:
plt.gca().spines[axis].set_linewidth(1.5)
plt.savefig("mags1632grads.pdf", format='pdf', bbox_inches='tight')
plt.show()
iterations = range(len(adjusted_gradients_16_all))
# Calculate mean absolute gradient at each iteration
mean_abs_grads_16 = [np.mean(np.abs(grads)) for grads in adjusted_gradients_16_all]
mean_abs_grads_32 = [np.mean(np.abs(grads)) for grads in adjusted_gradients_32_all]
plt.figure(figsize=(12, 6))
plt.plot(iterations, mean_abs_grads_16, label='Mean Abs Gradient Float16', color='red')
plt.plot(iterations, mean_abs_grads_32, label='Mean Abs Gradient Float32', color='blue')
plt.xlabel('Iterations')
plt.ylabel('Mean Absolute Gradient')
plt.title('Mean Absolute Adjusted Gradients Over Iterations')
plt.legend()
plt.grid(True)
plt.savefig("zero_grads.pdf", format='pdf', bbox_inches='tight')
plt.show()
plt.show()
# Function to create and plot binary heatmap
def plot_binary_heatmap(adjusted_gradients_all, filename):
# Normalize gradient magnitudes for visualization
max_len = max(len(grad) for grad in adjusted_gradients_all)
binary_grads = np.zeros((max_len, len(adjusted_gradients_all)))
for i, grad in enumerate(adjusted_gradients_all):
# Set to 1 if gradient component is not zero, otherwise leave as 0
binary_grads[:len(grad), i] = np.where(grad != 0, 1, 0)
cmap = ListedColormap(['blue', 'red'])
# Plotting
plt.rcParams['font.size'] = 24
plt.figure(figsize=(10, 8))
plt.imshow(binary_grads, aspect='auto', cmap=cmap, interpolation='nearest')
# Create custom legends
red_patch = mpatches.Patch(color='red', label='Non-zero derivative')
blue_patch = mpatches.Patch(color='blue', label='Zero derivative')
plt.legend(handles=[red_patch, blue_patch], loc='upper center', bbox_to_anchor=(0.5,1.09), ncol=2, frameon=False, fontsize=20)
plt.ylabel('Network parameter index')
plt.xlabel('No. of iterations')
plt.xlim(0,10000)
# Start y-axis from 1
plt.yticks(ticks=np.arange(0, max_len, step=20), labels=np.arange(1, max_len + 1, step=20))
plt.savefig(filename, format='pdf', bbox_inches='tight')
plt.show()
def plot_weight_updates_binary_heatmap(weights_all, filename):
# Assuming weights_all is a list of arrays representing weights at each iteration
num_iterations = len(weights_all)
max_len = max(len(weights) for weights in weights_all)
# Initialize the binary matrix
binary_updates = np.zeros((max_len, num_iterations-1))
for i in range(1, num_iterations):
# Calculate the difference between consecutive weight arrays
diff = weights_all[i][:len(weights_all[i-1])] - weights_all[i-1]
# Mark as 1 (update occurred) if difference is non-zero
binary_updates[:len(diff), i-1] = np.where(diff != 0, 1, 0)
cmap = ListedColormap(['blue', 'red'])
# Plotting
plt.rcParams['font.size'] = 24
plt.figure(figsize=(10, 8))
plt.imshow(binary_updates, aspect='auto', cmap=cmap, interpolation='nearest')
# Create custom legends
red_patch = mpatches.Patch(color='red', label='Updated')
blue_patch = mpatches.Patch(color='blue', label='Not updated')
plt.legend(handles=[red_patch, blue_patch], loc='upper center', bbox_to_anchor=(0.5,1.09), ncol=2, frameon=False, fontsize=20)
plt.ylabel('Network parameter index')
plt.xlabel('No. of iterations')
plt.xlim(0,10000)
plt.yticks(ticks=np.arange(0, max_len, step=20), labels=np.arange(1, max_len + 1, step=20))
plt.savefig(filename, format='pdf', bbox_inches='tight')
plt.show()
print(type(weights_16_all)) # Should be <class 'list'>
if len(weights_16_all) > 0:
print(type(weights_16_all[0])) # Should be <class 'numpy.ndarray'> or similar array type
print(weights_16_all[0].shape) # Should show the shape of the array, e.g., (100,) for 100 weights
# Plot binary heatmap for Float16 gradients
plot_binary_heatmap(adjusted_gradients_16_all, 'binary_float16_grads_heatmap.pdf')
# Plot binary heatmap for Float32 gradients
plot_binary_heatmap(adjusted_gradients_32_all, 'binary_float32_grads_heatmap.pdf')
# Plot binary heatmap for Float32 gradients
plot_weight_updates_binary_heatmap(weights_16_all, 'binary_float16_weights_heatmap.pdf')
# Assuming weights_16_all is a list of weight arrays at each iteration
mean_weights_16 = []
prev_weight = None
# num_const = 0
for i in range(1, len(weights_16_all)):
unchanged = np.isclose(weights_16_all[i], weights_16_all[i-1], atol=1e-10)
percent_unchanged = np.mean(unchanged) * 100
mean_weights_16.append(percent_unchanged)
# Assuming weights_16_all is a list of weight arrays at each iteration
mean_weights_32 = []
for i in range(1, len(weights_32_all)):
unchanged = np.isclose(weights_32_all[i], weights_32_all[i-1], atol=1e-10)
percent_unchanged = np.mean(unchanged) * 100
mean_weights_32.append(percent_unchanged)
# Calculate percentage of zero gradients for adjusted_gradients_16_all
mean_grads = [(np.mean(grad == 0) * 100) for grad in adjusted_gradients_16_all]
mean_grads_32 = [(np.mean(grad == 0) * 100) for grad in adjusted_gradients_32_all]
# Assuming equal number of iterations across weights and gradients
iterations1 = list(range(1, len(weights_16_all)))
iterations = list(range(len(mean_grads)))
# Plotting
plt.rcParams['font.size'] = 24
plt.figure(figsize=(10, 8))
plt.plot(iterations1, mean_weights_16, label = "Not updated", color='red')
plt.plot(iterations, mean_grads, label = "Zero derivative", color='blue')
plt.ylabel('Network parameters (%)')
plt.xlabel('No. of iterations')
# Set the x-axis to start at 0
plt.xlim(left=0)
plt.ylim(0,100)
plt.text(0.04, 0.95, 'float16', transform=plt.gca().transAxes, fontsize=24, verticalalignment='top',
bbox=dict(facecolor='none', edgecolor='black', boxstyle='round,pad=0.5'))
leg1 = plt.legend(loc = 'right', frameon=False)
plt.savefig('weight percentages', format='pdf', bbox_inches='tight')
plt.show()
# Plotting
plt.rcParams['font.size'] = 24
plt.figure(figsize=(10, 8))
plt.plot(iterations1, mean_weights_32, label = "Not updated", color='red')
plt.plot(iterations, mean_grads_32, label = "Zero derivative", color='blue')
plt.ylabel('Network parameters (%)')
plt.xlabel('No. of iterations')
# Set the x-axis to start at 0
plt.xlim(left=0)
plt.ylim(-2,100)
plt.text(0.04, 0.95, 'float32', transform=plt.gca().transAxes, fontsize=24, verticalalignment='top',
bbox=dict(facecolor='none', edgecolor='black', boxstyle='round,pad=0.5'))
leg1 = plt.legend(loc = 'upper right', frameon=False)
plt.savefig('weight percentages_float32', format='pdf', bbox_inches='tight')
plt.show()