|
| 1 | +# |
| 2 | +# Copyright 2024 National Technology & Engineering Solutions of Sandia, LLC |
| 3 | +# (NTESS). Under the terms of Contract DE-NA0003525 with NTESS, the U.S. |
| 4 | +# Government retains certain rights in this software. |
| 5 | +# |
| 6 | +# See LICENSE for full license details |
| 7 | +# |
| 8 | + |
| 9 | +""" |
| 10 | +Utility functions for calibrating input ranges for crossbar inputs and ADCs |
| 11 | +based on profiled data. Compatible with both PyTorch and Keras interfaces. |
| 12 | +These calibration methods are not guaranteed to be optimal. |
| 13 | +""" |
| 14 | + |
| 15 | +import os |
| 16 | +import numpy.typing as npt |
| 17 | +import numpy as np |
| 18 | +from scipy.optimize import minimize |
| 19 | +from simulator.parameters.core_parameters import CoreStyle |
| 20 | +from simulator.backend import ComputeBackend |
| 21 | +xp = ComputeBackend() |
| 22 | + |
| 23 | + |
| 24 | +def calibrate_input_limits( |
| 25 | + all_xbar_inputs: list, |
| 26 | + Nbits: int = 0, |
| 27 | + norm_ord: float = 1.0, |
| 28 | +) -> npt.ArrayLike: |
| 29 | + """Optimizes the input range for all layers in a network given profiled |
| 30 | + input values. This function is intended for use with ResNet CNNs where |
| 31 | + all but the first layer is precded by a ReLU, so inputs are strictly positive. |
| 32 | +
|
| 33 | + Args: |
| 34 | + all_xbar_inputs: list of arrays, each array contains profiled input |
| 35 | + values for a layer |
| 36 | + Nbits: quantization resolution used in optimizer |
| 37 | + Set to 0 to set range based on max profiled value |
| 38 | + norm_ord: power of the error norm used for the loss function in optimizer |
| 39 | + Returns: |
| 40 | + NumPy array containing the (min, max) range for the inputs of every layer |
| 41 | + """ |
| 42 | + |
| 43 | + n_layers = len(all_xbar_inputs) |
| 44 | + input_limits = np.zeros((n_layers, 2)) |
| 45 | + |
| 46 | + for k in range(n_layers): |
| 47 | + |
| 48 | + xbar_inputs_k = xp.asarray(all_xbar_inputs[k]) |
| 49 | + |
| 50 | + if Nbits > 0: |
| 51 | + eta0 = -4 |
| 52 | + # Optimize the input percentile |
| 53 | + eta = minimize( |
| 54 | + quantizationError_ReLU, |
| 55 | + eta0, |
| 56 | + args=(xbar_inputs_k, Nbits, norm_ord), |
| 57 | + method="nelder-mead", |
| 58 | + tol=0.1, |
| 59 | + ) |
| 60 | + percentile_max = 100 * (1 - pow(10, eta.x[0])) |
| 61 | + xmax = xp.percentile(xbar_inputs_k, percentile_max) |
| 62 | + else: |
| 63 | + xmax = xp.max(all_xbar_inputs[k]) |
| 64 | + |
| 65 | + input_limits[k, :] = np.array([0, float(xmax)]) |
| 66 | + |
| 67 | + return input_limits |
| 68 | + |
| 69 | + |
| 70 | +def calibrate_adc_limits( |
| 71 | + analog_layers: list, |
| 72 | + all_adc_inputs: list, |
| 73 | + Nbits: int = 0, |
| 74 | + norm_ord: float = 1.0, |
| 75 | + bitslice_pct: float = 99.99, |
| 76 | +) -> npt.ArrayLike: |
| 77 | + """Optimizes the ADC input range for all layers in a network given profiled |
| 78 | + input values. |
| 79 | +
|
| 80 | + Args: |
| 81 | + analog_layers: list of Torch analog modules or Keras analog layers containing |
| 82 | + params that will be used to decide how to calibrate |
| 83 | + all_adc_inputs: list of arrays, each array contains profiled input |
| 84 | + values for a layer |
| 85 | + Nbits: quantization resolution used in optimizer |
| 86 | + Set to 0 to set range based on max profiled value |
| 87 | + norm_ord: power of the error norm used for the loss function in optimizer |
| 88 | + (Used for unsliced core only) |
| 89 | + bitslice_pct: desired percentile coverage of input distribution that is used to |
| 90 | + find ADC ranges. (Used for bitsliced core only) |
| 91 | + Returns: |
| 92 | + NumPy array containing the (min, max) range for the inputs of every layer |
| 93 | + """ |
| 94 | + |
| 95 | + |
| 96 | + n_layers = len(all_adc_inputs) |
| 97 | + if analog_layers[0].params.core.style != CoreStyle.BITSLICED: |
| 98 | + adc_limits = np.zeros((n_layers, 2)) |
| 99 | + else: |
| 100 | + # Allows non-uniform bit slice width across layers |
| 101 | + adc_limits = [None] * n_layers |
| 102 | + |
| 103 | + k = 0 |
| 104 | + for layer in analog_layers: |
| 105 | + adc_inputs_k = xp.asarray(all_adc_inputs[k]) |
| 106 | + |
| 107 | + if layer.params.core.style != CoreStyle.BITSLICED: |
| 108 | + adc_limits[k, :] = optimize_adc_limits_unsliced( |
| 109 | + adc_inputs_k, Nbits=Nbits, norm_ord=norm_ord |
| 110 | + ) |
| 111 | + else: |
| 112 | + num_slices = layer.params.core.bit_sliced.num_slices |
| 113 | + adc_limits[k] = optimize_adc_limits_bitsliced( |
| 114 | + adc_inputs_k, |
| 115 | + num_slices, |
| 116 | + style = layer.params.core.bit_sliced.style, |
| 117 | + Nrows = Nrows, |
| 118 | + pct = bitslice_pct, |
| 119 | + ) |
| 120 | + k += 1 |
| 121 | + |
| 122 | + return adc_limits |
| 123 | + |
| 124 | + |
| 125 | +def optimize_adc_limits_unsliced( |
| 126 | + adc_inputs_k: npt.ArrayLike, |
| 127 | + Nbits: int = 0, |
| 128 | + norm_ord: float = 1.0, |
| 129 | +) -> npt.ArrayLike: |
| 130 | + """Optimizes the ADC input range for one layer which does not using weight bit slicing.""" |
| 131 | + |
| 132 | + # Although input bit slices are profiled separately, the current calibration |
| 133 | + # method does not resolve data by input bit |
| 134 | + adc_inputs_k = adc_inputs_k.flatten() |
| 135 | + |
| 136 | + if Nbits > 0: |
| 137 | + etas0 = (-4, -4) |
| 138 | + # Optimize the input percentile |
| 139 | + etas = minimize( |
| 140 | + quantizationError_minMax, |
| 141 | + etas0, |
| 142 | + args=(adc_inputs_k, Nbits, norm_ord), |
| 143 | + method="nelder-mead", |
| 144 | + tol=0.1, |
| 145 | + ) |
| 146 | + percentile_min = 100 * pow(10, etas.x[0]) |
| 147 | + percentile_max = 100 * (1 - pow(10, etas.x[1])) |
| 148 | + xmin = xp.percentile(adc_inputs_k, percentile_min) |
| 149 | + xmax = xp.percentile(adc_inputs_k, percentile_max) |
| 150 | + else: |
| 151 | + xmin = xp.min(adc_inputs_k) |
| 152 | + xmax = xp.max(adc_inputs_k) |
| 153 | + |
| 154 | + adc_limits_k = np.array([float(xmin), float(xmax)]) |
| 155 | + |
| 156 | + return adc_limits_k |
| 157 | + |
| 158 | + |
| 159 | +def optimize_adc_limits_bitsliced( |
| 160 | + adc_inputs_k: npt.ArrayLike, |
| 161 | + num_slices: int = 2, |
| 162 | + style: int = BitSlicedCoreStyle.BALANCED, |
| 163 | + Nrows: int = 1, |
| 164 | + pct: float = 99.99, |
| 165 | +) -> npt.ArrayLike: |
| 166 | + """ |
| 167 | + Optimizes the ADC input range for one layer which uses weight bit slicing. |
| 168 | + To reduce the overhead of bit slice digital post-processing, this method ensures |
| 169 | + that the ratio of the ADC limits of any two bit slices must be a power of 2. |
| 170 | + """ |
| 171 | + |
| 172 | + # NOTE: Although input bit slices are profiled separately, the current calibration |
| 173 | + # method does not resolve data by input bit |
| 174 | + |
| 175 | + adc_limits_k = np.zeros((num_slices, 2)) |
| 176 | + |
| 177 | + if style == BitSlicedCoreStyle.OFFSET: |
| 178 | + raise NotImplementedError( |
| 179 | + "ADC limits auto-calibration with weight bit slicing OFFSET " |
| 180 | + + "style not been implemented yet." |
| 181 | + ) |
| 182 | + |
| 183 | + for i_slice in range(num_slices): |
| 184 | + adc_inputs_ik = adc_inputs_k[i_slice,:,:].flatten() |
| 185 | + adc_inputs_ik /= Nrows |
| 186 | + |
| 187 | + # Find the percentile extreme values of the ADC input distribution |
| 188 | + p_neg = xp.percentile(adc_inputs_ik, 100-pct) |
| 189 | + p_pos = xp.percentile(adc_inputs_ik, pct) |
| 190 | + p_out = xp.maximum(xp.abs(p_neg),xp.abs(p_pos)) |
| 191 | + |
| 192 | + # Compute how much the ADC limits can be divided from the maximum possible, |
| 193 | + # and still cover the percentile extreme values |
| 194 | + clip_power_i = xp.floor(xp.log2(1/p_out)).astype(int) |
| 195 | + adc_limits_k[i_slice,0] = -Nrows / 2**clip_power_i |
| 196 | + adc_limits_k[i_slice,1] = Nrows / 2**clip_power_i |
| 197 | + |
| 198 | + return adc_limits_k |
| 199 | + |
| 200 | + |
| 201 | + |
| 202 | +def quantizationError_ReLU(eta, x, Nbits, norm_ord): |
| 203 | + """Quantizes values over a range from the minimum value to a high |
| 204 | + percentile value of the data. The percentile is only applied on |
| 205 | + large positive values, assuming ReLU activation is used. |
| 206 | +
|
| 207 | + Args: |
| 208 | + eta: parameter that controls the percentile used for clipping |
| 209 | + (to be optimized) |
| 210 | + x: data values to be quantized |
| 211 | + Nbits: quantization resolution in bits |
| 212 | + norm_ord: power of the error norm used for the loss function |
| 213 | + """ |
| 214 | + |
| 215 | + # Clip |
| 216 | + P = 100 * (1 - pow(10, eta)) |
| 217 | + P = xp.clip(P, 0, 100) |
| 218 | + x_min = 0 # assume ReLU |
| 219 | + x_maxP = xp.percentile(x, P) |
| 220 | + x_Q = x.copy() |
| 221 | + x_Q = x_Q.clip(x_min, x_maxP) |
| 222 | + |
| 223 | + # Quantize |
| 224 | + qmult = (2**Nbits - 1) / (x_maxP - x_min) |
| 225 | + x_Q = (x_Q - x_min) * qmult |
| 226 | + x_Q = xp.rint(x_Q, out=x_Q) |
| 227 | + x_Q /= qmult |
| 228 | + x_Q += x_min |
| 229 | + err = xp.linalg.norm(x - x_Q, ord=norm_ord) |
| 230 | + return float(err) |
| 231 | + |
| 232 | + |
| 233 | +def quantizationError_minMax(etas, x, Nbits, norm_ord): |
| 234 | + """Quantizes values over a range by optimizing the upper and lower |
| 235 | + percentiles of the range. |
| 236 | +
|
| 237 | + Args: |
| 238 | + etas: tuple of two parameters that control the lower and upper percentile |
| 239 | + used for clipping (to be optimized) |
| 240 | + x: data values to be quantized |
| 241 | + Nbits: quantization resolution in bits |
| 242 | + norm_ord: power of the error norm used for the loss function |
| 243 | + """ |
| 244 | + # Clip |
| 245 | + etaMin, etaMax = etas |
| 246 | + P_min = 100 * pow(10, etaMin) |
| 247 | + P_max = 100 * (1 - pow(10, etaMax)) |
| 248 | + P_min = xp.clip(P_min, 0, 100) |
| 249 | + P_max = xp.clip(P_max, 0, 100) |
| 250 | + x_min = xp.percentile(x, P_min) |
| 251 | + x_max = xp.percentile(x, P_max) |
| 252 | + x_Q = x.copy() |
| 253 | + x_Q = x_Q.clip(x_min, x_max) |
| 254 | + |
| 255 | + # Quantize |
| 256 | + qmult = (2**Nbits - 1) / (x_max - x_min) |
| 257 | + x_Q = (x_Q - x_min) * qmult |
| 258 | + x_Q = xp.rint(x_Q, out=x_Q) |
| 259 | + x_Q /= qmult |
| 260 | + x_Q += x_min |
| 261 | + err = xp.linalg.norm(x - x_Q, ord=norm_ord) |
| 262 | + return float(err) |
0 commit comments