@@ -88,72 +88,72 @@ def load_tf_weights_in_convbert(model, config, tf_checkpoint_path):
88
88
group_dense_name = "dense"
89
89
90
90
for j in range (config .num_hidden_layers ):
91
- param_mapping [f"encoder.layer. { j } .attention.self.query.weight" ] = (
92
- f"electra/ encoder/layer_ { j } / attention/ self/ query/kernel "
93
- )
94
- param_mapping [f"encoder.layer. { j } .attention.self.query.bias" ] = (
95
- f"electra/ encoder/layer_ { j } / attention/ self/ query/ bias"
96
- )
97
- param_mapping [f"encoder.layer. { j } .attention.self.key.weight" ] = (
98
- f"electra/ encoder/layer_ { j } / attention/ self/ key/kernel "
99
- )
100
- param_mapping [f"encoder.layer. { j } .attention.self.key.bias" ] = (
101
- f"electra/ encoder/layer_ { j } / attention/ self/ key/ bias"
102
- )
103
- param_mapping [f"encoder.layer. { j } .attention.self.value.weight" ] = (
104
- f"electra/ encoder/layer_ { j } / attention/ self/ value/kernel "
105
- )
106
- param_mapping [f"encoder.layer. { j } .attention.self.value.bias" ] = (
107
- f"electra/ encoder/layer_ { j } / attention/ self/ value/ bias"
108
- )
109
- param_mapping [f"encoder.layer. { j } .attention.self.key_conv_attn_layer.depthwise.weight" ] = (
110
- f"electra/ encoder/layer_ { j } / attention/ self/conv_attn_key/depthwise_kernel "
111
- )
112
- param_mapping [f"encoder.layer. { j } .attention.self.key_conv_attn_layer.pointwise.weight" ] = (
113
- f"electra/ encoder/layer_ { j } / attention/ self/conv_attn_key/pointwise_kernel "
114
- )
115
- param_mapping [f"encoder.layer. { j } .attention.self.key_conv_attn_layer.bias" ] = (
116
- f"electra/ encoder/layer_ { j } / attention/ self/conv_attn_key/ bias"
117
- )
118
- param_mapping [f"encoder.layer. { j } .attention.self.conv_kernel_layer.weight" ] = (
119
- f"electra/ encoder/layer_ { j } / attention/ self/conv_attn_kernel/kernel "
120
- )
121
- param_mapping [f"encoder.layer. { j } .attention.self.conv_kernel_layer.bias" ] = (
122
- f"electra/ encoder/layer_ { j } / attention/ self/conv_attn_kernel/ bias"
123
- )
124
- param_mapping [f"encoder.layer. { j } .attention.self.conv_out_layer.weight" ] = (
125
- f"electra/ encoder/layer_ { j } / attention/ self/conv_attn_point/kernel "
126
- )
127
- param_mapping [f"encoder.layer. { j } .attention.self.conv_out_layer.bias" ] = (
128
- f"electra/ encoder/layer_ { j } / attention/ self/conv_attn_point/ bias"
129
- )
130
- param_mapping [f"encoder.layer. { j } .attention.output.dense.weight" ] = (
131
- f"electra/ encoder/layer_ { j } / attention/ output/ dense/kernel "
132
- )
133
- param_mapping [f"encoder.layer. { j } .attention.output.LayerNorm.weight" ] = (
134
- f"electra/ encoder/layer_ { j } / attention/ output/ LayerNorm/gamma "
135
- )
136
- param_mapping [f"encoder.layer. { j } .attention.output.dense.bias" ] = (
137
- f"electra/ encoder/layer_ { j } / attention/ output/ dense/ bias"
138
- )
139
- param_mapping [f"encoder.layer. { j } .attention.output.LayerNorm.bias" ] = (
140
- f"electra/ encoder/layer_ { j } / attention/ output/ LayerNorm/beta "
141
- )
142
- param_mapping [f"encoder.layer. { j } .intermediate.dense.weight" ] = (
143
- f"electra/ encoder/layer_ { j } / intermediate/ { group_dense_name } /kernel "
144
- )
145
- param_mapping [f"encoder.layer. { j } .intermediate.dense.bias" ] = (
146
- f"electra/ encoder/layer_ { j } / intermediate/ { group_dense_name } / bias"
147
- )
148
- param_mapping [f"encoder.layer. { j } .output.dense.weight" ] = (
149
- f"electra/ encoder/layer_ { j } / output/ { group_dense_name } /kernel "
150
- )
151
- param_mapping [f"encoder.layer. { j } .output.dense.bias" ] = (
152
- f"electra/ encoder/layer_ { j } / output/ { group_dense_name } / bias"
153
- )
154
- param_mapping [f"encoder.layer. { j } .output.LayerNorm.weight" ] = (
155
- f"electra/ encoder/layer_ { j } / output/ LayerNorm/gamma "
156
- )
91
+ param_mapping [
92
+ f"encoder.layer. { j } . attention. self. query.weight "
93
+ ] = f"electra/encoder/layer_ { j } /attention/self/query/kernel"
94
+ param_mapping [
95
+ f"encoder.layer. { j } . attention. self. query. bias"
96
+ ] = f"electra/encoder/layer_ { j } /attention/self/query/bias"
97
+ param_mapping [
98
+ f"encoder.layer. { j } . attention. self. key.weight "
99
+ ] = f"electra/encoder/layer_ { j } /attention/self/key/kernel"
100
+ param_mapping [
101
+ f"encoder.layer. { j } . attention. self. key. bias"
102
+ ] = f"electra/encoder/layer_ { j } /attention/self/key/bias"
103
+ param_mapping [
104
+ f"encoder.layer. { j } . attention. self. value.weight "
105
+ ] = f"electra/encoder/layer_ { j } /attention/self/value/kernel"
106
+ param_mapping [
107
+ f"encoder.layer. { j } . attention. self. value. bias"
108
+ ] = f"electra/encoder/layer_ { j } /attention/self/value/bias"
109
+ param_mapping [
110
+ f"encoder.layer. { j } . attention. self.key_conv_attn_layer.depthwise.weight "
111
+ ] = f"electra/encoder/layer_ { j } /attention/self/conv_attn_key/depthwise_kernel"
112
+ param_mapping [
113
+ f"encoder.layer. { j } . attention. self.key_conv_attn_layer.pointwise.weight "
114
+ ] = f"electra/encoder/layer_ { j } /attention/self/conv_attn_key/pointwise_kernel"
115
+ param_mapping [
116
+ f"encoder.layer. { j } . attention. self.key_conv_attn_layer. bias"
117
+ ] = f"electra/encoder/layer_ { j } /attention/self/conv_attn_key/bias"
118
+ param_mapping [
119
+ f"encoder.layer. { j } . attention. self.conv_kernel_layer.weight "
120
+ ] = f"electra/encoder/layer_ { j } /attention/self/conv_attn_kernel/kernel"
121
+ param_mapping [
122
+ f"encoder.layer. { j } . attention. self.conv_kernel_layer. bias"
123
+ ] = f"electra/encoder/layer_ { j } /attention/self/conv_attn_kernel/bias"
124
+ param_mapping [
125
+ f"encoder.layer. { j } . attention. self.conv_out_layer.weight "
126
+ ] = f"electra/encoder/layer_ { j } /attention/self/conv_attn_point/kernel"
127
+ param_mapping [
128
+ f"encoder.layer. { j } . attention. self.conv_out_layer. bias"
129
+ ] = f"electra/encoder/layer_ { j } /attention/self/conv_attn_point/bias"
130
+ param_mapping [
131
+ f"encoder.layer. { j } . attention. output. dense.weight "
132
+ ] = f"electra/encoder/layer_ { j } /attention/output/dense/kernel"
133
+ param_mapping [
134
+ f"encoder.layer. { j } . attention. output. LayerNorm.weight "
135
+ ] = f"electra/encoder/layer_ { j } /attention/output/LayerNorm/gamma"
136
+ param_mapping [
137
+ f"encoder.layer. { j } . attention. output. dense. bias"
138
+ ] = f"electra/encoder/layer_ { j } /attention/output/dense/bias"
139
+ param_mapping [
140
+ f"encoder.layer. { j } . attention. output. LayerNorm.bias "
141
+ ] = f"electra/encoder/layer_ { j } /attention/output/LayerNorm/beta"
142
+ param_mapping [
143
+ f"encoder.layer. { j } . intermediate.dense.weight "
144
+ ] = f"electra/encoder/layer_ { j } /intermediate/ { group_dense_name } /kernel"
145
+ param_mapping [
146
+ f"encoder.layer. { j } . intermediate.dense. bias"
147
+ ] = f"electra/encoder/layer_ { j } /intermediate/ { group_dense_name } /bias"
148
+ param_mapping [
149
+ f"encoder.layer. { j } . output.dense.weight "
150
+ ] = f"electra/encoder/layer_ { j } /output/ { group_dense_name } /kernel"
151
+ param_mapping [
152
+ f"encoder.layer. { j } . output.dense. bias"
153
+ ] = f"electra/encoder/layer_ { j } /output/ { group_dense_name } /bias"
154
+ param_mapping [
155
+ f"encoder.layer. { j } . output. LayerNorm.weight "
156
+ ] = f"electra/encoder/layer_ { j } /output/LayerNorm/gamma"
157
157
param_mapping [f"encoder.layer.{ j } .output.LayerNorm.bias" ] = f"electra/encoder/layer_{ j } /output/LayerNorm/beta"
158
158
159
159
for param in model .named_parameters ():
0 commit comments