32
32
from tests .tests_tensorflow import generate_data_tf
33
33
34
34
35
- def assign_feature_value (
36
- dataset : tf .data .Dataset , feature_key : str , value : int
35
+ def assign_value_to_column (
36
+ dataset : tf .data .Dataset , column_name : str , value : int
37
37
) -> tf .data .Dataset :
38
- """Assign a value to a feature for every sample in a tf.data.Dataset
38
+ """Assign a value to a column for every sample in a tf.data.Dataset
39
39
40
40
Args:
41
41
dataset (tf.data.Dataset): tf.data.Dataset to assign the value to
42
- feature_key (str): Feature to assign the value to
42
+ column_name (str): Column to assign the value to
43
43
value (int): Value to assign
44
44
45
45
Returns:
46
46
tf.data.Dataset
47
47
"""
48
48
assert isinstance (dataset .element_spec , dict ), "dataset elements must be dicts"
49
49
50
- def assign_value_to_feature (x ):
51
- x [feature_key ] = value
50
+ def assign_value_to_column (x ):
51
+ x [column_name ] = value
52
52
return x
53
53
54
- dataset = dataset .map (assign_value_to_feature )
54
+ dataset = dataset .map (assign_value_to_column )
55
55
return dataset
56
56
57
57
@@ -72,24 +72,24 @@ def get_dataset_length(dataset: tf.data.Dataset) -> int:
72
72
return int (cardinality )
73
73
74
74
75
- def get_feature_from_ds (dataset : tf .data .Dataset , feature_key : str ) -> np .ndarray :
76
- """Get a feature from a tf.data.Dataset
75
+ def get_column_from_ds (dataset : tf .data .Dataset , column_name : str ) -> np .ndarray :
76
+ """Get a column from a tf.data.Dataset
77
77
78
78
!!! note
79
79
This function can be a bit time consuming since it needs to iterate
80
80
over the whole dataset.
81
81
82
82
Args:
83
- dataset (tf.data.Dataset): tf.data.Dataset to get the feature from
84
- feature_key (str): Feature value to get
83
+ dataset (tf.data.Dataset): tf.data.Dataset to get the column from
84
+ column_name (str): Column value to get
85
85
86
86
Returns:
87
- np.ndarray: Feature values for dataset
87
+ np.ndarray: Column values for dataset
88
88
"""
89
- features = dataset .map (lambda x : x [feature_key ])
90
- features = list (features .as_numpy_iterator ())
91
- features = np .array (features )
92
- return features
89
+ columns = dataset .map (lambda x : x [column_name ])
90
+ columns = list (columns .as_numpy_iterator ())
91
+ columns = np .array (columns )
92
+ return columns
93
93
94
94
95
95
def test_instanciate_tf_datahandler ():
@@ -179,11 +179,11 @@ def test_load_tensorflow_datasets(dataset_name, train):
179
179
# dummy item
180
180
for item in dataset .take (1 ):
181
181
dummy_item = item
182
- dummy_keys = list (dummy_item .keys ())
182
+ dummy_columns = list (dummy_item .keys ())
183
183
dummy_shapes = [v .shape for v in dummy_item .values ()]
184
184
185
- # check keys
186
- assert list (dataset .element_spec .keys ()) == dummy_keys == ["image" , "label" ]
185
+ # check columns
186
+ assert list (dataset .element_spec .keys ()) == dummy_columns == ["image" , "label" ]
187
187
188
188
# check output shape
189
189
assert (
@@ -230,18 +230,18 @@ def test_load_arrays_and_custom(x_shape, num_labels, num_samples, one_hot):
230
230
for dataset_id in [tuple_np , dict_np , tuple_tf , dict_tf , tensor_ds_tf ]:
231
231
ds = handler .load_dataset (dataset_id , columns = ["key_a" , "key_b" ])
232
232
233
- # check registered keys , shapes
234
- output_keys = list (ds .element_spec .keys ())
233
+ # check registered columns , shapes
234
+ output_columns = list (ds .element_spec .keys ())
235
235
output_shapes = [ds .element_spec [key ].shape for key in ds .element_spec .keys ()]
236
- assert output_keys == ["key_a" , "key_b" ]
236
+ assert output_columns == ["key_a" , "key_b" ]
237
237
assert output_shapes == [
238
238
tf .TensorShape (x_shape ),
239
239
tf .TensorShape ([num_labels ] if one_hot else []),
240
240
]
241
- # check item keys , shapes
241
+ # check item columns , shapes
242
242
for item in ds .take (1 ):
243
243
dummy_item = item
244
- assert list (dummy_item .keys ()) == output_keys
244
+ assert list (dummy_item .keys ()) == output_columns
245
245
assert list (map (lambda x : x .shape , dummy_item .values ())) == output_shapes
246
246
247
247
@@ -276,29 +276,29 @@ def test_data_handler_full_pipeline(x_shape, num_samples, num_labels, one_hot):
276
276
num_samples_b = get_dataset_length (dataset_b )
277
277
assert num_samples == (num_samples_a + num_samples_b )
278
278
279
- # assign feature , map, get feature
279
+ # assign column , map, get column
280
280
def map_fn_a (item ):
281
- item ["new_feature " ] -= 3
281
+ item ["new_column " ] -= 3
282
282
return item
283
283
284
284
def map_fn_b (item ):
285
- item ["new_feature " ] = item ["new_feature " ] * 3 + 2
285
+ item ["new_column " ] = item ["new_column " ] * 3 + 2
286
286
return item
287
287
288
- dataset_a = assign_feature_value (dataset_a , "new_feature " , 0 )
288
+ dataset_a = assign_value_to_column (dataset_a , "new_column " , 0 )
289
289
dataset_a = dataset_a .map (map_fn_a )
290
- features_a = tf .convert_to_tensor (get_feature_from_ds (dataset_a , "new_feature " ))
291
- assert tf .reduce_all (features_a == tf .convert_to_tensor ([- 3 ] * num_samples_a ))
290
+ columns_a = tf .convert_to_tensor (get_column_from_ds (dataset_a , "new_column " ))
291
+ assert tf .reduce_all (columns_a == tf .convert_to_tensor ([- 3 ] * num_samples_a ))
292
292
293
- dataset_b = assign_feature_value (dataset_b , "new_feature " , 1 )
293
+ dataset_b = assign_value_to_column (dataset_b , "new_column " , 1 )
294
294
dataset_b = dataset_b .map (map_fn_b )
295
- features_b = tf .convert_to_tensor (get_feature_from_ds (dataset_b , "new_feature " ))
296
- assert tf .reduce_all (features_b == tf .convert_to_tensor ([5 ] * num_samples_b ))
295
+ columns_b = tf .convert_to_tensor (get_column_from_ds (dataset_b , "new_column " ))
296
+ assert tf .reduce_all (columns_b == tf .convert_to_tensor ([5 ] * num_samples_b ))
297
297
298
298
# concatenate two sub datasets
299
299
dataset_c = handler .merge (dataset_a , dataset_b )
300
- features_c = tf .convert_to_tensor (get_feature_from_ds (dataset_c , "new_feature " ))
301
- assert tf .reduce_all (features_c == tf .concat ([features_a , features_b ], axis = 0 ))
300
+ columns_c = tf .convert_to_tensor (get_column_from_ds (dataset_c , "new_column " ))
301
+ assert tf .reduce_all (columns_c == tf .concat ([columns_a , columns_b ], axis = 0 ))
302
302
303
303
# prepare dataloader
304
304
loader = handler .prepare (dataset_c , 64 , shuffle = True )
@@ -406,13 +406,13 @@ def test_split_by_class(in_labels, out_labels, one_hot, expected_output):
406
406
len_inds = get_dataset_length (in_dataset )
407
407
len_outds = get_dataset_length (out_dataset )
408
408
409
- classes = get_feature_from_ds (dataset , "label" )
409
+ classes = get_column_from_ds (dataset , "label" )
410
410
classes = np .unique (classes , axis = 0 )
411
411
412
- classes_in = get_feature_from_ds (in_dataset , "label" )
412
+ classes_in = get_column_from_ds (in_dataset , "label" )
413
413
classes_in = np .unique (classes_in , axis = 0 )
414
414
415
- classes_out = get_feature_from_ds (out_dataset , "label" )
415
+ classes_out = get_column_from_ds (out_dataset , "label" )
416
416
classes_out = np .unique (classes_out , axis = 0 )
417
417
418
418
assert len_ds == expected_output [0 ]
0 commit comments