Skip to content

Commit c128250

Browse files
committed
Add failing tests
1 parent 3b88c65 commit c128250

File tree

5 files changed

+303
-0
lines changed

5 files changed

+303
-0
lines changed

tests/integration/cag/test_inequality.py

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -836,6 +836,119 @@ def test_inequality_with_nan():
836836
)
837837

838838

839+
@pytest.mark.skip(reason='This test is failing because of the time component in the B column.')
840+
def test_inequality_unequal_datetime_formats_strings():
841+
"""Test that the inequality pattern works with unequal datetime formats."""
842+
# Setup
843+
data = pd.DataFrame({
844+
'A': ['2020-01-01', '2020-01-02', '2020-01-03'],
845+
'B': ['2020-01-02 10:00:00', '2020-01-03 13:00:00', '2020-01-04 6:00:00'],
846+
})
847+
metadata = Metadata.load_from_dict({
848+
'columns': {
849+
'A': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
850+
'B': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d %H:%M:%S'},
851+
}
852+
})
853+
pattern = Inequality(low_column_name='A', high_column_name='B')
854+
855+
# Run
856+
sample = run_copula(data, metadata, [pattern]).sample(10)
857+
858+
# Assert
859+
assert is_object_dtype(sample['A'].dtype)
860+
assert is_object_dtype(sample['B'].dtype)
861+
862+
col_A = pd.to_datetime(sample['A'], format='%Y-%m-%d')
863+
col_B = pd.to_datetime(sample['B'], format='%Y-%m-%d %H:%M:%S')
864+
assert all(col_A <= col_B)
865+
assert any(col_B.dt.time.astype(str) != '00:00:00')
866+
867+
868+
@pytest.mark.skip(reason='The formats dont match, column B has a time component.')
869+
def test_inequality_unequal_datetime_formats():
870+
"""Test that the inequality pattern works with unequal datetime formats."""
871+
# Setup
872+
data = pd.DataFrame({
873+
'A': pd.to_datetime(['2020-01-01', np.nan, '2020-01-02']),
874+
'B': pd.to_datetime(['2020-01-02 10:00:00', '2020-01-03 13:00:00', np.nan]),
875+
})
876+
metadata = Metadata.load_from_dict({
877+
'columns': {
878+
'A': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
879+
'B': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d %H:%M:%S'},
880+
}
881+
})
882+
pattern = Inequality(low_column_name='A', high_column_name='B')
883+
884+
# Run
885+
sample = run_copula(data, metadata, [pattern]).sample(10)
886+
887+
# Assert
888+
col_A = pd.to_datetime(sample['A'], format='%Y-%m-%d')
889+
col_B = pd.to_datetime(sample['B'], format='%Y-%m-%d %H:%M:%S')
890+
assert all(col_A <= col_B)
891+
assert any(col_B.dt.time.astype(str) != '00:00:00')
892+
893+
894+
@pytest.mark.skip(reason='Timezone not supported for object dtype.')
895+
def test_inequality_unequal_datetime_formats_timezone_aware():
896+
"""Test that the inequality pattern works with timezone-aware datetime objects."""
897+
# Setup
898+
data = pd.DataFrame({
899+
'A': ['2020-01-01 UTC', '2020-01-02 UTC', '2020-01-03 UTC'],
900+
'B': ['2020-01-02 10:00:00 UTC', '2020-01-03 13:00:00 UTC', '2020-01-04 6:00:00 UTC'],
901+
})
902+
metadata = Metadata.load_from_dict({
903+
'columns': {
904+
'A': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d %Z'},
905+
'B': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d %H:%M:%S %Z'},
906+
}
907+
})
908+
pattern = Inequality(low_column_name='A', high_column_name='B')
909+
910+
# Run
911+
sample = run_copula(data, metadata, [pattern]).sample(10)
912+
913+
# Assert
914+
assert is_object_dtype(sample['A'].dtype)
915+
assert is_object_dtype(sample['B'].dtype)
916+
917+
col_A = pd.to_datetime(sample['A'], format='%Y-%m-%d')
918+
col_B = pd.to_datetime(sample['B'], format='%Y-%m-%d %H:%M:%S')
919+
assert all(col_A <= col_B)
920+
assert any(col_B.dt.time.astype(str) != '00:00:00')
921+
922+
923+
@pytest.mark.skip(reason='Only one column has timezone.')
924+
def test_inequality_unequal_datetime_formats_unequal_timezone():
925+
"""Test that the inequality pattern works with timezone-aware datetime objects."""
926+
# Setup
927+
data = pd.DataFrame({
928+
'A': pd.to_datetime(['2020-01-01', np.nan, '2020-01-02']),
929+
'B': pd.to_datetime(['2020-01-02 10:00:00 UTC', '2020-01-03 13:00:00 UTC', np.nan]),
930+
})
931+
metadata = Metadata.load_from_dict({
932+
'columns': {
933+
'A': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
934+
'B': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d %H:%M:%S %Z'},
935+
}
936+
})
937+
pattern = Inequality(low_column_name='A', high_column_name='B')
938+
939+
# Run
940+
sample = run_copula(data, metadata, [pattern]).sample(10)
941+
942+
# Assert
943+
assert is_object_dtype(sample['A'].dtype)
944+
assert is_object_dtype(sample['B'].dtype)
945+
946+
col_A = pd.to_datetime(sample['A'], format='%Y-%m-%d')
947+
col_B = pd.to_datetime(sample['B'], format='%Y-%m-%d %H:%M:%S')
948+
assert all(col_A <= col_B)
949+
assert any(col_B.dt.time.astype(str) != '00:00:00')
950+
951+
839952
def test_validate_cag(data, metadata, pattern):
840953
"""Test validate_cag works with synthetic data generated with Inequality."""
841954
# Setup

tests/integration/cag/test_range.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import numpy as np
44
import pandas as pd
55
import pytest
6+
from pandas.api.types import is_object_dtype
67

78
from sdv.cag import Range
89
from sdv.cag._errors import PatternNotMetError
@@ -328,6 +329,62 @@ def test_range_pattern_datetime_nans(metadata_datetime, pattern):
328329
assert (diff.dt.total_seconds() < 1e-6).all()
329330

330331

332+
@pytest.mark.skip(reason='Issue #2275 needs to be implemented for the Range constraint.')
333+
def test_range_with_timestamp_and_date():
334+
"""Test that the range pattern passes for different datetime formats."""
335+
# Setup
336+
data = pd.DataFrame(
337+
data={
338+
'SUBMISSION_TIMESTAMP': [
339+
'2016-07-10 17:04:00',
340+
'2016-07-11 13:23:00',
341+
'2016-07-12 08:45:30',
342+
'2016-07-11 12:00:00',
343+
'2016-07-12 10:30:00',
344+
],
345+
'DUE_DATE': ['2016-07-10', '2016-07-11', '2016-07-12', '2016-07-13', '2016-07-14'],
346+
'DUE_DATE_2': ['2016-07-11', '2016-07-12', '2016-07-13', '2016-07-14', '2016-07-15'],
347+
}
348+
)
349+
350+
metadata = Metadata.load_from_dict({
351+
'tables': {
352+
'table': {
353+
'columns': {
354+
'SUBMISSION_TIMESTAMP': {
355+
'sdtype': 'datetime',
356+
'datetime_format': '%Y-%m-%d %H:%M:%S',
357+
},
358+
'DUE_DATE': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
359+
'DUE_DATE_2': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
360+
}
361+
}
362+
}
363+
})
364+
pattern = Range(
365+
low_column_name='SUBMISSION_TIMESTAMP',
366+
middle_column_name='DUE_DATE',
367+
high_column_name='DUE_DATE_2',
368+
strict_boundaries=False,
369+
)
370+
371+
# Run
372+
synthesizer = run_copula(data, metadata, [pattern])
373+
synthetic_data = synthesizer.sample(num_rows=10)
374+
375+
# Assert
376+
assert is_object_dtype(synthetic_data['SUBMISSION_TIMESTAMP'].dtype)
377+
synthetic_data['SUBMISSION_TIMESTAMP'] = pd.to_datetime(
378+
synthetic_data['SUBMISSION_TIMESTAMP'], format='%Y-%m-%d %H:%M:%S'
379+
)
380+
assert is_object_dtype(synthetic_data['DUE_DATE'].dtype)
381+
synthetic_data['DUE_DATE'] = pd.to_datetime(synthetic_data['DUE_DATE'], format='%Y-%m-%d')
382+
invalid_rows = synthetic_data[
383+
synthetic_data['SUBMISSION_TIMESTAMP'].dt.date > synthetic_data['DUE_DATE'].dt.date
384+
]
385+
assert invalid_rows.empty
386+
387+
331388
def test_range_pattern_with_multi_table(data_multi, metadata_multi):
332389
"""Test that Range pattern works with multi-table data."""
333390
# Setup

tests/unit/cag/test_fixed_combinations.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,35 @@ def test__transform_with_categorical_dtype(self):
394394
expected_out_a = pd.Series(['a', 'b', 'c'], name='a')
395395
pd.testing.assert_series_equal(expected_out_a, out['a'])
396396

397+
@pytest.mark.skip(reason='KeyError raised. It used to throw MissingConstraintColumnError.')
398+
def test_transform_not_all_columns_provided(self):
399+
"""Test the ``FixedCombinations.transform`` method when not all columns are provided."""
400+
# Setup
401+
metadata = Metadata.load_from_dict({
402+
'tables': {
403+
'table': {
404+
'columns': {
405+
'a': {'sdtype': 'categorical'},
406+
'b': {'sdtype': 'categorical'},
407+
'c': {'sdtype': 'categorical'},
408+
}
409+
}
410+
}
411+
})
412+
table_data = pd.DataFrame({
413+
'a': ['a', 'b', 'c'],
414+
'b': ['d', 'e', 'f'],
415+
'c': ['g', 'h', 'i'],
416+
})
417+
columns = ['b', 'c']
418+
instance = FixedCombinations(column_names=columns)
419+
instance.fit(table_data, metadata)
420+
421+
# Run and Assert
422+
with pytest.raises(PatternNotMetError):
423+
# Transforming a column that was not seen is not validated
424+
instance.transform(pd.DataFrame({'a': ['a', 'b', 'c']}))
425+
397426
def test__reverse_transform(self):
398427
"""Test the ``FixedCombinations.reverse_transform`` method."""
399428
# Setup

tests/unit/cag/test_inequality.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -879,6 +879,42 @@ def test_reverse_transform_datetime_dtype_is_object(self):
879879
expected_out['b'] = expected_out['b'].astype(np.dtype('O'))
880880
pd.testing.assert_frame_equal(out, expected_out)
881881

882+
@pytest.mark.skip(reason='Nans are not getting reversed correctly.')
883+
def test_reverse_transform_nans(self):
884+
"""Test it reverses the transformation correctly when the data contains nans."""
885+
# Setup
886+
transformed = {
887+
'table': pd.DataFrame({
888+
'a': [1.0, 2.0, 3.0, 2.0],
889+
'a#b': [np.log(2)] * 4,
890+
'a#b.nan_component': ['b', 'a', 'None', 'a, b'],
891+
})
892+
}
893+
instance = Inequality(
894+
low_column_name='a',
895+
high_column_name='b',
896+
table_name='table',
897+
)
898+
instance._dtype = np.dtype('float')
899+
instance._original_data_columns = {'table': ['a', 'b']}
900+
instance._dtypes = {
901+
'table': {
902+
'a': np.dtype('float'),
903+
'b': np.dtype('float'),
904+
}
905+
}
906+
907+
# Run
908+
out = instance.reverse_transform(transformed)
909+
910+
# Assert
911+
out = out['table']
912+
expected_out = pd.DataFrame({
913+
'a': [1, np.nan, 3, np.nan],
914+
'b': [np.nan, 2, 4, np.nan],
915+
})
916+
pd.testing.assert_frame_equal(out, expected_out)
917+
882918
def test__is_valid(self):
883919
"""Test it checks if the data is valid."""
884920
# Setup

tests/unit/cag/test_range.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1031,6 +1031,46 @@ def test_reverse_transform_datetime_dtype_is_object(self):
10311031
expected_out['c'] = expected_out['c'].astype(np.dtype('O'))
10321032
pd.testing.assert_frame_equal(out, expected_out)
10331033

1034+
@pytest.mark.skip(reason='Nans are not getting reversed correctly.')
1035+
def test_reverse_transform_nans(self):
1036+
"""Test it reverses the transformation correctly when the data contains nans."""
1037+
# Setup
1038+
transformed = {
1039+
'table': pd.DataFrame({
1040+
'a': [1.0, 2.0, 3.0, 2.0],
1041+
'a#b': [np.log(2)] * 4,
1042+
'b#c': [np.log(2)] * 4,
1043+
'a#b#c.nan_component': ['b, c', 'a', 'None', 'a, b, c'],
1044+
})
1045+
}
1046+
instance = Range(
1047+
low_column_name='a',
1048+
middle_column_name='b',
1049+
high_column_name='c',
1050+
table_name='table',
1051+
)
1052+
instance._dtype = np.dtype('float')
1053+
instance._original_data_columns = {'table': ['a', 'b', 'c']}
1054+
instance._dtypes = {
1055+
'table': {
1056+
'a': np.dtype('float'),
1057+
'b': np.dtype('float'),
1058+
'c': np.dtype('float'),
1059+
}
1060+
}
1061+
1062+
# Run
1063+
out = instance.reverse_transform(transformed)
1064+
1065+
# Assert
1066+
out = out['table']
1067+
expected_out = pd.DataFrame({
1068+
'a': [1, np.nan, 3, np.nan],
1069+
'b': [np.nan, 2, 4, np.nan],
1070+
'c': [np.nan, 3, 5, np.nan],
1071+
})
1072+
pd.testing.assert_frame_equal(out, expected_out)
1073+
10341074
def test_is_valid(self):
10351075
"""Test it checks if the data is valid."""
10361076
# Setup
@@ -1143,3 +1183,31 @@ def test_is_valid_datetimes_strings(self):
11431183
out = out['table']
11441184
expected_out = [True, False, True]
11451185
np.testing.assert_array_equal(expected_out, out)
1186+
1187+
@pytest.mark.skip(reason='Strings with nans not supported.')
1188+
def test_is_valid_datetimes_strings_with_nans(self):
1189+
"""Test it checks if the data is valid when it contains datetimes."""
1190+
# Setup
1191+
table_data = {
1192+
'table': pd.DataFrame({
1193+
'a': ['2020-05-17', '2021-09-01', np.nan],
1194+
'b': ['2020-05-18', '2020-09-02', '2020-09-02'],
1195+
'c': ['2020-05-29', '2021-09-03', np.nan],
1196+
'col': [7, 8, 9],
1197+
})
1198+
}
1199+
instance = Range(
1200+
low_column_name='a',
1201+
middle_column_name='b',
1202+
high_column_name='c',
1203+
table_name='table',
1204+
)
1205+
instance._fitted = True
1206+
1207+
# Run
1208+
out = instance.is_valid(table_data)
1209+
1210+
# Assert
1211+
out = out['table']
1212+
expected_out = [True, False, True]
1213+
np.testing.assert_array_equal(expected_out, out)

0 commit comments

Comments
 (0)