Add failing tests

fealho · fealho · commit c128250472bf · 2025-05-21T06:57:09.000-07:00
diff --git a/tests/integration/cag/test_inequality.py b/tests/integration/cag/test_inequality.py
@@ -836,6 +836,119 @@ def test_inequality_with_nan():
     )
 
 
+@pytest.mark.skip(reason='This test is failing because of the time component in the B column.')
+def test_inequality_unequal_datetime_formats_strings():
+    """Test that the inequality pattern works with unequal datetime formats."""
+    # Setup
+    data = pd.DataFrame({
+        'A': ['2020-01-01', '2020-01-02', '2020-01-03'],
+        'B': ['2020-01-02 10:00:00', '2020-01-03 13:00:00', '2020-01-04 6:00:00'],
+    })
+    metadata = Metadata.load_from_dict({
+        'columns': {
+            'A': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
+            'B': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d %H:%M:%S'},
+        }
+    })
+    pattern = Inequality(low_column_name='A', high_column_name='B')
+
+    # Run
+    sample = run_copula(data, metadata, [pattern]).sample(10)
+
+    # Assert
+    assert is_object_dtype(sample['A'].dtype)
+    assert is_object_dtype(sample['B'].dtype)
+
+    col_A = pd.to_datetime(sample['A'], format='%Y-%m-%d')
+    col_B = pd.to_datetime(sample['B'], format='%Y-%m-%d %H:%M:%S')
+    assert all(col_A <= col_B)
+    assert any(col_B.dt.time.astype(str) != '00:00:00')
+
+
+@pytest.mark.skip(reason='The formats dont match, column B has a time component.')
+def test_inequality_unequal_datetime_formats():
+    """Test that the inequality pattern works with unequal datetime formats."""
+    # Setup
+    data = pd.DataFrame({
+        'A': pd.to_datetime(['2020-01-01', np.nan, '2020-01-02']),
+        'B': pd.to_datetime(['2020-01-02 10:00:00', '2020-01-03 13:00:00', np.nan]),
+    })
+    metadata = Metadata.load_from_dict({
+        'columns': {
+            'A': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
+            'B': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d %H:%M:%S'},
+        }
+    })
+    pattern = Inequality(low_column_name='A', high_column_name='B')
+
+    # Run
+    sample = run_copula(data, metadata, [pattern]).sample(10)
+
+    # Assert
+    col_A = pd.to_datetime(sample['A'], format='%Y-%m-%d')
+    col_B = pd.to_datetime(sample['B'], format='%Y-%m-%d %H:%M:%S')
+    assert all(col_A <= col_B)
+    assert any(col_B.dt.time.astype(str) != '00:00:00')
+
+
+@pytest.mark.skip(reason='Timezone not supported for object dtype.')
+def test_inequality_unequal_datetime_formats_timezone_aware():
+    """Test that the inequality pattern works with timezone-aware datetime objects."""
+    # Setup
+    data = pd.DataFrame({
+        'A': ['2020-01-01 UTC', '2020-01-02 UTC', '2020-01-03 UTC'],
+        'B': ['2020-01-02 10:00:00 UTC', '2020-01-03 13:00:00 UTC', '2020-01-04 6:00:00 UTC'],
+    })
+    metadata = Metadata.load_from_dict({
+        'columns': {
+            'A': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d %Z'},
+            'B': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d %H:%M:%S %Z'},
+        }
+    })
+    pattern = Inequality(low_column_name='A', high_column_name='B')
+
+    # Run
+    sample = run_copula(data, metadata, [pattern]).sample(10)
+
+    # Assert
+    assert is_object_dtype(sample['A'].dtype)
+    assert is_object_dtype(sample['B'].dtype)
+
+    col_A = pd.to_datetime(sample['A'], format='%Y-%m-%d')
+    col_B = pd.to_datetime(sample['B'], format='%Y-%m-%d %H:%M:%S')
+    assert all(col_A <= col_B)
+    assert any(col_B.dt.time.astype(str) != '00:00:00')
+
+
+@pytest.mark.skip(reason='Only one column has timezone.')
+def test_inequality_unequal_datetime_formats_unequal_timezone():
+    """Test that the inequality pattern works with timezone-aware datetime objects."""
+    # Setup
+    data = pd.DataFrame({
+        'A': pd.to_datetime(['2020-01-01', np.nan, '2020-01-02']),
+        'B': pd.to_datetime(['2020-01-02 10:00:00 UTC', '2020-01-03 13:00:00 UTC', np.nan]),
+    })
+    metadata = Metadata.load_from_dict({
+        'columns': {
+            'A': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
+            'B': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d %H:%M:%S %Z'},
+        }
+    })
+    pattern = Inequality(low_column_name='A', high_column_name='B')
+
+    # Run
+    sample = run_copula(data, metadata, [pattern]).sample(10)
+
+    # Assert
+    assert is_object_dtype(sample['A'].dtype)
+    assert is_object_dtype(sample['B'].dtype)
+
+    col_A = pd.to_datetime(sample['A'], format='%Y-%m-%d')
+    col_B = pd.to_datetime(sample['B'], format='%Y-%m-%d %H:%M:%S')
+    assert all(col_A <= col_B)
+    assert any(col_B.dt.time.astype(str) != '00:00:00')
+
+
 def test_validate_cag(data, metadata, pattern):
     """Test validate_cag works with synthetic data generated with Inequality."""
     # Setup
diff --git a/tests/integration/cag/test_range.py b/tests/integration/cag/test_range.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from pandas.api.types import is_object_dtype
 
 from sdv.cag import Range
 from sdv.cag._errors import PatternNotMetError
@@ -328,6 +329,62 @@ def test_range_pattern_datetime_nans(metadata_datetime, pattern):
     assert (diff.dt.total_seconds() < 1e-6).all()
 
 
+@pytest.mark.skip(reason='Issue #2275 needs to be implemented for the Range constraint.')
+def test_range_with_timestamp_and_date():
+    """Test that the range pattern passes for different datetime formats."""
+    # Setup
+    data = pd.DataFrame(
+        data={
+            'SUBMISSION_TIMESTAMP': [
+                '2016-07-10 17:04:00',
+                '2016-07-11 13:23:00',
+                '2016-07-12 08:45:30',
+                '2016-07-11 12:00:00',
+                '2016-07-12 10:30:00',
+            ],
+            'DUE_DATE': ['2016-07-10', '2016-07-11', '2016-07-12', '2016-07-13', '2016-07-14'],
+            'DUE_DATE_2': ['2016-07-11', '2016-07-12', '2016-07-13', '2016-07-14', '2016-07-15'],
+        }
+    )
+
+    metadata = Metadata.load_from_dict({
+        'tables': {
+            'table': {
+                'columns': {
+                    'SUBMISSION_TIMESTAMP': {
+                        'sdtype': 'datetime',
+                        'datetime_format': '%Y-%m-%d %H:%M:%S',
+                    },
+                    'DUE_DATE': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
+                    'DUE_DATE_2': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d'},
+                }
+            }
+        }
+    })
+    pattern = Range(
+        low_column_name='SUBMISSION_TIMESTAMP',
+        middle_column_name='DUE_DATE',
+        high_column_name='DUE_DATE_2',
+        strict_boundaries=False,
+    )
+
+    # Run
+    synthesizer = run_copula(data, metadata, [pattern])
+    synthetic_data = synthesizer.sample(num_rows=10)
+
+    # Assert
+    assert is_object_dtype(synthetic_data['SUBMISSION_TIMESTAMP'].dtype)
+    synthetic_data['SUBMISSION_TIMESTAMP'] = pd.to_datetime(
+        synthetic_data['SUBMISSION_TIMESTAMP'], format='%Y-%m-%d %H:%M:%S'
+    )
+    assert is_object_dtype(synthetic_data['DUE_DATE'].dtype)
+    synthetic_data['DUE_DATE'] = pd.to_datetime(synthetic_data['DUE_DATE'], format='%Y-%m-%d')
+    invalid_rows = synthetic_data[
+        synthetic_data['SUBMISSION_TIMESTAMP'].dt.date > synthetic_data['DUE_DATE'].dt.date
+    ]
+    assert invalid_rows.empty
+
+
 def test_range_pattern_with_multi_table(data_multi, metadata_multi):
     """Test that Range pattern works with multi-table data."""
     # Setup
diff --git a/tests/unit/cag/test_fixed_combinations.py b/tests/unit/cag/test_fixed_combinations.py
@@ -394,6 +394,35 @@ def test__transform_with_categorical_dtype(self):
         expected_out_a = pd.Series(['a', 'b', 'c'], name='a')
         pd.testing.assert_series_equal(expected_out_a, out['a'])
 
+    @pytest.mark.skip(reason='KeyError raised. It used to throw MissingConstraintColumnError.')
+    def test_transform_not_all_columns_provided(self):
+        """Test the ``FixedCombinations.transform`` method when not all columns are provided."""
+        # Setup
+        metadata = Metadata.load_from_dict({
+            'tables': {
+                'table': {
+                    'columns': {
+                        'a': {'sdtype': 'categorical'},
+                        'b': {'sdtype': 'categorical'},
+                        'c': {'sdtype': 'categorical'},
+                    }
+                }
+            }
+        })
+        table_data = pd.DataFrame({
+            'a': ['a', 'b', 'c'],
+            'b': ['d', 'e', 'f'],
+            'c': ['g', 'h', 'i'],
+        })
+        columns = ['b', 'c']
+        instance = FixedCombinations(column_names=columns)
+        instance.fit(table_data, metadata)
+
+        # Run and Assert
+        with pytest.raises(PatternNotMetError):
+            # Transforming a column that was not seen is not validated
+            instance.transform(pd.DataFrame({'a': ['a', 'b', 'c']}))
+
     def test__reverse_transform(self):
         """Test the ``FixedCombinations.reverse_transform`` method."""
         # Setup
diff --git a/tests/unit/cag/test_inequality.py b/tests/unit/cag/test_inequality.py
@@ -879,6 +879,42 @@ def test_reverse_transform_datetime_dtype_is_object(self):
         expected_out['b'] = expected_out['b'].astype(np.dtype('O'))
         pd.testing.assert_frame_equal(out, expected_out)
 
+    @pytest.mark.skip(reason='Nans are not getting reversed correctly.')
+    def test_reverse_transform_nans(self):
+        """Test it reverses the transformation correctly when the data contains nans."""
+        # Setup
+        transformed = {
+            'table': pd.DataFrame({
+                'a': [1.0, 2.0, 3.0, 2.0],
+                'a#b': [np.log(2)] * 4,
+                'a#b.nan_component': ['b', 'a', 'None', 'a, b'],
+            })
+        }
+        instance = Inequality(
+            low_column_name='a',
+            high_column_name='b',
+            table_name='table',
+        )
+        instance._dtype = np.dtype('float')
+        instance._original_data_columns = {'table': ['a', 'b']}
+        instance._dtypes = {
+            'table': {
+                'a': np.dtype('float'),
+                'b': np.dtype('float'),
+            }
+        }
+
+        # Run
+        out = instance.reverse_transform(transformed)
+
+        # Assert
+        out = out['table']
+        expected_out = pd.DataFrame({
+            'a': [1, np.nan, 3, np.nan],
+            'b': [np.nan, 2, 4, np.nan],
+        })
+        pd.testing.assert_frame_equal(out, expected_out)
+
     def test__is_valid(self):
         """Test it checks if the data is valid."""
         # Setup
diff --git a/tests/unit/cag/test_range.py b/tests/unit/cag/test_range.py
@@ -1031,6 +1031,46 @@ def test_reverse_transform_datetime_dtype_is_object(self):
         expected_out['c'] = expected_out['c'].astype(np.dtype('O'))
         pd.testing.assert_frame_equal(out, expected_out)
 
+    @pytest.mark.skip(reason='Nans are not getting reversed correctly.')
+    def test_reverse_transform_nans(self):
+        """Test it reverses the transformation correctly when the data contains nans."""
+        # Setup
+        transformed = {
+            'table': pd.DataFrame({
+                'a': [1.0, 2.0, 3.0, 2.0],
+                'a#b': [np.log(2)] * 4,
+                'b#c': [np.log(2)] * 4,
+                'a#b#c.nan_component': ['b, c', 'a', 'None', 'a, b, c'],
+            })
+        }
+        instance = Range(
+            low_column_name='a',
+            middle_column_name='b',
+            high_column_name='c',
+            table_name='table',
+        )
+        instance._dtype = np.dtype('float')
+        instance._original_data_columns = {'table': ['a', 'b', 'c']}
+        instance._dtypes = {
+            'table': {
+                'a': np.dtype('float'),
+                'b': np.dtype('float'),
+                'c': np.dtype('float'),
+            }
+        }
+
+        # Run
+        out = instance.reverse_transform(transformed)
+
+        # Assert
+        out = out['table']
+        expected_out = pd.DataFrame({
+            'a': [1, np.nan, 3, np.nan],
+            'b': [np.nan, 2, 4, np.nan],
+            'c': [np.nan, 3, 5, np.nan],
+        })
+        pd.testing.assert_frame_equal(out, expected_out)
+
     def test_is_valid(self):
         """Test it checks if the data is valid."""
         # Setup
@@ -1143,3 +1183,31 @@ def test_is_valid_datetimes_strings(self):
         out = out['table']
         expected_out = [True, False, True]
         np.testing.assert_array_equal(expected_out, out)
+
+    @pytest.mark.skip(reason='Strings with nans not supported.')
+    def test_is_valid_datetimes_strings_with_nans(self):
+        """Test it checks if the data is valid when it contains datetimes."""
+        # Setup
+        table_data = {
+            'table': pd.DataFrame({
+                'a': ['2020-05-17', '2021-09-01', np.nan],
+                'b': ['2020-05-18', '2020-09-02', '2020-09-02'],
+                'c': ['2020-05-29', '2021-09-03', np.nan],
+                'col': [7, 8, 9],
+            })
+        }
+        instance = Range(
+            low_column_name='a',
+            middle_column_name='b',
+            high_column_name='c',
+            table_name='table',
+        )
+        instance._fitted = True
+
+        # Run
+        out = instance.is_valid(table_data)
+
+        # Assert
+        out = out['table']
+        expected_out = [True, False, True]
+        np.testing.assert_array_equal(expected_out, out)