From d7494763e967f5a1ee2eb089af04717f883c176e Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Fri, 7 Feb 2025 15:40:00 -0500 Subject: [PATCH 01/16] Add methods to create data generation specs from files --- dbldatagen/data_generator.py | 61 +++++++++++++++++++++++++++ tests/files/test_generator_spec.json | 15 +++++++ tests/files/test_generator_spec.txt | 15 +++++++ tests/files/test_generator_spec.yml | 23 +++++++++++ tests/test_quick_tests.py | 62 +++++++++++++++++++++++++++- 5 files changed, 175 insertions(+), 1 deletion(-) create mode 100644 tests/files/test_generator_spec.json create mode 100644 tests/files/test_generator_spec.txt create mode 100644 tests/files/test_generator_spec.yml diff --git a/dbldatagen/data_generator.py b/dbldatagen/data_generator.py index 12015438..464a5827 100644 --- a/dbldatagen/data_generator.py +++ b/dbldatagen/data_generator.py @@ -6,9 +6,11 @@ This file defines the `DataGenError` and `DataGenerator` classes """ import copy +import json import logging import re +import yaml from pyspark.sql.types import LongType, IntegerType, StringType, StructType, StructField, DataType from ._version import _get_spark_version @@ -869,6 +871,17 @@ def withColumn(self, colName, colType=StringType(), minValue=None, maxValue=None self._inferredSchemaFields.append(StructField(colName, newColumn.datatype, nullable)) return self + def withColumns(self, columns): + """ Adds a set of columns to the synthetic generation specification. + + :param columns: A list of column generation specifications as dictionaries + :returns: A modified in-place instance of a data generator allowing for chaining of calls + following a builder pattern + """ + for column in columns: + self.withColumn(**column) + return self + def _mkSqlStructFromList(self, fields): """ Create a SQL struct expression from a list of fields @@ -1604,3 +1617,51 @@ def scriptMerge(self, tgtName=None, srcName=None, updateExpr=None, delExpr=None, result = HtmlUtils.formatCodeAsHtml(results) return result + + @staticmethod + def fromDict(options): + """ Creates a data generator from a dictionary of options. + + :param options: Dictionary with data generator options (e.g. "name", "rows") + :return: A data generator with the specified options + """ + return DataGenerator(**options) + + @staticmethod + def fromFile(path): + """ Creates a data generator from options loaded from a JSON or YAML file. + + :param path: File path to a JSON or YAML file containing data generation options + :return: A data generator with the specified options + """ + if path.endswith("yml") or path.endswith("yaml"): + return DataGenerator.fromYaml(path) + if path.endswith("json"): + return DataGenerator.fromJson(path) + raise ValueError("File type must be '.json' or '.yml'") + + @staticmethod + def fromJson(path): + """ Creates a data generator from options loaded from a JSON file. + + :param path: File path to a JSON file containing data generation options + :return: A data generator with the specified options + """ + with open(path, "r", encoding="utf-8") as f: + options = json.load(f) + generator = options.get("generator") + columns = options.get("columns", None) + return DataGenerator.fromDict(generator).withColumns(columns) + + @staticmethod + def fromYaml(path): + """ Creates a data generator from options loaded from a YAML file. + + :param path: File path to a YAML file containing data generation options + :return: A data generator with the specified options + """ + with open(path, "r", encoding="utf-8") as f: + options = yaml.safe_load(f) + generator = options.get("generator") + columns = options.get("columns") + return DataGenerator.fromDict(generator).withColumns(columns) diff --git a/tests/files/test_generator_spec.json b/tests/files/test_generator_spec.json new file mode 100644 index 00000000..118682e7 --- /dev/null +++ b/tests/files/test_generator_spec.json @@ -0,0 +1,15 @@ +{ + "generator": { + "name": "test_data_generator", + "rows": 1000, + "partitions": 10, + "randomSeedMethod": "fixed", + "randomSeed": 42, + "random": true + }, + "columns": [ + {"colName": "col1", "colType": "int", "min": 0, "max": 100}, + {"colName": "col2", "colType": "float", "min": 0.0, "max": 100.0}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true} + ] +} \ No newline at end of file diff --git a/tests/files/test_generator_spec.txt b/tests/files/test_generator_spec.txt new file mode 100644 index 00000000..118682e7 --- /dev/null +++ b/tests/files/test_generator_spec.txt @@ -0,0 +1,15 @@ +{ + "generator": { + "name": "test_data_generator", + "rows": 1000, + "partitions": 10, + "randomSeedMethod": "fixed", + "randomSeed": 42, + "random": true + }, + "columns": [ + {"colName": "col1", "colType": "int", "min": 0, "max": 100}, + {"colName": "col2", "colType": "float", "min": 0.0, "max": 100.0}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true} + ] +} \ No newline at end of file diff --git a/tests/files/test_generator_spec.yml b/tests/files/test_generator_spec.yml new file mode 100644 index 00000000..9e3043b8 --- /dev/null +++ b/tests/files/test_generator_spec.yml @@ -0,0 +1,23 @@ +generator: + name: test_data_generator + rows: 1000 + partitions: 10 + randomSeedMethod: fixed + randomSeed: 42 + random: true +columns: + - colName: col1 + colType: int + min: 0 + max: 100 + - colName: col2 + colType: float + min: 0.0 + max: 100.0 + - colName: col3 + colType: string + values: + - a + - b + - c + random: true \ No newline at end of file diff --git a/tests/test_quick_tests.py b/tests/test_quick_tests.py index de83daa3..befd8b8e 100644 --- a/tests/test_quick_tests.py +++ b/tests/test_quick_tests.py @@ -1,6 +1,8 @@ from datetime import timedelta, datetime import pytest +import json +import yaml from pyspark.sql.types import ( StructType, StructField, IntegerType, StringType, FloatType, DateType, DecimalType, DoubleType, ByteType, ShortType, LongType @@ -8,7 +10,7 @@ import dbldatagen as dg -from dbldatagen import DataGenerator +from dbldatagen import DataGenerator, ColumnGenerationSpec from dbldatagen import NRange, DateRange schema = StructType([ @@ -754,3 +756,61 @@ def test_random_generation_without_range_values(self, columnSpecOptions): def test_version_info(self): # test access to version info without explicit import print("Data generator version", dg.__version__) + + def test_multi_column_generation(self): + column_specs = [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True} + ] + df_from_dicts = dg.DataGenerator(rows=100, partitions=1).withColumns(column_specs).build() + assert df_from_dicts.columns == ["col1", "col2", "col3"] + + def test_generation_from_dictionary(self): + dg_spec = { + "name": "test_data_generator", + "rows": 1000, + "partitions": 10, + "randomSeedMethod": "fixed", + "randomSeed": 42, + "random": True + } + gen_from_dict = DataGenerator.fromDict(dg_spec) + assert gen_from_dict.name == dg_spec.get("name") + assert gen_from_dict.rowCount == dg_spec.get("rows") + assert gen_from_dict.partitions == dg_spec.get("partitions") + assert gen_from_dict.random == dg_spec.get("random") + assert gen_from_dict.randomSeed == dg_spec.get("randomSeed") + + def test_generation_from_file(self): + path = "tests/files/test_generator_spec.json" + with open(path, "r") as f: + options = json.load(f) + gen_options = options.get("generator") + gen_from_json = DataGenerator.fromFile(path) + assert gen_from_json.name == gen_options.get("name") + assert gen_from_json.rowCount == gen_options.get("rows") + assert gen_from_json.partitions == gen_options.get("partitions") + assert gen_from_json.random == gen_options.get("random") + assert gen_from_json.randomSeed == gen_options.get("randomSeed") + + df_from_json = gen_from_json.build() + assert df_from_json.columns == ["col1", "col2", "col3"] + + path = "tests/files/test_generator_spec.yml" + with open(path, "r") as f: + options = yaml.safe_load(f) + gen_options = options.get("generator") + gen_from_yaml = DataGenerator.fromFile(path) + assert gen_from_yaml.name == gen_options.get("name") + assert gen_from_yaml.rowCount == gen_options.get("rows") + assert gen_from_yaml.partitions == gen_options.get("partitions") + assert gen_from_yaml.random == gen_options.get("random") + assert gen_from_yaml.randomSeed == gen_options.get("randomSeed") + + df_from_json = gen_from_json.build() + assert df_from_json.columns == ["col1", "col2", "col3"] + + path = "tests/files/test_generator_spec.txt" + with pytest.raises(ValueError): + DataGenerator.fromFile(path) # Loading from .txt should raise a ValueError From c9c4e9333771caf775b890cdb2658ec1a584895c Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Fri, 7 Feb 2025 16:18:39 -0500 Subject: [PATCH 02/16] Update tests and documentation --- docs/source/generating_column_data.rst | 45 ++++++++++++++++++++++++++ docs/source/options_and_features.rst | 6 ++++ tests/files/test_generator_spec.json | 4 +-- tests/files/test_generator_spec.yml | 8 ++--- tests/test_quick_tests.py | 7 ++-- 5 files changed, 60 insertions(+), 10 deletions(-) diff --git a/docs/source/generating_column_data.rst b/docs/source/generating_column_data.rst index 7efe68a7..8033651e 100644 --- a/docs/source/generating_column_data.rst +++ b/docs/source/generating_column_data.rst @@ -182,3 +182,48 @@ This has several implications: SQL expression. To enforce the dependency, you must use the `baseColumn` attribute to indicate the dependency. +Creating data generation specs from files +----------------------------------------- + +``DataGenerator.fromFile("file_path")`` will return a ``DataGenerator`` with ``ColumnGenerationSpecs`` from definitions +in a JSON or YAML file. Use the ``"generator"`` key to specify ``DataGenerator`` options and the ``"columns"`` key to +specify ``ColumnGenerationSpec`` options. + +**JSON Example:** + +.. code-block:: JSON + { + "generator": { + "name": "test_data_generator", + "rows": 1000, + "partitions": 10 + }, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true} + ] + } + +**YAML Example:** +.. code-block:: YAML +generator: + name: test_data_generator + rows: 1000 + partitions: 10 +columns: + - colName: col1 + colType: int + minValue: 0 + maxValue: 1000 + - colName: col2 + colType: float + minValue: -10.0 + maxValue: 10.0 + - colName: col3 + colType: string + values: + - a + - b + - c + random: true diff --git a/docs/source/options_and_features.rst b/docs/source/options_and_features.rst index 2727c273..af590c5a 100644 --- a/docs/source/options_and_features.rst +++ b/docs/source/options_and_features.rst @@ -128,6 +128,12 @@ representing the column - for example "email_0", "email_1" etc. If you specify the attribute ``structType="array"``, the multiple columns will be combined into a single array valued column. +Generating columns from Python dictionaries +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can generate columns from Python dictionaries using ``withColumns(column_options)``. Each dictionary should contain +keys which match the ``withColumn`` arguments (e.g. ``"colName"``, ``"colType"``). + Generating random values ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/tests/files/test_generator_spec.json b/tests/files/test_generator_spec.json index 118682e7..1ea25a3b 100644 --- a/tests/files/test_generator_spec.json +++ b/tests/files/test_generator_spec.json @@ -8,8 +8,8 @@ "random": true }, "columns": [ - {"colName": "col1", "colType": "int", "min": 0, "max": 100}, - {"colName": "col2", "colType": "float", "min": 0.0, "max": 100.0}, + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 1000}, + {"colName": "col2", "colType": "float", "minValue": -10.0, "maxValue": 10.0}, {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true} ] } \ No newline at end of file diff --git a/tests/files/test_generator_spec.yml b/tests/files/test_generator_spec.yml index 9e3043b8..6772114c 100644 --- a/tests/files/test_generator_spec.yml +++ b/tests/files/test_generator_spec.yml @@ -8,12 +8,12 @@ generator: columns: - colName: col1 colType: int - min: 0 - max: 100 + minValue: 0 + maxValue: 1000 - colName: col2 colType: float - min: 0.0 - max: 100.0 + minValue: -10.0 + maxValue: 10.0 - colName: col3 colType: string values: diff --git a/tests/test_quick_tests.py b/tests/test_quick_tests.py index befd8b8e..6ae7d9a0 100644 --- a/tests/test_quick_tests.py +++ b/tests/test_quick_tests.py @@ -1,7 +1,6 @@ from datetime import timedelta, datetime - -import pytest import json +import pytest import yaml from pyspark.sql.types import ( StructType, StructField, IntegerType, StringType, FloatType, DateType, DecimalType, DoubleType, ByteType, @@ -784,7 +783,7 @@ def test_generation_from_dictionary(self): def test_generation_from_file(self): path = "tests/files/test_generator_spec.json" - with open(path, "r") as f: + with open(path, "r", encoding="utf-8") as f: options = json.load(f) gen_options = options.get("generator") gen_from_json = DataGenerator.fromFile(path) @@ -798,7 +797,7 @@ def test_generation_from_file(self): assert df_from_json.columns == ["col1", "col2", "col3"] path = "tests/files/test_generator_spec.yml" - with open(path, "r") as f: + with open(path, "r", encoding="utf-8") as f: options = yaml.safe_load(f) gen_options = options.get("generator") gen_from_yaml = DataGenerator.fromFile(path) From cb2f355b29ff309df2898e0f7bcdddde43adc9b3 Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Mon, 10 Feb 2025 17:22:16 -0500 Subject: [PATCH 03/16] Add toDict and fromDict methods for constraints --- dbldatagen/constraints/chained_relation.py | 10 ++++++++++ dbldatagen/constraints/constraint.py | 20 +++++++++++++++++++ .../constraints/literal_range_constraint.py | 12 +++++++++++ .../literal_relation_constraint.py | 11 ++++++++++ dbldatagen/constraints/negative_values.py | 10 ++++++++++ dbldatagen/constraints/positive_values.py | 10 ++++++++++ .../constraints/ranged_values_constraint.py | 12 +++++++++++ dbldatagen/constraints/sql_expr.py | 9 +++++++++ dbldatagen/constraints/unique_combinations.py | 9 +++++++++ 9 files changed, 103 insertions(+) diff --git a/dbldatagen/constraints/chained_relation.py b/dbldatagen/constraints/chained_relation.py index ea189506..185e3a09 100644 --- a/dbldatagen/constraints/chained_relation.py +++ b/dbldatagen/constraints/chained_relation.py @@ -57,3 +57,13 @@ def _generateFilterExpression(self): # ... and combine them using logical `and` operation return self.mkCombinedConstraintExpression(filters) + + def toDict(self): + """ Returns a Python dictionary representation of a Constraint. + :return: Python dictionary representing the constraint + """ + return { + "type": self.__class__.__name__, + "columns": self._columns, + "relation": self._relation + } diff --git a/dbldatagen/constraints/constraint.py b/dbldatagen/constraints/constraint.py index e9291098..73d3fc75 100644 --- a/dbldatagen/constraints/constraint.py +++ b/dbldatagen/constraints/constraint.py @@ -133,6 +133,26 @@ def filterExpression(self): self._calculatedFilterExpression = True return self._filterExpression + @classmethod + def fromDict(cls, constraint): + """ Creates a Constraint from a Python dictionary. + :param constraint: Constraint definition as a Python dictionary + :return: Constraint object + """ + inner_obj = constraint.copy() + constraint_type = inner_obj.pop("type") + for c in cls.__subclasses__(): + if c.__name__ == constraint_type: + return c(**inner_obj) + raise ValueError(f"Unknown constraint type: {constraint_type}") + + @abstractmethod + def toDict(self): + """ Returns a Python dictionary representation of a Constraint. + :return: Python dictionary representing the constraint + """ + raise NotImplementedError("Method toDict must be implemented in derived class") + class NoFilterMixin: """ Mixin class to indicate that constraint has no filter expression diff --git a/dbldatagen/constraints/literal_range_constraint.py b/dbldatagen/constraints/literal_range_constraint.py index 3076cf6b..c08228f1 100644 --- a/dbldatagen/constraints/literal_range_constraint.py +++ b/dbldatagen/constraints/literal_range_constraint.py @@ -43,3 +43,15 @@ def _generateFilterExpression(self): # ... and combine them using logical `and` operation return self.mkCombinedConstraintExpression(filters) + + def toDict(self): + """ Returns a Python dictionary representation of a Constraint. + :return: Python dictionary representing the constraint + """ + return { + "type": self.__class__.__name__, + "columns": self._columns, + "lowValue": self._lowValue, + "highValue": self._highValue, + "strict": self._strict + } diff --git a/dbldatagen/constraints/literal_relation_constraint.py b/dbldatagen/constraints/literal_relation_constraint.py index 1ec629bd..b22ec9e9 100644 --- a/dbldatagen/constraints/literal_relation_constraint.py +++ b/dbldatagen/constraints/literal_relation_constraint.py @@ -35,3 +35,14 @@ def _generateFilterExpression(self): filters = [self._generate_relation_expression(col, self._relation, literalValue) for col in expressions] return self.mkCombinedConstraintExpression(filters) + + def toDict(self): + """ Returns a Python dictionary representation of a Constraint. + :return: Python dictionary representing the constraint + """ + return { + "type": self.__class__.__name__, + "columns": self._columns, + "relation": self._relation, + "value": self._value + } diff --git a/dbldatagen/constraints/negative_values.py b/dbldatagen/constraints/negative_values.py index 22d43ddb..f43f04d0 100644 --- a/dbldatagen/constraints/negative_values.py +++ b/dbldatagen/constraints/negative_values.py @@ -35,3 +35,13 @@ def _generateFilterExpression(self): filters = [col.isNotNull() & (col <= 0) for col in expressions] return self.mkCombinedConstraintExpression(filters) + + def toDict(self): + """ Returns a Python dictionary representation of a Constraint. + :return: Python dictionary representing the constraint + """ + return { + "type": "NegativeValues", + "columns": self._columns, + "strict": self._strict + } diff --git a/dbldatagen/constraints/positive_values.py b/dbldatagen/constraints/positive_values.py index 42aae7cb..dc6b67c1 100644 --- a/dbldatagen/constraints/positive_values.py +++ b/dbldatagen/constraints/positive_values.py @@ -36,3 +36,13 @@ def _generateFilterExpression(self): filters = [col.isNotNull() & (col >= 0) for col in expressions] return self.mkCombinedConstraintExpression(filters) + + def toDict(self): + """ Returns a Python dictionary representation of a Constraint. + :return: Python dictionary representing the constraint + """ + return { + "type": self.__class__.__name__, + "columns": self._columns, + "strict": self._strict + } diff --git a/dbldatagen/constraints/ranged_values_constraint.py b/dbldatagen/constraints/ranged_values_constraint.py index b2b9df49..8d2e901b 100644 --- a/dbldatagen/constraints/ranged_values_constraint.py +++ b/dbldatagen/constraints/ranged_values_constraint.py @@ -42,3 +42,15 @@ def _generateFilterExpression(self): # ... and combine them using logical `and` operation return self.mkCombinedConstraintExpression(filters) + + def toDict(self): + """ Returns a Python dictionary representation of a Constraint. + :return: Python dictionary representing the constraint + """ + return { + "type": "RangedValues", + "columns": self._columns, + "lowValue": self._lowValue, + "highValue": self._highValue, + "strict": self._strict + } diff --git a/dbldatagen/constraints/sql_expr.py b/dbldatagen/constraints/sql_expr.py index 91855330..f8e48c74 100644 --- a/dbldatagen/constraints/sql_expr.py +++ b/dbldatagen/constraints/sql_expr.py @@ -28,3 +28,12 @@ def __init__(self, expr: str): def _generateFilterExpression(self): """ Generate a SQL filter expression that may be used for filtering""" return F.expr(self._expr) + + def toDict(self): + """ Returns a Python dictionary representation of a Constraint. + :return: Python dictionary representing the constraint + """ + return { + "type": self.__class__.__name__, + "expr": self._expr + } diff --git a/dbldatagen/constraints/unique_combinations.py b/dbldatagen/constraints/unique_combinations.py index 3bea785d..1122e0b1 100644 --- a/dbldatagen/constraints/unique_combinations.py +++ b/dbldatagen/constraints/unique_combinations.py @@ -79,3 +79,12 @@ def transformDataframe(self, dataGenerator, dataFrame): results = dataFrame.dropDuplicates(columnsToEvaluate) return results + + def toDict(self): + """ Returns a Python dictionary representation of a Constraint. + :return: Python dictionary representing the constraint + """ + return { + "type": self.__class__.__name__, + "columns": self._columns + } From b1b1f1a20c9fe023291aa259d4914d0a9c0ba5c0 Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Mon, 10 Feb 2025 17:22:56 -0500 Subject: [PATCH 04/16] Add toDict method for column generation specs --- dbldatagen/column_generation_spec.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/dbldatagen/column_generation_spec.py b/dbldatagen/column_generation_spec.py index 8330062f..a9df64ef 100644 --- a/dbldatagen/column_generation_spec.py +++ b/dbldatagen/column_generation_spec.py @@ -119,7 +119,7 @@ def __init__(self, name, colType=None, minValue=0, maxValue=None, step=1, prefix if EXPR_OPTION not in kwargs: raise ValueError("Column generation spec must have `expr` attribute specified if datatype is inferred") - elif type(colType) == str: + elif isinstance(colType, str): colType = SchemaParser.columnTypeFromString(colType) assert isinstance(colType, DataType), f"colType `{colType}` is not instance of DataType" @@ -1300,3 +1300,22 @@ def makeGenerationExpressions(self): retval = F.slice(retval, F.lit(1), F.expr(expr_str)) return retval + + def toDict(self): + """ Creates a dictionary from a ColumnGenerationSpec. + :return: A dictionary representation of the ColumnGenerationSpec + """ + return { + "colName": self.name, + "colType": self.datatype.simpleString(), + "minValue": self.min, + "maxValue": self.max, + "step": self.step, + "values": self.values, + "expr": self.expr, + "prefix": self.prefix, + "random": self.random, + "nullable": self.nullable, + "omit": self.omit, + "implicit": self.implicit + } From b9e8ea9adfe7217b34fa2e7c85d88a18aab46b0c Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Mon, 10 Feb 2025 17:24:33 -0500 Subject: [PATCH 05/16] Add methods to create data generators from config --- dbldatagen/data_generator.py | 96 ++++++++++++++++++++++++------------ 1 file changed, 64 insertions(+), 32 deletions(-) diff --git a/dbldatagen/data_generator.py b/dbldatagen/data_generator.py index 464a5827..936cbabe 100644 --- a/dbldatagen/data_generator.py +++ b/dbldatagen/data_generator.py @@ -871,7 +871,7 @@ def withColumn(self, colName, colType=StringType(), minValue=None, maxValue=None self._inferredSchemaFields.append(StructField(colName, newColumn.datatype, nullable)) return self - def withColumns(self, columns): + def withColumnDefinitions(self, columns): """ Adds a set of columns to the synthetic generation specification. :param columns: A list of column generation specifications as dictionaries @@ -1219,6 +1219,12 @@ def _getColumnDataTypes(self, columns): """ return [self._columnSpecsByName[colspec].datatype for colspec in columns] + def getColumnGenerationSpecs(self): + return self._allColumnSpecs + + def getConstraints(self): + return self._constraints + def withConstraint(self, constraint): """Add a constraint to control the data generation @@ -1268,6 +1274,17 @@ def withSqlConstraint(self, sqlExpression: str): self.withConstraint(SqlExpr(sqlExpression)) return self + def withConstraintDefinitions(self, constraints): + """ Adds a set of constraints to the synthetic generation specification. + + :param constraints: A list of constraints as dictionaries + :returns: A modified in-place instance of a data generator allowing for chaining of calls + following a builder pattern + """ + for c in constraints: + self.withConstraint(Constraint.fromDict(c)) + return self + def computeBuildPlan(self): """ prepare for building by computing a pseudo build plan @@ -1621,47 +1638,62 @@ def scriptMerge(self, tgtName=None, srcName=None, updateExpr=None, delExpr=None, @staticmethod def fromDict(options): """ Creates a data generator from a dictionary of options. - :param options: Dictionary with data generator options (e.g. "name", "rows") :return: A data generator with the specified options """ - return DataGenerator(**options) + generator = options["generator"] + columns = options.get("columns", []) + constraints = options.get("constraints", []) + return ( + DataGenerator(**generator) + .withColumnDefinitions(columns) + .withConstraintDefinitions(constraints) + ) + + def toDict(self): + """ Creates a dictionary from a DataGenerator. + :return: A dictionary representation of the DataGenerator + """ + generator = { + "name": self.name, + "rows": self.rowCount, + "partitions": self.partitions, + "random": self.random, + "randomSeed": self.randomSeed, + "startingId": self.starting_id, + } + return { + "generator": generator, + "columns": [column.toDict() for column in self.getColumnGenerationSpecs()], + "constraints": [constraint.toDict() for constraint in self.getConstraints()] + } @staticmethod - def fromFile(path): - """ Creates a data generator from options loaded from a JSON or YAML file. - - :param path: File path to a JSON or YAML file containing data generation options + def fromJson(options): + """ Creates a data generator from a JSON string. + :param options: A JSON string containing data generation options :return: A data generator with the specified options """ - if path.endswith("yml") or path.endswith("yaml"): - return DataGenerator.fromYaml(path) - if path.endswith("json"): - return DataGenerator.fromJson(path) - raise ValueError("File type must be '.json' or '.yml'") - - @staticmethod - def fromJson(path): - """ Creates a data generator from options loaded from a JSON file. + options = json.loads(options) + return DataGenerator.fromDict(options) - :param path: File path to a JSON file containing data generation options - :return: A data generator with the specified options + def toJson(self): + """ Returns the JSON string representation of a data generator. + :return: A JSON string representation of the DataGenerator """ - with open(path, "r", encoding="utf-8") as f: - options = json.load(f) - generator = options.get("generator") - columns = options.get("columns", None) - return DataGenerator.fromDict(generator).withColumns(columns) + return json.dumps(self.toDict()) @staticmethod - def fromYaml(path): - """ Creates a data generator from options loaded from a YAML file. - - :param path: File path to a YAML file containing data generation options + def fromYaml(options): + """ Creates a data generator from a YAML string. + :param options: A YAML string containing data generation options :return: A data generator with the specified options """ - with open(path, "r", encoding="utf-8") as f: - options = yaml.safe_load(f) - generator = options.get("generator") - columns = options.get("columns") - return DataGenerator.fromDict(generator).withColumns(columns) + options = yaml.safe_load(options) + return DataGenerator.fromDict(options) + + def toYaml(self): + """ Returns the YAML string representation of a data generator. + :return: A YAML string representation of the DataGenerator + """ + return yaml.dump(self.toDict()) From 3c9b8511ca3f827dc162c0dd949fc46ec73346bf Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Mon, 10 Feb 2025 17:25:08 -0500 Subject: [PATCH 06/16] Clean-up messages from linter --- dbldatagen/text_generators.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dbldatagen/text_generators.py b/dbldatagen/text_generators.py index 965350be..ae20a63e 100644 --- a/dbldatagen/text_generators.py +++ b/dbldatagen/text_generators.py @@ -77,7 +77,7 @@ def __str__(self): return f"TextGenerator(randomSeed={self._randomSeed})" def __eq__(self, other): - return type(self) == type(other) and self._randomSeed == other._randomSeed + return isinstance(self, type(other)) and self._randomSeed == other._randomSeed def withRandomSeed(self, seed): """ Set the random seed for the text generator @@ -260,7 +260,7 @@ def __init__(self, template, escapeSpecialChars=False, extendedWordList=None): assert v is not None and isinstance(v, tuple) and len(v) == 2, "value must be tuple of length 2" mapping_length, mappings = v assert isinstance(mapping_length, int), "mapping length must be of type int" - assert isinstance(mappings, (list, np.ndarray)),\ + assert isinstance(mappings, (list, np.ndarray)), \ "mappings are lists or numpy arrays" assert mapping_length == 0 or len(mappings) == mapping_length, "mappings must match mapping_length" @@ -277,7 +277,7 @@ def __init__(self, template, escapeSpecialChars=False, extendedWordList=None): assert v is not None and isinstance(v, tuple) and len(v) == 2, "value must be tuple of length 2" mapping_length, mappings = v assert isinstance(mapping_length, int), "mapping length must be of type int" - assert mappings is None or isinstance(mappings, (list, np.ndarray)),\ + assert mappings is None or isinstance(mappings, (list, np.ndarray)), \ "mappings are lists or numpy arrays" # for escaped mappings, the mapping can be None in which case the mapping is to the number itself From 1dc8b131e0a2ec91372a0f4242d9b9fc67ab10ea Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Mon, 10 Feb 2025 17:25:31 -0500 Subject: [PATCH 07/16] Fix tests for creating data generators from config --- tests/files/test_generator_spec.json | 15 - tests/files/test_generator_spec.txt | 15 - tests/files/test_generator_spec.yml | 23 -- tests/test_generation_from_config.py | 403 +++++++++++++++++++++++++++ tests/test_quick_tests.py | 112 +++++--- 5 files changed, 480 insertions(+), 88 deletions(-) delete mode 100644 tests/files/test_generator_spec.json delete mode 100644 tests/files/test_generator_spec.txt delete mode 100644 tests/files/test_generator_spec.yml create mode 100644 tests/test_generation_from_config.py diff --git a/tests/files/test_generator_spec.json b/tests/files/test_generator_spec.json deleted file mode 100644 index 1ea25a3b..00000000 --- a/tests/files/test_generator_spec.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "generator": { - "name": "test_data_generator", - "rows": 1000, - "partitions": 10, - "randomSeedMethod": "fixed", - "randomSeed": 42, - "random": true - }, - "columns": [ - {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 1000}, - {"colName": "col2", "colType": "float", "minValue": -10.0, "maxValue": 10.0}, - {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true} - ] -} \ No newline at end of file diff --git a/tests/files/test_generator_spec.txt b/tests/files/test_generator_spec.txt deleted file mode 100644 index 118682e7..00000000 --- a/tests/files/test_generator_spec.txt +++ /dev/null @@ -1,15 +0,0 @@ -{ - "generator": { - "name": "test_data_generator", - "rows": 1000, - "partitions": 10, - "randomSeedMethod": "fixed", - "randomSeed": 42, - "random": true - }, - "columns": [ - {"colName": "col1", "colType": "int", "min": 0, "max": 100}, - {"colName": "col2", "colType": "float", "min": 0.0, "max": 100.0}, - {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true} - ] -} \ No newline at end of file diff --git a/tests/files/test_generator_spec.yml b/tests/files/test_generator_spec.yml deleted file mode 100644 index 6772114c..00000000 --- a/tests/files/test_generator_spec.yml +++ /dev/null @@ -1,23 +0,0 @@ -generator: - name: test_data_generator - rows: 1000 - partitions: 10 - randomSeedMethod: fixed - randomSeed: 42 - random: true -columns: - - colName: col1 - colType: int - minValue: 0 - maxValue: 1000 - - colName: col2 - colType: float - minValue: -10.0 - maxValue: 10.0 - - colName: col3 - colType: string - values: - - a - - b - - c - random: true \ No newline at end of file diff --git a/tests/test_generation_from_config.py b/tests/test_generation_from_config.py new file mode 100644 index 00000000..2e059307 --- /dev/null +++ b/tests/test_generation_from_config.py @@ -0,0 +1,403 @@ +from contextlib import nullcontext as does_not_raise +import json +import pytest +import yaml +import dbldatagen as dg + +spark = dg.SparkSingleton.getLocalInstance("unit tests") + + +class TestGenerationFromConfig: + @pytest.mark.parametrize("expectation, columns", [ + (does_not_raise(), [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True} + ]), + (does_not_raise(), [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True} + ]), + ]) + def test_column_definitions_from_dict(self, columns, expectation): + with expectation: + # Test the options set on the ColumnGenerationSpecs: + gen_from_dicts = dg.DataGenerator(rows=100, partitions=1).withColumnDefinitions(columns) + for column in columns: + column_spec = gen_from_dicts.getColumnSpec(column["colName"]) + for key in column.keys(): + assert column_spec.toDict()[key] == column[key] + + # Test the data generated after building the DataFrame: + df_from_dicts = gen_from_dicts.build() + assert df_from_dicts.columns == ["col1", "col2", "col3"] + + @pytest.mark.parametrize("expectation, constraints", [ + (does_not_raise(), [ + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": True}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": True} + ]), + (does_not_raise(), [ + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": False}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": True}, + {"type": "SqlExpr", "expr": "col1 > 0"}, + {"type": "LiteralRelation", "columns": ["col2"], "relation": "<>", "value": "0"} + ]), + (pytest.raises(ValueError), [ # Testing an invalid "relation" value + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": True}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": True}, + {"type": "SqlExpr", "expr": "col1 > 0"}, + {"type": "LiteralRelation", "columns": ["col2"], "relation": "+", "value": "0"} + ]), + (pytest.raises(ValueError), [ # Testing an invalid "type" value + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": False}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": True}, + {"type": "SqlExpr", "expr": "col1 > 0"}, + {"type": "Equivalent", "columns": ["col2"], "value": "0"} + ]), + (does_not_raise(), [ + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": True}, + {"type": "NegativeValues", "columns": ["col1", "col2"], "strict": False}, + {"type": "ChainedRelation", "columns": ["col1", "col2"], "relation": ">"}, + {"type": "RangedValues", "columns": ["col2"], "lowValue": 0, "highValue": 100, "strict": True}, + {"type": "UniqueCombinations", "columns": ["col1", "col2"]} + ]), + ]) + def test_constraint_definitions_from_dict(self, constraints, expectation): + with expectation: + # Test the options set on the ColumnGenerationSpecs: + columns = [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True} + ] + gen_from_dicts = dg.DataGenerator(rows=100, partitions=1) \ + .withColumnDefinitions(columns) \ + .withConstraintDefinitions(constraints) + + constraint_specs = [constraint.toDict() for constraint in gen_from_dicts.getConstraints()] + for constraint in constraints: + assert constraint in constraint_specs + + @pytest.mark.parametrize("expectation, options", [ + (does_not_raise(), + {"generator": {"name": "test_generator", "rows": 1000}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}] + }), + (does_not_raise(), + {"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}] + }), + (does_not_raise(), + {"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}], + "constraints": [ + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": True}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": True}, + {"type": "SqlExpr", "expr": "col1 > 0"}, + {"type": "LiteralRelation", "columns": ["col2"], "relation": "<>", "value": "0"}] + }), + (pytest.raises(KeyError), # Testing a dictionary missing a "generator" object + {"columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}] + }), + (pytest.raises(ValueError), # Testing an invalid "type" value + {"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}], + "constraints": [ + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": True}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": True}, + {"type": "SqlExpr", "expr": "col1 > 0"}, + {"type": "Equivalent", "columns": ["col2"], "value": 0}] + }), + ]) + def test_generator_from_dict(self, options, expectation): + with expectation: + # Test the options set on the DataGenerator: + gen_from_dicts = dg.DataGenerator.fromDict(options) + generator = options.get("generator") + for key in generator: + assert gen_from_dicts.toDict()["generator"][key] == generator[key] + + # Test the options set on the ColumnGenerationSpecs: + columns = options.get("columns", []) + for column in columns: + column_spec = gen_from_dicts.getColumnSpec(column["colName"]) + for key in column.keys(): + assert column_spec.toDict()[key] == column[key] + + # Test the options set on the Constraints: + constraints = options.get("constraints", []) + constraint_specs = [constraint.toDict() for constraint in gen_from_dicts.getConstraints()] + for constraint in constraints: + assert constraint in constraint_specs + + # Test the data generated after building the DataFrame: + df_from_dicts = gen_from_dicts.build() + assert df_from_dicts.columns == ["col1", "col2", "col3"] + + @pytest.mark.parametrize("expectation, json_options", [ + (does_not_raise(), + '''{"generator": {"name": "test_generator", "rows": 1000}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}] + }'''), + (does_not_raise(), + '''{"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": true}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}] + }'''), + (does_not_raise(), + '''{"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": true}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}], + "constraints": [ + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": true}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": true}, + {"type": "SqlExpr", "expr": "col1 > 0"}, + {"type": "LiteralRelation", "columns": ["col2"], "relation": "<>", "value": 0}] + }'''), + (pytest.raises(KeyError), # Testing a JSON object missing the "generator" key + '''{"columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": true}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}] + }'''), + (pytest.raises(ValueError), # Testing an invalid "type" value + '''{"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": true}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}], + "constraints": [ + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": true}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": true}, + {"type": "SqlExpr", "expr": "col1 > 0"}, + {"type": "Equivalent", "columns": ["col2"], "value": 0}] + }'''), + ]) + def test_generator_from_json(self, json_options, expectation): + options = json.loads(json_options) + with expectation: + # Test the options set on the DataGenerator: + gen_from_dicts = dg.DataGenerator.fromJson(json_options) + generator = options.get("generator") + for key in generator: + assert gen_from_dicts.toDict()["generator"][key] == generator[key] + + # Test the options set on the ColumnGenerationSpecs: + columns = options.get("columns", []) + for column in columns: + column_spec = gen_from_dicts.getColumnSpec(column["colName"]) + for key in column.keys(): + assert column_spec.toDict()[key] == column[key] + + # Test the options set on the Constraints: + constraints = options.get("constraints", []) + constraint_specs = [constraint.toDict() for constraint in gen_from_dicts.getConstraints()] + for constraint in constraints: + assert constraint in constraint_specs + + # Test the data generated after building the DataFrame: + df_from_dicts = gen_from_dicts.build() + assert df_from_dicts.columns == ["col1", "col2", "col3"] + + @pytest.mark.parametrize("expectation, yaml_options", [ + (does_not_raise(), + '''--- + generator: + name: test_generator + rows: 10000 + randomSeed: 42 + columns: + - colName: col1 + colType: int + minValue: 0 + maxValue: 100 + step: 2 + random: true + - colName: col2 + colType: float + minValue: 0 + maxValue: 100 + step: 1.5 + - colName: col3 + colType: string + values: + - a + - b + - c + random: true'''), + (does_not_raise(), + '''--- + generator: + name: test_generator + rows: 1000 + columns: + - colName: col1 + colType: int + minValue: 0 + maxValue: 100 + - colName: col2 + colType: float + minValue: 0 + maxValue: 100 + - colName: col3 + colType: string + values: + - a + - b + - c + random: true'''), + (does_not_raise(), + '''--- + generator: + name: test_generator + rows: 10000 + randomSeed: 42 + columns: + - colName: col1 + colType: int + minValue: 0 + maxValue: 100 + step: 2 + random: true + - colName: col2 + colType: float + minValue: 0 + maxValue: 100 + step: 1.5 + - colName: col3 + colType: string + values: + - a + - b + - c + random: true + constraints: + - type: LiteralRange + columns: + - col1 + lowValue: -1000 + highValue: 1000 + strict: true + - type: PositiveValues + columns: + - col1 + - col2 + strict: true + - type: SqlExpr + expr: col1 > 0 + - type: LiteralRelation + columns: + - col2 + relation: "<>" + value: 0'''), + (pytest.raises(KeyError), # Testing a YAML object missing the "generator" key + '''--- + columns: + - colName: col1 + colType: int + minValue: 0 + maxValue: 100 + - colName: col2 + colType: float + minValue: 0 + maxValue: 100 + - colName: col3 + colType: string + values: + - a + - b + - c + random: true'''), + (pytest.raises(ValueError), # Testing an invalid "type" value + '''--- + generator: + name: test_generator + rows: 10000 + randomSeed: 42 + columns: + - colName: col1 + colType: int + minValue: 0 + maxValue: 100 + step: 2 + random: true + - colName: col2 + colType: float + minValue: 0 + maxValue: 100 + step: 1.5 + - colName: col3 + colType: string + values: + - a + - b + - c + random: true + constraints: + - type: LiteralRange + columns: + - col1 + lowValue: -1000 + highValue: 1000 + strict: true + - type: PositiveValues + columns: + - col1 + - col2 + strict: true + - type: SqlExpr + expr: col1 > 0 + - type: Equivalent + columns: + - col2 + value: 0''') + ]) + def test_generator_from_yaml(self, yaml_options, expectation): + options = yaml.safe_load(yaml_options) + with expectation: + # Test the options set on the DataGenerator: + gen_from_dicts = dg.DataGenerator.fromYaml(yaml_options) + generator = options.get("generator") + for key in generator: + assert gen_from_dicts.toDict()["generator"][key] == generator[key] + + # Test the options set on the ColumnGenerationSpecs: + columns = options.get("columns", []) + for column in columns: + column_spec = gen_from_dicts.getColumnSpec(column["colName"]) + for key in column.keys(): + assert column_spec.toDict()[key] == column[key] + + # Test the options set on the Constraints: + constraints = options.get("constraints", []) + constraint_specs = [constraint.toDict() for constraint in gen_from_dicts.getConstraints()] + for constraint in constraints: + assert constraint in constraint_specs + + # Test the data generated after building the DataFrame: + df_from_dicts = gen_from_dicts.build() + assert df_from_dicts.columns == ["col1", "col2", "col3"] diff --git a/tests/test_quick_tests.py b/tests/test_quick_tests.py index 6ae7d9a0..f5ed9c67 100644 --- a/tests/test_quick_tests.py +++ b/tests/test_quick_tests.py @@ -9,8 +9,9 @@ import dbldatagen as dg -from dbldatagen import DataGenerator, ColumnGenerationSpec +from dbldatagen import DataGenerator from dbldatagen import NRange, DateRange +from dbldatagen.constraints import PositiveValues schema = StructType([ StructField("site_id", IntegerType(), True), @@ -762,7 +763,7 @@ def test_multi_column_generation(self): {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True} ] - df_from_dicts = dg.DataGenerator(rows=100, partitions=1).withColumns(column_specs).build() + df_from_dicts = dg.DataGenerator(rows=100, partitions=1).withColumnDefinitions(column_specs).build() assert df_from_dicts.columns == ["col1", "col2", "col3"] def test_generation_from_dictionary(self): @@ -774,42 +775,83 @@ def test_generation_from_dictionary(self): "randomSeed": 42, "random": True } - gen_from_dict = DataGenerator.fromDict(dg_spec) + gen_from_dict = DataGenerator.fromDict({"generator": dg_spec}) assert gen_from_dict.name == dg_spec.get("name") assert gen_from_dict.rowCount == dg_spec.get("rows") assert gen_from_dict.partitions == dg_spec.get("partitions") assert gen_from_dict.random == dg_spec.get("random") assert gen_from_dict.randomSeed == dg_spec.get("randomSeed") - def test_generation_from_file(self): - path = "tests/files/test_generator_spec.json" - with open(path, "r", encoding="utf-8") as f: - options = json.load(f) - gen_options = options.get("generator") - gen_from_json = DataGenerator.fromFile(path) - assert gen_from_json.name == gen_options.get("name") - assert gen_from_json.rowCount == gen_options.get("rows") - assert gen_from_json.partitions == gen_options.get("partitions") - assert gen_from_json.random == gen_options.get("random") - assert gen_from_json.randomSeed == gen_options.get("randomSeed") - - df_from_json = gen_from_json.build() - assert df_from_json.columns == ["col1", "col2", "col3"] - - path = "tests/files/test_generator_spec.yml" - with open(path, "r", encoding="utf-8") as f: - options = yaml.safe_load(f) - gen_options = options.get("generator") - gen_from_yaml = DataGenerator.fromFile(path) - assert gen_from_yaml.name == gen_options.get("name") - assert gen_from_yaml.rowCount == gen_options.get("rows") - assert gen_from_yaml.partitions == gen_options.get("partitions") - assert gen_from_yaml.random == gen_options.get("random") - assert gen_from_yaml.randomSeed == gen_options.get("randomSeed") - - df_from_json = gen_from_json.build() - assert df_from_json.columns == ["col1", "col2", "col3"] - - path = "tests/files/test_generator_spec.txt" - with pytest.raises(ValueError): - DataGenerator.fromFile(path) # Loading from .txt should raise a ValueError + def test_to_dict(self): + gen = ( + dg.DataGenerator(rows=1000, name="test_data_generator", partitions=1) + .withColumn("val1", "int", minValue=0, maxValue=100, step=1, random=True) + .withConstraint(PositiveValues(columns=["val1"], strict=True)) + ) + gen_dict = gen.toDict() + assert gen_dict["generator"]["rows"] == 1000 + assert gen_dict["generator"]["name"] == "test_data_generator" + assert gen_dict["generator"]["partitions"] == 1 + + column = gen_dict["columns"][1] + assert column["colName"] == "val1" + assert column["colType"] == "int" + assert column["minValue"] == 0 + assert column["maxValue"] == 100 + assert column["step"] == 1 + assert column["random"] + + constraint = gen_dict["constraints"][0] + assert constraint["type"] == "PositiveValues" + assert constraint["columns"] == ["val1"] + assert constraint["strict"] + + def test_to_json(self): + gen = ( + dg.DataGenerator(rows=1000, name="test_data_generator", partitions=1) + .withColumn("val1", "int", minValue=0, maxValue=100, step=1, random=True) + .withConstraint(PositiveValues(columns=["val1"], strict=True)) + ) + gen_json = gen.toJson() + gen_dict = json.loads(gen_json) + assert gen_dict["generator"]["rows"] == 1000 + assert gen_dict["generator"]["name"] == "test_data_generator" + assert gen_dict["generator"]["partitions"] == 1 + + column = gen_dict["columns"][1] + assert column["colName"] == "val1" + assert column["colType"] == "int" + assert column["minValue"] == 0 + assert column["maxValue"] == 100 + assert column["step"] == 1 + assert column["random"] + + constraint = gen_dict["constraints"][0] + assert constraint["type"] == "PositiveValues" + assert constraint["columns"] == ["val1"] + assert constraint["strict"] + + def test_to_yaml(self): + gen = ( + dg.DataGenerator(rows=1000, name="test_data_generator", partitions=1) + .withColumn("val1", "int", minValue=0, maxValue=100, step=1, random=True) + .withConstraint(PositiveValues(columns=["val1"], strict=True)) + ) + gen_yaml = gen.toYaml() + gen_dict = yaml.safe_load(gen_yaml) + assert gen_dict["generator"]["rows"] == 1000 + assert gen_dict["generator"]["name"] == "test_data_generator" + assert gen_dict["generator"]["partitions"] == 1 + + column = gen_dict["columns"][1] + assert column["colName"] == "val1" + assert column["colType"] == "int" + assert column["minValue"] == 0 + assert column["maxValue"] == 100 + assert column["step"] == 1 + assert column["random"] + + constraint = gen_dict["constraints"][0] + assert constraint["type"] == "PositiveValues" + assert constraint["columns"] == ["val1"] + assert constraint["strict"] From 039b7687138a6aabc7907a52d73b94cd03f259c9 Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Mon, 10 Feb 2025 17:26:08 -0500 Subject: [PATCH 08/16] Show the Java version during build steps --- .github/workflows/push.yml | 3 +++ .github/workflows/release.yml | 3 +++ 2 files changed, 6 insertions(+) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 5c915304..dbe9b94a 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -27,6 +27,9 @@ jobs: # restore-keys: | # ${{ runner.os }}-go- + - name: Check Java version + run: java -version + - name: Set up Python 3.8 uses: actions/setup-python@v5 with: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e7ec05de..e755fb61 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -19,6 +19,9 @@ jobs: - name: Checkout uses: actions/checkout@v4 + - name: Check Java version + run: java -version + - name: Set up Python 3.8 uses: actions/setup-python@v5 with: From b011c197269cc650ee87cdd4ca547fb214032da7 Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Tue, 11 Feb 2025 09:14:46 -0500 Subject: [PATCH 09/16] Test git action --- .github/workflows/push.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index dbe9b94a..a619a445 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -30,6 +30,9 @@ jobs: - name: Check Java version run: java -version + - name: Set up Java 8 + run: sudo update-alternatives --config java + - name: Set up Python 3.8 uses: actions/setup-python@v5 with: From 72e1b2c351060273834316e5310c1b137ec30185 Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Tue, 11 Feb 2025 09:17:21 -0500 Subject: [PATCH 10/16] Test git action --- .github/workflows/push.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index a619a445..21ea44dd 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -27,11 +27,10 @@ jobs: # restore-keys: | # ${{ runner.os }}-go- - - name: Check Java version - run: java -version - - - name: Set up Java 8 - run: sudo update-alternatives --config java + - name: Set Java 8 + run: | + sudo update-alternatives --set java /usr/lib/jvm/temurin-8-jdk-amd64/bin/java + java -version - name: Set up Python 3.8 uses: actions/setup-python@v5 From 900121f3be5767d5a3f7a7ae1f56d2d5077d6297 Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Tue, 11 Feb 2025 09:27:56 -0500 Subject: [PATCH 11/16] Clean-up formatting --- tests/test_generation_from_config.py | 496 +++++++++++++-------------- 1 file changed, 248 insertions(+), 248 deletions(-) diff --git a/tests/test_generation_from_config.py b/tests/test_generation_from_config.py index 2e059307..4a99fb6a 100644 --- a/tests/test_generation_from_config.py +++ b/tests/test_generation_from_config.py @@ -9,17 +9,17 @@ class TestGenerationFromConfig: @pytest.mark.parametrize("expectation, columns", [ - (does_not_raise(), [ - {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, - {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, - {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True} - ]), - (does_not_raise(), [ - {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, - {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, - {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True} - ]), - ]) + (does_not_raise(), [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True} + ]), + (does_not_raise(), [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True} + ]), + ]) def test_column_definitions_from_dict(self, columns, expectation): with expectation: # Test the options set on the ColumnGenerationSpecs: @@ -81,51 +81,51 @@ def test_constraint_definitions_from_dict(self, constraints, expectation): assert constraint in constraint_specs @pytest.mark.parametrize("expectation, options", [ - (does_not_raise(), - {"generator": {"name": "test_generator", "rows": 1000}, - "columns": [ - {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, - {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, - {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}] - }), - (does_not_raise(), - {"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, - "columns": [ - {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, - {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, - {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}] - }), - (does_not_raise(), - {"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, - "columns": [ - {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, - {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, - {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}], - "constraints": [ - {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": True}, - {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": True}, - {"type": "SqlExpr", "expr": "col1 > 0"}, - {"type": "LiteralRelation", "columns": ["col2"], "relation": "<>", "value": "0"}] - }), - (pytest.raises(KeyError), # Testing a dictionary missing a "generator" object - {"columns": [ - {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, - {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, - {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}] - }), - (pytest.raises(ValueError), # Testing an invalid "type" value - {"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, - "columns": [ - {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, - {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, - {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}], - "constraints": [ - {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": True}, - {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": True}, - {"type": "SqlExpr", "expr": "col1 > 0"}, - {"type": "Equivalent", "columns": ["col2"], "value": 0}] - }), - ]) + (does_not_raise(), + {"generator": {"name": "test_generator", "rows": 1000}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}] + }), + (does_not_raise(), + {"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}] + }), + (does_not_raise(), + {"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}], + "constraints": [ + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": True}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": True}, + {"type": "SqlExpr", "expr": "col1 > 0"}, + {"type": "LiteralRelation", "columns": ["col2"], "relation": "<>", "value": "0"}] + }), + (pytest.raises(KeyError), # Testing a dictionary missing a "generator" object + {"columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}] + }), + (pytest.raises(ValueError), # Testing an invalid "type" value + {"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}], + "constraints": [ + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": True}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": True}, + {"type": "SqlExpr", "expr": "col1 > 0"}, + {"type": "Equivalent", "columns": ["col2"], "value": 0}] + }), + ]) def test_generator_from_dict(self, options, expectation): with expectation: # Test the options set on the DataGenerator: @@ -152,51 +152,51 @@ def test_generator_from_dict(self, options, expectation): assert df_from_dicts.columns == ["col1", "col2", "col3"] @pytest.mark.parametrize("expectation, json_options", [ - (does_not_raise(), - '''{"generator": {"name": "test_generator", "rows": 1000}, - "columns": [ - {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, - {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, - {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}] - }'''), - (does_not_raise(), - '''{"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, + (does_not_raise(), + '''{"generator": {"name": "test_generator", "rows": 1000}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}] + }'''), + (does_not_raise(), + '''{"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": true}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}] + }'''), + (does_not_raise(), + '''{"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, "columns": [ {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": true}, {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, - {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}] - }'''), - (does_not_raise(), - '''{"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, - "columns": [ - {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": true}, - {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, - {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}], - "constraints": [ - {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": true}, - {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": true}, - {"type": "SqlExpr", "expr": "col1 > 0"}, - {"type": "LiteralRelation", "columns": ["col2"], "relation": "<>", "value": 0}] - }'''), - (pytest.raises(KeyError), # Testing a JSON object missing the "generator" key - '''{"columns": [ - {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": true}, - {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, - {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}] - }'''), - (pytest.raises(ValueError), # Testing an invalid "type" value - '''{"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, - "columns": [ - {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": true}, - {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, - {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}], + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}], "constraints": [ - {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": true}, - {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": true}, - {"type": "SqlExpr", "expr": "col1 > 0"}, - {"type": "Equivalent", "columns": ["col2"], "value": 0}] + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": true}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": true}, + {"type": "SqlExpr", "expr": "col1 > 0"}, + {"type": "LiteralRelation", "columns": ["col2"], "relation": "<>", "value": 0}] }'''), - ]) + (pytest.raises(KeyError), # Testing a JSON object missing the "generator" key + '''{"columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": true}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}] + }'''), + (pytest.raises(ValueError), # Testing an invalid "type" value + '''{"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, + "columns": [ + {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": true}, + {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, + {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": true}], + "constraints": [ + {"type": "LiteralRange", "columns": ["col1"], "lowValue": -1000, "highValue": 1000, "strict": true}, + {"type": "PositiveValues", "columns": ["col1", "col2"], "strict": true}, + {"type": "SqlExpr", "expr": "col1 > 0"}, + {"type": "Equivalent", "columns": ["col2"], "value": 0}] + }'''), + ]) def test_generator_from_json(self, json_options, expectation): options = json.loads(json_options) with expectation: @@ -224,158 +224,158 @@ def test_generator_from_json(self, json_options, expectation): assert df_from_dicts.columns == ["col1", "col2", "col3"] @pytest.mark.parametrize("expectation, yaml_options", [ - (does_not_raise(), - '''--- - generator: - name: test_generator - rows: 10000 - randomSeed: 42 - columns: - - colName: col1 - colType: int - minValue: 0 - maxValue: 100 - step: 2 - random: true - - colName: col2 - colType: float - minValue: 0 - maxValue: 100 - step: 1.5 - - colName: col3 - colType: string - values: - - a - - b - - c - random: true'''), - (does_not_raise(), - '''--- - generator: - name: test_generator - rows: 1000 - columns: - - colName: col1 - colType: int - minValue: 0 - maxValue: 100 - - colName: col2 - colType: float - minValue: 0 - maxValue: 100 - - colName: col3 - colType: string - values: - - a - - b - - c - random: true'''), - (does_not_raise(), - '''--- - generator: - name: test_generator - rows: 10000 - randomSeed: 42 - columns: - - colName: col1 - colType: int - minValue: 0 - maxValue: 100 - step: 2 - random: true - - colName: col2 - colType: float - minValue: 0 - maxValue: 100 - step: 1.5 - - colName: col3 - colType: string - values: - - a - - b - - c - random: true - constraints: - - type: LiteralRange - columns: - - col1 - lowValue: -1000 - highValue: 1000 - strict: true - - type: PositiveValues - columns: - - col1 - - col2 - strict: true - - type: SqlExpr - expr: col1 > 0 - - type: LiteralRelation - columns: - - col2 - relation: "<>" - value: 0'''), - (pytest.raises(KeyError), # Testing a YAML object missing the "generator" key - '''--- - columns: - - colName: col1 - colType: int - minValue: 0 - maxValue: 100 - - colName: col2 - colType: float - minValue: 0 - maxValue: 100 - - colName: col3 - colType: string - values: - - a - - b - - c - random: true'''), - (pytest.raises(ValueError), # Testing an invalid "type" value - '''--- - generator: - name: test_generator - rows: 10000 - randomSeed: 42 - columns: - - colName: col1 - colType: int - minValue: 0 - maxValue: 100 - step: 2 - random: true - - colName: col2 - colType: float - minValue: 0 - maxValue: 100 - step: 1.5 - - colName: col3 - colType: string - values: - - a - - b - - c - random: true - constraints: - - type: LiteralRange - columns: - - col1 - lowValue: -1000 - highValue: 1000 - strict: true - - type: PositiveValues - columns: - - col1 - - col2 - strict: true - - type: SqlExpr - expr: col1 > 0 - - type: Equivalent - columns: - - col2 - value: 0''') - ]) + (does_not_raise(), + '''--- + generator: + name: test_generator + rows: 10000 + randomSeed: 42 + columns: + - colName: col1 + colType: int + minValue: 0 + maxValue: 100 + step: 2 + random: true + - colName: col2 + colType: float + minValue: 0 + maxValue: 100 + step: 1.5 + - colName: col3 + colType: string + values: + - a + - b + - c + random: true'''), + (does_not_raise(), + '''--- + generator: + name: test_generator + rows: 1000 + columns: + - colName: col1 + colType: int + minValue: 0 + maxValue: 100 + - colName: col2 + colType: float + minValue: 0 + maxValue: 100 + - colName: col3 + colType: string + values: + - a + - b + - c + random: true'''), + (does_not_raise(), + '''--- + generator: + name: test_generator + rows: 10000 + randomSeed: 42 + columns: + - colName: col1 + colType: int + minValue: 0 + maxValue: 100 + step: 2 + random: true + - colName: col2 + colType: float + minValue: 0 + maxValue: 100 + step: 1.5 + - colName: col3 + colType: string + values: + - a + - b + - c + random: true + constraints: + - type: LiteralRange + columns: + - col1 + lowValue: -1000 + highValue: 1000 + strict: true + - type: PositiveValues + columns: + - col1 + - col2 + strict: true + - type: SqlExpr + expr: col1 > 0 + - type: LiteralRelation + columns: + - col2 + relation: "<>" + value: 0'''), + (pytest.raises(KeyError), # Testing a YAML object missing the "generator" key + '''--- + columns: + - colName: col1 + colType: int + minValue: 0 + maxValue: 100 + - colName: col2 + colType: float + minValue: 0 + maxValue: 100 + - colName: col3 + colType: string + values: + - a + - b + - c + random: true'''), + (pytest.raises(ValueError), # Testing an invalid "type" value + '''--- + generator: + name: test_generator + rows: 10000 + randomSeed: 42 + columns: + - colName: col1 + colType: int + minValue: 0 + maxValue: 100 + step: 2 + random: true + - colName: col2 + colType: float + minValue: 0 + maxValue: 100 + step: 1.5 + - colName: col3 + colType: string + values: + - a + - b + - c + random: true + constraints: + - type: LiteralRange + columns: + - col1 + lowValue: -1000 + highValue: 1000 + strict: true + - type: PositiveValues + columns: + - col1 + - col2 + strict: true + - type: SqlExpr + expr: col1 > 0 + - type: Equivalent + columns: + - col2 + value: 0''') + ]) def test_generator_from_yaml(self, yaml_options, expectation): options = yaml.safe_load(yaml_options) with expectation: From 07f4b7f8add62ac7f4b7a0871542db39c7c96e52 Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Tue, 11 Feb 2025 09:33:07 -0500 Subject: [PATCH 12/16] Clean-up formatting --- tests/test_generation_from_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_generation_from_config.py b/tests/test_generation_from_config.py index 4a99fb6a..2be92afc 100644 --- a/tests/test_generation_from_config.py +++ b/tests/test_generation_from_config.py @@ -112,7 +112,7 @@ def test_constraint_definitions_from_dict(self, constraints, expectation): {"colName": "col1", "colType": "int", "minValue": 0, "maxValue": 100, "step": 2, "random": True}, {"colName": "col2", "colType": "float", "minValue": 0.0, "maxValue": 100.0, "step": 1.5}, {"colName": "col3", "colType": "string", "values": ["a", "b", "c"], "random": True}] - }), + }), (pytest.raises(ValueError), # Testing an invalid "type" value {"generator": {"name": "test_generator", "rows": 10000, "randomSeed": 42}, "columns": [ From adcc9b2420d9d4860eaa3056e13cbfa48e746039 Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Tue, 11 Feb 2025 09:56:17 -0500 Subject: [PATCH 13/16] Test --- .github/workflows/push.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 21ea44dd..828bd7b4 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -31,6 +31,7 @@ jobs: run: | sudo update-alternatives --set java /usr/lib/jvm/temurin-8-jdk-amd64/bin/java java -version + sudo update-alternatives --config java - name: Set up Python 3.8 uses: actions/setup-python@v5 @@ -50,6 +51,9 @@ jobs: - name: Install dependencies run: pipenv install --dev + - name: Check Java version + run: + - name: Lint run: | pipenv run prospector --profile prospector.yaml From 83ce7baa8ce091c90863abd9d404ec14c2d4a2fc Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Tue, 11 Feb 2025 09:58:03 -0500 Subject: [PATCH 14/16] Test --- .github/workflows/push.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 828bd7b4..6abdc337 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -51,9 +51,6 @@ jobs: - name: Install dependencies run: pipenv install --dev - - name: Check Java version - run: - - name: Lint run: | pipenv run prospector --profile prospector.yaml From 607c17fae6acd1a860e5c726e61dcdd73c708ef1 Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Tue, 11 Feb 2025 10:36:13 -0500 Subject: [PATCH 15/16] Test --- .github/workflows/push.yml | 4 +++- makefile | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 6abdc337..b8c74861 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -56,7 +56,9 @@ jobs: pipenv run prospector --profile prospector.yaml - name: Run tests - run: make test + run: | + java -version + make test - name: Publish test coverage to coverage site uses: codecov/codecov-action@v4 diff --git a/makefile b/makefile index e76e0952..17e2bfc1 100644 --- a/makefile +++ b/makefile @@ -71,6 +71,10 @@ prep-doc-release: # Tests test: export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES +test: export SPARK_MASTER_HOST='localhost' + +test: export SPARK_LOCAL_IP=127.0.0.1 + dev-test: export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES dev-test: export SPARK_MASTER_HOST='localhost' From 0f585b3937fe855e2001dd9f472384f8c87376f9 Mon Sep 17 00:00:00 2001 From: Greg Hansen Date: Tue, 11 Feb 2025 12:17:37 -0500 Subject: [PATCH 16/16] Try actions on ubuntu-22.04 --- .github/workflows/push.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index b8c74861..9d23a1e8 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -9,7 +9,7 @@ on: jobs: tests: # Ubuntu latest no longer installs Python 3.9 by default so install it - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout uses: actions/checkout@v4