Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:
jobs:
tests:
# Ubuntu latest no longer installs Python 3.9 by default so install it
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
Expand All @@ -27,6 +27,12 @@ jobs:
# restore-keys: |
# ${{ runner.os }}-go-

- name: Set Java 8
run: |
sudo update-alternatives --set java /usr/lib/jvm/temurin-8-jdk-amd64/bin/java
java -version
sudo update-alternatives --config java

- name: Set up Python 3.8
uses: actions/setup-python@v5
with:
Expand All @@ -50,7 +56,9 @@ jobs:
pipenv run prospector --profile prospector.yaml

- name: Run tests
run: make test
run: |
java -version
make test

- name: Publish test coverage to coverage site
uses: codecov/codecov-action@v4
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ jobs:
- name: Checkout
uses: actions/checkout@v4

- name: Check Java version
run: java -version

- name: Set up Python 3.8
uses: actions/setup-python@v5
with:
Expand Down
21 changes: 20 additions & 1 deletion dbldatagen/column_generation_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def __init__(self, name, colType=None, minValue=0, maxValue=None, step=1, prefix
if EXPR_OPTION not in kwargs:
raise ValueError("Column generation spec must have `expr` attribute specified if datatype is inferred")

elif type(colType) == str:
elif isinstance(colType, str):
colType = SchemaParser.columnTypeFromString(colType)

assert isinstance(colType, DataType), f"colType `{colType}` is not instance of DataType"
Expand Down Expand Up @@ -1300,3 +1300,22 @@ def makeGenerationExpressions(self):
retval = F.slice(retval, F.lit(1), F.expr(expr_str))

return retval

def toDict(self):
""" Creates a dictionary from a ColumnGenerationSpec.
:return: A dictionary representation of the ColumnGenerationSpec
"""
return {
"colName": self.name,
"colType": self.datatype.simpleString(),
"minValue": self.min,
"maxValue": self.max,
"step": self.step,
"values": self.values,
"expr": self.expr,
"prefix": self.prefix,
"random": self.random,
"nullable": self.nullable,
"omit": self.omit,
"implicit": self.implicit
}
10 changes: 10 additions & 0 deletions dbldatagen/constraints/chained_relation.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,13 @@ def _generateFilterExpression(self):

# ... and combine them using logical `and` operation
return self.mkCombinedConstraintExpression(filters)

def toDict(self):
""" Returns a Python dictionary representation of a Constraint.
:return: Python dictionary representing the constraint
"""
return {
"type": self.__class__.__name__,
"columns": self._columns,
"relation": self._relation
}
20 changes: 20 additions & 0 deletions dbldatagen/constraints/constraint.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,26 @@ def filterExpression(self):
self._calculatedFilterExpression = True
return self._filterExpression

@classmethod
def fromDict(cls, constraint):
""" Creates a Constraint from a Python dictionary.
:param constraint: Constraint definition as a Python dictionary
:return: Constraint object
"""
inner_obj = constraint.copy()
constraint_type = inner_obj.pop("type")
for c in cls.__subclasses__():
if c.__name__ == constraint_type:
return c(**inner_obj)
raise ValueError(f"Unknown constraint type: {constraint_type}")

@abstractmethod
def toDict(self):
""" Returns a Python dictionary representation of a Constraint.
:return: Python dictionary representing the constraint
"""
raise NotImplementedError("Method toDict must be implemented in derived class")


class NoFilterMixin:
""" Mixin class to indicate that constraint has no filter expression
Expand Down
12 changes: 12 additions & 0 deletions dbldatagen/constraints/literal_range_constraint.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,15 @@ def _generateFilterExpression(self):

# ... and combine them using logical `and` operation
return self.mkCombinedConstraintExpression(filters)

def toDict(self):
""" Returns a Python dictionary representation of a Constraint.
:return: Python dictionary representing the constraint
"""
return {
"type": self.__class__.__name__,
"columns": self._columns,
"lowValue": self._lowValue,
"highValue": self._highValue,
"strict": self._strict
}
11 changes: 11 additions & 0 deletions dbldatagen/constraints/literal_relation_constraint.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,14 @@ def _generateFilterExpression(self):
filters = [self._generate_relation_expression(col, self._relation, literalValue) for col in expressions]

return self.mkCombinedConstraintExpression(filters)

def toDict(self):
""" Returns a Python dictionary representation of a Constraint.
:return: Python dictionary representing the constraint
"""
return {
"type": self.__class__.__name__,
"columns": self._columns,
"relation": self._relation,
"value": self._value
}
10 changes: 10 additions & 0 deletions dbldatagen/constraints/negative_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,13 @@ def _generateFilterExpression(self):
filters = [col.isNotNull() & (col <= 0) for col in expressions]

return self.mkCombinedConstraintExpression(filters)

def toDict(self):
""" Returns a Python dictionary representation of a Constraint.
:return: Python dictionary representing the constraint
"""
return {
"type": "NegativeValues",
"columns": self._columns,
"strict": self._strict
}
10 changes: 10 additions & 0 deletions dbldatagen/constraints/positive_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,13 @@ def _generateFilterExpression(self):
filters = [col.isNotNull() & (col >= 0) for col in expressions]

return self.mkCombinedConstraintExpression(filters)

def toDict(self):
""" Returns a Python dictionary representation of a Constraint.
:return: Python dictionary representing the constraint
"""
return {
"type": self.__class__.__name__,
"columns": self._columns,
"strict": self._strict
}
12 changes: 12 additions & 0 deletions dbldatagen/constraints/ranged_values_constraint.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,15 @@ def _generateFilterExpression(self):

# ... and combine them using logical `and` operation
return self.mkCombinedConstraintExpression(filters)

def toDict(self):
""" Returns a Python dictionary representation of a Constraint.
:return: Python dictionary representing the constraint
"""
return {
"type": "RangedValues",
"columns": self._columns,
"lowValue": self._lowValue,
"highValue": self._highValue,
"strict": self._strict
}
9 changes: 9 additions & 0 deletions dbldatagen/constraints/sql_expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,12 @@ def __init__(self, expr: str):
def _generateFilterExpression(self):
""" Generate a SQL filter expression that may be used for filtering"""
return F.expr(self._expr)

def toDict(self):
""" Returns a Python dictionary representation of a Constraint.
:return: Python dictionary representing the constraint
"""
return {
"type": self.__class__.__name__,
"expr": self._expr
}
9 changes: 9 additions & 0 deletions dbldatagen/constraints/unique_combinations.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,12 @@ def transformDataframe(self, dataGenerator, dataFrame):
results = dataFrame.dropDuplicates(columnsToEvaluate)

return results

def toDict(self):
""" Returns a Python dictionary representation of a Constraint.
:return: Python dictionary representing the constraint
"""
return {
"type": self.__class__.__name__,
"columns": self._columns
}
93 changes: 93 additions & 0 deletions dbldatagen/data_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
This file defines the `DataGenError` and `DataGenerator` classes
"""
import copy
import json
import logging
import re

import yaml
from pyspark.sql.types import LongType, IntegerType, StringType, StructType, StructField, DataType

from ._version import _get_spark_version
Expand Down Expand Up @@ -869,6 +871,17 @@ def withColumn(self, colName, colType=StringType(), minValue=None, maxValue=None
self._inferredSchemaFields.append(StructField(colName, newColumn.datatype, nullable))
return self

def withColumnDefinitions(self, columns):
""" Adds a set of columns to the synthetic generation specification.

:param columns: A list of column generation specifications as dictionaries
:returns: A modified in-place instance of a data generator allowing for chaining of calls
following a builder pattern
"""
for column in columns:
self.withColumn(**column)
return self

def _mkSqlStructFromList(self, fields):
"""
Create a SQL struct expression from a list of fields
Expand Down Expand Up @@ -1206,6 +1219,12 @@ def _getColumnDataTypes(self, columns):
"""
return [self._columnSpecsByName[colspec].datatype for colspec in columns]

def getColumnGenerationSpecs(self):
return self._allColumnSpecs

def getConstraints(self):
return self._constraints

def withConstraint(self, constraint):
"""Add a constraint to control the data generation

Expand Down Expand Up @@ -1255,6 +1274,17 @@ def withSqlConstraint(self, sqlExpression: str):
self.withConstraint(SqlExpr(sqlExpression))
return self

def withConstraintDefinitions(self, constraints):
""" Adds a set of constraints to the synthetic generation specification.

:param constraints: A list of constraints as dictionaries
:returns: A modified in-place instance of a data generator allowing for chaining of calls
following a builder pattern
"""
for c in constraints:
self.withConstraint(Constraint.fromDict(c))
return self

def computeBuildPlan(self):
""" prepare for building by computing a pseudo build plan

Expand Down Expand Up @@ -1604,3 +1634,66 @@ def scriptMerge(self, tgtName=None, srcName=None, updateExpr=None, delExpr=None,
result = HtmlUtils.formatCodeAsHtml(results)

return result

@staticmethod
def fromDict(options):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make sure to have explicit tests for this covering the following use cases:

1 - with simple options
2 - with composite (object valued options)

See the examples on the following page for object valued options - i.e DateRange, Distribution objects

""" Creates a data generator from a dictionary of options.
:param options: Dictionary with data generator options (e.g. "name", "rows")
:return: A data generator with the specified options
"""
generator = options["generator"]
columns = options.get("columns", [])
constraints = options.get("constraints", [])
return (
DataGenerator(**generator)
.withColumnDefinitions(columns)
.withConstraintDefinitions(constraints)
)

def toDict(self):
""" Creates a dictionary from a DataGenerator.
:return: A dictionary representation of the DataGenerator
"""
generator = {
"name": self.name,
"rows": self.rowCount,
"partitions": self.partitions,
"random": self.random,
"randomSeed": self.randomSeed,
"startingId": self.starting_id,
}
return {
"generator": generator,
"columns": [column.toDict() for column in self.getColumnGenerationSpecs()],
"constraints": [constraint.toDict() for constraint in self.getConstraints()]
}

@staticmethod
def fromJson(options):
""" Creates a data generator from a JSON string.
:param options: A JSON string containing data generation options
:return: A data generator with the specified options
"""
options = json.loads(options)
return DataGenerator.fromDict(options)

def toJson(self):
""" Returns the JSON string representation of a data generator.
:return: A JSON string representation of the DataGenerator
"""
return json.dumps(self.toDict())

@staticmethod
def fromYaml(options):
""" Creates a data generator from a YAML string.
:param options: A YAML string containing data generation options
:return: A data generator with the specified options
"""
options = yaml.safe_load(options)
return DataGenerator.fromDict(options)

def toYaml(self):
""" Returns the YAML string representation of a data generator.
:return: A YAML string representation of the DataGenerator
"""
return yaml.dump(self.toDict())
6 changes: 3 additions & 3 deletions dbldatagen/text_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def __str__(self):
return f"TextGenerator(randomSeed={self._randomSeed})"

def __eq__(self, other):
return type(self) == type(other) and self._randomSeed == other._randomSeed
return isinstance(self, type(other)) and self._randomSeed == other._randomSeed

def withRandomSeed(self, seed):
""" Set the random seed for the text generator
Expand Down Expand Up @@ -260,7 +260,7 @@ def __init__(self, template, escapeSpecialChars=False, extendedWordList=None):
assert v is not None and isinstance(v, tuple) and len(v) == 2, "value must be tuple of length 2"
mapping_length, mappings = v
assert isinstance(mapping_length, int), "mapping length must be of type int"
assert isinstance(mappings, (list, np.ndarray)),\
assert isinstance(mappings, (list, np.ndarray)), \
"mappings are lists or numpy arrays"
assert mapping_length == 0 or len(mappings) == mapping_length, "mappings must match mapping_length"

Expand All @@ -277,7 +277,7 @@ def __init__(self, template, escapeSpecialChars=False, extendedWordList=None):
assert v is not None and isinstance(v, tuple) and len(v) == 2, "value must be tuple of length 2"
mapping_length, mappings = v
assert isinstance(mapping_length, int), "mapping length must be of type int"
assert mappings is None or isinstance(mappings, (list, np.ndarray)),\
assert mappings is None or isinstance(mappings, (list, np.ndarray)), \
"mappings are lists or numpy arrays"

# for escaped mappings, the mapping can be None in which case the mapping is to the number itself
Expand Down
Loading