diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index 332fada15d9dc..ce8d0ebbdd8ac 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -19,6 +19,8 @@ jobs: metadata-ingestion-general: runs-on: ubuntu-latest + env: + SPARK_VERSION: 3.0.3 strategy: matrix: python-version: ["3.6", "3.9.9"] @@ -46,6 +48,8 @@ jobs: metadata-ingestion-by-version: runs-on: ubuntu-latest + env: + SPARK_VERSION: 3.0.3 strategy: matrix: python-version: ["3.6", "3.9.9"] diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index c6419708b3683..e089d610f9328 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -99,6 +99,21 @@ def get_long_description(): "cryptography", } +data_lake_base = { + *aws_common, + "parse>=1.19.0", + "pyarrow>=6.0.1", + "tableschema>=1.20.2", + "ujson>=4.3.0", + "types-ujson>=4.2.1", + "smart-open[s3]>=5.2.1", +} + +data_lake_profiling = { + "pydeequ==1.0.1", + "pyspark==3.0.3", +} + # Note: for all of these, framework_common will be added. plugins: Dict[str, Set[str]] = { # Sink plugins. @@ -118,7 +133,7 @@ def get_long_description(): "clickhouse-usage": sql_common | {"clickhouse-sqlalchemy==0.1.8"}, "datahub-lineage-file": set(), "datahub-business-glossary": set(), - "data-lake": {*aws_common, "pydeequ==1.0.1", "pyspark==3.0.3", "parse==1.19.0"}, + "data-lake": {*data_lake_base, *data_lake_profiling}, "dbt": {"requests"}, "druid": sql_common | {"pydruid>=0.6.2"}, # Starting with 7.14.0 python client is checking if it is connected to elasticsearch client. If its not it throws @@ -196,6 +211,7 @@ def get_long_description(): *base_requirements, *framework_common, *mypy_stubs, + *data_lake_base, "black>=21.12b0", "coverage>=5.1", "flake8>=3.8.3", diff --git a/metadata-ingestion/source_docs/data_lake.md b/metadata-ingestion/source_docs/data_lake.md index f832b05683354..fef3a9454d74a 100644 --- a/metadata-ingestion/source_docs/data_lake.md +++ b/metadata-ingestion/source_docs/data_lake.md @@ -10,7 +10,7 @@ This source is in **Beta** and under active development. Not yet considered read ## Setup -To install this plugin, run `pip install 'acryl-datahub[data-lake]'`. Because the files are read using PySpark, we require Spark 3.0.3 with Hadoop 3.2 to be installed. +To install this plugin, run `pip install 'acryl-datahub[data-lake]'`. Note that because the profiling is run with PySpark, we require Spark 3.0.3 with Hadoop 3.2 to be installed (see [compatibility](#compatibility) for more details). The data lake connector extracts schemas and profiles from a variety of file formats (see below for an exhaustive list). Individual files are ingested as tables, and profiles are computed similar to the [SQL profiler](./sql_profiles.md). @@ -37,7 +37,7 @@ If you would like to write a more complicated function for resolving file names, Extracts: - Row and column counts for each table -- For each column, if applicable: +- For each column, if profiling is enabled: - null counts and proportions - distinct counts and proportions - minimum, maximum, mean, median, standard deviation, some quantile values @@ -47,20 +47,25 @@ This connector supports both local files as well as those stored on AWS S3 (whic - CSV - TSV -- Parquet - JSON +- Parquet - Apache Avro +Schemas for Parquet and Avro files are extracted as provided. + +Schemas for schemaless formats (CSV, TSV, JSON) are inferred. For CSV and TSV files, we consider the first 100 rows by default, which can be controlled via the `max_rows` recipe parameter (see [below](#config-details)) +JSON file schemas are inferred on the basis of the entire file (given the difficulty in extracting only the first few objects of the file), which may impact performance. +We are working on using iterator-based JSON parsers to avoid reading in the entire JSON object. + :::caution If you are ingesting datasets from AWS S3, we recommend running the ingestion on a server in the same region to avoid high egress costs. ::: -| Capability | Status | Details | -| -----------| ------ | ---- | -| Platform Instance | 🛑 | [link](../../docs/platform-instances.md) | - +| Capability | Status | Details | +| ----------------- | ------ | ---------------------------------------- | +| Platform Instance | 🛑 | [link](../../docs/platform-instances.md) | ## Quickstart recipe @@ -99,6 +104,7 @@ Note that a `.` is used to denote nested fields in the YAML recipe. | `aws_config.aws_access_key_id` | | Autodetected | See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html | | `aws_config.aws_secret_access_key` | | Autodetected | See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html | | `aws_config.aws_session_token` | | Autodetected | See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html | +| `max_rows` | | `100` | Maximum number of rows to use when inferring schemas for TSV and CSV files. | | `schema_patterns.allow` | | `*` | List of regex patterns for tables to ingest. Defaults to all. | | `schema_patterns.deny` | | | List of regex patterns for tables to not ingest. Defaults to none. | | `schema_patterns.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching of tables to ingest. | @@ -121,9 +127,7 @@ Note that a `.` is used to denote nested fields in the YAML recipe. ## Compatibility -Files are read using PySpark and profiles are computed with PyDeequ. -We currently require Spark 3.0.3 with Hadoop 3.2 to be installed and the `SPARK_HOME` environment variable to be set for PySpark. -The Spark+Hadoop binary can be downloaded [here](https://www.apache.org/dyn/closer.lua/spark/spark-3.0.3/spark-3.0.3-bin-hadoop3.2.tgz). +Profiles are computed with PyDeequ, which relies on PySpark. Therefore, for computing profiles, we currently require Spark 3.0.3 with Hadoop 3.2 to be installed and the `SPARK_HOME` and `SPARK_VERSION` environment variables to be set. The Spark+Hadoop binary can be downloaded [here](https://www.apache.org/dyn/closer.lua/spark/spark-3.0.3/spark-3.0.3-bin-hadoop3.2.tgz). For an example guide on setting up PyDeequ on AWS, see [this guide](https://aws.amazon.com/blogs/big-data/testing-data-quality-at-scale-with-pydeequ/). diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index c45685f4d6d4a..4f8d3251e93b9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -256,9 +256,8 @@ def process_dataflow_node( # append S3 format if different ones exist if len(s3_formats[s3_uri]) > 1: node_urn = make_s3_urn( - s3_uri, + f"{s3_uri}.{node_args.get('format')}", self.env, - suffix=node_args.get("format"), ) else: diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py b/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py index 57df31807435b..fe232aa47f93c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py @@ -1,17 +1,35 @@ -from typing import Optional +import os +S3_PREFIXES = ["s3://", "s3n://", "s3a://"] -def make_s3_urn(s3_uri: str, env: str, suffix: Optional[str] = None) -> str: - if not s3_uri.startswith("s3://"): - raise ValueError("S3 URIs should begin with 's3://'") +def is_s3_uri(uri: str) -> bool: + return any(uri.startswith(prefix) for prefix in S3_PREFIXES) + + +def strip_s3_prefix(s3_uri: str) -> str: # remove S3 prefix (s3://) - s3_name = s3_uri[5:] + for s3_prefix in S3_PREFIXES: + if s3_uri.startswith(s3_prefix): + plain_base_path = s3_uri[len(s3_prefix) :] + return plain_base_path + + raise ValueError( + f"Not an S3 URI. Must start with one of the following prefixes: {str(S3_PREFIXES)}" + ) + + +def make_s3_urn(s3_uri: str, env: str) -> str: + + s3_name = strip_s3_prefix(s3_uri) if s3_name.endswith("/"): s3_name = s3_name[:-1] - if suffix is not None: - return f"urn:li:dataset:(urn:li:dataPlatform:s3,{s3_name}_{suffix},{env})" + name, extension = os.path.splitext(s3_name) + + if extension != "": + extension = extension[1:] # remove the dot + return f"urn:li:dataset:(urn:li:dataPlatform:s3,{name}_{extension},{env})" return f"urn:li:dataset:(urn:li:dataPlatform:s3,{s3_name},{env})" diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake/__init__.py index b8669b22b7c9c..53c6388c1621d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/data_lake/__init__.py +++ b/metadata-ingestion/src/datahub/ingestion/source/data_lake/__init__.py @@ -30,16 +30,18 @@ TimestampType, ) from pyspark.sql.utils import AnalysisException +from smart_open import open as smart_open from datahub.emitter.mce_builder import make_data_platform_urn, make_dataset_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.aws.s3_util import make_s3_urn +from datahub.ingestion.source.aws.s3_util import is_s3_uri, make_s3_urn, strip_s3_prefix from datahub.ingestion.source.data_lake.config import DataLakeSourceConfig from datahub.ingestion.source.data_lake.profiling import _SingleTableProfiler from datahub.ingestion.source.data_lake.report import DataLakeSourceReport +from datahub.ingestion.source.schema_inference import avro, csv_tsv, json, parquet from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent from datahub.metadata.com.linkedin.pegasus2avro.schema import ( @@ -49,7 +51,6 @@ NullTypeClass, NumberTypeClass, RecordTypeClass, - SchemaField, SchemaFieldDataType, SchemaMetadata, StringTypeClass, @@ -171,6 +172,9 @@ def __init__(self, config: DataLakeSourceConfig, ctx: PipelineContext): for config_flag in profiling_flags_to_report }, ) + self.init_spark() + + def init_spark(self): conf = SparkConf() @@ -201,7 +205,6 @@ def __init__(self, config: DataLakeSourceConfig, ctx: PipelineContext): # see https://hadoop.apache.org/docs/r3.0.3/hadoop-aws/tools/hadoop-aws/index.html#Changing_Authentication_Providers if all(x is not None for x in aws_provided_credentials): - conf.set( "spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider", @@ -233,7 +236,7 @@ def __init__(self, config: DataLakeSourceConfig, ctx: PipelineContext): ) conf.set("spark.jars.excludes", pydeequ.f2j_maven_coord) - conf.set("spark.driver.memory", config.spark_driver_memory) + conf.set("spark.driver.memory", self.source_config.spark_driver_memory) self.spark = SparkSession.builder.config(conf=conf).getOrCreate() @@ -243,7 +246,7 @@ def create(cls, config_dict, ctx): return cls(config, ctx) - def read_file(self, file: str) -> Optional[DataFrame]: + def read_file_spark(self, file: str) -> Optional[DataFrame]: extension = os.path.splitext(file)[1] @@ -294,7 +297,7 @@ def read_file(self, file: str) -> Optional[DataFrame]: return df.toDF(*(c.replace(".", "_") for c in df.columns)) def get_table_schema( - self, dataframe: DataFrame, file_path: str, table_name: str + self, file_path: str, table_name: str ) -> Iterable[MetadataWorkUnit]: data_platform_urn = make_data_platform_urn(self.source_config.platform) @@ -305,7 +308,7 @@ def get_table_schema( dataset_name = os.path.basename(file_path) # if no path spec is provided and the file is in S3, then use the S3 path to construct an URN - if self.source_config.platform == "s3" and self.source_config.path_spec is None: + if is_s3_uri(file_path) and self.source_config.path_spec is None: dataset_urn = make_s3_urn(file_path, self.source_config.env) dataset_snapshot = DatasetSnapshot( @@ -319,25 +322,53 @@ def get_table_schema( ) dataset_snapshot.aspects.append(dataset_properties) - column_fields = [] + if file_path.startswith("s3a://"): + if self.source_config.aws_config is None: + raise ValueError("AWS config is required for S3 file sources") - for field in dataframe.schema.fields: + s3_client = self.source_config.aws_config.get_s3_client() - field = SchemaField( - fieldPath=field.name, - type=get_column_type(self.report, dataset_name, field.dataType), - nativeDataType=str(field.dataType), - recursive=False, - ) + file = smart_open(file_path, "rb", transport_params={"client": s3_client}) - column_fields.append(field) + else: + file = open(file_path, "rb") + + fields = [] + + try: + if file_path.endswith(".parquet"): + fields = parquet.ParquetInferrer().infer_schema(file) + elif file_path.endswith(".csv"): + fields = csv_tsv.CsvInferrer( + max_rows=self.source_config.max_rows + ).infer_schema(file) + elif file_path.endswith(".tsv"): + fields = csv_tsv.TsvInferrer( + max_rows=self.source_config.max_rows + ).infer_schema(file) + elif file_path.endswith(".json"): + fields = json.JsonInferrer().infer_schema(file) + elif file_path.endswith(".avro"): + fields = avro.AvroInferrer().infer_schema(file) + else: + self.report.report_warning( + file_path, f"file {file_path} has unsupported extension" + ) + file.close() + except Exception as e: + self.report.report_warning( + file_path, f"could not infer schema for file {file_path}: {e}" + ) + file.close() + + fields = sorted(fields, key=lambda f: f.fieldPath) schema_metadata = SchemaMetadata( schemaName=dataset_name, platform=data_platform_urn, version=0, hash="", - fields=column_fields, + fields=fields, platformSchema=OtherSchemaClass(rawSchema=""), ) @@ -348,10 +379,16 @@ def get_table_schema( self.report.report_workunit(wu) yield wu - def get_table_name(self, relative_path: str) -> str: + def get_table_name(self, relative_path: str, full_path: str) -> str: if self.source_config.path_spec is None: - return relative_path + name, extension = os.path.splitext(full_path) + + if extension != "": + extension = extension[1:] # remove the dot + return f"{name}_{extension}" + + return name def warn(): self.report.report_warning( @@ -382,24 +419,28 @@ def ingest_table( self, full_path: str, relative_path: str ) -> Iterable[MetadataWorkUnit]: - table_name = self.get_table_name(relative_path) - - table = self.read_file(full_path) - - # if table is not readable, skip - if table is None: - return + table_name = self.get_table_name(relative_path, full_path) # yield the table schema first logger.debug( f"Ingesting {full_path}: making table schemas {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) - yield from self.get_table_schema(table, full_path, table_name) + yield from self.get_table_schema(full_path, table_name) # If profiling is not enabled, skip the rest if not self.source_config.profiling.enabled: return + # read in the whole table with Spark for profiling + table = self.read_file_spark(full_path) + + # if table is not readable, skip + if table is None: + self.report.report_warning( + table_name, f"unable to read table {table_name} from file {full_path}" + ) + return + with PerfTimer() as timer: # init PySpark analysis object logger.debug( @@ -460,10 +501,7 @@ def ingest_table( def get_workunits_s3(self) -> Iterable[MetadataWorkUnit]: - for s3_prefix in S3_PREFIXES: - if self.source_config.base_path.startswith(s3_prefix): - plain_base_path = self.source_config.base_path.lstrip(s3_prefix) - break + plain_base_path = strip_s3_prefix(self.source_config.base_path) # append a trailing slash if it's not there so prefix filtering works if not plain_base_path.endswith("/"): @@ -531,11 +569,7 @@ def get_workunits(self) -> Iterable[MetadataWorkUnit]: with PerfTimer() as timer: # check if file is an s3 object - if any( - self.source_config.base_path.startswith(s3_prefix) - for s3_prefix in S3_PREFIXES - ): - + if is_s3_uri(self.source_config.base_path): yield from self.get_workunits_s3() else: diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake/config.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake/config.py index df59fe8bd5ce2..004af7302043d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/data_lake/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/data_lake/config.py @@ -29,6 +29,8 @@ class DataLakeSourceConfig(ConfigModel): spark_driver_memory: str = "4g" + max_rows: int = 100 + @pydantic.root_validator() def ensure_profiling_pattern_is_passed_to_profiling( cls, values: Dict[str, Any] diff --git a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py index 3ee30246050e6..7a6782b552f01 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py @@ -1,13 +1,9 @@ import logging -from collections import Counter from dataclasses import dataclass, field -from typing import Any -from typing import Counter as CounterType from typing import Dict, Iterable, List, Optional, Tuple, Type, Union, ValuesView import bson import pymongo -from mypy_extensions import TypedDict from packaging import version from pydantic import PositiveInt, validator from pymongo.mongo_client import MongoClient @@ -17,6 +13,10 @@ from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.schema_inference.object import ( + SchemaDescription, + construct_schema, +) from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent from datahub.metadata.com.linkedin.pegasus2avro.schema import ( @@ -116,181 +116,6 @@ def report_dropped(self, name: str) -> None: } -def is_nullable_doc(doc: Dict[str, Any], field_path: Tuple) -> bool: - """ - Check if a nested field is nullable in a document from a collection. - - Parameters - ---------- - doc: - document to check nullability for - field_path: - path to nested field to check, ex. ('first_field', 'nested_child', '2nd_nested_child') - """ - - field = field_path[0] - - # if field is inside - if field in doc: - - value = doc[field] - - if value is None: - return True - - # if no fields left, must be non-nullable - if len(field_path) == 1: - return False - - # otherwise, keep checking the nested fields - remaining_fields = field_path[1:] - - # if dictionary, check additional level of nesting - if isinstance(value, dict): - return is_nullable_doc(doc[field], remaining_fields) - - # if list, check if any member is missing field - if isinstance(value, list): - - # count empty lists of nested objects as nullable - if len(value) == 0: - return True - - return any(is_nullable_doc(x, remaining_fields) for x in doc[field]) - - # any other types to check? - # raise ValueError("Nested type not 'list' or 'dict' encountered") - return True - - return True - - -def is_nullable_collection( - collection: Iterable[Dict[str, Any]], field_path: Tuple -) -> bool: - """ - Check if a nested field is nullable in a collection. - - Parameters - ---------- - collection: - collection to check nullability for - field_path: - path to nested field to check, ex. ('first_field', 'nested_child', '2nd_nested_child') - """ - - return any(is_nullable_doc(doc, field_path) for doc in collection) - - -class BasicSchemaDescription(TypedDict): - types: CounterType[type] # field types and times seen - count: int # times the field was seen - - -class SchemaDescription(BasicSchemaDescription): - delimited_name: str # collapsed field name - # we use 'mixed' to denote mixed types, so we need a str here - type: Union[type, str] # collapsed type - nullable: bool # if field is ever missing - - -def construct_schema( - collection: Iterable[Dict[str, Any]], delimiter: str -) -> Dict[Tuple[str, ...], SchemaDescription]: - """ - Construct (infer) a schema from a collection of documents. - - For each field (represented as a tuple to handle nested items), reports the following: - - `types`: Python types of field values - - `count`: Number of times the field was encountered - - `type`: type of the field if `types` is just a single value, otherwise `mixed` - - `nullable`: if field is ever null/missing - - `delimited_name`: name of the field, joined by a given delimiter - - Parameters - ---------- - collection: - collection to construct schema over. - delimiter: - string to concatenate field names by - """ - - schema: Dict[Tuple[str, ...], BasicSchemaDescription] = {} - - def append_to_schema(doc: Dict[str, Any], parent_prefix: Tuple[str, ...]) -> None: - """ - Recursively update the schema with a document, which may/may not contain nested fields. - - Parameters - ---------- - doc: - document to scan - parent_prefix: - prefix of fields that the document is under, pass an empty tuple when initializing - """ - - for key, value in doc.items(): - - new_parent_prefix = parent_prefix + (key,) - - # if nested value, look at the types within - if isinstance(value, dict): - - append_to_schema(value, new_parent_prefix) - - # if array of values, check what types are within - if isinstance(value, list): - - for item in value: - - # if dictionary, add it as a nested object - if isinstance(item, dict): - append_to_schema(item, new_parent_prefix) - - # don't record None values (counted towards nullable) - if value is not None: - - if new_parent_prefix not in schema: - - schema[new_parent_prefix] = { - "types": Counter([type(value)]), - "count": 1, - } - - else: - - # update the type count - schema[new_parent_prefix]["types"].update({type(value): 1}) - schema[new_parent_prefix]["count"] += 1 - - for document in collection: - append_to_schema(document, ()) - - extended_schema: Dict[Tuple[str, ...], SchemaDescription] = {} - - for field_path in schema.keys(): - - field_types = schema[field_path]["types"] - - field_type: Union[str, type] = "mixed" - - # if single type detected, mark that as the type to go with - if len(field_types.keys()) == 1: - field_type = next(iter(field_types)) - - field_extended: SchemaDescription = { - "types": schema[field_path]["types"], - "count": schema[field_path]["count"], - "nullable": is_nullable_collection(collection, field_path), - "delimited_name": delimiter.join(field_path), - "type": field_type, - } - - extended_schema[field_path] = field_extended - - return extended_schema - - def construct_schema_pymongo( collection: pymongo.collection.Collection, delimiter: str, diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/avro.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/avro.py new file mode 100644 index 0000000000000..4e97b58674a6c --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/avro.py @@ -0,0 +1,17 @@ +from typing import IO, List + +from avro.datafile import DataFileReader +from avro.io import DatumReader + +from datahub.ingestion.extractor import schema_util +from datahub.ingestion.source.schema_inference.base import SchemaInferenceBase +from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField + + +class AvroInferrer(SchemaInferenceBase): + def infer_schema(self, file: IO[bytes]) -> List[SchemaField]: + + reader = DataFileReader(file, DatumReader()) + fields = schema_util.avro_schema_to_mce_fields(reader.schema) + + return fields diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/base.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/base.py new file mode 100644 index 0000000000000..dfec16f5289fd --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/base.py @@ -0,0 +1,15 @@ +from typing import IO, List + +from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField + + +class SchemaInferenceBase: + """ + Base class for file schema inference. + """ + + def infer_schema(self, file: IO[bytes]) -> List[SchemaField]: + """ + Infer schema from file. + """ + raise NotImplementedError("infer_schema not implemented") diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/csv_tsv.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/csv_tsv.py new file mode 100644 index 0000000000000..54f7dfb5b903c --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/csv_tsv.py @@ -0,0 +1,74 @@ +from typing import IO, Dict, List, Type + +from tableschema import Table + +from datahub.ingestion.source.schema_inference.base import SchemaInferenceBase +from datahub.metadata.com.linkedin.pegasus2avro.schema import ( + ArrayTypeClass, + BooleanTypeClass, + DateTypeClass, + NullTypeClass, + NumberTypeClass, + RecordTypeClass, + SchemaField, + SchemaFieldDataType, + StringTypeClass, + TimeTypeClass, + UnionTypeClass, +) + +# see https://github.com/frictionlessdata/tableschema-py/blob/main/tableschema/schema.py#L545 +tableschema_type_map: Dict[str, Type] = { + "duration": TimeTypeClass, + "geojson": RecordTypeClass, + "geopoint": RecordTypeClass, + "object": RecordTypeClass, + "array": ArrayTypeClass, + "datetime": TimeTypeClass, + "time": TimeTypeClass, + "date": DateTypeClass, + "integer": NumberTypeClass, + "number": NumberTypeClass, + "boolean": BooleanTypeClass, + "string": StringTypeClass, + "any": UnionTypeClass, +} + + +def get_table_schema_fields(table: Table, max_rows: int) -> List[SchemaField]: + table.infer(limit=max_rows) + + fields: List[SchemaField] = [] + + for raw_field in table.schema.fields: + mapped_type: Type = tableschema_type_map.get(raw_field.type, NullTypeClass) + + field = SchemaField( + fieldPath=raw_field.name, + type=SchemaFieldDataType(mapped_type()), + nativeDataType=str(raw_field.type), + recursive=False, + ) + fields.append(field) + + return fields + + +class CsvInferrer(SchemaInferenceBase): + def __init__(self, max_rows: int): + self.max_rows = max_rows + + def infer_schema(self, file: IO[bytes]) -> List[SchemaField]: + # infer schema of a csv file without reading the whole file + table = Table(file, format="csv") + return get_table_schema_fields(table, max_rows=self.max_rows) + + +class TsvInferrer(SchemaInferenceBase): + def __init__(self, max_rows: int): + self.max_rows = max_rows + + def infer_schema(self, file: IO[bytes]) -> List[SchemaField]: + # infer schema of a tsv file without reading the whole file + table = Table(file, format="tsv") + return get_table_schema_fields(table, max_rows=self.max_rows) diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/json.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/json.py new file mode 100644 index 0000000000000..9d690d8304b77 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/json.py @@ -0,0 +1,59 @@ +from typing import IO, Dict, List, Type, Union + +import ujson + +from datahub.ingestion.source.schema_inference.base import SchemaInferenceBase +from datahub.ingestion.source.schema_inference.object import construct_schema +from datahub.metadata.com.linkedin.pegasus2avro.schema import ( + ArrayTypeClass, + BooleanTypeClass, + NullTypeClass, + NumberTypeClass, + RecordTypeClass, + SchemaField, + SchemaFieldDataType, + StringTypeClass, + UnionTypeClass, +) + +_field_type_mapping: Dict[Union[Type, str], Type] = { + list: ArrayTypeClass, + bool: BooleanTypeClass, + type(None): NullTypeClass, + int: NumberTypeClass, + float: NumberTypeClass, + str: StringTypeClass, + dict: RecordTypeClass, + "mixed": UnionTypeClass, +} + + +class JsonInferrer(SchemaInferenceBase): + def infer_schema(self, file: IO[bytes]) -> List[SchemaField]: + + datastore = ujson.load(file) + + if not isinstance(datastore, list): + datastore = [datastore] + + schema = construct_schema(datastore, delimiter=".") + fields: List[SchemaField] = [] + + for schema_field in sorted(schema.values(), key=lambda x: x["delimited_name"]): + mapped_type = _field_type_mapping.get(schema_field["type"], NullTypeClass) + + native_type = schema_field["type"] + + if isinstance(native_type, type): + native_type = native_type.__name__ + + field = SchemaField( + fieldPath=schema_field["delimited_name"], + nativeDataType=native_type, + type=SchemaFieldDataType(type=mapped_type()), + nullable=schema_field["nullable"], + recursive=False, + ) + fields.append(field) + + return fields diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/object.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/object.py new file mode 100644 index 0000000000000..1dd54978fd3ce --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/object.py @@ -0,0 +1,168 @@ +from collections import Counter +from typing import Any +from typing import Counter as CounterType +from typing import Dict, Sequence, Tuple, Union + +from mypy_extensions import TypedDict + + +class BasicSchemaDescription(TypedDict): + types: CounterType[type] # field types and times seen + count: int # times the field was seen + + +class SchemaDescription(BasicSchemaDescription): + delimited_name: str # collapsed field name + # we use 'mixed' to denote mixed types, so we need a str here + type: Union[type, str] # collapsed type + nullable: bool # if field is ever missing + + +def is_field_nullable(doc: Dict[str, Any], field_path: Tuple) -> bool: + """ + Check if a nested field is nullable in a document from a collection. + + Parameters + ---------- + doc: + document to check nullability for + field_path: + path to nested field to check, ex. ('first_field', 'nested_child', '2nd_nested_child') + """ + + if not field_path: + return True + + field = field_path[0] + + # if field is inside + if field in doc: + value = doc[field] + + if value is None: + return True + # if no fields left, must be non-nullable + if len(field_path) == 1: + return False + + # otherwise, keep checking the nested fields + remaining_fields = field_path[1:] + + # if dictionary, check additional level of nesting + if isinstance(value, dict): + return is_field_nullable(doc[field], remaining_fields) + # if list, check if any member is missing field + if isinstance(value, list): + # count empty lists of nested objects as nullable + if len(value) == 0: + return True + return any(is_field_nullable(x, remaining_fields) for x in doc[field]) + + # any other types to check? + # raise ValueError("Nested type not 'list' or 'dict' encountered") + return True + + return True + + +def is_nullable_collection( + collection: Sequence[Dict[str, Any]], field_path: Tuple +) -> bool: + """ + Check if a nested field is nullable in a collection. + + Parameters + ---------- + collection: + collection to check nullability for + field_path: + path to nested field to check, ex. ('first_field', 'nested_child', '2nd_nested_child') + """ + + return any(is_field_nullable(doc, field_path) for doc in collection) + + +def construct_schema( + collection: Sequence[Dict[str, Any]], delimiter: str +) -> Dict[Tuple[str, ...], SchemaDescription]: + """ + Construct (infer) a schema from a collection of documents. + + For each field (represented as a tuple to handle nested items), reports the following: + - `types`: Python types of field values + - `count`: Number of times the field was encountered + - `type`: type of the field if `types` is just a single value, otherwise `mixed` + - `nullable`: if field is ever null/missing + - `delimited_name`: name of the field, joined by a given delimiter + + Parameters + ---------- + collection: + collection to construct schema over. + delimiter: + string to concatenate field names by + """ + + schema: Dict[Tuple[str, ...], BasicSchemaDescription] = {} + + def append_to_schema(doc: Dict[str, Any], parent_prefix: Tuple[str, ...]) -> None: + """ + Recursively update the schema with a document, which may/may not contain nested fields. + + Parameters + ---------- + doc: + document to scan + parent_prefix: + prefix of fields that the document is under, pass an empty tuple when initializing + """ + + for key, value in doc.items(): + new_parent_prefix = parent_prefix + (key,) + + # if nested value, look at the types within + if isinstance(value, dict): + append_to_schema(value, new_parent_prefix) + # if array of values, check what types are within + if isinstance(value, list): + for item in value: + # if dictionary, add it as a nested object + if isinstance(item, dict): + append_to_schema(item, new_parent_prefix) + + # don't record None values (counted towards nullable) + if value is not None: + if new_parent_prefix not in schema: + schema[new_parent_prefix] = { + "types": Counter([type(value)]), + "count": 1, + } + + else: + # update the type count + schema[new_parent_prefix]["types"].update({type(value): 1}) + schema[new_parent_prefix]["count"] += 1 + + for document in collection: + append_to_schema(document, ()) + + extended_schema: Dict[Tuple[str, ...], SchemaDescription] = {} + + for field_path in schema.keys(): + field_types = schema[field_path]["types"] + field_type: Union[str, type] = "mixed" + + # if single type detected, mark that as the type to go with + if len(field_types.keys()) == 1: + field_type = next(iter(field_types)) + field_extended: SchemaDescription = { + "types": schema[field_path]["types"], + "count": schema[field_path]["count"], + "nullable": is_nullable_collection(collection, field_path), + "delimited_name": delimiter.join(field_path), + "type": field_type, + } + + extended_schema[field_path] = field_extended + + return extended_schema diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/parquet.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/parquet.py new file mode 100644 index 0000000000000..26a34b5990d2a --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/parquet.py @@ -0,0 +1,98 @@ +from typing import IO, Any, Callable, Dict, List, Type + +import pyarrow +import pyarrow.parquet + +from datahub.ingestion.source.schema_inference.base import SchemaInferenceBase +from datahub.metadata.com.linkedin.pegasus2avro.schema import ( + ArrayTypeClass, + BooleanTypeClass, + BytesTypeClass, + DateTypeClass, + NullTypeClass, + NumberTypeClass, + RecordTypeClass, + SchemaField, + SchemaFieldDataType, + StringTypeClass, + TimeTypeClass, + UnionTypeClass, +) + +# see https://arrow.apache.org/docs/python/api/datatypes.html#type-checking +pyarrow_type_map: Dict[Callable[[Any], bool], Type] = { + pyarrow.types.is_boolean: BooleanTypeClass, + pyarrow.types.is_integer: NumberTypeClass, + pyarrow.types.is_signed_integer: NumberTypeClass, + pyarrow.types.is_unsigned_integer: NumberTypeClass, + pyarrow.types.is_int8: NumberTypeClass, + pyarrow.types.is_int16: NumberTypeClass, + pyarrow.types.is_int32: NumberTypeClass, + pyarrow.types.is_int64: NumberTypeClass, + pyarrow.types.is_uint8: NumberTypeClass, + pyarrow.types.is_uint16: NumberTypeClass, + pyarrow.types.is_uint32: NumberTypeClass, + pyarrow.types.is_uint64: NumberTypeClass, + pyarrow.types.is_floating: NumberTypeClass, + pyarrow.types.is_float16: NumberTypeClass, + pyarrow.types.is_float32: NumberTypeClass, + pyarrow.types.is_float64: NumberTypeClass, + pyarrow.types.is_decimal: NumberTypeClass, + pyarrow.types.is_list: ArrayTypeClass, + pyarrow.types.is_large_list: ArrayTypeClass, + pyarrow.types.is_struct: RecordTypeClass, + pyarrow.types.is_union: UnionTypeClass, + pyarrow.types.is_nested: RecordTypeClass, + pyarrow.types.is_temporal: TimeTypeClass, + pyarrow.types.is_timestamp: TimeTypeClass, + pyarrow.types.is_date: DateTypeClass, + pyarrow.types.is_date32: DateTypeClass, + pyarrow.types.is_date64: DateTypeClass, + pyarrow.types.is_time: TimeTypeClass, + pyarrow.types.is_time32: TimeTypeClass, + pyarrow.types.is_time64: TimeTypeClass, + pyarrow.types.is_null: NullTypeClass, + pyarrow.types.is_binary: BytesTypeClass, + pyarrow.types.is_unicode: StringTypeClass, + pyarrow.types.is_string: StringTypeClass, + pyarrow.types.is_large_binary: BytesTypeClass, + pyarrow.types.is_large_unicode: StringTypeClass, + pyarrow.types.is_large_string: StringTypeClass, + pyarrow.types.is_fixed_size_binary: BytesTypeClass, + pyarrow.types.is_map: RecordTypeClass, + pyarrow.types.is_dictionary: RecordTypeClass, +} + + +def map_pyarrow_type(pyarrow_type: Type) -> Type: + + for checker, mapped_type in pyarrow_type_map.items(): + + if checker(pyarrow_type): + return mapped_type + + return NullTypeClass + + +class ParquetInferrer(SchemaInferenceBase): + def infer_schema(self, file: IO[bytes]) -> List[SchemaField]: + # infer schema of a parquet file without reading the whole file + + # read the first line of the file + schema = pyarrow.parquet.read_schema(file, memory_map=True) + + fields: List[SchemaField] = [] + + for name, pyarrow_type in zip(schema.names, schema.types): + mapped_type = map_pyarrow_type(pyarrow_type) + + field = SchemaField( + fieldPath=name, + type=SchemaFieldDataType(mapped_type()), + nativeDataType=str(pyarrow_type), + recursive=False, + ) + + fields.append(field) + + return fields diff --git a/metadata-ingestion/tests/integration/data_lake/data_lake_mces_golden.json b/metadata-ingestion/tests/integration/data_lake/data_lake_mces_golden.json index bbf9802efa52e..094dbd4c13cda 100644 --- a/metadata-ingestion/tests/integration/data_lake/data_lake_mces_golden.json +++ b/metadata-ingestion/tests/integration/data_lake/data_lake_mces_golden.json @@ -39,6 +39,346 @@ } }, "fields": [ + { + "fieldPath": "2", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "3", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "Br \n(mg/L)", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "Ca \n(mg/L)", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "Cl \n(mg/L)", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "Cond (\u00b5S/cm)", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "DO (mg/L)", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "DOC [mg/L C]", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "F \n(mg/L)", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "K \n(mg/L)", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "Lat (\u00b0N)", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "Long (\u00b0W)", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "Mg \n(mg/L)", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "NH3-N \n(mg N/L)", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "NO3-N+NO2-N \n(mg N/L)", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "Na \n(mg/L)", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "PO4-P \n(mg P/L)", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "Park ID", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "SO4-S \n(mg/L)", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "SUVA, 254nm", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, { "fieldPath": "Sampling Date", "jsonPath": null, @@ -49,7 +389,498 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "string", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "Secchi Depth (m)", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "Site ID", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "TDN \n(mg N/L)", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "TDP \n(mg P/L)", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "UV Absorbance, 254nm", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "Water Temp (\u00b0C)", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "d18O", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "dD", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "field29", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "pH", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + } + ], + "primaryKeys": null, + "foreignKeysSpecs": null, + "foreignKeys": null + } + } + ] + } + }, + "proposedDelta": null, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "data-lake-test", + "registryName": null, + "registryVersion": null, + "properties": null + } +}, +{ + "auditHeader": null, + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:data-lake-test,cases_2.NPS,PROD)", + "entityKeyAspect": null, + "changeType": "UPSERT", + "aspectName": "datasetProfile", + "aspect": { + "value": "{\"timestampMillis\": 1615443388097, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 33, \"columnCount\": 14, \"fieldProfiles\": [{\"fieldPath\": \"Sampling Date\", \"uniqueCount\": 12, \"uniqueProportion\": 0.36363636363636365, \"nullCount\": 0, \"nullProportion\": 0.0, \"distinctValueFrequencies\": [{\"value\": \"(mg N/L)\\\"\", \"frequency\": 3}, {\"value\": \"(mg P/L)\\\"\", \"frequency\": 2}, {\"value\": \"(mg/L)\\\"\", \"frequency\": 8}, {\"value\": \"6/21/2013\", \"frequency\": 3}, {\"value\": \"6/22/2013\", \"frequency\": 2}, {\"value\": \"6/23/2013\", \"frequency\": 1}, {\"value\": \"6/26/2014\", \"frequency\": 4}, {\"value\": \"6/27/2013\", \"frequency\": 1}, {\"value\": \"8/6/2014\", \"frequency\": 1}, {\"value\": \"8/7/2014\", \"frequency\": 3}, {\"value\": \"8/8/2014\", \"frequency\": 1}, {\"value\": \"9/16/2013\", \"frequency\": 4}], \"sampleValues\": [\"(mg/L)\\\"\", \"8/6/2014\", \"8/7/2014\", \"8/7/2014\", \"(mg/L)\\\"\", \"(mg N/L)\\\"\", \"(mg/L)\\\"\", \"9/16/2013\", \"6/22/2013\", \"(mg P/L)\\\"\", \"6/21/2013\", \"9/16/2013\", \"6/26/2014\", \"6/27/2013\", \"6/23/2013\", \"(mg N/L)\\\"\", \"6/26/2014\", \"9/16/2013\", \"(mg P/L)\\\"\", \"9/16/2013\"]}, {\"fieldPath\": \"Site ID\", \"uniqueCount\": 34, \"uniqueProportion\": 1.0303030303030303, \"nullCount\": 0, \"nullProportion\": 0.0, \"distinctValueFrequencies\": [{\"value\": \"Br \", \"frequency\": 1}, {\"value\": \"Ca \", \"frequency\": 1}, {\"value\": \"Cl \", \"frequency\": 1}, {\"value\": \"Desperation Lake\", \"frequency\": 1}, {\"value\": \"Devil Mountain Lake\", \"frequency\": 1}, {\"value\": \"F \", \"frequency\": 1}, {\"value\": \"Feniak Lake\", \"frequency\": 1}, {\"value\": \"Imuruk Lake\", \"frequency\": 1}, {\"value\": \"Iniakuk Lake\", \"frequency\": 1}, {\"value\": \"K \", \"frequency\": 1}, {\"value\": \"Kurupa Lake\", \"frequency\": 1}, {\"value\": \"Kuzitrin Lake\", \"frequency\": 1}, {\"value\": \"Lake Kangilipak\", \"frequency\": 1}, {\"value\": \"Lake Matcharak\", \"frequency\": 1}, {\"value\": \"Lake Narvakrak\", \"frequency\": 1}, {\"value\": \"Lake Selby\", \"frequency\": 1}, {\"value\": \"Lava Lake\", \"frequency\": 1}, {\"value\": \"Mg \", \"frequency\": 1}, {\"value\": \"NO3-N+NO2-N \", \"frequency\": 1}, {\"value\": \"Na \", \"frequency\": 1}, {\"value\": \"North Killeak Lake\", \"frequency\": 1}, {\"value\": \"Nutavukti Lake\", \"frequency\": 1}, {\"value\": \"Okoklik Lake\", \"frequency\": 1}, {\"value\": \"PO4-P \", \"frequency\": 1}, {\"value\": \"SO4-S \", \"frequency\": 1}, {\"value\": \"Summit Lake\", \"frequency\": 1}, {\"value\": \"TDN \", \"frequency\": 1}, {\"value\": \"TDP \", \"frequency\": 1}, {\"value\": \"Takahula Lake\", \"frequency\": 1}, {\"value\": \"Walker Lake\", \"frequency\": 1}, {\"value\": \"White Fish Lake\", \"frequency\": 1}, {\"value\": \"Wild Lake\", \"frequency\": 1}, {\"value\": \"d18O\", \"frequency\": 1}], \"sampleValues\": [\"Br \", \"Kurupa Lake\", \"Lake Selby\", \"Nutavukti Lake\", \"Ca \", \"NO3-N+NO2-N \", \"SO4-S \", \"Lake Kangilipak\", \"Devil Mountain Lake\", \"TDN \", \"Imuruk Lake\", \"Feniak Lake\", \"Iniakuk Lake\", \"Lake Narvakrak\", \"North Killeak Lake\", \"TDP \", \"Lake Matcharak\", \"Okoklik Lake\", \"Cl \", \"Desperation Lake\"]}, {\"fieldPath\": \"Park ID\", \"uniqueCount\": 4, \"uniqueProportion\": 0.19047619047619047, \"nullCount\": 12, \"nullProportion\": 0.36363636363636365, \"distinctValueFrequencies\": [{\"value\": \"BELA\", \"frequency\": 6}, {\"value\": \"GAAR\", \"frequency\": 9}, {\"value\": \"NOAT\", \"frequency\": 5}, {\"value\": \"NullValue\", \"frequency\": 12}, {\"value\": \"dD\", \"frequency\": 1}], \"sampleValues\": [\"None\", \"GAAR\", \"GAAR\", \"GAAR\", \"None\", \"None\", \"None\", \"NOAT\", \"BELA\", \"None\", \"BELA\", \"NOAT\", \"GAAR\", \"NOAT\", \"BELA\", \"None\", \"GAAR\", \"NOAT\", \"None\", \"NOAT\"]}, {\"fieldPath\": \"Lat (\\ufffdN)\", \"uniqueCount\": 19, \"uniqueProportion\": 0.95, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"65.3845\", \"frequency\": 1}, {\"value\": \"65.58905\", \"frequency\": 1}, {\"value\": \"65.58996\", \"frequency\": 1}, {\"value\": \"66.32813\", \"frequency\": 1}, {\"value\": \"66.37863\", \"frequency\": 1}, {\"value\": \"66.40362\", \"frequency\": 1}, {\"value\": \"66.89298\", \"frequency\": 1}, {\"value\": \"67.01337\", \"frequency\": 1}, {\"value\": \"67.06375\", \"frequency\": 1}, {\"value\": \"67.1257\", \"frequency\": 1}, {\"value\": \"67.35014\", \"frequency\": 1}, {\"value\": \"67.50282\", \"frequency\": 1}, {\"value\": \"67.74715\", \"frequency\": 1}, {\"value\": \"67.9999\", \"frequency\": 1}, {\"value\": \"68.00064\", \"frequency\": 1}, {\"value\": \"68.01392\", \"frequency\": 1}, {\"value\": \"68.07008\", \"frequency\": 1}, {\"value\": \"68.24775\", \"frequency\": 1}, {\"value\": \"68.33031\", \"frequency\": 1}, {\"value\": \"68.35879\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"68.35879\", \"66.89298\", \"67.01337\", \"None\", \"None\", \"None\", \"68.00064\", \"66.40362\", \"None\", \"65.58905\", \"68.24775\", \"67.1257\", \"67.9999\", \"66.32813\", \"None\", \"67.74715\", \"68.01392\", \"None\", \"68.33031\"]}, {\"fieldPath\": \"Long (\\ufffdW)\", \"uniqueCount\": 20, \"uniqueProportion\": 1.0, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"-158.31337\", \"frequency\": 1}, {\"value\": \"-158.74584\", \"frequency\": 1}, {\"value\": \"-159.15975\", \"frequency\": 1}, {\"value\": \"-159.27007\", \"frequency\": 1}, {\"value\": \"-161.72104\", \"frequency\": 1}, {\"value\": \"-163.18683\", \"frequency\": 1}, {\"value\": \"-163.21787\", \"frequency\": 1}, {\"value\": \"-163.90928\", \"frequency\": 1}, {\"value\": \"-164.10563\", \"frequency\": 1}, {\"value\": \"-164.48818\", \"frequency\": 1}, {\"value\": \"-164.7452\", \"frequency\": 1}, {\"value\": \"150.47092\", \"frequency\": 1}, {\"value\": \"151.57256\", \"frequency\": 1}, {\"value\": \"153.20834\", \"frequency\": 1}, {\"value\": \"153.66048\", \"frequency\": 1}, {\"value\": \"154.34215\", \"frequency\": 1}, {\"value\": \"154.60695\", \"frequency\": 1}, {\"value\": \"154.73245\", \"frequency\": 1}, {\"value\": \"155.65584\", \"frequency\": 1}, {\"value\": \"156.21262\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"154.60695\", \"155.65584\", \"154.73245\", \"None\", \"None\", \"None\", \"-159.15975\", \"-164.48818\", \"None\", \"-163.18683\", \"-158.31337\", \"153.20834\", \"-161.72104\", \"-164.10563\", \"None\", \"156.21262\", \"-159.27007\", \"None\", \"-158.74584\"]}, {\"fieldPath\": \"Water Temp (\\ufffdC)\", \"uniqueCount\": 16, \"uniqueProportion\": 0.8, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"-\", \"frequency\": 4}, {\"value\": \"11.34\", \"frequency\": 1}, {\"value\": \"11.9\", \"frequency\": 1}, {\"value\": \"12.05\", \"frequency\": 1}, {\"value\": \"15.1\", \"frequency\": 1}, {\"value\": \"15.3\", \"frequency\": 1}, {\"value\": \"17.38\", \"frequency\": 1}, {\"value\": \"17.6\", \"frequency\": 1}, {\"value\": \"18.3\", \"frequency\": 1}, {\"value\": \"2.95\", \"frequency\": 1}, {\"value\": \"20.18\", \"frequency\": 1}, {\"value\": \"4.51\", \"frequency\": 1}, {\"value\": \"5.36\", \"frequency\": 1}, {\"value\": \"6.46\", \"frequency\": 2}, {\"value\": \"8.06\", \"frequency\": 1}, {\"value\": \"9.3\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"9.3\", \"15.1\", \"17.6\", \"None\", \"None\", \"None\", \"5.36\", \"6.46\", \"None\", \"17.38\", \"4.51\", \"-\", \"18.3\", \"11.34\", \"None\", \"-\", \"6.46\", \"None\", \"2.95\"]}, {\"fieldPath\": \"Cond (\\ufffdS/cm)\", \"uniqueCount\": 16, \"uniqueProportion\": 0.8, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"-\", \"frequency\": 4}, {\"value\": \"107\", \"frequency\": 1}, {\"value\": \"108\", \"frequency\": 1}, {\"value\": \"129.2\", \"frequency\": 1}, {\"value\": \"132.4\", \"frequency\": 1}, {\"value\": \"1351\", \"frequency\": 1}, {\"value\": \"15\", \"frequency\": 1}, {\"value\": \"163.1\", \"frequency\": 1}, {\"value\": \"227.3\", \"frequency\": 1}, {\"value\": \"26\", \"frequency\": 1}, {\"value\": \"452\", \"frequency\": 1}, {\"value\": \"61\", \"frequency\": 2}, {\"value\": \"75\", \"frequency\": 1}, {\"value\": \"75.7\", \"frequency\": 1}, {\"value\": \"83\", \"frequency\": 1}, {\"value\": \"92\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"227.3\", \"163.1\", \"132.4\", \"None\", \"None\", \"None\", \"61\", \"107\", \"None\", \"26\", \"108\", \"-\", \"83\", \"1351\", \"None\", \"-\", \"75\", \"None\", \"61\"]}, {\"fieldPath\": \"pH\", \"uniqueCount\": 8, \"uniqueProportion\": 0.4, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"-\", \"frequency\": 13}, {\"value\": \"6.44\", \"frequency\": 1}, {\"value\": \"7.31\", \"frequency\": 1}, {\"value\": \"7.42\", \"frequency\": 1}, {\"value\": \"7.45\", \"frequency\": 1}, {\"value\": \"7.69\", \"frequency\": 1}, {\"value\": \"7.82\", \"frequency\": 1}, {\"value\": \"8.04\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"-\", \"-\", \"-\", \"None\", \"None\", \"None\", \"-\", \"7.69\", \"None\", \"6.44\", \"-\", \"-\", \"7.31\", \"8.04\", \"None\", \"-\", \"-\", \"None\", \"-\"]}, {\"fieldPath\": \"DO (mg/L)\", \"uniqueCount\": 6, \"uniqueProportion\": 0.3157894736842105, \"nullCount\": 14, \"nullProportion\": 0.42424242424242425, \"distinctValueFrequencies\": [{\"value\": \"-\", \"frequency\": 14}, {\"value\": \"7.6\", \"frequency\": 1}, {\"value\": \"7.9\", \"frequency\": 1}, {\"value\": \"8.5\", \"frequency\": 1}, {\"value\": \"8.9\", \"frequency\": 1}, {\"value\": \"9.3\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 14}], \"sampleValues\": [\"None\", \"8.9\", \"8.5\", \"7.6\", \"None\", \"None\", \"None\", \"-\", \"-\", \"None\", \"-\", \"-\", \"-\", \"-\", \"-\", \"None\", \"None\", \"-\", \"None\", \"-\"]}, {\"fieldPath\": \"Secchi Depth (m)\", \"uniqueCount\": 4, \"uniqueProportion\": 0.2, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"-\", \"frequency\": 16}, {\"value\": \"10\", \"frequency\": 2}, {\"value\": \"4.1\", \"frequency\": 1}, {\"value\": \"7\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"-\", \"-\", \"-\", \"None\", \"None\", \"None\", \"-\", \"-\", \"None\", \"-\", \"-\", \"7\", \"-\", \"-\", \"None\", \"10\", \"-\", \"None\", \"-\"]}, {\"fieldPath\": \"UV Absorbance, 254nm\", \"uniqueCount\": 20, \"uniqueProportion\": 1.0, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"0.02\", \"frequency\": 1}, {\"value\": \"0.044\", \"frequency\": 1}, {\"value\": \"0.048\", \"frequency\": 1}, {\"value\": \"0.059\", \"frequency\": 1}, {\"value\": \"0.06\", \"frequency\": 1}, {\"value\": \"0.062\", \"frequency\": 1}, {\"value\": \"0.063\", \"frequency\": 1}, {\"value\": \"0.076\", \"frequency\": 1}, {\"value\": \"0.091\", \"frequency\": 1}, {\"value\": \"0.095\", \"frequency\": 1}, {\"value\": \"0.104\", \"frequency\": 1}, {\"value\": \"0.107\", \"frequency\": 1}, {\"value\": \"0.117\", \"frequency\": 1}, {\"value\": \"0.151\", \"frequency\": 1}, {\"value\": \"0.154\", \"frequency\": 1}, {\"value\": \"0.186\", \"frequency\": 1}, {\"value\": \"0.191\", \"frequency\": 1}, {\"value\": \"0.212\", \"frequency\": 1}, {\"value\": \"0.223\", \"frequency\": 1}, {\"value\": \"0.436\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"0.044\", \"0.107\", \"0.117\", \"None\", \"None\", \"None\", \"0.212\", \"0.091\", \"None\", \"0.151\", \"0.048\", \"0.095\", \"0.154\", \"0.104\", \"None\", \"0.076\", \"0.191\", \"None\", \"0.062\"]}, {\"fieldPath\": \"DOC [mg/L C]\", \"uniqueCount\": 19, \"uniqueProportion\": 0.95, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"1.3\", \"frequency\": 1}, {\"value\": \"1.8\", \"frequency\": 1}, {\"value\": \"12.3\", \"frequency\": 1}, {\"value\": \"2.0\", \"frequency\": 1}, {\"value\": \"2.1\", \"frequency\": 2}, {\"value\": \"2.4\", \"frequency\": 1}, {\"value\": \"2.7\", \"frequency\": 1}, {\"value\": \"3.3\", \"frequency\": 1}, {\"value\": \"3.4\", \"frequency\": 1}, {\"value\": \"4.2\", \"frequency\": 1}, {\"value\": \"4.3\", \"frequency\": 1}, {\"value\": \"4.5\", \"frequency\": 1}, {\"value\": \"4.7\", \"frequency\": 1}, {\"value\": \"5.1\", \"frequency\": 1}, {\"value\": \"5.8\", \"frequency\": 1}, {\"value\": \"6.5\", \"frequency\": 1}, {\"value\": \"7.8\", \"frequency\": 1}, {\"value\": \"8.3\", \"frequency\": 1}, {\"value\": \"8.5\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"2.1\", \"4.2\", \"4.5\", \"None\", \"None\", \"None\", \"8.5\", \"3.4\", \"None\", \"4.7\", \"1.8\", \"3.3\", \"5.8\", \"4.3\", \"None\", \"5.1\", \"7.8\", \"None\", \"2.1\"]}, {\"fieldPath\": \"SUVA, 254nm\", \"uniqueCount\": 12, \"uniqueProportion\": 0.6, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"1.5\", \"frequency\": 2}, {\"value\": \"2.1\", \"frequency\": 1}, {\"value\": \"2.2\", \"frequency\": 2}, {\"value\": \"2.4\", \"frequency\": 2}, {\"value\": \"2.5\", \"frequency\": 2}, {\"value\": \"2.6\", \"frequency\": 3}, {\"value\": \"2.7\", \"frequency\": 2}, {\"value\": \"2.9\", \"frequency\": 2}, {\"value\": \"3.1\", \"frequency\": 1}, {\"value\": \"3.2\", \"frequency\": 1}, {\"value\": \"3.4\", \"frequency\": 1}, {\"value\": \"3.5\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"2.1\", \"2.5\", \"2.6\", \"None\", \"None\", \"None\", \"2.5\", \"2.6\", \"None\", \"3.2\", \"2.7\", \"2.9\", \"2.7\", \"2.4\", \"None\", \"1.5\", \"2.4\", \"None\", \"2.9\"]}, {\"fieldPath\": \"NH3-N \", \"uniqueCount\": 8, \"uniqueProportion\": 0.4, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"*0.001\", \"frequency\": 7}, {\"value\": \"*0.002\", \"frequency\": 4}, {\"value\": \"*0.003\", \"frequency\": 1}, {\"value\": \"*0.004\", \"frequency\": 2}, {\"value\": \"*0.007\", \"frequency\": 2}, {\"value\": \"*0.009\", \"frequency\": 1}, {\"value\": \"0.019\", \"frequency\": 1}, {\"value\": \"0.135\", \"frequency\": 2}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"*0.001\", \"*0.002\", \"*0.003\", \"None\", \"None\", \"None\", \"*0.007\", \"*0.001\", \"None\", \"0.135\", \"*0.001\", \"*0.001\", \"*0.009\", \"*0.002\", \"None\", \"*0.001\", \"*0.004\", \"None\", \"*0.001\"]}]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "data-lake-test", + "registryName": null, + "registryVersion": null, + "properties": null + } +}, +{ + "auditHeader": null, + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:data-lake-test,cases_2.pokemon_abilities_json,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": {}, + "externalUrl": null, + "description": "", + "uri": null, + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "pokemon_abilities_json.json", + "platform": "urn:li:dataPlatform:data-lake-test", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown", + "impersonator": null + }, + "deleted": null, + "dataset": null, + "cluster": null, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "effect_changes", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": null + } + } + }, + "nativeDataType": "list", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "effect_changes.effect_entries", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": null + } + } + }, + "nativeDataType": "list", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "effect_changes.effect_entries.effect", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "str", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "effect_changes.effect_entries.language", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.RecordType": {} + } + }, + "nativeDataType": "dict", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "effect_changes.effect_entries.language.name", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "str", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "effect_changes.effect_entries.language.url", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "str", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "effect_changes.version_group", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.RecordType": {} + } + }, + "nativeDataType": "dict", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "effect_changes.version_group.name", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "str", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "effect_changes.version_group.url", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "str", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "effect_entries", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": null + } + } + }, + "nativeDataType": "list", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "effect_entries.effect", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "str", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "effect_entries.language", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.RecordType": {} + } + }, + "nativeDataType": "dict", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "effect_entries.language.name", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "str", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "effect_entries.language.url", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "str", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -57,7 +888,7 @@ "jsonProps": null }, { - "fieldPath": "Site ID", + "fieldPath": "effect_entries.short_effect", "jsonPath": null, "nullable": false, "description": null, @@ -66,7 +897,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "str", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -74,16 +905,18 @@ "jsonProps": null }, { - "fieldPath": "Park ID", + "fieldPath": "flavor_text_entries", "jsonPath": null, "nullable": false, "description": null, "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": null + } } }, - "nativeDataType": "StringType", + "nativeDataType": "list", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -91,16 +924,16 @@ "jsonProps": null }, { - "fieldPath": "Lat (\ufffdN)", + "fieldPath": "flavor_text_entries.flavor_text", "jsonPath": null, "nullable": false, "description": null, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "DoubleType", + "nativeDataType": "str", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -108,16 +941,16 @@ "jsonProps": null }, { - "fieldPath": "Long (\ufffdW)", + "fieldPath": "flavor_text_entries.language", "jsonPath": null, "nullable": false, "description": null, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.RecordType": {} } }, - "nativeDataType": "DoubleType", + "nativeDataType": "dict", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -125,7 +958,7 @@ "jsonProps": null }, { - "fieldPath": "Water Temp (\ufffdC)", + "fieldPath": "flavor_text_entries.language.name", "jsonPath": null, "nullable": false, "description": null, @@ -134,7 +967,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "str", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -142,7 +975,7 @@ "jsonProps": null }, { - "fieldPath": "Cond (\ufffdS/cm)", + "fieldPath": "flavor_text_entries.language.url", "jsonPath": null, "nullable": false, "description": null, @@ -151,7 +984,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "str", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -159,7 +992,24 @@ "jsonProps": null }, { - "fieldPath": "pH", + "fieldPath": "flavor_text_entries.version_group", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.RecordType": {} + } + }, + "nativeDataType": "dict", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "flavor_text_entries.version_group.name", "jsonPath": null, "nullable": false, "description": null, @@ -168,7 +1018,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "str", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -176,7 +1026,7 @@ "jsonProps": null }, { - "fieldPath": "DO (mg/L)", + "fieldPath": "flavor_text_entries.version_group.url", "jsonPath": null, "nullable": false, "description": null, @@ -185,7 +1035,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "str", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -193,7 +1043,24 @@ "jsonProps": null }, { - "fieldPath": "Secchi Depth (m)", + "fieldPath": "generation", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.RecordType": {} + } + }, + "nativeDataType": "dict", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "generation.name", "jsonPath": null, "nullable": false, "description": null, @@ -202,7 +1069,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "str", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -210,16 +1077,16 @@ "jsonProps": null }, { - "fieldPath": "UV Absorbance, 254nm", + "fieldPath": "generation.url", "jsonPath": null, "nullable": false, "description": null, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "DoubleType", + "nativeDataType": "str", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -227,7 +1094,7 @@ "jsonProps": null }, { - "fieldPath": "DOC [mg/L C]", + "fieldPath": "id", "jsonPath": null, "nullable": false, "description": null, @@ -236,7 +1103,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "DoubleType", + "nativeDataType": "int", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -244,16 +1111,16 @@ "jsonProps": null }, { - "fieldPath": "SUVA, 254nm", + "fieldPath": "is_main_series", "jsonPath": null, "nullable": false, "description": null, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.BooleanType": {} } }, - "nativeDataType": "DoubleType", + "nativeDataType": "bool", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -261,7 +1128,7 @@ "jsonProps": null }, { - "fieldPath": "NH3-N ", + "fieldPath": "name", "jsonPath": null, "nullable": false, "description": null, @@ -270,101 +1137,26 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "str", "recursive": false, "globalTags": null, "glossaryTerms": null, "isPartOfKey": false, "jsonProps": null - } - ], - "primaryKeys": null, - "foreignKeysSpecs": null, - "foreignKeys": null - } - } - ] - } - }, - "proposedDelta": null, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "data-lake-test", - "registryName": null, - "registryVersion": null, - "properties": null - } -}, -{ - "auditHeader": null, - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:data-lake-test,cases_2.NPS,PROD)", - "entityKeyAspect": null, - "changeType": "UPSERT", - "aspectName": "datasetProfile", - "aspect": { - "value": "{\"timestampMillis\": 1615443388097, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 33, \"columnCount\": 14, \"fieldProfiles\": [{\"fieldPath\": \"Sampling Date\", \"uniqueCount\": 12, \"uniqueProportion\": 0.36363636363636365, \"nullCount\": 0, \"nullProportion\": 0.0, \"distinctValueFrequencies\": [{\"value\": \"(mg N/L)\\\"\", \"frequency\": 3}, {\"value\": \"(mg P/L)\\\"\", \"frequency\": 2}, {\"value\": \"(mg/L)\\\"\", \"frequency\": 8}, {\"value\": \"6/21/2013\", \"frequency\": 3}, {\"value\": \"6/22/2013\", \"frequency\": 2}, {\"value\": \"6/23/2013\", \"frequency\": 1}, {\"value\": \"6/26/2014\", \"frequency\": 4}, {\"value\": \"6/27/2013\", \"frequency\": 1}, {\"value\": \"8/6/2014\", \"frequency\": 1}, {\"value\": \"8/7/2014\", \"frequency\": 3}, {\"value\": \"8/8/2014\", \"frequency\": 1}, {\"value\": \"9/16/2013\", \"frequency\": 4}], \"sampleValues\": [\"(mg/L)\\\"\", \"8/6/2014\", \"8/7/2014\", \"8/7/2014\", \"(mg/L)\\\"\", \"(mg N/L)\\\"\", \"(mg/L)\\\"\", \"9/16/2013\", \"6/22/2013\", \"(mg P/L)\\\"\", \"6/21/2013\", \"9/16/2013\", \"6/26/2014\", \"6/27/2013\", \"6/23/2013\", \"(mg N/L)\\\"\", \"6/26/2014\", \"9/16/2013\", \"(mg P/L)\\\"\", \"9/16/2013\"]}, {\"fieldPath\": \"Site ID\", \"uniqueCount\": 34, \"uniqueProportion\": 1.0303030303030303, \"nullCount\": 0, \"nullProportion\": 0.0, \"distinctValueFrequencies\": [{\"value\": \"Br \", \"frequency\": 1}, {\"value\": \"Ca \", \"frequency\": 1}, {\"value\": \"Cl \", \"frequency\": 1}, {\"value\": \"Desperation Lake\", \"frequency\": 1}, {\"value\": \"Devil Mountain Lake\", \"frequency\": 1}, {\"value\": \"F \", \"frequency\": 1}, {\"value\": \"Feniak Lake\", \"frequency\": 1}, {\"value\": \"Imuruk Lake\", \"frequency\": 1}, {\"value\": \"Iniakuk Lake\", \"frequency\": 1}, {\"value\": \"K \", \"frequency\": 1}, {\"value\": \"Kurupa Lake\", \"frequency\": 1}, {\"value\": \"Kuzitrin Lake\", \"frequency\": 1}, {\"value\": \"Lake Kangilipak\", \"frequency\": 1}, {\"value\": \"Lake Matcharak\", \"frequency\": 1}, {\"value\": \"Lake Narvakrak\", \"frequency\": 1}, {\"value\": \"Lake Selby\", \"frequency\": 1}, {\"value\": \"Lava Lake\", \"frequency\": 1}, {\"value\": \"Mg \", \"frequency\": 1}, {\"value\": \"NO3-N+NO2-N \", \"frequency\": 1}, {\"value\": \"Na \", \"frequency\": 1}, {\"value\": \"North Killeak Lake\", \"frequency\": 1}, {\"value\": \"Nutavukti Lake\", \"frequency\": 1}, {\"value\": \"Okoklik Lake\", \"frequency\": 1}, {\"value\": \"PO4-P \", \"frequency\": 1}, {\"value\": \"SO4-S \", \"frequency\": 1}, {\"value\": \"Summit Lake\", \"frequency\": 1}, {\"value\": \"TDN \", \"frequency\": 1}, {\"value\": \"TDP \", \"frequency\": 1}, {\"value\": \"Takahula Lake\", \"frequency\": 1}, {\"value\": \"Walker Lake\", \"frequency\": 1}, {\"value\": \"White Fish Lake\", \"frequency\": 1}, {\"value\": \"Wild Lake\", \"frequency\": 1}, {\"value\": \"d18O\", \"frequency\": 1}], \"sampleValues\": [\"Br \", \"Kurupa Lake\", \"Lake Selby\", \"Nutavukti Lake\", \"Ca \", \"NO3-N+NO2-N \", \"SO4-S \", \"Lake Kangilipak\", \"Devil Mountain Lake\", \"TDN \", \"Imuruk Lake\", \"Feniak Lake\", \"Iniakuk Lake\", \"Lake Narvakrak\", \"North Killeak Lake\", \"TDP \", \"Lake Matcharak\", \"Okoklik Lake\", \"Cl \", \"Desperation Lake\"]}, {\"fieldPath\": \"Park ID\", \"uniqueCount\": 4, \"uniqueProportion\": 0.19047619047619047, \"nullCount\": 12, \"nullProportion\": 0.36363636363636365, \"distinctValueFrequencies\": [{\"value\": \"BELA\", \"frequency\": 6}, {\"value\": \"GAAR\", \"frequency\": 9}, {\"value\": \"NOAT\", \"frequency\": 5}, {\"value\": \"NullValue\", \"frequency\": 12}, {\"value\": \"dD\", \"frequency\": 1}], \"sampleValues\": [\"None\", \"GAAR\", \"GAAR\", \"GAAR\", \"None\", \"None\", \"None\", \"NOAT\", \"BELA\", \"None\", \"BELA\", \"NOAT\", \"GAAR\", \"NOAT\", \"BELA\", \"None\", \"GAAR\", \"NOAT\", \"None\", \"NOAT\"]}, {\"fieldPath\": \"Lat (\\ufffdN)\", \"uniqueCount\": 19, \"uniqueProportion\": 0.95, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"65.3845\", \"frequency\": 1}, {\"value\": \"65.58905\", \"frequency\": 1}, {\"value\": \"65.58996\", \"frequency\": 1}, {\"value\": \"66.32813\", \"frequency\": 1}, {\"value\": \"66.37863\", \"frequency\": 1}, {\"value\": \"66.40362\", \"frequency\": 1}, {\"value\": \"66.89298\", \"frequency\": 1}, {\"value\": \"67.01337\", \"frequency\": 1}, {\"value\": \"67.06375\", \"frequency\": 1}, {\"value\": \"67.1257\", \"frequency\": 1}, {\"value\": \"67.35014\", \"frequency\": 1}, {\"value\": \"67.50282\", \"frequency\": 1}, {\"value\": \"67.74715\", \"frequency\": 1}, {\"value\": \"67.9999\", \"frequency\": 1}, {\"value\": \"68.00064\", \"frequency\": 1}, {\"value\": \"68.01392\", \"frequency\": 1}, {\"value\": \"68.07008\", \"frequency\": 1}, {\"value\": \"68.24775\", \"frequency\": 1}, {\"value\": \"68.33031\", \"frequency\": 1}, {\"value\": \"68.35879\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"68.35879\", \"66.89298\", \"67.01337\", \"None\", \"None\", \"None\", \"68.00064\", \"66.40362\", \"None\", \"65.58905\", \"68.24775\", \"67.1257\", \"67.9999\", \"66.32813\", \"None\", \"67.74715\", \"68.01392\", \"None\", \"68.33031\"]}, {\"fieldPath\": \"Long (\\ufffdW)\", \"uniqueCount\": 20, \"uniqueProportion\": 1.0, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"-158.31337\", \"frequency\": 1}, {\"value\": \"-158.74584\", \"frequency\": 1}, {\"value\": \"-159.15975\", \"frequency\": 1}, {\"value\": \"-159.27007\", \"frequency\": 1}, {\"value\": \"-161.72104\", \"frequency\": 1}, {\"value\": \"-163.18683\", \"frequency\": 1}, {\"value\": \"-163.21787\", \"frequency\": 1}, {\"value\": \"-163.90928\", \"frequency\": 1}, {\"value\": \"-164.10563\", \"frequency\": 1}, {\"value\": \"-164.48818\", \"frequency\": 1}, {\"value\": \"-164.7452\", \"frequency\": 1}, {\"value\": \"150.47092\", \"frequency\": 1}, {\"value\": \"151.57256\", \"frequency\": 1}, {\"value\": \"153.20834\", \"frequency\": 1}, {\"value\": \"153.66048\", \"frequency\": 1}, {\"value\": \"154.34215\", \"frequency\": 1}, {\"value\": \"154.60695\", \"frequency\": 1}, {\"value\": \"154.73245\", \"frequency\": 1}, {\"value\": \"155.65584\", \"frequency\": 1}, {\"value\": \"156.21262\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"154.60695\", \"155.65584\", \"154.73245\", \"None\", \"None\", \"None\", \"-159.15975\", \"-164.48818\", \"None\", \"-163.18683\", \"-158.31337\", \"153.20834\", \"-161.72104\", \"-164.10563\", \"None\", \"156.21262\", \"-159.27007\", \"None\", \"-158.74584\"]}, {\"fieldPath\": \"Water Temp (\\ufffdC)\", \"uniqueCount\": 16, \"uniqueProportion\": 0.8, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"-\", \"frequency\": 4}, {\"value\": \"11.34\", \"frequency\": 1}, {\"value\": \"11.9\", \"frequency\": 1}, {\"value\": \"12.05\", \"frequency\": 1}, {\"value\": \"15.1\", \"frequency\": 1}, {\"value\": \"15.3\", \"frequency\": 1}, {\"value\": \"17.38\", \"frequency\": 1}, {\"value\": \"17.6\", \"frequency\": 1}, {\"value\": \"18.3\", \"frequency\": 1}, {\"value\": \"2.95\", \"frequency\": 1}, {\"value\": \"20.18\", \"frequency\": 1}, {\"value\": \"4.51\", \"frequency\": 1}, {\"value\": \"5.36\", \"frequency\": 1}, {\"value\": \"6.46\", \"frequency\": 2}, {\"value\": \"8.06\", \"frequency\": 1}, {\"value\": \"9.3\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"9.3\", \"15.1\", \"17.6\", \"None\", \"None\", \"None\", \"5.36\", \"6.46\", \"None\", \"17.38\", \"4.51\", \"-\", \"18.3\", \"11.34\", \"None\", \"-\", \"6.46\", \"None\", \"2.95\"]}, {\"fieldPath\": \"Cond (\\ufffdS/cm)\", \"uniqueCount\": 16, \"uniqueProportion\": 0.8, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"-\", \"frequency\": 4}, {\"value\": \"107\", \"frequency\": 1}, {\"value\": \"108\", \"frequency\": 1}, {\"value\": \"129.2\", \"frequency\": 1}, {\"value\": \"132.4\", \"frequency\": 1}, {\"value\": \"1351\", \"frequency\": 1}, {\"value\": \"15\", \"frequency\": 1}, {\"value\": \"163.1\", \"frequency\": 1}, {\"value\": \"227.3\", \"frequency\": 1}, {\"value\": \"26\", \"frequency\": 1}, {\"value\": \"452\", \"frequency\": 1}, {\"value\": \"61\", \"frequency\": 2}, {\"value\": \"75\", \"frequency\": 1}, {\"value\": \"75.7\", \"frequency\": 1}, {\"value\": \"83\", \"frequency\": 1}, {\"value\": \"92\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"227.3\", \"163.1\", \"132.4\", \"None\", \"None\", \"None\", \"61\", \"107\", \"None\", \"26\", \"108\", \"-\", \"83\", \"1351\", \"None\", \"-\", \"75\", \"None\", \"61\"]}, {\"fieldPath\": \"pH\", \"uniqueCount\": 8, \"uniqueProportion\": 0.4, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"-\", \"frequency\": 13}, {\"value\": \"6.44\", \"frequency\": 1}, {\"value\": \"7.31\", \"frequency\": 1}, {\"value\": \"7.42\", \"frequency\": 1}, {\"value\": \"7.45\", \"frequency\": 1}, {\"value\": \"7.69\", \"frequency\": 1}, {\"value\": \"7.82\", \"frequency\": 1}, {\"value\": \"8.04\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"-\", \"-\", \"-\", \"None\", \"None\", \"None\", \"-\", \"7.69\", \"None\", \"6.44\", \"-\", \"-\", \"7.31\", \"8.04\", \"None\", \"-\", \"-\", \"None\", \"-\"]}, {\"fieldPath\": \"DO (mg/L)\", \"uniqueCount\": 6, \"uniqueProportion\": 0.3157894736842105, \"nullCount\": 14, \"nullProportion\": 0.42424242424242425, \"distinctValueFrequencies\": [{\"value\": \"-\", \"frequency\": 14}, {\"value\": \"7.6\", \"frequency\": 1}, {\"value\": \"7.9\", \"frequency\": 1}, {\"value\": \"8.5\", \"frequency\": 1}, {\"value\": \"8.9\", \"frequency\": 1}, {\"value\": \"9.3\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 14}], \"sampleValues\": [\"None\", \"8.9\", \"8.5\", \"7.6\", \"None\", \"None\", \"None\", \"-\", \"-\", \"None\", \"-\", \"-\", \"-\", \"-\", \"-\", \"None\", \"None\", \"-\", \"None\", \"-\"]}, {\"fieldPath\": \"Secchi Depth (m)\", \"uniqueCount\": 4, \"uniqueProportion\": 0.2, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"-\", \"frequency\": 16}, {\"value\": \"10\", \"frequency\": 2}, {\"value\": \"4.1\", \"frequency\": 1}, {\"value\": \"7\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"-\", \"-\", \"-\", \"None\", \"None\", \"None\", \"-\", \"-\", \"None\", \"-\", \"-\", \"7\", \"-\", \"-\", \"None\", \"10\", \"-\", \"None\", \"-\"]}, {\"fieldPath\": \"UV Absorbance, 254nm\", \"uniqueCount\": 20, \"uniqueProportion\": 1.0, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"0.02\", \"frequency\": 1}, {\"value\": \"0.044\", \"frequency\": 1}, {\"value\": \"0.048\", \"frequency\": 1}, {\"value\": \"0.059\", \"frequency\": 1}, {\"value\": \"0.06\", \"frequency\": 1}, {\"value\": \"0.062\", \"frequency\": 1}, {\"value\": \"0.063\", \"frequency\": 1}, {\"value\": \"0.076\", \"frequency\": 1}, {\"value\": \"0.091\", \"frequency\": 1}, {\"value\": \"0.095\", \"frequency\": 1}, {\"value\": \"0.104\", \"frequency\": 1}, {\"value\": \"0.107\", \"frequency\": 1}, {\"value\": \"0.117\", \"frequency\": 1}, {\"value\": \"0.151\", \"frequency\": 1}, {\"value\": \"0.154\", \"frequency\": 1}, {\"value\": \"0.186\", \"frequency\": 1}, {\"value\": \"0.191\", \"frequency\": 1}, {\"value\": \"0.212\", \"frequency\": 1}, {\"value\": \"0.223\", \"frequency\": 1}, {\"value\": \"0.436\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"0.044\", \"0.107\", \"0.117\", \"None\", \"None\", \"None\", \"0.212\", \"0.091\", \"None\", \"0.151\", \"0.048\", \"0.095\", \"0.154\", \"0.104\", \"None\", \"0.076\", \"0.191\", \"None\", \"0.062\"]}, {\"fieldPath\": \"DOC [mg/L C]\", \"uniqueCount\": 19, \"uniqueProportion\": 0.95, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"1.3\", \"frequency\": 1}, {\"value\": \"1.8\", \"frequency\": 1}, {\"value\": \"12.3\", \"frequency\": 1}, {\"value\": \"2.0\", \"frequency\": 1}, {\"value\": \"2.1\", \"frequency\": 2}, {\"value\": \"2.4\", \"frequency\": 1}, {\"value\": \"2.7\", \"frequency\": 1}, {\"value\": \"3.3\", \"frequency\": 1}, {\"value\": \"3.4\", \"frequency\": 1}, {\"value\": \"4.2\", \"frequency\": 1}, {\"value\": \"4.3\", \"frequency\": 1}, {\"value\": \"4.5\", \"frequency\": 1}, {\"value\": \"4.7\", \"frequency\": 1}, {\"value\": \"5.1\", \"frequency\": 1}, {\"value\": \"5.8\", \"frequency\": 1}, {\"value\": \"6.5\", \"frequency\": 1}, {\"value\": \"7.8\", \"frequency\": 1}, {\"value\": \"8.3\", \"frequency\": 1}, {\"value\": \"8.5\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"2.1\", \"4.2\", \"4.5\", \"None\", \"None\", \"None\", \"8.5\", \"3.4\", \"None\", \"4.7\", \"1.8\", \"3.3\", \"5.8\", \"4.3\", \"None\", \"5.1\", \"7.8\", \"None\", \"2.1\"]}, {\"fieldPath\": \"SUVA, 254nm\", \"uniqueCount\": 12, \"uniqueProportion\": 0.6, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"1.5\", \"frequency\": 2}, {\"value\": \"2.1\", \"frequency\": 1}, {\"value\": \"2.2\", \"frequency\": 2}, {\"value\": \"2.4\", \"frequency\": 2}, {\"value\": \"2.5\", \"frequency\": 2}, {\"value\": \"2.6\", \"frequency\": 3}, {\"value\": \"2.7\", \"frequency\": 2}, {\"value\": \"2.9\", \"frequency\": 2}, {\"value\": \"3.1\", \"frequency\": 1}, {\"value\": \"3.2\", \"frequency\": 1}, {\"value\": \"3.4\", \"frequency\": 1}, {\"value\": \"3.5\", \"frequency\": 1}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"2.1\", \"2.5\", \"2.6\", \"None\", \"None\", \"None\", \"2.5\", \"2.6\", \"None\", \"3.2\", \"2.7\", \"2.9\", \"2.7\", \"2.4\", \"None\", \"1.5\", \"2.4\", \"None\", \"2.9\"]}, {\"fieldPath\": \"NH3-N \", \"uniqueCount\": 8, \"uniqueProportion\": 0.4, \"nullCount\": 13, \"nullProportion\": 0.3939393939393939, \"distinctValueFrequencies\": [{\"value\": \"*0.001\", \"frequency\": 7}, {\"value\": \"*0.002\", \"frequency\": 4}, {\"value\": \"*0.003\", \"frequency\": 1}, {\"value\": \"*0.004\", \"frequency\": 2}, {\"value\": \"*0.007\", \"frequency\": 2}, {\"value\": \"*0.009\", \"frequency\": 1}, {\"value\": \"0.019\", \"frequency\": 1}, {\"value\": \"0.135\", \"frequency\": 2}, {\"value\": \"NullValue\", \"frequency\": 13}], \"sampleValues\": [\"None\", \"*0.001\", \"*0.002\", \"*0.003\", \"None\", \"None\", \"None\", \"*0.007\", \"*0.001\", \"None\", \"0.135\", \"*0.001\", \"*0.001\", \"*0.009\", \"*0.002\", \"None\", \"*0.001\", \"*0.004\", \"None\", \"*0.001\"]}]}", - "contentType": "application/json" - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "data-lake-test", - "registryName": null, - "registryVersion": null, - "properties": null - } -}, -{ - "auditHeader": null, - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:data-lake-test,cases_2.pokemon_abilities_json,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": {}, - "externalUrl": null, - "description": "", - "uri": null, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "pokemon_abilities_json.json", - "platform": "urn:li:dataPlatform:data-lake-test", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown", - "impersonator": null - }, - "deleted": null, - "dataset": null, - "cluster": null, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.OtherSchema": { - "rawSchema": "" - } - }, - "fields": [ + }, { - "fieldPath": "effect_changes", + "fieldPath": "names", "jsonPath": null, "nullable": false, "description": null, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NullType": {} + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": null + } } }, - "nativeDataType": "ArrayType(StructType(List(StructField(effect_entries,ArrayType(StructType(List(StructField(effect,StringType,true),StructField(language,StructType(List(StructField(name,StringType,true),StructField(url,StringType,true))),true))),true),true),StructField(version_group,StructType(List(StructField(name,StringType,true),StructField(url,StringType,true))),true))),true)", + "nativeDataType": "list", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -372,16 +1164,16 @@ "jsonProps": null }, { - "fieldPath": "effect_entries", + "fieldPath": "names.language", "jsonPath": null, "nullable": false, "description": null, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NullType": {} + "com.linkedin.pegasus2avro.schema.RecordType": {} } }, - "nativeDataType": "ArrayType(StructType(List(StructField(effect,StringType,true),StructField(language,StructType(List(StructField(name,StringType,true),StructField(url,StringType,true))),true),StructField(short_effect,StringType,true))),true)", + "nativeDataType": "dict", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -389,16 +1181,16 @@ "jsonProps": null }, { - "fieldPath": "flavor_text_entries", + "fieldPath": "names.language.name", "jsonPath": null, "nullable": false, "description": null, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NullType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "ArrayType(StructType(List(StructField(flavor_text,StringType,true),StructField(language,StructType(List(StructField(name,StringType,true),StructField(url,StringType,true))),true),StructField(version_group,StructType(List(StructField(name,StringType,true),StructField(url,StringType,true))),true))),true)", + "nativeDataType": "str", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -406,16 +1198,16 @@ "jsonProps": null }, { - "fieldPath": "generation", + "fieldPath": "names.language.url", "jsonPath": null, "nullable": false, "description": null, "type": { "type": { - "com.linkedin.pegasus2avro.schema.RecordType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StructType(List(StructField(name,StringType,true),StructField(url,StringType,true)))", + "nativeDataType": "str", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -423,16 +1215,16 @@ "jsonProps": null }, { - "fieldPath": "id", + "fieldPath": "names.name", "jsonPath": null, "nullable": false, "description": null, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "LongType", + "nativeDataType": "str", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -440,7 +1232,26 @@ "jsonProps": null }, { - "fieldPath": "is_main_series", + "fieldPath": "pokemon", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": null + } + } + }, + "nativeDataType": "list", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "pokemon.is_hidden", "jsonPath": null, "nullable": false, "description": null, @@ -449,7 +1260,7 @@ "com.linkedin.pegasus2avro.schema.BooleanType": {} } }, - "nativeDataType": "BooleanType", + "nativeDataType": "bool", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -457,7 +1268,24 @@ "jsonProps": null }, { - "fieldPath": "name", + "fieldPath": "pokemon.pokemon", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.RecordType": {} + } + }, + "nativeDataType": "dict", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "pokemon.pokemon.name", "jsonPath": null, "nullable": false, "description": null, @@ -466,7 +1294,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "str", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -474,16 +1302,16 @@ "jsonProps": null }, { - "fieldPath": "names", + "fieldPath": "pokemon.pokemon.url", "jsonPath": null, "nullable": false, "description": null, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NullType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "ArrayType(StructType(List(StructField(language,StructType(List(StructField(name,StringType,true),StructField(url,StringType,true))),true),StructField(name,StringType,true))),true)", + "nativeDataType": "str", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -491,16 +1319,16 @@ "jsonProps": null }, { - "fieldPath": "pokemon", + "fieldPath": "pokemon.slot", "jsonPath": null, "nullable": false, "description": null, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NullType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "ArrayType(StructType(List(StructField(is_hidden,BooleanType,true),StructField(pokemon,StructType(List(StructField(name,StringType,true),StructField(url,StringType,true))),true),StructField(slot,LongType,true))),true)", + "nativeDataType": "int", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -594,7 +1422,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -611,7 +1439,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -628,7 +1456,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -645,7 +1473,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -662,7 +1490,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "string", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -756,7 +1584,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -773,7 +1601,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "string", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -790,7 +1618,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "string", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -807,7 +1635,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -824,7 +1652,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "string", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -841,7 +1669,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -858,7 +1686,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -875,7 +1703,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "string", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -892,7 +1720,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -909,7 +1737,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -926,7 +1754,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "string", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -943,7 +1771,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -960,7 +1788,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -977,7 +1805,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "string", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -994,7 +1822,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1011,7 +1839,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "string", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1028,7 +1856,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1045,7 +1873,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "string", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1062,7 +1890,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "string", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1079,7 +1907,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1096,7 +1924,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1113,7 +1941,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1130,7 +1958,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1147,7 +1975,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1164,7 +1992,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1181,7 +2009,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1198,7 +2026,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1215,7 +2043,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1232,7 +2060,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1249,7 +2077,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1266,7 +2094,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1351,16 +2179,16 @@ }, "fields": [ { - "fieldPath": "1st chord", + "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", "jsonPath": null, - "nullable": false, + "nullable": true, "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "LongType", + "nativeDataType": "double", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1368,16 +2196,16 @@ "jsonProps": null }, { - "fieldPath": "2nd chord", + "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", "jsonPath": null, - "nullable": false, + "nullable": true, "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "LongType", + "nativeDataType": "long", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1385,16 +2213,16 @@ "jsonProps": null }, { - "fieldPath": "3rd chord", + "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", "jsonPath": null, - "nullable": false, + "nullable": true, "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "LongType", + "nativeDataType": "long", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1402,16 +2230,16 @@ "jsonProps": null }, { - "fieldPath": "4th chord", + "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", "jsonPath": null, - "nullable": false, + "nullable": true, "description": null, "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "long", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1419,16 +2247,16 @@ "jsonProps": null }, { - "fieldPath": "Progression Quality", + "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", "jsonPath": null, - "nullable": false, + "nullable": true, "description": null, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "DoubleType", + "nativeDataType": "string", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1522,7 +2350,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1539,7 +2367,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1556,7 +2384,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1573,7 +2401,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1590,7 +2418,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "string", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1681,10 +2509,46 @@ "description": null, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NullType": {} + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": null + } + } + }, + "nativeDataType": "list", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "countries.code", + "jsonPath": null, + "nullable": true, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "str", + "recursive": false, + "globalTags": null, + "glossaryTerms": null, + "isPartOfKey": false, + "jsonProps": null + }, + { + "fieldPath": "countries.name", + "jsonPath": null, + "nullable": false, + "description": null, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "ArrayType(StructType(List(StructField(code,StringType,true),StructField(name,StringType,true))),true)", + "nativeDataType": "str", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1769,16 +2633,16 @@ }, "fields": [ { - "fieldPath": "weight", + "fieldPath": "[version=2.0].[type=Root].[type=long].height", "jsonPath": null, - "nullable": false, + "nullable": true, "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "LongType", + "nativeDataType": "long", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1786,16 +2650,16 @@ "jsonProps": null }, { - "fieldPath": "height", + "fieldPath": "[version=2.0].[type=Root].[type=long].weight", "jsonPath": null, - "nullable": false, + "nullable": true, "description": null, "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "LongType", + "nativeDataType": "long", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1880,7 +2744,7 @@ }, "fields": [ { - "fieldPath": "name", + "fieldPath": "color", "jsonPath": null, "nullable": false, "description": null, @@ -1889,7 +2753,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "string", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1897,16 +2761,16 @@ "jsonProps": null }, { - "fieldPath": "weight", + "fieldPath": "healthy", "jsonPath": null, "nullable": false, "description": null, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.BooleanType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "boolean", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1923,7 +2787,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "IntegerType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1931,7 +2795,7 @@ "jsonProps": null }, { - "fieldPath": "color", + "fieldPath": "name", "jsonPath": null, "nullable": false, "description": null, @@ -1940,7 +2804,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "string", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -1948,16 +2812,16 @@ "jsonProps": null }, { - "fieldPath": "healthy", + "fieldPath": "weight", "jsonPath": null, "nullable": false, "description": null, "type": { "type": { - "com.linkedin.pegasus2avro.schema.BooleanType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "BooleanType", + "nativeDataType": "integer", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -2042,7 +2906,7 @@ }, "fields": [ { - "fieldPath": "name", + "fieldPath": "color", "jsonPath": null, "nullable": false, "description": null, @@ -2051,7 +2915,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "string", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -2059,16 +2923,16 @@ "jsonProps": null }, { - "fieldPath": "weight", + "fieldPath": "healthy", "jsonPath": null, "nullable": false, "description": null, "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.BooleanType": {} } }, - "nativeDataType": "LongType", + "nativeDataType": "bool", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -2085,7 +2949,7 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "LongType", + "nativeDataType": "int64", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -2093,7 +2957,7 @@ "jsonProps": null }, { - "fieldPath": "color", + "fieldPath": "name", "jsonPath": null, "nullable": false, "description": null, @@ -2102,7 +2966,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "StringType", + "nativeDataType": "string", "recursive": false, "globalTags": null, "glossaryTerms": null, @@ -2110,16 +2974,16 @@ "jsonProps": null }, { - "fieldPath": "healthy", + "fieldPath": "weight", "jsonPath": null, "nullable": false, "description": null, "type": { "type": { - "com.linkedin.pegasus2avro.schema.BooleanType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "BooleanType", + "nativeDataType": "int64", "recursive": false, "globalTags": null, "glossaryTerms": null, diff --git a/metadata-ingestion/tests/unit/data_lake/test_schema_inference.py b/metadata-ingestion/tests/unit/data_lake/test_schema_inference.py new file mode 100644 index 0000000000000..cbd5be9e7d832 --- /dev/null +++ b/metadata-ingestion/tests/unit/data_lake/test_schema_inference.py @@ -0,0 +1,130 @@ +import tempfile +from typing import List, Type + +import avro.schema +import pandas as pd +import ujson +from avro import schema as avro_schema +from avro.datafile import DataFileWriter +from avro.io import DatumWriter + +from datahub.ingestion.source.schema_inference import avro, csv_tsv, json, parquet +from datahub.metadata.com.linkedin.pegasus2avro.schema import ( + BooleanTypeClass, + NumberTypeClass, + SchemaField, + StringTypeClass, +) +from tests.unit.test_schema_util import assert_field_paths_match + +expected_field_paths = [ + "boolean_field", + "integer_field", + "string_field", +] + +expected_field_paths_avro = [ + "[version=2.0].[type=test].[type=boolean].boolean_field", + "[version=2.0].[type=test].[type=int].integer_field", + "[version=2.0].[type=test].[type=string].string_field", +] + +expected_field_types = [BooleanTypeClass, NumberTypeClass, StringTypeClass] + +test_table = pd.DataFrame( + { + "boolean_field": [True, False, True], + "integer_field": [1, 2, 3], + "string_field": ["a", "b", "c"], + } +) + + +def assert_field_types_match( + fields: List[SchemaField], expected_field_types: List[Type] +) -> None: + assert len(fields) == len(expected_field_types) + for field, expected_type in zip(fields, expected_field_types): + assert isinstance(field.type.type, expected_type) + + +def test_infer_schema_csv(): + with tempfile.TemporaryFile(mode="w+b") as file: + file.write(bytes(test_table.to_csv(index=False, header=True), encoding="utf-8")) + file.seek(0) + + fields = csv_tsv.CsvInferrer(max_rows=100).infer_schema(file) + fields.sort(key=lambda x: x.fieldPath) + + assert_field_paths_match(fields, expected_field_paths) + assert_field_types_match(fields, expected_field_types) + + +def test_infer_schema_tsv(): + with tempfile.TemporaryFile(mode="w+b") as file: + file.write( + bytes( + test_table.to_csv(index=False, header=True, sep="\t"), encoding="utf-8" + ) + ) + file.seek(0) + + fields = csv_tsv.TsvInferrer(max_rows=100).infer_schema(file) + fields.sort(key=lambda x: x.fieldPath) + + assert_field_paths_match(fields, expected_field_paths) + assert_field_types_match(fields, expected_field_types) + + +def test_infer_schema_json(): + with tempfile.TemporaryFile(mode="w+b") as file: + file.write(bytes(test_table.to_json(orient="records"), encoding="utf-8")) + file.seek(0) + + fields = json.JsonInferrer().infer_schema(file) + fields.sort(key=lambda x: x.fieldPath) + + assert_field_paths_match(fields, expected_field_paths) + assert_field_types_match(fields, expected_field_types) + + +def test_infer_schema_parquet(): + with tempfile.TemporaryFile(mode="w+b") as file: + test_table.to_parquet(file) + file.seek(0) + + fields = parquet.ParquetInferrer().infer_schema(file) + fields.sort(key=lambda x: x.fieldPath) + + assert_field_paths_match(fields, expected_field_paths) + assert_field_types_match(fields, expected_field_types) + + +def test_infer_schema_avro(): + with tempfile.TemporaryFile(mode="w+b") as file: + schema = avro_schema.parse( + ujson.dumps( + { + "type": "record", + "name": "test", + "fields": [ + {"name": "boolean_field", "type": "boolean"}, + {"name": "integer_field", "type": "int"}, + {"name": "string_field", "type": "string"}, + ], + } + ) + ) + writer = DataFileWriter(file, DatumWriter(), schema) + records = test_table.to_dict(orient="records") + for record in records: + writer.append(record) + writer.sync() + + file.seek(0) + + fields = avro.AvroInferrer().infer_schema(file) + fields.sort(key=lambda x: x.fieldPath) + + assert_field_paths_match(fields, expected_field_paths_avro) + assert_field_types_match(fields, expected_field_types) diff --git a/metadata-ingestion/tests/unit/sagemaker/sagemaker_mces_golden.json b/metadata-ingestion/tests/unit/sagemaker/sagemaker_mces_golden.json index 405fc1e621f8b..8c5da056cc2d9 100644 --- a/metadata-ingestion/tests/unit/sagemaker/sagemaker_mces_golden.json +++ b/metadata-ingestion/tests/unit/sagemaker/sagemaker_mces_golden.json @@ -321,7 +321,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,auto-ml-job-input-bucket/file.txt,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,auto-ml-job-input-bucket/file_txt,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -346,7 +346,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,auto-ml-job-output-bucket/file.txt,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,auto-ml-job-output-bucket/file_txt,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -370,7 +370,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,compilation-job-bucket/input-config.tar.gz,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,compilation-job-bucket/input-config.tar_gz,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -396,7 +396,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,compilation-job-bucket/output-config.tar.gz,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,compilation-job-bucket/output-config.tar_gz,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -422,7 +422,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,edge-packaging-bucket/model-artifact.tar.gz,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,edge-packaging-bucket/model-artifact.tar_gz,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -446,7 +446,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,edge-packaging-bucket/output-config.tar.gz,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,edge-packaging-bucket/output-config.tar_gz,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -470,7 +470,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/data-source.tar.gz,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/data-source.tar_gz,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -494,7 +494,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/category-config.tar.gz,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/category-config.tar_gz,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -518,7 +518,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/output-dataset.tar.gz,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/output-dataset.tar_gz,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -542,7 +542,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/output-config.tar.gz,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/output-config.tar_gz,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -566,7 +566,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,processing-job/input-data.tar.gz,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,processing-job/input-data.tar_gz,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -595,7 +595,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/input-dataset.tar.gz,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/input-dataset.tar_gz,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -623,7 +623,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/output-data.tar.gz,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/output-data.tar_gz,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -647,7 +647,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/checkpoint-config.tar.gz,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/checkpoint-config.tar_gz,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -671,7 +671,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/debug-hook-config.tar.gz,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/debug-hook-config.tar_gz,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -695,7 +695,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/tensorboard-output-config.tar.gz,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/tensorboard-output-config.tar_gz,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -719,7 +719,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/profiler-config.tar.gz,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/profiler-config.tar_gz,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -743,7 +743,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/debug-rule-config.tar.gz,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/debug-rule-config.tar_gz,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -767,7 +767,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/profiler-rule-config.tar.gz,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/profiler-rule-config.tar_gz,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -791,7 +791,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,transform-job/input-data-source.tar.gz,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,transform-job/input-data-source.tar_gz,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -818,7 +818,7 @@ "auditHeader": null, "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,transform-job/output.tar.gz,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:s3,transform-job/output.tar_gz,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { @@ -911,10 +911,10 @@ { "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:s3,auto-ml-job-input-bucket/file.txt,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:s3,auto-ml-job-input-bucket/file_txt,PROD)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:s3,auto-ml-job-output-bucket/file.txt,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:s3,auto-ml-job-output-bucket/file_txt,PROD)" ], "inputDatajobs": [] } @@ -993,10 +993,10 @@ { "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:s3,compilation-job-bucket/input-config.tar.gz,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:s3,compilation-job-bucket/input-config.tar_gz,PROD)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:s3,compilation-job-bucket/output-config.tar.gz,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:s3,compilation-job-bucket/output-config.tar_gz,PROD)" ], "inputDatajobs": [ "urn:li:dataJob:(urn:li:dataFlow:(sagemaker,compilation:a-compilation-job,PROD),arn:aws:sagemaker:us-west-2:123412341234:compilation-job/a-compilation-job)" @@ -1077,8 +1077,8 @@ "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { "inputDatasets": [], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:s3,edge-packaging-bucket/model-artifact.tar.gz,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:s3,edge-packaging-bucket/output-config.tar.gz,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:s3,edge-packaging-bucket/model-artifact.tar_gz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:s3,edge-packaging-bucket/output-config.tar_gz,PROD)" ], "inputDatajobs": [] } @@ -1236,12 +1236,12 @@ { "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/category-config.tar.gz,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/data-source.tar.gz,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/category-config.tar_gz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/data-source.tar_gz,PROD)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/output-config.tar.gz,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/output-dataset.tar.gz,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/output-config.tar_gz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:s3,labeling-job/output-dataset.tar_gz,PROD)" ], "inputDatajobs": [] } @@ -1325,7 +1325,7 @@ { "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:s3,processing-job/input-data.tar.gz,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:s3,processing-job/input-data.tar_gz,PROD)" ], "outputDatasets": [], "inputDatajobs": [ @@ -1432,16 +1432,16 @@ { "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/input-dataset.tar.gz,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/input-dataset.tar_gz,PROD)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/checkpoint-config.tar.gz,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/debug-hook-config.tar.gz,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/debug-rule-config.tar.gz,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/output-data.tar.gz,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/profiler-config.tar.gz,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/profiler-rule-config.tar.gz,PROD)", - "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/tensorboard-output-config.tar.gz,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/checkpoint-config.tar_gz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/debug-hook-config.tar_gz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/debug-rule-config.tar_gz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/output-data.tar_gz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/profiler-config.tar_gz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/profiler-rule-config.tar_gz,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:s3,training-job/tensorboard-output-config.tar_gz,PROD)" ], "inputDatajobs": [] } @@ -1524,10 +1524,10 @@ { "com.linkedin.pegasus2avro.datajob.DataJobInputOutput": { "inputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:s3,transform-job/input-data-source.tar.gz,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:s3,transform-job/input-data-source.tar_gz,PROD)" ], "outputDatasets": [ - "urn:li:dataset:(urn:li:dataPlatform:s3,transform-job/output.tar.gz,PROD)" + "urn:li:dataset:(urn:li:dataPlatform:s3,transform-job/output.tar_gz,PROD)" ], "inputDatajobs": [ "urn:li:dataJob:(urn:li:dataFlow:(sagemaker,auto_ml:an-auto-ml-job,PROD),arn:aws:sagemaker:us-west-2:123412341234:auto-ml-job/an-auto-ml-job)", @@ -1812,4 +1812,4 @@ "proposedDelta": null, "systemMetadata": null } -] +] \ No newline at end of file diff --git a/metadata-ingestion/tox.ini b/metadata-ingestion/tox.ini index 8669c5efd7587..978d33818ba45 100644 --- a/metadata-ingestion/tox.ini +++ b/metadata-ingestion/tox.ini @@ -18,6 +18,7 @@ python = # see more here -> https://github.com/tox-dev/tox/issues/1105#issuecomment-448596282 [testenv] +passenv = SPARK_VERSION deps = .[dev] commands =