diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d15acc2..9dba8a0 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,7 +7,9 @@ on: push: branches: [ main ] pull_request: - branches: [ main ] + branches: + - main + - api_upload_update workflow_dispatch: jobs: @@ -15,7 +17,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.8, 3.9, "3.10"] + python-version: [3.9, '3.10', 3.11, 3.12] services: @@ -53,7 +55,7 @@ jobs: pytest -s tests/ # Run coverage only once - - if: ${{ matrix.python-version == '3.9'}} + - if: ${{ matrix.python-version == '3.10'}} name: Get Coverage for badge run: | # Run coverage save the results diff --git a/pyproject.toml b/pyproject.toml index 039dca5..e6cd6e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ classifiers = [ dependencies = [ "snowexsql==0.6.0rc1", "snowmicropyn", - "insitupy~=0.3.0", + "insitupy~=0.4", "boto3<1.24", "rasterio<1.4", "timezonefinder<7", diff --git a/scripts/upload/add_bsu_gpr.py b/scripts/upload/add_bsu_gpr.py index 26d7e77..ed12a87 100644 --- a/scripts/upload/add_bsu_gpr.py +++ b/scripts/upload/add_bsu_gpr.py @@ -8,49 +8,39 @@ """ -import time -from os.path import abspath, expanduser, join +from os.path import abspath, expanduser -import pandas as pd - -from snowexsql.db import get_db -from snowex_db.upload import * +from snowexsql.db import db_session_with_credentials +from snowex_db.upload.points import PointDataCSV def main(): - file = '../download/data/SNOWEX/SNEX20_BSU_GPR.001/2020.01.28/SNEX20_BSU_GPR_pE_01282020_01292020_02042020.csv' + file = ('../download/data/SNOWEX/SNEX20_BSU_GPR.001/' + '2020.01.28/SNEX20_BSU_GPR_pE_01282020_01292020_02042020.csv') kwargs = { - # Keyword argument to upload depth measurements - 'depth_is_metadata': False, - # Constant Metadata for the GPR data - 'site_name': 'Grand Mesa', - 'observers': 'Tate Meehan', - 'instrument': 'pulse EKKO Pro multi-polarization 1 GHz GPR', - 'in_timezone': 'UTC', - 'out_timezone': 'UTC', - 'epsg': 26912, - 'doi': 'https://doi.org/10.5067/Q2LFK0QSVGS2' + 'campaign_name': 'Grand Mesa', + 'observer': 'Tate Meehan', + 'instrument': 'gpr', + 'instrument_model': 'pulse EKKO Pro multi-polarization 1 GHz GPR', + 'timezone': 'UTC', + 'doi': 'https://doi.org/10.5067/Q2LFK0QSVGS2', + 'name': 'BSU GPR Data', } # Break out the path and make it an absolute path file = abspath(expanduser(file)) - # Grab a db connection to a local db named snowex - db_name = 'localhost/snowex' - engine, session = get_db(db_name, credentials='./credentials.json') - - # Instantiate the point uploader - csv = PointDataCSV(file, **kwargs) - # Push it to the database - csv.submit(session) - - # Close out the session with the DB - session.close() + # Grab a db connection + with db_session_with_credentials() as (_engine, session): + # Instantiate the point uploader + csv = PointDataCSV(file, **kwargs) + # Push it to the database + csv.submit(session) # return the number of errors for run.py can report it - return len(csv.errors) + # return len(csv.errors) if __name__ == '__main__': diff --git a/scripts/upload/add_pits_bulk_properties.py b/scripts/upload/add_pits_bulk_properties.py index bed5120..00d9e30 100644 --- a/scripts/upload/add_pits_bulk_properties.py +++ b/scripts/upload/add_pits_bulk_properties.py @@ -9,16 +9,14 @@ import pandas as pd -from snowex_db.upload import PointDataCSV -from snowex_db import db_session +from snowexsql.db import db_session_with_credentials +from snowex_db.upload.points import PointDataCSV def main(): """ Add bulk SWE, Depth, Density for 2020 and 2021 timeseires pits """ - db_name = 'localhost/snowex' - debug = True # Point to the downloaded data from data_dir = abspath('../download/data/SNOWEX/') @@ -35,14 +33,18 @@ def main(): }, # Preliminary data from 2023 Alask pits { + # TODO: update this "DOI": "preliminary_alaska_pits", "path": "../SNEX23_preliminary/Data/SnowEx23_SnowPits_AKIOP_Summary_SWE_v01.csv" } ] + # start a db session + # look through the pit summary files for info in path_details: doi = info["DOI"] file_path = join(data_dir, info["path"]) # Read csv and dump new one without the extra header lines + # that make parsing not possible df = pd.read_csv( file_path, skiprows=list(range(32)) + [33] @@ -61,16 +63,15 @@ def main(): df.to_csv(new_name, index=False) # Submit SWE file data as point data - with db_session( - db_name, credentials='credentials.json' - ) as (session, engine): - pcsv = PointDataCSV( - new_name, doi=doi, debug=debug, - depth_is_metadata=False, - row_based_crs=True, - row_based_timezone=True + with db_session_with_credentials() as (_engine, session): + u = PointDataCSV( + new_name, + doi=doi, + row_based_timezone=True, + derived=True ) - pcsv.submit(session) + + u.submit(session) if __name__ == '__main__': diff --git a/snowex_db/metadata.py b/snowex_db/metadata.py index ef38f56..1a61852 100644 --- a/snowex_db/metadata.py +++ b/snowex_db/metadata.py @@ -3,15 +3,20 @@ to describing data. """ import logging +import pandas as pd +import pytz + from dataclasses import dataclass -from typing import Union +from typing import Tuple, Union -from insitupy.io.metadata import MetaDataParser from insitupy.profiles.metadata import ProfileMetaData +from insitupy.campaigns.snowex.snowex_metadata import SnowExMetaDataParser from snowexsql.db import get_table_attributes from snowexsql.tables import Site -from .interpretation import * +from .interpretation import ( + manage_degree_values, convert_cardinal_to_degree, add_date_time_keys +) from .projection import add_geom, reproject_point_in_dict from .string_management import * from .utilities import assign_default_kwargs, get_logger @@ -119,12 +124,13 @@ class SnowExProfileMetadata(ProfileMetaData): wind: Union[str, None] = None -class ExtendedSnowExMetadataParser(MetaDataParser): +class ExtendedSnowExMetadataParser(SnowExMetaDataParser): """ Extend the parser to update the parsing function """ - def parse(self): + def parse(self, filename: str) \ + -> Tuple[SnowExProfileMetadata, list, dict, int]: """ Parse the file and return a metadata object. We can override these methods as needed to parse the different @@ -132,12 +138,15 @@ def parse(self): This populates self.rough_obj + Args: + filename: Path to the file from which to parse metadata + Returns: (metadata object, column list, position of header in file) """ ( meta_lines, columns, columns_map, header_position - ) = self.find_header_info(self._fname) + ) = self.find_header_info(filename) self._rough_obj = self._preparse_meta(meta_lines) # Create a standard metadata object metadata = SnowExProfileMetadata( diff --git a/snowex_db/point_data.py b/snowex_db/point_data.py index bef0e31..bf29726 100644 --- a/snowex_db/point_data.py +++ b/snowex_db/point_data.py @@ -1,18 +1,17 @@ import logging -from pathlib import Path from typing import List +import geopandas as gpd import numpy as np import pandas as pd -import geopandas as gpd -from insitupy.campaigns.snowex import SnowExProfileData -from insitupy.io.dates import DateManager +from insitupy.io.dates import DateTimeManager from insitupy.io.locations import LocationManager +from insitupy.io.metadata import MetaDataParser from insitupy.io.yaml_codes import YamlCodes - from insitupy.profiles.base import MeasurementData from insitupy.profiles.metadata import ProfileMetaData -from insitupy.variables import MeasurementDescription, ExtendableVariables +from insitupy.variables import MeasurementDescription +from timezonefinder import TimezoneFinder from .point_metadata import PointSnowExMetadataParser @@ -21,45 +20,24 @@ class SnowExPointData(MeasurementData): OUT_TIMEZONE = "UTC" - DEFAULT_METADATA_VARIABLE_FILES = SnowExProfileData.DEFAULT_METADATA_VARIABLE_FILES - DEFAULT_PRIMARY_VARIABLE_FILES = MeasurementData.DEFAULT_PRIMARY_VARIABLE_FILES + [ - Path(__file__).parent.joinpath( - "./point_primary_variable_overrides.yaml" - ) - ] + META_PARSER = PointSnowExMetadataParser def __init__( - self, input_df: pd.DataFrame, metadata: ProfileMetaData, - variable: MeasurementDescription, - original_file=None, meta_parser=None, allow_map_failure=False, + self, variable: MeasurementDescription = None, + meta_parser: MetaDataParser = None, row_based_timezone=False, timezone=None ): """ - Take df of layered data (SMP, pit, etc) Args: - input_df: dataframe of data - Should include depth and optional bottom depth - Should include sample or sample_a, sample_b, etc - metadata: ProfileMetaData object - variable: description of variable - original_file: optional track original file - meta_parser: MetaDataParser object. This will hold our variables - map and units map - allow_map_failures: if a mapping fails, warn us and use the - original string (default False) + See MeasurementData.__init__ row_based_timezone: does each row have a unique timezone implied timezone: input timezone for the whole file """ self._row_based_timezone = row_based_timezone self._in_timezone = timezone - super().__init__( - input_df, metadata, variable, - original_file=original_file, - meta_parser=meta_parser, - allow_map_failure=allow_map_failure - ) + super().__init__(variable, meta_parser) @staticmethod def read_csv_dataframe(profile_filename, columns, header_position): @@ -80,7 +58,8 @@ def read_csv_dataframe(profile_filename, columns, header_position): profile_filename, header=0, skiprows=header_position, names=columns, - encoding='latin' + encoding='latin', + dtype=str # treat all columns as strings to get weird date format ) if "flags" in df.columns: # Max length of the flags column @@ -96,7 +75,7 @@ def _get_location(self, row): """ try: lat, lon, *_ = LocationManager.parse(row) - except ValueError as e: + except ValueError: if self.metadata is not None: LOG.warning( f"Row {row.name} does not have a valid location. " @@ -106,9 +85,7 @@ def _get_location(self, row): else: raise RuntimeError("No valid location found in row or metadata.") - row["latitude"] = lat - row["longitude"] = lon - return row + return lat, lon def _get_datetime(self, row): """ @@ -118,8 +95,11 @@ def _get_datetime(self, row): """ tz = self._in_timezone if self._row_based_timezone: - # TODO: do we have to look it up? - raise NotImplementedError("?") + # Look up the timezone for the location and apply that + timezone_str = TimezoneFinder().timezone_at( + lat=row["latitude"], lng=row["longitude"] + ) + tz = timezone_str # e.g., 'America/Denver' try: datetime = None # In case we found a date entry that has date and time @@ -127,12 +107,13 @@ def _get_datetime(self, row): str_date = str( row[YamlCodes.DATE_TIME].replace('T', '-') ) + datetime = pd.to_datetime(str_date) if datetime is None: - datetime = DateManager.handle_separate_datetime(row) + datetime = DateTimeManager.handle_separate_datetime(row) - result = DateManager.adjust_timezone( + result = DateTimeManager.adjust_timezone( datetime, in_timezone=tz, out_timezone=self.OUT_TIMEZONE @@ -142,47 +123,60 @@ def _get_datetime(self, row): result = self.metadata.date_time else: raise e - row["datetime"] = result - return row - - @classmethod - def _get_campaign(cls, row): - """ - fill in the campaign info for a row - Args: - row: pandas row - """ - row["campaign"] = row.get(YamlCodes.SITE_NAME) - return row + return result - def _format_df(self, input_df): + def _format_df(self): """ Format the incoming df with the column headers and other info we want This will filter to a single measurement as well as the expected shared columns like depth """ - self._set_column_mappings(input_df) + self._set_column_mappings() + + # If the variable is real (not -1), check columns + if self.variable.code != "-1": + # Verify the sample column exists and rename to variable + self._check_sample_columns() - # Verify the sample column exists and rename to variable - df = self._check_sample_columns(input_df) + columns = self._df.columns.tolist() - df = df.apply(self._get_campaign, axis=1) + # If we do not have a geometry column, we need to parse + # the raw df, otherwise we assume this has been done already, + # likely on the first read of the file + + # Get the campaign name + if "campaign" not in self._df.columns: + self._df["campaign"] = self._df.get(YamlCodes.SITE_NAME) + # TODO: How do we speed this up? + # campaign should be very quick with a df level logic + # but the other ones will take morelogic # parse the location - df = df.apply(self._get_location, axis=1) - # Parse the datetime - df = df.apply(self._get_datetime, axis=1) + self._df[["latitude", "longitude"]] = self._df.apply( + self._get_location, axis=1, result_type="expand" + ) + # If the datetime isn't already parsed, parse it + if ( + "datetime" in self._df.columns.tolist() + and pd.api.types.is_datetime64_any_dtype( + self._df["datetime"] + ) + ): + LOG.debug("not parsing date") + else: + # Parse the datetime + self._df["datetime"] = self._df.apply( + self._get_datetime, axis=1, result_type="expand" + ) location = gpd.points_from_xy( - df["longitude"], df["latitude"] + self._df["longitude"], self._df["latitude"] ) - df = df.drop(columns=["longitude", "latitude"]) + # self._df = self._df.drop(columns=["longitude", "latitude"]) - df = gpd.GeoDataFrame( - df, geometry=location + self._df = gpd.GeoDataFrame( + self._df, geometry=location ).set_crs("EPSG:4326") - df = df.replace(-9999, np.NaN) - - return df + self._df = self._df.replace(-9999, np.NaN) class PointDataCollection: @@ -205,17 +199,12 @@ def series(self) -> List[SnowExPointData]: @classmethod def _read_csv( - cls, fname, columns, column_mapping, header_pos, - metadata: ProfileMetaData, meta_parser: PointSnowExMetadataParser, + cls, fname, meta_parser: PointSnowExMetadataParser, timezone=None, row_based_timezone=False ) -> List[SnowExPointData]: """ Args: fname: path to csv - columns: columns for dataframe - column_mapping: mapping of column name to variable description - header_pos: skiprows for pd.read_csv - metadata: metadata for each object meta_parser: parser for the metadata timezone: input timezone row_based_timezone: is the timezone row based? @@ -224,63 +213,83 @@ def _read_csv( a list of ProfileData objects """ + # parse the file for metadata before parsing the individual + # variables + all_file = cls.DATA_CLASS( + variable=None, # we do not have a variable yet + meta_parser=meta_parser, + timezone=timezone, row_based_timezone=row_based_timezone + ) + all_file.from_csv(fname) + result = [] - # TODO: how does the metadata parser fit into this? - if columns is None and header_pos is None: - LOG.warning(f"File {fname} is empty of rows") - df = pd.DataFrame() - else: - df = cls.DATA_CLASS.read_csv_dataframe( - fname, columns, header_pos, - ) shared_column_options = [ - meta_parser.primary_variables.entries["INSTRUMENT"], + # TODO: could we make this a 'shared' option in the definition + meta_parser.primary_variables.entries["CAMPAIGN"], + meta_parser.primary_variables.entries["COMMENTS"], meta_parser.primary_variables.entries["DATE"], - meta_parser.primary_variables.entries["TIME"], meta_parser.primary_variables.entries["DATETIME"], - meta_parser.primary_variables.entries["UTCDOY"], - meta_parser.primary_variables.entries["UTCTOD"], - meta_parser.primary_variables.entries["UTCYEAR"], - meta_parser.primary_variables.entries["LATITUDE"], - meta_parser.primary_variables.entries["LONGITUDE"], meta_parser.primary_variables.entries["EASTING"], - meta_parser.primary_variables.entries["NORTHING"], meta_parser.primary_variables.entries["ELEVATION"], + meta_parser.primary_variables.entries["FLAGS"], + meta_parser.primary_variables.entries["INSTRUMENT"], meta_parser.primary_variables.entries["INSTRUMENT_MODEL"], - meta_parser.primary_variables.entries["UTM_ZONE"], + meta_parser.primary_variables.entries["LATITUDE"], + meta_parser.primary_variables.entries["LONGITUDE"], + meta_parser.primary_variables.entries["NORTHING"], meta_parser.primary_variables.entries["PIT_ID"], + meta_parser.primary_variables.entries["TIME"], + meta_parser.primary_variables.entries["UTCDOY"], + meta_parser.primary_variables.entries["UTCTOD"], + meta_parser.primary_variables.entries["UTCYEAR"], + meta_parser.primary_variables.entries["UTM_ZONE"], meta_parser.primary_variables.entries["VERSION_NUMBER"], ] shared_columns = [ - c for c, v in column_mapping.items() + c for c, v in all_file.meta_columns_map.items() if v in shared_column_options ] variable_columns = [ - c for c in column_mapping.keys() if c not in shared_columns + c for c in all_file.meta_columns_map.keys() if c not in shared_columns + ] + # Filter out ignore columns + variable_columns = [ + v for v in variable_columns + if all_file.meta_columns_map[v].code != "ignore" ] # Create an object for each measurement for column in variable_columns: - target_df = df.loc[:, shared_columns + [column]] - result.append(cls.DATA_CLASS( - target_df, metadata, - column_mapping[column], # variable is a MeasurementDescription - original_file=fname, + points = cls.DATA_CLASS( + variable=all_file.meta_columns_map[column], meta_parser=meta_parser, timezone=timezone, row_based_timezone=row_based_timezone - )) - - return result + ) + # IMPORTANT - Metadata needs to be set before assigning the + # dataframe as information from the metadata is used to format_df + # the information + points.metadata = all_file.metadata + df_columns = all_file.df.columns.tolist() + # The df setter filters some columns, so adjust our shared columns + df_shared_columns = [ + c for c in shared_columns if c in df_columns + ] + # run the whole file through the df setter + points.df = all_file.df.loc[:, df_shared_columns + [column]].copy() + # -------- + result.append(points) + + return result, all_file.metadata @classmethod def from_csv( cls, fname, timezone="US/Mountain", header_sep=",", site_id=None, campaign_name=None, allow_map_failure=False, units_map=None, row_based_timezone=False, - metadata_variable_files=None, - primary_variable_files=None, + metadata_variable_file=None, + primary_variable_file=None, ): """ Find all variables in a single csv file @@ -293,36 +302,27 @@ def from_csv( allow_map_failure: allow metadata and column unknowns units_map: units map for the metadata row_based_timezone: is the timezone row based - metadata_variable_files: list of files to override the metadata + metadata_variable_file: list of files to override the metadata variables - primary_variable_files: list of files to override the + primary_variable_file: list of files to override the primary variables Returns: This class with a collection of profiles and metadata """ - primary_variables = ExtendableVariables( - primary_variable_files or cls.DATA_CLASS.DEFAULT_PRIMARY_VARIABLE_FILES - ) - metadata_variables = ExtendableVariables( - metadata_variable_files or cls.DATA_CLASS.DEFAULT_METADATA_VARIABLE_FILES, - ) # parse multiple files and create an iterable of ProfileData meta_parser = PointSnowExMetadataParser( - fname, timezone, primary_variables, metadata_variables, + timezone, primary_variable_file, metadata_variable_file, header_sep=header_sep, _id=site_id, campaign_name=campaign_name, allow_map_failures=allow_map_failure, - units_map=units_map + units_map=units_map, ) - # Parse the metadata and column info - metadata, columns, columns_map, header_pos = meta_parser.parse() + # read in the actual data - profiles = cls._read_csv( - fname, columns, columns_map, header_pos, metadata, - meta_parser, + profiles, metadata = cls._read_csv( + fname, meta_parser, timezone=timezone, row_based_timezone=row_based_timezone ) - # ignore profiles with the name 'ignore' profiles = [ p for p in profiles if diff --git a/snowex_db/point_metadata.py b/snowex_db/point_metadata.py index 61378c8..d242447 100644 --- a/snowex_db/point_metadata.py +++ b/snowex_db/point_metadata.py @@ -1,17 +1,18 @@ import logging +from typing import Tuple, Union -from insitupy.io.metadata import MetaDataParser +from insitupy.campaigns.snowex.snowex_metadata import SnowExMetaDataParser from insitupy.profiles.metadata import ProfileMetaData LOG = logging.getLogger() -class PointSnowExMetadataParser(MetaDataParser): +class PointSnowExMetadataParser(SnowExMetaDataParser): """ - Extend the parser to update the extended varaibles + Extend the parser to update the extended variables """ - def find_header_info(self, filename=None): + def find_header_info(self, filename): """ Read in all site details file for a pit If the filename has the word site in it then we read everything in the file. Otherwise, we use this @@ -28,7 +29,6 @@ def find_header_info(self, filename=None): **header_pos** - Index of the columns header for skiprows in read_csv """ - filename = filename or self._fname filename = str(filename) with open(filename, encoding='latin') as fp: lines = fp.readlines() @@ -62,7 +62,8 @@ def find_header_info(self, filename=None): return str_data, columns, columns_map, header_pos - def parse(self): + def parse(self, filename: str) -> ( + Tuple)[Union[ProfileMetaData | None], list, dict, int]: """ Parse the file and return a metadata object. We can override these methods as needed to parse the different @@ -70,12 +71,16 @@ def parse(self): This populates self.rough_obj + Args: + filename: (str) Full path to the file with the header info to parse + Returns: - (None, column list, position of header in file) + Tuple: + metadata object or None, column list, position of header in file """ ( meta_lines, columns, columns_map, header_position - ) = self.find_header_info(self._fname) + ) = self.find_header_info(filename) self._rough_obj = self._preparse_meta(meta_lines) # We do not have header metadata for point files if not self.rough_obj: @@ -99,4 +104,4 @@ def parse(self): flags=self.parse_flags(), observers=self.parse_observers() ) - return metadata, columns, columns_map, header_position \ No newline at end of file + return metadata, columns, columns_map, header_position diff --git a/snowex_db/point_primary_variable_overrides.yaml b/snowex_db/point_primary_variable_overrides.yaml index d3e56a4..8c21c19 100644 --- a/snowex_db/point_primary_variable_overrides.yaml +++ b/snowex_db/point_primary_variable_overrides.yaml @@ -67,6 +67,7 @@ IGNORE: - id - avgvelocity - count + - site match_on_code: true INSTRUMENT: auto_remap: true @@ -224,17 +225,33 @@ WIND_SPEED_10FT: map_from: - wsms_10ft_avg match_on_code: true +CAMPAIGN: + auto_remap: true + code: campaign + description: Name of campaign site + map_from: + - location + match_on_code: true +COMMENTS: + code: comments + description: Comments + match_on_code: true +FLAGS: + code: flags + description: measurement flag + map_from: + - flag + match_on_code: true VERSION_NUMBER: auto_remap: true code: version_number - description: version - map_from: - - version number + description: version match_on_code: true PIT_ID: auto_remap: true code: pit_id description: ID of snow pit map_from: - - pitid + - pit_id + - pitid match_on_code: true diff --git a/snowex_db/profile_data.py b/snowex_db/profile_data.py index 5289e15..b7c0579 100644 --- a/snowex_db/profile_data.py +++ b/snowex_db/profile_data.py @@ -1,47 +1,11 @@ -from pathlib import Path - -import pandas as pd -from insitupy.campaigns.snowex import ( - SnowExProfileData, SnowExProfileDataCollection -) -from insitupy.io.metadata import MetaDataParser -from insitupy.profiles.metadata import ProfileMetaData -from insitupy.variables import MeasurementDescription +from insitupy.campaigns.snowex import SnowExProfileData, SnowExProfileDataCollection from .metadata import ExtendedSnowExMetadataParser class ExtendedSnowexProfileData(SnowExProfileData): META_PARSER = ExtendedSnowExMetadataParser - DEFAULT_METADATA_VARIABLE_FILES = ( - SnowExProfileData.DEFAULT_METADATA_VARIABLE_FILES - ) + [ - Path(__file__).parent.joinpath( - "./metadata_variable_overrides.yaml" - ) - ] - DEFAULT_PRIMARY_VARIABLE_FILES = ( - SnowExProfileData.DEFAULT_PRIMARY_VARIABLE_FILES) + [ - Path(__file__).parent.joinpath( - "./profile_primary_variable_overrides.yaml" - ) - ] - - def __init__( - self, input_df: pd.DataFrame, - metadata: ProfileMetaData, - variable: MeasurementDescription, - meta_parser: MetaDataParser, **kwargs - ): - # Tricky, this needs to happen before super init - self._comments_column = meta_parser.primary_variables.entries[ - "COMMENTS"] - super().__init__(input_df, metadata, variable, meta_parser, **kwargs) - - def shared_column_options(self): - return self._depth_columns + [self._comments_column] class ExtendedSnowExProfileDataCollection(SnowExProfileDataCollection): - META_PARSER = ExtendedSnowExMetadataParser PROFILE_DATA_CLASS = ExtendedSnowexProfileData diff --git a/snowex_db/upload/layers.py b/snowex_db/upload/layers.py index 0a37c40..74798fe 100644 --- a/snowex_db/upload/layers.py +++ b/snowex_db/upload/layers.py @@ -9,6 +9,7 @@ import pandas as pd from geoalchemy2 import WKTElement +from insitupy.io.strings import StringManager from insitupy.campaigns.snowex import SnowExProfileData from snowexsql.tables import ( Campaign, DOI, Instrument, LayerData, MeasurementType, Observer, Site @@ -17,7 +18,6 @@ from .batch import BatchBase from ..metadata import SnowExProfileMetadata from ..profile_data import ExtendedSnowExProfileDataCollection -from ..string_management import parse_none from ..utilities import get_logger LOG = logging.getLogger("snowex_db.upload.layers") @@ -49,8 +49,7 @@ def __init__( timezone (str): The timezone used, default is "US/Mountain". kwargs: Additional optional keyword arguments related to the profile. doi (str): Digital Object Identifier - instrument (str): Name of the instrument used - collection. + instrument (str): Name of the instrument used in the collection. header_sep (str): Delimiter for separating values in the header. Default is ','. id (str): Identifier for the profile data file. @@ -94,11 +93,17 @@ def _read(self) -> ExtendedSnowExProfileDataCollection: """ try: return ExtendedSnowExProfileDataCollection.from_csv( - self.filename, + filename=self.filename, timezone=self._timezone, header_sep=self._header_sep, site_id=self._id, - campaign_name=self._campaign_name + campaign_name=self._campaign_name, + metadata_variable_file=Path(__file__).parent.joinpath( + "../metadata_variable_overrides.yaml" + ), + primary_variable_file=Path(__file__).parent.joinpath( + "../profile_primary_variable_overrides.yaml" + ), ) except pd.errors.ParserError as e: LOG.error(e) @@ -117,19 +122,21 @@ def build_data(self, profile: SnowExProfileData) -> gpd.GeoDataFrame: df: Dataframe ready for submission """ - df = profile.df.copy() - if df.empty: + if profile.df is None: LOG.debug("df is empty, returning") - return df + return gpd.GeoDataFrame() + metadata = profile.metadata variable = profile.variable + df = profile.df.copy() + # The type of measurement df['type'] = [variable.code] * len(df) # Manage nans and nones for c in df.columns: - df[c] = df[c].apply(lambda x: parse_none(x)) + df[c] = df[c].apply(lambda x: StringManager.parse_none(x)) df['value'] = df[variable.code].astype(str) if 'units' not in df.columns: diff --git a/snowex_db/upload/points.py b/snowex_db/upload/points.py index de3b612..5656691 100644 --- a/snowex_db/upload/points.py +++ b/snowex_db/upload/points.py @@ -1,9 +1,8 @@ """ Module for classes that upload single files to the database. """ - +from pathlib import Path import logging - import geopandas as gpd import pandas as pd from geoalchemy2 import WKTElement @@ -12,9 +11,12 @@ Campaign, DOI, Instrument, MeasurementType, Observer, PointData, PointObservation ) + +from insitupy.io.strings import StringManager + from .base import BaseUpload from ..point_data import PointDataCollection, SnowExPointData -from ..string_management import parse_none + LOG = logging.getLogger("snowex_db.upload.points") @@ -22,11 +24,6 @@ class DataValidationError(ValueError): pass -# TODO: do we need to make a SnowExPointDataCollection similar to -# SnowExProfileDataCollection, since some files will have more than one point -# measurement per file? This is true for GPR, summary swe, etc -# TODO: start with test datasets for simpler examples - class PointDataCSV(BaseUpload): """ @@ -44,12 +41,42 @@ class PointDataCSV(BaseUpload): 'density': 'kg/m^3' } - def __init__(self, session, filename, timezone="US/Mountain", **kwargs): - self.filename = filename + def __init__( + self, session, profile_filename, timezone="US/Mountain", **kwargs + ): + """ + + Args: + session: SQLAlchemy session to use for the upload + profile_filename: Path to the csv file to upload + timezone: Timezone to assume for the data, defaults to "US/Mountain" + **kwargs: + doi + instrument + header_sep + id + campaign_name + derived + instrument_model + comments + observer + name + row_based_timezone + instrument_map + """ + self.filename = profile_filename self._session = session + self._timezone = timezone self._doi = kwargs.get("doi") self._instrument = kwargs.get("instrument") + # a map of measurement type to instrument name + self._instrument_map = kwargs.get("instrument_map", {}) + if self._instrument_map and self._instrument: + raise ValueError( + "Cannot provide both 'instrument' and 'instrument_map'. " + "Please choose one." + ) self._header_sep = kwargs.get("header_sep", ",") self._id = kwargs.get("id") self._campaign_name = kwargs.get("campaign_name") @@ -89,7 +116,10 @@ def _read(self, in_timezone=None): site_id=self._id, campaign_name=self._campaign_name, units_map=self.UNITS_MAP, - row_based_timezone=self._row_based_tz + row_based_timezone=self._row_based_tz, + primary_variable_file=Path(__file__).parent.joinpath( + "../point_primary_variable_overrides.yaml" + ) ) except pd.errors.ParserError as e: LOG.error(e) @@ -121,8 +151,8 @@ def build_data(self, series: SnowExPointData) -> gpd.GeoDataFrame: # Manage nans and nones for c in df.columns: - df[c] = df[c].apply(lambda x: parse_none(x)) - df['value'] = df[variable.code].astype(str) + df[c] = df[c].apply(lambda x: StringManager.parse_none(x)) + df['value'] = df[variable.code].astype(float) if 'units' not in df.columns: unit_str = series.units_map.get(variable.code) @@ -147,9 +177,16 @@ def build_data(self, series: SnowExPointData) -> gpd.GeoDataFrame: if column_name not in columns: df[column_name] = [param] * len(df) + # Anywhere the instrument is None, use the instrument map + # based on the measurement name + if self._instrument_map and 'instrument' in df.columns: + df['instrument'] = df['instrument'].fillna( + df['type'].map(self._instrument_map) + ) + # Map the measurement names or default to original df["instrument"] = df['instrument'].map( - lambda x: self.MEASUREMENT_NAMES.get(x, x) + lambda x: self.MEASUREMENT_NAMES.get(x.lower(), x) ) return df @@ -168,6 +205,7 @@ def submit(self): if not df.empty: # IMPORTANT: Add observations first, so we can use them in the # entries + # TODO: how do these link back? self._add_campaign_observation(df) @@ -179,6 +217,7 @@ def submit(self): # TODO: instrument name logic here? d = self._add_entry(row) + # session.bulk_save_objects(objects) does not resolve # foreign keys, DO NOT USE IT self._session.add(d) @@ -190,13 +229,17 @@ def submit(self): ) def _observation_name_from_row(self, row): - value = f"{row['name']}_{row['instrument']}" + name = row.get('name') or row.get('pit_id') + value = f"{name}_{row['instrument']}" + if row.get('instrument_model'): - value += row['instrument_model'] + value += '_' + row['instrument_model'] + # Add the type of measurement # This is necessary because the GPR returns multiple variables if row.get('type'): value += "_" + row['type'] + return value def _get_first_check_unique(self, df, key): @@ -205,10 +248,12 @@ def _get_first_check_unique(self, df, key): it is unique. If not, raise a DataValidationError """ unique_values = df[key].unique() + if len(unique_values) > 1: raise DataValidationError( f"Multiple values for {key} found: {unique_values}" ) + return unique_values[0] def _add_campaign_observation(self, df): @@ -218,73 +263,74 @@ def _add_campaign_observation(self, df): """ df["date"] = pd.to_datetime(df["datetime"]).dt.date - # Group by our observation keys to add into the database - for keys, grouped_df in df.groupby( - ['instrument', 'instrument_model', 'name', 'type', 'date'], - dropna=False - ): - # Process each unique combination of keys (key) and its corresponding group (grouped_df) + + # Group by our observation keys to add records uniquely into the database + base_groups = ['instrument', 'instrument_model', 'name', 'type', 'date'] + if 'pit_id' in df.columns: + base_groups.append('pit_id') + + # Process each unique combination of keys (key) and its corresponding group (grouped_df) + for keys, grouped_df in df.groupby(base_groups, dropna=False): # Add instrument - instrument_name = self._get_first_check_unique(grouped_df, 'instrument') - # Map the instrument name if we have a mapping for it - instrument_name = self.MEASUREMENT_NAMES.get( - instrument_name.lower(), instrument_name - ) instrument = self._check_or_add_object( self._session, Instrument, dict( - name=instrument_name, + name=self._get_first_check_unique(grouped_df, 'instrument'), model=self._get_first_check_unique(grouped_df, 'instrument_model') ) ) # Add measurement type - measurement_type = self._get_first_check_unique( - grouped_df, "type" - ) measurement_obj = self._check_or_add_object( # Add units and 'derived' flag for the measurement self._session, MeasurementType, dict( - name=measurement_type, + name=self._get_first_check_unique(grouped_df, "type"), units=self._get_first_check_unique(grouped_df, "units"), derived=self._derived ) ) - # Check name is unique - self._get_first_check_unique(df, "name") + # Check name is unique because we are adding ONE + # campaign observation here + self._get_first_check_unique(grouped_df, "name") + if 'pit_id' in grouped_df.columns: + self._get_first_check_unique(grouped_df, "pit_id") # Get the measurement name measurement_name = self._observation_name_from_row(grouped_df.iloc[0]) # Add doi - doi_string = self._get_first_check_unique(df, "doi") + doi_string = self._get_first_check_unique(grouped_df, "doi") if doi_string is not None: doi = self._check_or_add_object( self._session, DOI, dict(doi=doi_string) ) else: doi = None - # pass in campaign - campaign_name = self._get_first_check_unique( - df, "campaign" - ) or self._campaign_name + # Add campaign + campaign_name = self._get_first_check_unique(grouped_df, "campaign") \ + or self._campaign_name if campaign_name is None: raise DataValidationError("Campaign cannot be None") campaign = self._check_or_add_object( self._session, Campaign, dict(name=campaign_name) ) - # add observer + # Add observer observer_name = self._get_first_check_unique( - df, "observer" + grouped_df, "observer" ) or self._observer observer_name = observer_name or "unknown" observer = self._check_or_add_object( self._session, Observer, dict(name=observer_name) ) + # Construct description string description = None if ["comments"] in grouped_df.columns.values: - description = self._get_first_check_unique( + description = (description or "") + self._get_first_check_unique( grouped_df, "comments" - ), + ) + if ["flags"] in grouped_df.columns.values: + description = (description or "") + self._get_first_check_unique( + grouped_df, "flags" + ) date_obj = self._get_first_check_unique(grouped_df, "date") observation = self._check_or_add_object( @@ -297,7 +343,6 @@ def _add_campaign_observation(self, df): ), object_kwargs=dict( name=measurement_name, - # TODO: we lose out on row-based comments here description=description, date=date_obj, instrument=instrument, @@ -338,8 +383,8 @@ def _add_entry(self, row: dict): datetime=row["datetime"], # Arguments from kwargs geom=row['geometry'], - version_number=row['version_number'], - elevation=row['elevation'], + version_number=row.get('version_number', None), + elevation=row.get('elevation', None), equipment=row['instrument'] ) diff --git a/tests/data/bsu_gpr.csv b/tests/data/bsu_gpr.csv new file mode 100644 index 0000000..a1adad0 --- /dev/null +++ b/tests/data/bsu_gpr.csv @@ -0,0 +1,13 @@ +Date,Time,Longitude,Latitude,ElevationWGS84,Easting,Northing,UTM_Zone,TWT,Depth,SWE +012820,161549.557,-108.190889311605,39.0343743775669,3040.469,743148.428,4324346.715,12,8.3,101.096735522092,275.994087975311 +012820,161549.59,-108.190889588925,39.0343743752416,3040.46,743148.404,4324346.714,12,8.3,101.096735522092,275.994087975311 +012820,161549.623,-108.190889843164,39.0343743723601,3040.451,743148.382,4324346.713,12,8.3,101.096735522092,275.994087975311 +012820,161549.656,-108.19089008622,39.0343743602,3040.442,743148.361,4324346.711,12,8.3,101.096735522092,275.994087975311 +012820,161549.689,-108.190890306195,39.0343743474837,3040.434,743148.342,4324346.709,12,8.3,101.096735522092,275.994087975311 +012820,161549.721,-108.190890514631,39.0343743344892,3040.427,743148.324,4324346.707,12,8.3,101.096735522092,275.994087975311 +012820,161549.754,-108.190890711883,39.0343743122161,3040.419,743148.307,4324346.704,12,8.3,101.096735522092,275.994087975311 +012820,161549.787,-108.190890897238,39.0343742986654,3040.413,743148.291,4324346.702,12,8.3,101.096735522092,275.994087975311 +020420,205415.639,-108.165976850641,39.0171321212163,3078.752,745364.801,4322499.769,12,8.4,102.31476848019,279.319317950918 +020420,205415.672,-108.16597604275,39.017132687161,3078.76,745364.869,4322499.834,12,8.4,102.31476848019,279.319317950918 +020420,205415.705,-108.165975246395,39.0171332533861,3078.768,745364.936,4322499.899,12,8.4,102.31476848019,279.319317950918 +020420,205415.738,-108.165974438503,39.0171338193308,3078.775,745365.004,4322499.964,12,8.4,102.31476848019,279.319317950918 \ No newline at end of file diff --git a/tests/data/pit_summary_points.csv b/tests/data/pit_summary_points.csv new file mode 100644 index 0000000..7be9dbd --- /dev/null +++ b/tests/data/pit_summary_points.csv @@ -0,0 +1,13 @@ +Location,Site,PitID,Date/Local Standard Time,UTM Zone,Easting (m),Northing (m),Latitude (deg),Longitude (deg),Density Mean (kg/m^3),SWE (mm),HS (cm),Flag +American River Basin,Caples Lake,CAAMCL_20191220_1300,2019-12-20T13:00,10N,757216,4288787,38.71033054555811,-120.04186927254749,278.0,333.5,120.0,"BDG, MW" +American River Basin,Caples Lake,CAAMCL_20200131_1215,2020-01-31T12:15,10N,757220,4288788,38.71033838194462,-120.0418229560188,329.5,446.5,135.0,MW +American River Basin,Caples Lake,CAAMCL_20200214_1200,2020-02-14T12:00,10N,757218,4288787,38.71032996387011,-120.04184629988718,359.5,442.5,123.0,STLay +American River Basin,Caples Lake,CAAMCL_20200221_1200,2020-02-21T12:00,10N,757217,4288780,38.710267256343286,-120.04186038464364,364.0,424.0,117.0,"MW, STLay" +American River Basin,Caples Lake,CAAMCL_20200228_1130,2020-02-28T11:30,10N,757215,4288778,38.71024983849251,-120.04188409968448,396.5,475.5,120.0,STLay +American River Basin,Caples Lake,CAAMCL_20200306_1145,2020-03-06T11:45,10N,757216,4288779,38.71025854741857,-120.04187224216548,403.5,479.5,119.0,"TDG, MW" +American River Basin,Caples Lake,CAAMCL_20200313_1030,2020-03-13T10:30,10N,757214,4288777,38.71024112956513,-120.04189595720058,435.5,434.5,100.0,"TDG, MW, STCom, STLay" +East River,Forest 12,COER12_20200226_1242,2020-02-26T12:42,13N,328520,4310833,38.929673072842384,-106.97828661618968,267.0,245.5,92.0,BDG +East River,Forest 12,COER12_20200428_1400,2020-04-28T14:00,13N,328526,4310840,38.929737288240055,-106.97821918710338,356.5,257.0,72.0, +East River,Forest 12,COER12_20200428_1415,2020-04-28T14:15,13N,328530,4310840,38.92973807010048,-106.97817306650516,368.0,367.0,100.0, +East River,Forest 12,COER12_20200428_1445,2020-04-28T14:45,13N,328528,4310844,38.92977370350157,-106.97819712779332,356.0,356.5,100.0,STCom +East River,Forest 12,COER12_20200512_1030,2020-05-12T10:30,13N,328519,4310837,38.92970890169526,-106.9782991473662,393.0,196.0,50.0, \ No newline at end of file diff --git a/tests/data/site_details_2021.csv b/tests/data/site_details_2021.csv new file mode 100644 index 0000000..891bccb --- /dev/null +++ b/tests/data/site_details_2021.csv @@ -0,0 +1,27 @@ +# Location,Boise River Basin +# Site,Lower Deer Point - Open +# PitID,IDBRLO_20201125_1215 +# Date/Local Standard Time,2020-11-25T12:15 +# UTM Zone,11N +# Easting (m),570582 +# Northing (m),4843040 +# Latitude (deg),43.73704 +# Longitude (deg),-116.12351 +# Slope (deg),-9999 +# Aspect (deg),-9999 +# Air Temp (deg C),-9999 +# HS (cm),26 +# Observers,"G. Antonioli, B. Minich" +# WISe Serial No,-9999 +# Weather,"overcast, light snowfall of rimed stellars" +# Precip Type,Snow +# Precip Rate,Very Light +# Sky,Broken (>1/2 of sky) +# Wind,Light +# Ground Condition,Moist +# Ground Roughness,Smooth +# Ground Vegetation,Grass | Shrub +# Vegetation Height (cm),10 | -9999 +# Tree Canopy,No Trees +# Comments,Used the 250 cc density cutter. +# Flags,STCom diff --git a/tests/helpers.py b/tests/helpers.py index 028d9cb..1a5aa71 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -13,7 +13,7 @@ class WithUploadedFile: def upload_file(self, session, filename): u = self.UploaderClass( - session=session, filename=filename, **self.kwargs + session, filename, **self.kwargs ) u.submit() diff --git a/tests/points/test_depth.py b/tests/points/test_depth.py index ffcfe48..8dec361 100644 --- a/tests/points/test_depth.py +++ b/tests/points/test_depth.py @@ -28,7 +28,7 @@ class TestDepth(PointBaseTesting): @pytest.fixture(scope="class") def uploaded_file(self, session, data_dir): self.upload_file( - filename=str(data_dir.joinpath("depths.csv")), session=session + session, str(data_dir.joinpath("depths.csv")), ) def filter_measurement_type(self, session, measurement_type, query=None): @@ -42,25 +42,69 @@ def filter_measurement_type(self, session, measurement_type, query=None): ).filter(MeasurementType.name == measurement_type) return query + @pytest.mark.usefixtures("uploaded_file") + def test_measurement_type(self, session): + record = self.get_records(session, MeasurementType, "name", "depth") + assert len(record) == 1 + record = record[0] + assert record.units == 'cm' + assert record.derived is False + + @pytest.mark.usefixtures("uploaded_file") + @pytest.mark.parametrize( + "name, model", [ + ("mesa", "Mesa2_1"), + ("magnaprobe", "CRREL_B"), + ("pit ruler", None), + ] + ) + def test_instrument(self, name, model, session): + record = self.get_records(session, Instrument, "name", name) + assert len(record) == 1 + record = record[0] + assert record.model == model + + @pytest.mark.usefixtures("uploaded_file") + @pytest.mark.parametrize( + "name, count", + [ + ("example_point_name_magnaprobe_CRREL_B_depth", 1), + ("example_point_name_mesa_Mesa2_1_depth", 1), + # We have three different dates + ("example_point_name_pit ruler_depth", 3) + ], + ) + def test_campaign_observation(self, name, count, session): + names = self.get_records(session, CampaignObservation, "name", name) + assert len(names) == count + + @pytest.mark.usefixtures("uploaded_file") + @pytest.mark.parametrize( + "date, count", + [ + (date(2020, 1, 28), 1), + (date(2020, 2, 4), 1), + (date(2020, 2, 11), 1), + (date(2020, 1, 30), 1), + (date(2020, 2, 5), 1), + ], + ) + def test_point_observation(self, date, count, session): + record = self.get_records(session, PointObservation, "date", date) + assert len(record) == count + @pytest.mark.parametrize( "table, attribute, expected_value", [ (Campaign, "name", "Grand Mesa"), - (Instrument, "name", "mesa"), - (Instrument, "model", "Mesa2_1"), - (MeasurementType, "name", ['depth']), - (MeasurementType, "units", ['cm']), - (MeasurementType, "derived", [False]), (DOI, "doi", "some_point_doi"), - (CampaignObservation, "name", "example_point_name_M2Mesa2_1_depth"), (PointData, "geom", WKTElement('POINT (-108.13515 39.03045)', srid=4326) - ), - (PointObservation, "date", date(2020, 2, 4)), + ), ] ) def test_metadata(self, table, attribute, expected_value, uploaded_file): self._check_metadata(table, attribute, expected_value) - + @pytest.mark.parametrize( "data_name, attribute_to_check, filter_attribute, filter_value, expected", [ ('depth', 'value', 'value', 94.0, [94]), diff --git a/tests/points/test_gpr_bsu.py b/tests/points/test_gpr_bsu.py new file mode 100644 index 0000000..c41ed43 --- /dev/null +++ b/tests/points/test_gpr_bsu.py @@ -0,0 +1,103 @@ +from datetime import datetime, timezone, date + +import pytest +from geoalchemy2 import WKTElement +from snowexsql.tables import PointData, DOI, Campaign, Instrument, \ + MeasurementType, PointObservation +from snowexsql.tables.campaign_observation import CampaignObservation + +from snowex_db.upload.points import PointDataCSV + +from _base import PointBaseTesting + + +class TestGPR(PointBaseTesting): + """ + Test that a density file is uploaded correctly including sample + averaging for the main value. + """ + kwargs = { + 'timezone': "UTC", + 'doi': "some_gpr_point_doi", + "campaign_name": "Grand Mesa", + "name": "BSU GPR DATA", + "instrument": "gpr" + } + UploaderClass = PointDataCSV + TableClass = PointData + + @pytest.fixture(scope="class") + def uploaded_file(self, session, data_dir): + self.upload_file(session, str(data_dir.joinpath("bsu_gpr.csv"))) + + def filter_measurement_type(self, session, measurement_type, query=None): + if query is None: + query = session.query(self.TableClass) + + query = query.join( + self.TableClass.observation + ).join( + PointObservation.measurement_type + ).filter(MeasurementType.name == measurement_type) + return query + + @pytest.mark.parametrize( + "table, attribute, expected_value", [ + (Campaign, "name", "Grand Mesa"), + (Instrument, "name", "gpr"), + (Instrument, "model", None), + (MeasurementType, "name", ['two_way_travel', 'depth', "swe"]), + (MeasurementType, "units", ['ns', 'cm', 'mm']), + (MeasurementType, "derived", [False, False, False]), + (DOI, "doi", "some_gpr_point_doi"), + (CampaignObservation, "name", "BSU GPR DATA_gpr_two_way_travel"), + (PointData, "geom", + WKTElement('POINT (-108.190889311605 39.0343743775669)', srid=4326) + ), + (PointObservation, "date", date(2020, 1, 28)), + ] + ) + def test_metadata(self, table, attribute, expected_value, uploaded_file): + self._check_metadata(table, attribute, expected_value) + + @pytest.mark.parametrize( + "data_name, attribute_to_check, filter_attribute, filter_value, expected", [ + ('two_way_travel', 'value', 'date', date(2020, 1, 28), [8.3] * 8), + ('depth', 'value', 'date', date(2020, 1, 28), + [101.096735522092, 101.096735522092, 101.096735522092, 101.096735522092, 101.096735522092, 101.096735522092, 101.096735522092, 101.096735522092]), + ('swe', 'value', 'date', date(2020, 1, 28), + [275.994087975311, 275.994087975311, 275.994087975311, 275.994087975311, 275.994087975311, 275.994087975311, 275.994087975311, 275.994087975311]), + ] + ) + def test_value( + self, data_name, attribute_to_check, + filter_attribute, filter_value, expected, uploaded_file + ): + self.check_value( + data_name, attribute_to_check, + filter_attribute, filter_value, expected, + ) + + @pytest.mark.parametrize( + "data_name, expected", [ + ("depth", 12), + ("swe", 12), + ("two_way_travel", 12), + ("density", 0), # no measurements + ] + ) + def test_count(self, data_name, expected, uploaded_file): + n = self.check_count(data_name) + assert n == expected + + @pytest.mark.parametrize( + "data_name, attribute_to_count, expected", [ + ("depth", "value", 2), + ("swe", "value", 2), + ("swe", "units", 1) + ] + ) + def test_unique_count(self, data_name, attribute_to_count, expected, uploaded_file): + self.check_unique_count( + data_name, attribute_to_count, expected + ) diff --git a/tests/points/test_perimiter_depth.py b/tests/points/test_perimiter_depth.py index 870dbf9..7b7c6b1 100644 --- a/tests/points/test_perimiter_depth.py +++ b/tests/points/test_perimiter_depth.py @@ -31,7 +31,7 @@ class TestPerimeterDepth(PointBaseTesting): @pytest.fixture(scope="class") def uploaded_file(self, session, data_dir): self.upload_file( - filename=str(data_dir.joinpath("perimeters.csv")), session=session + session, str(data_dir.joinpath("perimeters.csv")) ) @pytest.mark.parametrize( diff --git a/tests/points/test_poll_depth.py b/tests/points/test_pole_depth.py similarity index 58% rename from tests/points/test_poll_depth.py rename to tests/points/test_pole_depth.py index c176b7e..02fe0ad 100644 --- a/tests/points/test_poll_depth.py +++ b/tests/points/test_pole_depth.py @@ -33,20 +33,67 @@ def uploaded_file(self, session, data_dir): filename=str(data_dir.joinpath("pole_depths.csv")), session=session ) + @pytest.mark.usefixtures("uploaded_file") + def test_measurement_type(self, session): + record = self.get_records(session, MeasurementType, "name", "depth") + assert len(record) == 1 + record = record[0] + assert record.units == 'cm' + assert record.derived is False + + @pytest.mark.usefixtures("uploaded_file") + @pytest.mark.parametrize("model", ["W1B", "E9B", "E8A", "E6A"]) + def test_instrument(self, model, session): + record = self.get_records(session, Instrument, "model", model) + assert len(record) == 1 + record = record[0] + assert record.name == "camera" + + @pytest.mark.usefixtures("uploaded_file") + @pytest.mark.parametrize( + "date", + [ + date(2019, 11, 27), + date(2019, 12, 7), + date(2019, 12, 31), + date(2020, 2, 1), + date(2019, 10, 28), + date(2019, 11, 28), + date(2019, 12, 14), + date(2019, 11, 29), + date(2020, 2, 27), + date(2020, 4, 7), + date(2020, 5, 22), + date(2020, 1, 27), + date(2020, 3, 14), + date(2020, 5, 3), + ], + ) + def test_point_observation(self, date, session): + record = self.get_records(session, PointObservation, "date", date) + assert len(record) == 1 + + @pytest.mark.usefixtures("uploaded_file") + @pytest.mark.parametrize( + "name, count", + [ + ("example_pole_point_name_camera_E6A_depth", 4), + ("example_pole_point_name_camera_E8A_depth", 3), + ("example_pole_point_name_camera_E9B_depth", 4), + ("example_pole_point_name_camera_W1B_depth", 3), + ], + ) + def test_campaign_observation(self, name, count, session): + names = self.get_records(session, CampaignObservation, "name", name) + assert len(names) == count + @pytest.mark.parametrize( "table, attribute, expected_value", [ (Campaign, "name", "Grand Mesa"), - (Instrument, "name", "camera"), - (Instrument, "model", "E6A"), - (MeasurementType, "name", ['depth']), - (MeasurementType, "units", ['cm']), - (MeasurementType, "derived", [False]), (DOI, "doi", "some_point_doi_poles"), - (CampaignObservation, "name", "example_pole_point_name_cameraE6A_depth"), (PointData, "geom", WKTElement('POINT (-108.184794 39.008078)', srid=4326) ), - (PointObservation, "date", date(2019, 11, 27)), ] ) def test_metadata(self, table, attribute, expected_value, uploaded_file): diff --git a/tests/points/test_summary_pits.py b/tests/points/test_summary_pits.py new file mode 100644 index 0000000..e9cfda6 --- /dev/null +++ b/tests/points/test_summary_pits.py @@ -0,0 +1,119 @@ +from datetime import datetime, timezone, date + +import pytest +from geoalchemy2 import WKTElement +from snowexsql.db import db_session_with_credentials +from snowexsql.tables import PointData, DOI, Campaign, Instrument, \ + MeasurementType, PointObservation +from snowexsql.tables.campaign_observation import CampaignObservation + +from snowex_db.upload.points import PointDataCSV +from tests.points._base import PointBaseTesting + + +class TestSummaryPits(PointBaseTesting): + """ + Test the summary csvs for a collection of pits + """ + + kwargs = { + 'timezone': 'MST', + 'doi': "some_point_pit_doi", + "row_based_timezone": True, # row based timezone + "derived": True, + "instrument_map": { + "depth": "manual", + "swe": "manual", + "density": "cutter", + "comments": "unknown" + } + } + UploaderClass = PointDataCSV + TableClass = PointData + + @pytest.fixture(scope="class") + def uploaded_file(self, session, data_dir): + """ + NOTE - this is part of the _modified file that we create + in the upload script, NOT the original file + """ + self.upload_file( + session, + str(data_dir.joinpath("pit_summary_points.csv")) + ) + + def filter_measurement_type(self, session, measurement_type, query=None): + if query is None: + query = session.query(self.TableClass) + + query = query.join( + self.TableClass.observation + ).join( + PointObservation.measurement_type + ).filter(MeasurementType.name == measurement_type) + return query + + @pytest.mark.parametrize( + "table, attribute, expected_value", [ + (Campaign, "name", "American River Basin"), + (Instrument, "name", "cutter"), + (Instrument, "model", None), + (MeasurementType, "name", ['density', 'swe', 'depth']), + (MeasurementType, "units", ['kg/m^3', 'mm', 'cm']), + (MeasurementType, "derived", [True, True, True]), + (DOI, "doi", "some_point_pit_doi"), + (CampaignObservation, "name", "CAAMCL_20191220_1300_cutter_density"), + (PointData, "geom", + WKTElement('POINT (-120.04186927254749 38.71033054555811)', srid=4326) + ), + (PointObservation, "date", date(2019, 12, 20)), + ] + ) + def test_metadata(self, table, attribute, expected_value, uploaded_file): + self._check_metadata(table, attribute, expected_value) + + @pytest.mark.parametrize( + "data_name, attribute_to_check, filter_attribute, filter_value, expected", [ + ('depth', 'value', 'value', 117.0, [117.0]), + ('depth', 'units', 'value', 117.0, ['cm']), + ('depth', 'datetime', 'value', 117.0, [datetime(2020, 2, 21, 20, 00, tzinfo=timezone.utc)]), + ] + ) + def test_value( + self, data_name, attribute_to_check, + filter_attribute, filter_value, expected, uploaded_file + ): + self.check_value( + data_name, attribute_to_check, + filter_attribute, filter_value, expected, + ) + + @pytest.mark.parametrize( + "data_name, expected", [ + ("depth", 12) + ] + ) + def test_count(self, data_name, expected, uploaded_file): + n = self.check_count(data_name) + assert n == expected + + @pytest.mark.parametrize( + "data_name, attribute_to_count, expected", [ + ("depth", "value", 9), + ("depth", "units", 1) + ] + ) + def test_unique_count(self, data_name, attribute_to_count, expected, uploaded_file): + self.check_unique_count( + data_name, attribute_to_count, expected + ) + + def test_unique_types(self, uploaded_file): + """ + Test number of unique measurement types + """ + with db_session_with_credentials() as (engine, session): + records = session.query( + MeasurementType.name + ).distinct().all() + assert len(records) == 3 diff --git a/tests/test_metadata.py b/tests/test_metadata.py index cde2102..c235bf1 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -1,7 +1,9 @@ """ Test all things from the metadata.py file """ +import datetime from os.path import abspath, dirname, join + import numpy as np import pandas as pd import pytest @@ -23,6 +25,7 @@ } +@pytest.mark.skip class DataHeaderTestBase: depth_is_metadata = True kwargs = {'in_timezone': 'US/Mountain'}