diff --git a/scripts/upload/add_time_series_pits_2020.py b/scripts/upload/add_time_series_pits_2020.py index f8e6ef2..1b2f140 100644 --- a/scripts/upload/add_time_series_pits_2020.py +++ b/scripts/upload/add_time_series_pits_2020.py @@ -2,134 +2,71 @@ Script to upload the Snowex Time Series pits """ -import glob -import re -from os.path import abspath, join from pathlib import Path +from earthaccess_data import get_files +from import_logger import get_logger +from snowexsql.db import db_session_with_credentials +from snowex_db.utilities import get_timezone_from_site_id, get_site_id_from_filename +from snowex_db.upload.layers import UploadProfileBatch -from snowex_db.batch import UploadProfileBatch, UploadSiteDetailsBatch -from snowex_db.upload import PointDataCSV -from snowex_db import db_session +LOG = get_logger() -tz_map = {'US/Pacific': ['CA', 'NV', 'WA'], - 'US/Mountain': ['CO', 'ID', 'NM', 'UT', 'MT'], - } +SNOWEX_PITS_MAP = { + "SNEX20_TS_SP": "10.5067/KZ43HVLZV6G4" +} +# Filename keyword to the instrument used +INSTRUMENT_MAP = { + "siteDetails": None, + "density": "Density Cutter", + "temperature": "Digital Thermometer", + "LWC": "A2 Sensor", + "stratigraphy": "Manual" + } -def main(): + +def main(file_list: list, doi: str) -> None: """ Add 2020 timeseries pits """ - db_name = 'localhost/snowex' - - # Version 2 DOI - # https://nsidc.org/data/snex20_ts_sp/versions/2 - doi = "https://doi.org/10.5067/KZ43HVLZV6G4" - debug = True + # Constant Metadata for the GPR data + kwargs = { + "campaign_name": "2020 Timeseries", + "doi": doi, + } - # Point to the downloaded data from - data_dir = abspath('../download/data/SNOWEX/SNEX20_TS_SP.002/') - error_msg = [] + # Regex to get site id from filename + snowex_reg = r'SNEX20_TS_SP_\d{8}_\d{4}_([a-zA-Z0-9]*)_.*\.csv' # Files to ignore - ignore_files = [ - "SNEX20_TS_SP_Summary_Environment_v02.csv", - "SNEX20_TS_SP_Summary_SWE_v02.csv", - "SNEX20_TS_SP_Summary_SWE_v02_modified.csv" - ] - - # Get all the date folders - unique_dt_olders = Path( - data_dir - ).expanduser().absolute().glob("20*.*.*") - for udf in unique_dt_olders: - # get all the csvs in the folder - dt_folder_files = list(udf.glob("*.csv")) - site_ids = [] - # Get the unique site ids for this date folder - compiled = re.compile( - r'SNEX20_TS_SP_\d{8}_\d{4}_([a-zA-Z0-9]*)_data_.*_v02\.csv' - ) - for file_path in dt_folder_files: - file_name = file_path.name - if file_name in ignore_files: - print(f"Skipping {file_name}") - continue - match = compiled.match(file_name) - if match: - code = match.group(1) - site_ids.append(code) - else: - raise RuntimeError(f"No site ID found for {file_name}") - - # Get the unique site ids - site_ids = list(set(site_ids)) - - for site_id in site_ids: - abbrev = site_id[0:2] - tz = [k for k, states in tz_map.items() if abbrev in states][0] - - # Grab all the csvs in the pits folder - filenames = glob.glob(join(str(udf), f'*_{site_id}_*.csv')) - - # Grab all the site details files - sites = glob.glob(join( - str(udf), f'*_{site_id}_*siteDetails*.csv' - )) - - # Grab all the perimeter depths and remove them for now. - perimeter_depths = glob.glob(join( - str(udf), f'*_{site_id}_*perimeterDepths*.csv' - )) - - # Use no-gap-filled density for the sole reason that - # Gap filled density for profiles where the scale was broken - # are just an empty file after the headers. We should - # Record that Nan density was collected for the profile - density_files = glob.glob(join( - str(udf), f'*_{site_id}_*_gapFilledDensity_*.csv' - )) - - # Remove the site details from the total file list to get only the - profiles = list( - set(filenames) - set(sites) - set(perimeter_depths) - - set(density_files) # remove non-gap-filled denisty - ) - - # Submit all profiles associated with pit at a time - b = UploadProfileBatch( - filenames=profiles, debug=debug, doi=doi, in_timezone=tz, - db_name=db_name, - allow_split_lines=True # Logic for split header lines - ) - b.push() - error_msg += b.errors - - # Upload the site details - sd = UploadSiteDetailsBatch( - filenames=sites, debug=debug, doi=doi, in_timezone=tz, - db_name=db_name - ) - sd.push() - error_msg += sd.errors - - # Submit all perimeters as point data - with db_session( - db_name, credentials='credentials.json' - ) as (session, engine): - for fp in perimeter_depths: - pcsv = PointDataCSV( - fp, doi=doi, debug=debug, depth_is_metadata=False, - in_timezone=tz, - allow_split_lines=True # Logic for split header lines - ) - pcsv.submit(session) - - for f, m in error_msg: - print(f) - return len(error_msg) + gap_filled_density = [f for f in file_list if "gapDensity" in str(f)] + file_list = list(set(file_list) - set(gap_filled_density)) + + with db_session_with_credentials('./credentials.json') as (_engine, session): + + # Filter by instrument + for keyword, instrument in INSTRUMENT_MAP.items(): + instrumented_files = [ + f for f in file_list if keyword in Path(f).name + ] + kwargs["instrument"] = instrument + LOG.info(f"\n\nUploading {len(instrumented_files)} files with keyword: {keyword}") + + # Filter to sites to manage the timezones + unique_sites = list(set([get_site_id_from_filename(f, snowex_reg) for f in instrumented_files])) + + for site in unique_sites: + site_files = [ + f for f in instrumented_files if site in f + ] + kwargs["timezone"] = get_timezone_from_site_id(site) + + uploader = UploadProfileBatch(session, site_files, **kwargs) + uploader.push() if __name__ == '__main__': - main() + for data_set_id, doi in SNOWEX_PITS_MAP.items(): + with get_files(data_set_id, doi) as files: + main(files, doi) \ No newline at end of file diff --git a/scripts/upload/add_time_series_pits_2021.py b/scripts/upload/add_time_series_pits_2021.py index 244eee3..6913552 100644 --- a/scripts/upload/add_time_series_pits_2021.py +++ b/scripts/upload/add_time_series_pits_2021.py @@ -2,115 +2,71 @@ Script to upload the Snowex Time Series pits """ -import glob -import re -from os.path import abspath, join from pathlib import Path +from earthaccess_data import get_files +from snowexsql.db import db_session_with_credentials +from snowex_db.utilities import get_timezone_from_site_id, get_site_id_from_filename +from snowex_db.upload.layers import UploadProfileBatch +from import_logger import get_logger +LOG = get_logger() +SNOWEX_PITS_MAP = { + "SNEX21_TS_SP": "10.5067/QIANJYJGRWOV" +} -tz_map = {'US/Pacific': ['CA', 'NV', 'WA'], - 'US/Mountain': ['CO', 'ID', 'NM', 'UT', 'MT'], - } +# Filename keyword to the instrument used +INSTRUMENT_MAP = { + "siteDetails": None, + "density": "Density Cutter", + "temperature": "Digital Thermometer", + "LWC": "A2 Sensor", + "stratigraphy": "Manual" + } -def main(): +def main(file_list: list, doi: str) -> None: """ - Snowex 2021 timeseries pits + Add 2021 timeseries pits """ - db_name = 'localhost/snowex' - # https://nsidc.org/data/snex21_ts_sp/versions/1 - doi = "https://doi.org/10.5067/QIANJYJGRWOV" - debug = True + # Constant Metadata for the GPR data + kwargs = { + "campaign_name": "2021 Timeseries", + "doi": doi, + } - # Point to the downloaded data from - data_dir = abspath('../download/data/SNOWEX/SNEX21_TS_SP.001/') - error_msg = [] + # Regex to get site id from filename + snowex_reg = r'SNEX21_TS_SP_\d{8}_\d{4}_([a-zA-Z0-9]*)_.*\.csv' # Files to ignore - ignore_files = [ - "SNEX21_TS_SP_Summary_Environment_v01.csv", - "SNEX21_TS_SP_Summary_SWE_v01.csv", - "SNEX21_TS_SP_Summary_SWE_v01_modified.csv" - ] - - # Get all the date folders - unique_dt_olders = Path( - data_dir - ).expanduser().absolute().glob("20*.*.*") - for udf in unique_dt_olders: - # get all the csvs in the folder - dt_folder_files = list(udf.glob("*.csv")) - site_ids = [] - # Get the unique site ids for this date folder - compiled = re.compile( - r'SNEX21_TS_SP_\d{8}_\d{4}_([a-zA-Z0-9]*)_data_.*_v01\.csv' - ) - for file_path in dt_folder_files: - file_name = file_path.name - if file_name in ignore_files: - print(f"Skipping {file_name}") - continue - - match = compiled.match(file_name) - if match: - code = match.group(1) - site_ids.append(code) - else: - raise RuntimeError(f"No site ID found for {file_name}") - - # Get the unique site ids - site_ids = list(set(site_ids)) - - for site_id in site_ids: - abbrev = site_id[0:2] - tz = [k for k, states in tz_map.items() if abbrev in states][0] - - # Grab all the csvs in the pits folder - filenames = glob.glob(join(str(udf), f'*_{site_id}_*.csv')) - - # Grab all the site details files - sites = glob.glob(join( - str(udf), f'*_{site_id}_*siteDetails*.csv' - )) - - # Use no-gap-filled density for the sole reason that - # Gap filled density for profiles where the scale was broken - # are just an empty file after the headers. We should - # Record that Nan density was collected for the profile - density_files = glob.glob(join( - str(udf), f'*_{site_id}_*_gapFilledDensity_*.csv' - )) - - # Remove the site details from the total file list to get only the - profiles = list( - set(filenames) - set(sites) - - set(density_files) # remove non-gap-filled denisty - ) - - # Submit all profiles associated with pit at a time - b = UploadProfileBatch( - filenames=profiles, debug=debug, doi=doi, in_timezone=tz, - db_name=db_name, - allow_split_lines=True # Logic for split header lines - ) - b.push() - error_msg += b.errors - - # Upload the site details - sd = UploadSiteDetailsBatch( - filenames=sites, debug=debug, doi=doi, in_timezone=tz, - db_name=db_name, - allow_split_lines=True # Logic for split header lines - ) - sd.push() - error_msg += sd.errors - - for f, m in error_msg: - print(f) - return len(error_msg) + gap_filled_density = [f for f in file_list if "gapDensity" in str(f)] + file_list = list(set(file_list) - set(gap_filled_density)) + + with db_session_with_credentials('./credentials.json') as (_engine, session): + + # Filter by instrument + for keyword, instrument in INSTRUMENT_MAP.items(): + instrumented_files = [ + f for f in file_list if keyword in Path(f).name + ] + kwargs["instrument"] = instrument + LOG.info(f"\n\nUploading {len(instrumented_files)} files with keyword: {keyword}") + + # Filter to sites to manage the timezones + unique_sites = list(set([get_site_id_from_filename(f, snowex_reg) for f in instrumented_files])) + + for site in unique_sites: + site_files = [ + f for f in instrumented_files if site in f + ] + kwargs["timezone"] = get_timezone_from_site_id(site) + + uploader = UploadProfileBatch(session, site_files, **kwargs) + uploader.push() if __name__ == '__main__': - main() + for data_set_id, doi in SNOWEX_PITS_MAP.items(): + with get_files(data_set_id, doi) as files: + main(files, doi) \ No newline at end of file diff --git a/snowex_db/utilities.py b/snowex_db/utilities.py index c511e79..e0a1b78 100644 --- a/snowex_db/utilities.py +++ b/snowex_db/utilities.py @@ -7,9 +7,14 @@ import logging from os import walk from os.path import getctime, join - +import re +from pathlib import Path import coloredlogs +state_tz_map = {'US/Pacific': ['CA', 'NV', 'WA'], + 'US/Mountain': ['CO', 'ID', 'NM', 'UT', 'MT'], + } + def get_logger(name, debug=True, ext_logger=None): """ @@ -158,4 +163,27 @@ def get_file_creation_date(file): """ result = datetime.datetime.fromtimestamp(getctime(file)).date() - return result \ No newline at end of file + return result + + +def get_site_id_from_filename(filename: str, regex: str) -> str: + """ + Get the site ID based on the site code in the filename from the pit files + """ + + compiled = re.compile(regex) + match = compiled.match(Path(filename).name) + if match: + code = match.group(1) + return code + else: + raise RuntimeError(f"No site ID found for {filename}") + + +def get_timezone_from_site_id(site_id: str) -> str: + """ + Get the timezone based on the site id + """ + abbrev = site_id[0:2] + tz = [k for k, states in state_tz_map.items() if abbrev in states][0] + return tz \ No newline at end of file diff --git a/tests/test_utilities.py b/tests/test_utilities.py index c519725..487ec5e 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -5,7 +5,7 @@ from snowex_db.utilities import ( read_n_lines, find_files, find_kw_in_lines, assign_default_kwargs, - get_file_creation_date + get_file_creation_date, get_site_id_from_filename, get_timezone_from_site_id ) @@ -100,3 +100,25 @@ def test_get_file_creation_date(): """ result = get_file_creation_date(__file__) assert type(result) is date + + +def test_get_site_id_from_filename(): + """ + Test getting site ID from filename + """ + filename = "SNEX20_TS_SP_20191029_1210_COFEJ1_data_gapFilledDensity_v02.csv" + regex = r'SNEX20_TS_SP_\d{8}_\d{4}_([a-zA-Z0-9]*)_data_.*_v02\.csv' + site_id = get_site_id_from_filename(filename, regex) + assert site_id == "COFEJ1" + + +@pytest.mark.parametrize('site_id, expected_tz', [ + ('COGM', 'US/Mountain'), + ('CAAM', 'US/Pacific'), + ]) +def test_get_timezone_from_site_id(site_id, expected_tz): + """ + Test getting timezone from site ID + """ + tz = get_timezone_from_site_id(site_id) + assert tz == expected_tz \ No newline at end of file