From 976d26c37baeb0d3a13fe5206e8b01f809c824dd Mon Sep 17 00:00:00 2001 From: micah johnson Date: Fri, 28 Nov 2025 08:38:37 -0700 Subject: [PATCH 1/6] Semi working pit uploader for 2020 --- scripts/upload/add_time_series_pits_2020.py | 189 ++++++++------------ 1 file changed, 75 insertions(+), 114 deletions(-) diff --git a/scripts/upload/add_time_series_pits_2020.py b/scripts/upload/add_time_series_pits_2020.py index f8e6ef2..a203b07 100644 --- a/scripts/upload/add_time_series_pits_2020.py +++ b/scripts/upload/add_time_series_pits_2020.py @@ -4,132 +4,93 @@ import glob import re -from os.path import abspath, join from pathlib import Path +from earthaccess_data import get_files +from import_logger import get_logger +from snowexsql.db import db_session_with_credentials -from snowex_db.batch import UploadProfileBatch, UploadSiteDetailsBatch -from snowex_db.upload import PointDataCSV -from snowex_db import db_session +from snowex_db.upload.layers import UploadProfileBatch + +LOG = get_logger() tz_map = {'US/Pacific': ['CA', 'NV', 'WA'], 'US/Mountain': ['CO', 'ID', 'NM', 'UT', 'MT'], } +SNOWEX_PITS_MAP = { + "SNEX20_TS_SP": "10.5067/KZ43HVLZV6G4" +} + +# Filename keyword to the instrument used +INSTRUMENT_MAP = { + "siteDetails": None, + "density": "Density Cutter", + "temperature": "Digital Thermometer", + "LWC": "A2 Sensor", + "stratigraphy": "Manual" + } -def main(): +def get_site_id(filename: str) -> str: + """ + Get the site ID based on the site code in the filename + """ + compiled = re.compile( + r'SNEX20_TS_SP_\d{8}_\d{4}_([a-zA-Z0-9]*)_data_.*_v02\.csv' + ) + match = compiled.match(Path(filename).name) + if match: + code = match.group(1) + return code + else: + raise RuntimeError(f"No site ID found for {filename}") + + +def get_timezone(site_id: str) -> str: + """ + Get the timezone based on the site code + """ + abbrev = site_id[0:2] + tz = [k for k, states in tz_map.items() if abbrev in states][0] + return tz + + +def main(file_list: list, doi: str) -> None: """ Add 2020 timeseries pits """ - db_name = 'localhost/snowex' - - # Version 2 DOI - # https://nsidc.org/data/snex20_ts_sp/versions/2 - doi = "https://doi.org/10.5067/KZ43HVLZV6G4" - debug = True - - # Point to the downloaded data from - data_dir = abspath('../download/data/SNOWEX/SNEX20_TS_SP.002/') - error_msg = [] - - # Files to ignore - ignore_files = [ - "SNEX20_TS_SP_Summary_Environment_v02.csv", - "SNEX20_TS_SP_Summary_SWE_v02.csv", - "SNEX20_TS_SP_Summary_SWE_v02_modified.csv" - ] - - # Get all the date folders - unique_dt_olders = Path( - data_dir - ).expanduser().absolute().glob("20*.*.*") - for udf in unique_dt_olders: - # get all the csvs in the folder - dt_folder_files = list(udf.glob("*.csv")) - site_ids = [] - # Get the unique site ids for this date folder - compiled = re.compile( - r'SNEX20_TS_SP_\d{8}_\d{4}_([a-zA-Z0-9]*)_data_.*_v02\.csv' - ) - for file_path in dt_folder_files: - file_name = file_path.name - if file_name in ignore_files: - print(f"Skipping {file_name}") - continue - match = compiled.match(file_name) - if match: - code = match.group(1) - site_ids.append(code) - else: - raise RuntimeError(f"No site ID found for {file_name}") - - # Get the unique site ids - site_ids = list(set(site_ids)) - - for site_id in site_ids: - abbrev = site_id[0:2] - tz = [k for k, states in tz_map.items() if abbrev in states][0] - - # Grab all the csvs in the pits folder - filenames = glob.glob(join(str(udf), f'*_{site_id}_*.csv')) - - # Grab all the site details files - sites = glob.glob(join( - str(udf), f'*_{site_id}_*siteDetails*.csv' - )) - - # Grab all the perimeter depths and remove them for now. - perimeter_depths = glob.glob(join( - str(udf), f'*_{site_id}_*perimeterDepths*.csv' - )) - - # Use no-gap-filled density for the sole reason that - # Gap filled density for profiles where the scale was broken - # are just an empty file after the headers. We should - # Record that Nan density was collected for the profile - density_files = glob.glob(join( - str(udf), f'*_{site_id}_*_gapFilledDensity_*.csv' - )) - - # Remove the site details from the total file list to get only the - profiles = list( - set(filenames) - set(sites) - set(perimeter_depths) - - set(density_files) # remove non-gap-filled denisty - ) - - # Submit all profiles associated with pit at a time - b = UploadProfileBatch( - filenames=profiles, debug=debug, doi=doi, in_timezone=tz, - db_name=db_name, - allow_split_lines=True # Logic for split header lines - ) - b.push() - error_msg += b.errors - - # Upload the site details - sd = UploadSiteDetailsBatch( - filenames=sites, debug=debug, doi=doi, in_timezone=tz, - db_name=db_name - ) - sd.push() - error_msg += sd.errors - - # Submit all perimeters as point data - with db_session( - db_name, credentials='credentials.json' - ) as (session, engine): - for fp in perimeter_depths: - pcsv = PointDataCSV( - fp, doi=doi, debug=debug, depth_is_metadata=False, - in_timezone=tz, - allow_split_lines=True # Logic for split header lines - ) - pcsv.submit(session) - - for f, m in error_msg: - print(f) - return len(error_msg) + # Constant Metadata for the GPR data + kwargs = { + "campaign_name": "2020 Timeseries", + "doi": doi, + } + + # Files to remove + + + with db_session_with_credentials('./credentials.json') as (_engine, session): + + # Filter by instrument + for keyword, instrument in INSTRUMENT_MAP.items(): + instrumented_files = [ + f for f in file_list if keyword in Path(f).name + ] + kwargs["instrument"] = instrument + + # Filter to sites to manage the timezones + unique_sites = set([get_site_id(f) for f in instrumented_files]) + + for site in unique_sites: + site_files = [ + f for f in instrumented_files if site in f + ] + kwargs["timezone"] = get_timezone(site) + + uploader = UploadProfileBatch(session, site_files, **kwargs) + uploader.push() if __name__ == '__main__': - main() + for data_set_id, doi in SNOWEX_PITS_MAP.items(): + with get_files(data_set_id, doi) as files: + main(files, doi) \ No newline at end of file From 0efeb2dabd2b4ec8da1cbef2c729f66ba9065f2c Mon Sep 17 00:00:00 2001 From: micah johnson Date: Sat, 29 Nov 2025 06:28:27 -0700 Subject: [PATCH 2/6] Working script to uploaded 2020 time series pits. Working on #43 --- scripts/upload/add_time_series_pits_2020.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/scripts/upload/add_time_series_pits_2020.py b/scripts/upload/add_time_series_pits_2020.py index a203b07..5b44991 100644 --- a/scripts/upload/add_time_series_pits_2020.py +++ b/scripts/upload/add_time_series_pits_2020.py @@ -2,7 +2,6 @@ Script to upload the Snowex Time Series pits """ -import glob import re from pathlib import Path from earthaccess_data import get_files @@ -14,10 +13,6 @@ LOG = get_logger() -tz_map = {'US/Pacific': ['CA', 'NV', 'WA'], - 'US/Mountain': ['CO', 'ID', 'NM', 'UT', 'MT'], - } - SNOWEX_PITS_MAP = { "SNEX20_TS_SP": "10.5067/KZ43HVLZV6G4" } @@ -31,6 +26,11 @@ "stratigraphy": "Manual" } +tz_map = {'US/Pacific': ['CA', 'NV', 'WA'], + 'US/Mountain': ['CO', 'ID', 'NM', 'UT', 'MT'], + } + + def get_site_id(filename: str) -> str: """ Get the site ID based on the site code in the filename @@ -65,8 +65,9 @@ def main(file_list: list, doi: str) -> None: "doi": doi, } - # Files to remove - + # Files to ignore + gap_filled_density = [f for f in file_list if "gapDensity" in f] + file_list = list(set(file_list) - set(gap_filled_density)) with db_session_with_credentials('./credentials.json') as (_engine, session): @@ -76,9 +77,10 @@ def main(file_list: list, doi: str) -> None: f for f in file_list if keyword in Path(f).name ] kwargs["instrument"] = instrument + LOG.info(f"\n\nUploading {len(instrumented_files)} files with keyword: {keyword}") # Filter to sites to manage the timezones - unique_sites = set([get_site_id(f) for f in instrumented_files]) + unique_sites = list(set([get_site_id(f) for f in instrumented_files])) for site in unique_sites: site_files = [ From 29a3a81b5a4eb3bf51fe129bde1bd9ebcc7e0317 Mon Sep 17 00:00:00 2001 From: micah johnson Date: Sat, 29 Nov 2025 06:40:49 -0700 Subject: [PATCH 3/6] Drying out some upload code --- snowex_db/utilities.py | 32 ++++++++++++++++++++++++++++++-- tests/test_utilities.py | 24 +++++++++++++++++++++++- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/snowex_db/utilities.py b/snowex_db/utilities.py index c511e79..e0a1b78 100644 --- a/snowex_db/utilities.py +++ b/snowex_db/utilities.py @@ -7,9 +7,14 @@ import logging from os import walk from os.path import getctime, join - +import re +from pathlib import Path import coloredlogs +state_tz_map = {'US/Pacific': ['CA', 'NV', 'WA'], + 'US/Mountain': ['CO', 'ID', 'NM', 'UT', 'MT'], + } + def get_logger(name, debug=True, ext_logger=None): """ @@ -158,4 +163,27 @@ def get_file_creation_date(file): """ result = datetime.datetime.fromtimestamp(getctime(file)).date() - return result \ No newline at end of file + return result + + +def get_site_id_from_filename(filename: str, regex: str) -> str: + """ + Get the site ID based on the site code in the filename from the pit files + """ + + compiled = re.compile(regex) + match = compiled.match(Path(filename).name) + if match: + code = match.group(1) + return code + else: + raise RuntimeError(f"No site ID found for {filename}") + + +def get_timezone_from_site_id(site_id: str) -> str: + """ + Get the timezone based on the site id + """ + abbrev = site_id[0:2] + tz = [k for k, states in state_tz_map.items() if abbrev in states][0] + return tz \ No newline at end of file diff --git a/tests/test_utilities.py b/tests/test_utilities.py index c519725..487ec5e 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -5,7 +5,7 @@ from snowex_db.utilities import ( read_n_lines, find_files, find_kw_in_lines, assign_default_kwargs, - get_file_creation_date + get_file_creation_date, get_site_id_from_filename, get_timezone_from_site_id ) @@ -100,3 +100,25 @@ def test_get_file_creation_date(): """ result = get_file_creation_date(__file__) assert type(result) is date + + +def test_get_site_id_from_filename(): + """ + Test getting site ID from filename + """ + filename = "SNEX20_TS_SP_20191029_1210_COFEJ1_data_gapFilledDensity_v02.csv" + regex = r'SNEX20_TS_SP_\d{8}_\d{4}_([a-zA-Z0-9]*)_data_.*_v02\.csv' + site_id = get_site_id_from_filename(filename, regex) + assert site_id == "COFEJ1" + + +@pytest.mark.parametrize('site_id, expected_tz', [ + ('COGM', 'US/Mountain'), + ('CAAM', 'US/Pacific'), + ]) +def test_get_timezone_from_site_id(site_id, expected_tz): + """ + Test getting timezone from site ID + """ + tz = get_timezone_from_site_id(site_id) + assert tz == expected_tz \ No newline at end of file From f35e3f9a671477a1cd458d931bb78a75a54fc325 Mon Sep 17 00:00:00 2001 From: micah johnson Date: Sat, 29 Nov 2025 06:43:57 -0700 Subject: [PATCH 4/6] updated 2020 to use dried out tools --- scripts/upload/add_time_series_pits_2020.py | 37 ++++----------------- 1 file changed, 6 insertions(+), 31 deletions(-) diff --git a/scripts/upload/add_time_series_pits_2020.py b/scripts/upload/add_time_series_pits_2020.py index 5b44991..ad7c0b9 100644 --- a/scripts/upload/add_time_series_pits_2020.py +++ b/scripts/upload/add_time_series_pits_2020.py @@ -7,7 +7,7 @@ from earthaccess_data import get_files from import_logger import get_logger from snowexsql.db import db_session_with_credentials - +from snowex_db.utilities import get_timezone_from_site_id, get_site_id_from_filename from snowex_db.upload.layers import UploadProfileBatch LOG = get_logger() @@ -26,34 +26,6 @@ "stratigraphy": "Manual" } -tz_map = {'US/Pacific': ['CA', 'NV', 'WA'], - 'US/Mountain': ['CO', 'ID', 'NM', 'UT', 'MT'], - } - - -def get_site_id(filename: str) -> str: - """ - Get the site ID based on the site code in the filename - """ - compiled = re.compile( - r'SNEX20_TS_SP_\d{8}_\d{4}_([a-zA-Z0-9]*)_data_.*_v02\.csv' - ) - match = compiled.match(Path(filename).name) - if match: - code = match.group(1) - return code - else: - raise RuntimeError(f"No site ID found for {filename}") - - -def get_timezone(site_id: str) -> str: - """ - Get the timezone based on the site code - """ - abbrev = site_id[0:2] - tz = [k for k, states in tz_map.items() if abbrev in states][0] - return tz - def main(file_list: list, doi: str) -> None: """ @@ -65,6 +37,9 @@ def main(file_list: list, doi: str) -> None: "doi": doi, } + # Regex to get site id from filename + snowex_reg = r'SNEX20_TS_SP_\d{8}_\d{4}_([a-zA-Z0-9]*)_.*\.csv' + # Files to ignore gap_filled_density = [f for f in file_list if "gapDensity" in f] file_list = list(set(file_list) - set(gap_filled_density)) @@ -80,13 +55,13 @@ def main(file_list: list, doi: str) -> None: LOG.info(f"\n\nUploading {len(instrumented_files)} files with keyword: {keyword}") # Filter to sites to manage the timezones - unique_sites = list(set([get_site_id(f) for f in instrumented_files])) + unique_sites = list(set([get_site_id_from_filename(f, snowex_reg) for f in instrumented_files])) for site in unique_sites: site_files = [ f for f in instrumented_files if site in f ] - kwargs["timezone"] = get_timezone(site) + kwargs["timezone"] = get_timezone_from_site_id(site) uploader = UploadProfileBatch(session, site_files, **kwargs) uploader.push() From b6380ca9594c7a83a556e939521fa37b4b77c2a0 Mon Sep 17 00:00:00 2001 From: micah johnson Date: Sat, 29 Nov 2025 07:05:08 -0700 Subject: [PATCH 5/6] functioning timeseries pits 2021 --- scripts/upload/add_time_series_pits_2020.py | 1 - scripts/upload/add_time_series_pits_2021.py | 150 +++++++------------- 2 files changed, 53 insertions(+), 98 deletions(-) diff --git a/scripts/upload/add_time_series_pits_2020.py b/scripts/upload/add_time_series_pits_2020.py index ad7c0b9..a50b37d 100644 --- a/scripts/upload/add_time_series_pits_2020.py +++ b/scripts/upload/add_time_series_pits_2020.py @@ -2,7 +2,6 @@ Script to upload the Snowex Time Series pits """ -import re from pathlib import Path from earthaccess_data import get_files from import_logger import get_logger diff --git a/scripts/upload/add_time_series_pits_2021.py b/scripts/upload/add_time_series_pits_2021.py index 244eee3..39e647d 100644 --- a/scripts/upload/add_time_series_pits_2021.py +++ b/scripts/upload/add_time_series_pits_2021.py @@ -2,115 +2,71 @@ Script to upload the Snowex Time Series pits """ -import glob -import re -from os.path import abspath, join from pathlib import Path +from earthaccess_data import get_files +from snowexsql.db import db_session_with_credentials +from snowex_db.utilities import get_timezone_from_site_id, get_site_id_from_filename +from snowex_db.upload.layers import UploadProfileBatch +from import_logger import get_logger +LOG = get_logger() +SNOWEX_PITS_MAP = { + "SNEX21_TS_SP": "10.5067/QIANJYJGRWOV" +} -tz_map = {'US/Pacific': ['CA', 'NV', 'WA'], - 'US/Mountain': ['CO', 'ID', 'NM', 'UT', 'MT'], - } +# Filename keyword to the instrument used +INSTRUMENT_MAP = { + "siteDetails": None, + "density": "Density Cutter", + "temperature": "Digital Thermometer", + "LWC": "A2 Sensor", + "stratigraphy": "Manual" + } -def main(): +def main(file_list: list, doi: str) -> None: """ - Snowex 2021 timeseries pits + Add 2021 timeseries pits """ - db_name = 'localhost/snowex' - # https://nsidc.org/data/snex21_ts_sp/versions/1 - doi = "https://doi.org/10.5067/QIANJYJGRWOV" - debug = True + # Constant Metadata for the GPR data + kwargs = { + "campaign_name": "2021 Timeseries", + "doi": doi, + } - # Point to the downloaded data from - data_dir = abspath('../download/data/SNOWEX/SNEX21_TS_SP.001/') - error_msg = [] + # Regex to get site id from filename + snowex_reg = r'SNEX21_TS_SP_\d{8}_\d{4}_([a-zA-Z0-9]*)_.*\.csv' # Files to ignore - ignore_files = [ - "SNEX21_TS_SP_Summary_Environment_v01.csv", - "SNEX21_TS_SP_Summary_SWE_v01.csv", - "SNEX21_TS_SP_Summary_SWE_v01_modified.csv" - ] - - # Get all the date folders - unique_dt_olders = Path( - data_dir - ).expanduser().absolute().glob("20*.*.*") - for udf in unique_dt_olders: - # get all the csvs in the folder - dt_folder_files = list(udf.glob("*.csv")) - site_ids = [] - # Get the unique site ids for this date folder - compiled = re.compile( - r'SNEX21_TS_SP_\d{8}_\d{4}_([a-zA-Z0-9]*)_data_.*_v01\.csv' - ) - for file_path in dt_folder_files: - file_name = file_path.name - if file_name in ignore_files: - print(f"Skipping {file_name}") - continue - - match = compiled.match(file_name) - if match: - code = match.group(1) - site_ids.append(code) - else: - raise RuntimeError(f"No site ID found for {file_name}") - - # Get the unique site ids - site_ids = list(set(site_ids)) - - for site_id in site_ids: - abbrev = site_id[0:2] - tz = [k for k, states in tz_map.items() if abbrev in states][0] - - # Grab all the csvs in the pits folder - filenames = glob.glob(join(str(udf), f'*_{site_id}_*.csv')) - - # Grab all the site details files - sites = glob.glob(join( - str(udf), f'*_{site_id}_*siteDetails*.csv' - )) - - # Use no-gap-filled density for the sole reason that - # Gap filled density for profiles where the scale was broken - # are just an empty file after the headers. We should - # Record that Nan density was collected for the profile - density_files = glob.glob(join( - str(udf), f'*_{site_id}_*_gapFilledDensity_*.csv' - )) - - # Remove the site details from the total file list to get only the - profiles = list( - set(filenames) - set(sites) - - set(density_files) # remove non-gap-filled denisty - ) - - # Submit all profiles associated with pit at a time - b = UploadProfileBatch( - filenames=profiles, debug=debug, doi=doi, in_timezone=tz, - db_name=db_name, - allow_split_lines=True # Logic for split header lines - ) - b.push() - error_msg += b.errors - - # Upload the site details - sd = UploadSiteDetailsBatch( - filenames=sites, debug=debug, doi=doi, in_timezone=tz, - db_name=db_name, - allow_split_lines=True # Logic for split header lines - ) - sd.push() - error_msg += sd.errors - - for f, m in error_msg: - print(f) - return len(error_msg) + gap_filled_density = [f for f in file_list if "gapDensity" in f] + file_list = list(set(file_list) - set(gap_filled_density)) + + with db_session_with_credentials('./credentials.json') as (_engine, session): + + # Filter by instrument + for keyword, instrument in INSTRUMENT_MAP.items(): + instrumented_files = [ + f for f in file_list if keyword in Path(f).name + ] + kwargs["instrument"] = instrument + LOG.info(f"\n\nUploading {len(instrumented_files)} files with keyword: {keyword}") + + # Filter to sites to manage the timezones + unique_sites = list(set([get_site_id_from_filename(f, snowex_reg) for f in instrumented_files])) + + for site in unique_sites: + site_files = [ + f for f in instrumented_files if site in f + ] + kwargs["timezone"] = get_timezone_from_site_id(site) + + uploader = UploadProfileBatch(session, site_files, **kwargs) + uploader.push() if __name__ == '__main__': - main() + for data_set_id, doi in SNOWEX_PITS_MAP.items(): + with get_files(data_set_id, doi) as files: + main(files, doi) \ No newline at end of file From d8cdca46816fbcb9642ee72d3291b0ae36abe118 Mon Sep 17 00:00:00 2001 From: aaarendt Date: Fri, 5 Dec 2025 11:43:06 -0800 Subject: [PATCH 6/6] Fix: accommodate Path or str in file list --- scripts/upload/add_time_series_pits_2020.py | 2 +- scripts/upload/add_time_series_pits_2021.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/upload/add_time_series_pits_2020.py b/scripts/upload/add_time_series_pits_2020.py index a50b37d..1b2f140 100644 --- a/scripts/upload/add_time_series_pits_2020.py +++ b/scripts/upload/add_time_series_pits_2020.py @@ -40,7 +40,7 @@ def main(file_list: list, doi: str) -> None: snowex_reg = r'SNEX20_TS_SP_\d{8}_\d{4}_([a-zA-Z0-9]*)_.*\.csv' # Files to ignore - gap_filled_density = [f for f in file_list if "gapDensity" in f] + gap_filled_density = [f for f in file_list if "gapDensity" in str(f)] file_list = list(set(file_list) - set(gap_filled_density)) with db_session_with_credentials('./credentials.json') as (_engine, session): diff --git a/scripts/upload/add_time_series_pits_2021.py b/scripts/upload/add_time_series_pits_2021.py index 39e647d..6913552 100644 --- a/scripts/upload/add_time_series_pits_2021.py +++ b/scripts/upload/add_time_series_pits_2021.py @@ -40,7 +40,7 @@ def main(file_list: list, doi: str) -> None: snowex_reg = r'SNEX21_TS_SP_\d{8}_\d{4}_([a-zA-Z0-9]*)_.*\.csv' # Files to ignore - gap_filled_density = [f for f in file_list if "gapDensity" in f] + gap_filled_density = [f for f in file_list if "gapDensity" in str(f)] file_list = list(set(file_list) - set(gap_filled_density)) with db_session_with_credentials('./credentials.json') as (_engine, session):