Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 53 additions & 116 deletions scripts/upload/add_time_series_pits_2020.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,134 +2,71 @@
Script to upload the Snowex Time Series pits
"""

import glob
import re
from os.path import abspath, join
from pathlib import Path
from earthaccess_data import get_files
from import_logger import get_logger
from snowexsql.db import db_session_with_credentials
from snowex_db.utilities import get_timezone_from_site_id, get_site_id_from_filename
from snowex_db.upload.layers import UploadProfileBatch

from snowex_db.batch import UploadProfileBatch, UploadSiteDetailsBatch
from snowex_db.upload import PointDataCSV
from snowex_db import db_session
LOG = get_logger()


tz_map = {'US/Pacific': ['CA', 'NV', 'WA'],
'US/Mountain': ['CO', 'ID', 'NM', 'UT', 'MT'],
}
SNOWEX_PITS_MAP = {
"SNEX20_TS_SP": "10.5067/KZ43HVLZV6G4"
}

# Filename keyword to the instrument used
INSTRUMENT_MAP = {
"siteDetails": None,
"density": "Density Cutter",
"temperature": "Digital Thermometer",
"LWC": "A2 Sensor",
"stratigraphy": "Manual"
}

def main():

def main(file_list: list, doi: str) -> None:
"""
Add 2020 timeseries pits
"""
db_name = 'localhost/snowex'

# Version 2 DOI
# https://nsidc.org/data/snex20_ts_sp/versions/2
doi = "https://doi.org/10.5067/KZ43HVLZV6G4"
debug = True
# Constant Metadata for the GPR data
kwargs = {
"campaign_name": "2020 Timeseries",
"doi": doi,
}

# Point to the downloaded data from
data_dir = abspath('../download/data/SNOWEX/SNEX20_TS_SP.002/')
error_msg = []
# Regex to get site id from filename
snowex_reg = r'SNEX20_TS_SP_\d{8}_\d{4}_([a-zA-Z0-9]*)_.*\.csv'

# Files to ignore
ignore_files = [
"SNEX20_TS_SP_Summary_Environment_v02.csv",
"SNEX20_TS_SP_Summary_SWE_v02.csv",
"SNEX20_TS_SP_Summary_SWE_v02_modified.csv"
]

# Get all the date folders
unique_dt_olders = Path(
data_dir
).expanduser().absolute().glob("20*.*.*")
for udf in unique_dt_olders:
# get all the csvs in the folder
dt_folder_files = list(udf.glob("*.csv"))
site_ids = []
# Get the unique site ids for this date folder
compiled = re.compile(
r'SNEX20_TS_SP_\d{8}_\d{4}_([a-zA-Z0-9]*)_data_.*_v02\.csv'
)
for file_path in dt_folder_files:
file_name = file_path.name
if file_name in ignore_files:
print(f"Skipping {file_name}")
continue
match = compiled.match(file_name)
if match:
code = match.group(1)
site_ids.append(code)
else:
raise RuntimeError(f"No site ID found for {file_name}")

# Get the unique site ids
site_ids = list(set(site_ids))

for site_id in site_ids:
abbrev = site_id[0:2]
tz = [k for k, states in tz_map.items() if abbrev in states][0]

# Grab all the csvs in the pits folder
filenames = glob.glob(join(str(udf), f'*_{site_id}_*.csv'))

# Grab all the site details files
sites = glob.glob(join(
str(udf), f'*_{site_id}_*siteDetails*.csv'
))

# Grab all the perimeter depths and remove them for now.
perimeter_depths = glob.glob(join(
str(udf), f'*_{site_id}_*perimeterDepths*.csv'
))

# Use no-gap-filled density for the sole reason that
# Gap filled density for profiles where the scale was broken
# are just an empty file after the headers. We should
# Record that Nan density was collected for the profile
density_files = glob.glob(join(
str(udf), f'*_{site_id}_*_gapFilledDensity_*.csv'
))

# Remove the site details from the total file list to get only the
profiles = list(
set(filenames) - set(sites) - set(perimeter_depths) -
set(density_files) # remove non-gap-filled denisty
)

# Submit all profiles associated with pit at a time
b = UploadProfileBatch(
filenames=profiles, debug=debug, doi=doi, in_timezone=tz,
db_name=db_name,
allow_split_lines=True # Logic for split header lines
)
b.push()
error_msg += b.errors

# Upload the site details
sd = UploadSiteDetailsBatch(
filenames=sites, debug=debug, doi=doi, in_timezone=tz,
db_name=db_name
)
sd.push()
error_msg += sd.errors

# Submit all perimeters as point data
with db_session(
db_name, credentials='credentials.json'
) as (session, engine):
for fp in perimeter_depths:
pcsv = PointDataCSV(
fp, doi=doi, debug=debug, depth_is_metadata=False,
in_timezone=tz,
allow_split_lines=True # Logic for split header lines
)
pcsv.submit(session)

for f, m in error_msg:
print(f)
return len(error_msg)
gap_filled_density = [f for f in file_list if "gapDensity" in str(f)]
file_list = list(set(file_list) - set(gap_filled_density))

with db_session_with_credentials('./credentials.json') as (_engine, session):

# Filter by instrument
for keyword, instrument in INSTRUMENT_MAP.items():
instrumented_files = [
f for f in file_list if keyword in Path(f).name
]
kwargs["instrument"] = instrument
LOG.info(f"\n\nUploading {len(instrumented_files)} files with keyword: {keyword}")

# Filter to sites to manage the timezones
unique_sites = list(set([get_site_id_from_filename(f, snowex_reg) for f in instrumented_files]))

for site in unique_sites:
site_files = [
f for f in instrumented_files if site in f
]
kwargs["timezone"] = get_timezone_from_site_id(site)

uploader = UploadProfileBatch(session, site_files, **kwargs)
uploader.push()


if __name__ == '__main__':
main()
for data_set_id, doi in SNOWEX_PITS_MAP.items():
with get_files(data_set_id, doi) as files:
main(files, doi)
150 changes: 53 additions & 97 deletions scripts/upload/add_time_series_pits_2021.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,115 +2,71 @@
Script to upload the Snowex Time Series pits
"""

import glob
import re
from os.path import abspath, join
from pathlib import Path
from earthaccess_data import get_files
from snowexsql.db import db_session_with_credentials
from snowex_db.utilities import get_timezone_from_site_id, get_site_id_from_filename
from snowex_db.upload.layers import UploadProfileBatch
from import_logger import get_logger

LOG = get_logger()


SNOWEX_PITS_MAP = {
"SNEX21_TS_SP": "10.5067/QIANJYJGRWOV"
}

tz_map = {'US/Pacific': ['CA', 'NV', 'WA'],
'US/Mountain': ['CO', 'ID', 'NM', 'UT', 'MT'],
}

# Filename keyword to the instrument used
INSTRUMENT_MAP = {
"siteDetails": None,
"density": "Density Cutter",
"temperature": "Digital Thermometer",
"LWC": "A2 Sensor",
"stratigraphy": "Manual"
}

def main():
def main(file_list: list, doi: str) -> None:
"""
Snowex 2021 timeseries pits
Add 2021 timeseries pits
"""
db_name = 'localhost/snowex'
# https://nsidc.org/data/snex21_ts_sp/versions/1
doi = "https://doi.org/10.5067/QIANJYJGRWOV"
debug = True
# Constant Metadata for the GPR data
kwargs = {
"campaign_name": "2021 Timeseries",
"doi": doi,
}

# Point to the downloaded data from
data_dir = abspath('../download/data/SNOWEX/SNEX21_TS_SP.001/')
error_msg = []
# Regex to get site id from filename
snowex_reg = r'SNEX21_TS_SP_\d{8}_\d{4}_([a-zA-Z0-9]*)_.*\.csv'

# Files to ignore
ignore_files = [
"SNEX21_TS_SP_Summary_Environment_v01.csv",
"SNEX21_TS_SP_Summary_SWE_v01.csv",
"SNEX21_TS_SP_Summary_SWE_v01_modified.csv"
]

# Get all the date folders
unique_dt_olders = Path(
data_dir
).expanduser().absolute().glob("20*.*.*")
for udf in unique_dt_olders:
# get all the csvs in the folder
dt_folder_files = list(udf.glob("*.csv"))
site_ids = []
# Get the unique site ids for this date folder
compiled = re.compile(
r'SNEX21_TS_SP_\d{8}_\d{4}_([a-zA-Z0-9]*)_data_.*_v01\.csv'
)
for file_path in dt_folder_files:
file_name = file_path.name
if file_name in ignore_files:
print(f"Skipping {file_name}")
continue

match = compiled.match(file_name)
if match:
code = match.group(1)
site_ids.append(code)
else:
raise RuntimeError(f"No site ID found for {file_name}")

# Get the unique site ids
site_ids = list(set(site_ids))

for site_id in site_ids:
abbrev = site_id[0:2]
tz = [k for k, states in tz_map.items() if abbrev in states][0]

# Grab all the csvs in the pits folder
filenames = glob.glob(join(str(udf), f'*_{site_id}_*.csv'))

# Grab all the site details files
sites = glob.glob(join(
str(udf), f'*_{site_id}_*siteDetails*.csv'
))

# Use no-gap-filled density for the sole reason that
# Gap filled density for profiles where the scale was broken
# are just an empty file after the headers. We should
# Record that Nan density was collected for the profile
density_files = glob.glob(join(
str(udf), f'*_{site_id}_*_gapFilledDensity_*.csv'
))

# Remove the site details from the total file list to get only the
profiles = list(
set(filenames) - set(sites) -
set(density_files) # remove non-gap-filled denisty
)

# Submit all profiles associated with pit at a time
b = UploadProfileBatch(
filenames=profiles, debug=debug, doi=doi, in_timezone=tz,
db_name=db_name,
allow_split_lines=True # Logic for split header lines
)
b.push()
error_msg += b.errors

# Upload the site details
sd = UploadSiteDetailsBatch(
filenames=sites, debug=debug, doi=doi, in_timezone=tz,
db_name=db_name,
allow_split_lines=True # Logic for split header lines
)
sd.push()
error_msg += sd.errors

for f, m in error_msg:
print(f)
return len(error_msg)
gap_filled_density = [f for f in file_list if "gapDensity" in str(f)]
file_list = list(set(file_list) - set(gap_filled_density))

with db_session_with_credentials('./credentials.json') as (_engine, session):

# Filter by instrument
for keyword, instrument in INSTRUMENT_MAP.items():
instrumented_files = [
f for f in file_list if keyword in Path(f).name
]
kwargs["instrument"] = instrument
LOG.info(f"\n\nUploading {len(instrumented_files)} files with keyword: {keyword}")

# Filter to sites to manage the timezones
unique_sites = list(set([get_site_id_from_filename(f, snowex_reg) for f in instrumented_files]))

for site in unique_sites:
site_files = [
f for f in instrumented_files if site in f
]
kwargs["timezone"] = get_timezone_from_site_id(site)

uploader = UploadProfileBatch(session, site_files, **kwargs)
uploader.push()


if __name__ == '__main__':
main()
for data_set_id, doi in SNOWEX_PITS_MAP.items():
with get_files(data_set_id, doi) as files:
main(files, doi)
Loading