forked from ocean-data-factory-sweden/kso-utils
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathproject_utils.py
More file actions
149 lines (122 loc) · 5.01 KB
/
Copy pathproject_utils.py
File metadata and controls
149 lines (122 loc) · 5.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# base imports
import os
import logging
import pandas as pd
from dataclasses import dataclass
from dataclass_csv import DataclassReader, DataclassWriter
# util imports
import kso_utils.spyfish_utils as spyfish_utils
# Logging
logging.basicConfig()
logging.getLogger().setLevel(logging.DEBUG)
@dataclass
class Project:
Project_name: str
Zooniverse_number: int = 0
db_path: str = None
server: str = None
bucket: str = None
key: str = None
csv_folder: str = None
movie_folder: str = None
photo_folder: str = None
ml_folder: str = None
def find_project(project_name: str = ""):
"""Find project information using
project csv path and project name"""
# Specify the path to the list of projects
project_path = "../kso_utils/db_starter/projects_list.csv"
snic_path = "/cephyr/NOBACKUP/groups/snic2021-6-9/"
# Check path to the list of projects is a csv
if os.path.exists(project_path) and not project_path.endswith(".csv"):
logging.error("A csv file was not selected. Please try again.")
elif os.path.exists(project_path) and os.path.exists(snic_path):
project_path = os.path.join(snic_path, "db_starter/projects_list.csv")
# If list of projects doesn't exist retrieve it from github
elif not os.path.exists(project_path):
github_path = "https://github.com/ocean-data-factory-sweden/kso-data-management/blob/main/db_starter/projects_list.csv?raw=true"
read_file = pd.read_csv(github_path)
read_file.to_csv(project_path, index=None)
with open(project_path) as csv:
reader = DataclassReader(csv, Project)
for row in reader:
if row.Project_name == project_name:
logging.info(f"{project_name} loaded succesfully")
return row
def add_project(project_info: dict = {}):
"""Add new project information to
project csv using a project_info dictionary
"""
project_path = "../kso_utils/db_starter/projects_list.csv"
snic_path = "/cephyr/NOBACKUP/groups/snic2021-6-9/"
if not os.path.exists(project_path) and os.path.exists(snic_path):
project_path = os.path.join(snic_path, "db_starter/projects_list.csv")
with open(project_path, "a") as f:
project = [Project(*list(project_info.values()))]
w = DataclassWriter(f, project, Project)
w.write(skip_header=True)
def get_col_names(project: Project, local_csv: str):
"""Return a dictionary with the project-specific column names of a csv of interest
This function helps matching the schema format without modifying the column names of the original csv.
:param project: The project object
:param local_csv: a string of the name of the local csv of interest
:return: a dictionary with the names of the columns
"""
# Get project-specific server info
project_name = project.Project_name
if "sites" in local_csv:
# Get spyfish specific column names
if project_name == "Spyfish_Aotearoa":
col_names_sites = spyfish_utils.get_spyfish_col_names("sites")
else:
# Save the column names of interest in a dict
col_names_sites = {
"siteName": "siteName",
"decimalLatitude": "decimalLatitude",
"decimalLongitude": "decimalLongitude",
"geodeticDatum": "geodeticDatum",
"countryCode": "countryCode",
}
return col_names_sites
if "movies" in local_csv:
# Get spyfish specific column names
if project_name == "Spyfish_Aotearoa":
col_names_movies = spyfish_utils.get_spyfish_col_names("movies")
elif project_name == "Koster_Seafloor_Obs":
# Save the column names of interest in a dict
col_names_movies = {
"filename": "filename",
"created_on": "created_on",
"fps": "fps",
"duration": "duration",
"sampling_start": "SamplingStart",
"sampling_end": "SamplingEnd",
"author": "Author",
"site_id": "site_id",
"fpath": "fpath",
}
else:
# Save the column names of interest in a dict
col_names_movies = {
"filename": "filename",
"created_on": "created_on",
"fps": "fps",
"duration": "duration",
"sampling_start": "sampling_start",
"sampling_end": "sampling_end",
"author": "author",
"site_id": "site_id",
"fpath": "fpath",
}
return col_names_movies
if "species" in local_csv:
# Save the column names of interest in a dict
col_names_species = {
"label": "label",
"scientificName": "scientificName",
"taxonRank": "taxonRank",
"kingdom": "kingdom",
}
return col_names_species
else:
raise ValueError("The local csv doesn't have a table match in the schema")