kso-utils/project_utils.py at main · AlexaS17/kso-utils · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# base imports
import os
import logging
import pandas as pd
from dataclasses import dataclass
from dataclass_csv import DataclassReader, DataclassWriter

# util imports
import kso_utils.spyfish_utils as spyfish_utils


# Logging
logging.basicConfig()
logging.getLogger().setLevel(logging.DEBUG)


@dataclass
class Project:
    Project_name: str
    Zooniverse_number: int = 0
    db_path: str = None
    server: str = None
    bucket: str = None
    key: str = None
    csv_folder: str = None
    movie_folder: str = None
    photo_folder: str = None
    ml_folder: str = None


def find_project(project_name: str = ""):
    """Find project information using
    project csv path and project name"""
    # Specify the path to the list of projects
    project_path = "../kso_utils/db_starter/projects_list.csv"
    snic_path = "/cephyr/NOBACKUP/groups/snic2021-6-9/"

    # Check path to the list of projects is a csv
    if os.path.exists(project_path) and not project_path.endswith(".csv"):
        logging.error("A csv file was not selected. Please try again.")

    elif os.path.exists(project_path) and os.path.exists(snic_path):
        project_path = os.path.join(snic_path, "db_starter/projects_list.csv")

    # If list of projects doesn't exist retrieve it from github
    elif not os.path.exists(project_path):
        github_path = "https://github.com/ocean-data-factory-sweden/kso-data-management/blob/main/db_starter/projects_list.csv?raw=true"
        read_file = pd.read_csv(github_path)
        read_file.to_csv(project_path, index=None)

    with open(project_path) as csv:
        reader = DataclassReader(csv, Project)
        for row in reader:
            if row.Project_name == project_name:
                logging.info(f"{project_name} loaded succesfully")
                return row


def add_project(project_info: dict = {}):
    """Add new project information to
    project csv using a project_info dictionary
    """
    project_path = "../kso_utils/db_starter/projects_list.csv"
    snic_path = "/cephyr/NOBACKUP/groups/snic2021-6-9/"

    if not os.path.exists(project_path) and os.path.exists(snic_path):
        project_path = os.path.join(snic_path, "db_starter/projects_list.csv")
    with open(project_path, "a") as f:
        project = [Project(*list(project_info.values()))]
        w = DataclassWriter(f, project, Project)
        w.write(skip_header=True)


def get_col_names(project: Project, local_csv: str):
    """Return a dictionary with the project-specific column names of a csv of interest
    This function helps matching the schema format without modifying the column names of the original csv.

    :param project: The project object
    :param local_csv: a string of the name of the local csv of interest
    :return: a dictionary with the names of the columns
    """

    # Get project-specific server info
    project_name = project.Project_name

    if "sites" in local_csv:
        # Get spyfish specific column names
        if project_name == "Spyfish_Aotearoa":
            col_names_sites = spyfish_utils.get_spyfish_col_names("sites")

        else:
            # Save the column names of interest in a dict
            col_names_sites = {
                "siteName": "siteName",
                "decimalLatitude": "decimalLatitude",
                "decimalLongitude": "decimalLongitude",
                "geodeticDatum": "geodeticDatum",
                "countryCode": "countryCode",
            }

        return col_names_sites

    if "movies" in local_csv:
        # Get spyfish specific column names
        if project_name == "Spyfish_Aotearoa":
            col_names_movies = spyfish_utils.get_spyfish_col_names("movies")

        elif project_name == "Koster_Seafloor_Obs":
            # Save the column names of interest in a dict
            col_names_movies = {
                "filename": "filename",
                "created_on": "created_on",
                "fps": "fps",
                "duration": "duration",
                "sampling_start": "SamplingStart",
                "sampling_end": "SamplingEnd",
                "author": "Author",
                "site_id": "site_id",
                "fpath": "fpath",
            }

        else:
            # Save the column names of interest in a dict
            col_names_movies = {
                "filename": "filename",
                "created_on": "created_on",
                "fps": "fps",
                "duration": "duration",
                "sampling_start": "sampling_start",
                "sampling_end": "sampling_end",
                "author": "author",
                "site_id": "site_id",
                "fpath": "fpath",
            }

        return col_names_movies

    if "species" in local_csv:
        # Save the column names of interest in a dict
        col_names_species = {
            "label": "label",
            "scientificName": "scientificName",
            "taxonRank": "taxonRank",
            "kingdom": "kingdom",
        }
        return col_names_species

    else:
        raise ValueError("The local csv doesn't have a table match in the schema")