Skip to content

Gsoc 25 shivam code #114

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions GOSC-25_SHIVAM_CODES/NetCDF_DataReader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""
NetCDF_DataReader.py
Purpose: Load and manipulate netCDF data (e.g., CBOFS) using xarray for ocean model analysis.
Explore data attributes, variables, and perform basic data manipulation.

"""
#%%%%

# Import necessary libraries
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt

#%%%

class NetCDFDataReader:
"""
Create class to load netcdf file and explore its functionalities
"""
def __init__(self, file_path): # constructor for initializing the class


self.file_path = file_path
self.dataset = None

def load_data(self): # fucntion for reading a netcdf file from specified file path

try:
self.dataset = xr.open_dataset(self.file_path, decode_times=True) #load the dataset
print("Dataset loaded successfully.")
return self.dataset
except Exception as e:
print(f"Error loading dataset: {e}")
return None

def get_subset(self, **kwargs): # function for extracting a subset of the dataset based on time, latitude, and longitude ranges

# the dictionary passed will have key value pairs of {features:columns} for data sclicing
if self.dataset is None:
print("Error: Dataset not loaded.")
return None
try:
keys=list(kwargs.get('features'))
values=list(kwargs.get('columns'))
subset = self.dataset.sel(**{keys[0]:slice(values[0], values[1])})


print("Subset extracted successfully.")
return subset
except Exception as e:
print(f"Error extracting subset: {e}")
return None


# %%

# for demonstration purpose using a netcdf file hosted on opendap server
file_loc = 'https://opendap1.nodc.no/opendap/physics/point/cruise/nansen_legacy-single_profile/NMDC_Nansen-Legacy_PR_CT_58US_2021708/CTD_station_P1_NLEG01-1_-_Nansen_Legacy_Cruise_-_2021_Joint_Cruise_2-1.nc'
# create an instance of the NetCDFDataReader class
ncd = NetCDFDataReader(file_loc)
data =ncd.load_data() # load the data
print(data)
# %%
# exploring the dimenisons, variales and attrivutes of data

""" Dimensions"""
print(data.dims) # print the dimensions of the dataset
# output is ({'PRES': 320}) so it has data across 320 values of pressure


"""Exploring the Temp vars"""
print(data['TEMP']) # print the data of the dataset for TEMP variable
print(data['TEMP'].to_dataframe()) # print the data of the dataset for TEMP variable in pandas dataframe format


"""Attributes"""
print(data.attrs) # print the attributes of the dataset
# output is a dictionary of attributes of the dataset
# %%
129 changes: 129 additions & 0 deletions GOSC-25_SHIVAM_CODES/Plotting_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"""
Plotting_data.py
Purpose: Plot the combined data created in TimeSeries_Data_Exploration.py
Explore multiple plotting functions for gridded data

"""

#%%# Import necessary libraries
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import os
import scipy
import cartopy
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter
import cmocean


# %%


# Loading the combined data
data = xr.load_dataset(r'Example Data/combined_data.nc') # load the combined data


# %%

combined_data = data.copy() #create a copy of the data
# Set up the map projection (Mercator is good for ocean data)
projection = ccrs.PlateCarree()

# Create a figure with subplots for each time step
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(10, 15),
subplot_kw={'projection': projection})

# Plot surface temperature for each time step
for t in range(len(combined_data.ocean_time)):
ax = axes[t]

# Select surface temperature at this time step
surface_temp = combined_data.temp.isel(ocean_time=t,s_rho=10)

print(surface_temp.shape)
print(surface_temp.to_dataframe())
# Create the map
ax.coastlines(resolution='110m')
ax.add_feature(cfeature.LAND)
ax.add_feature(cfeature.OCEAN)
ax.gridlines(draw_labels=True)

# Plot the temperature data
p = ax.pcolormesh(combined_data.lon_rho, combined_data.lat_rho, surface_temp,
transform=ccrs.PlateCarree(), # Data is in lat/lon coordinates
cmap=cmocean.cm.thermal, # Use cmocean's thermal colormap
vmin=surface_temp.min(), vmax=surface_temp.max())

# Add a colorbar
plt.colorbar(p, ax=ax, label='Temperature (°C)')

# Set title
ax.set_title(f'Surface Temperature at {combined_data.ocean_time[t].values}')

plt.tight_layout()
plt.show()
# %%

import numpy as np
from scipy.ndimage import gaussian_filter1d

# Compute the spatially averaged surface temperature
surface_temp = combined_data.temp.isel(s_rho=19).mean(dim=['eta_rho', 'xi_rho'])

# Convert to numpy for smoothing
temp_values = surface_temp.values
time_values = combined_data.ocean_time.values

# Smooth the temperature data using a Gaussian filter
smoothed_temp = gaussian_filter1d(temp_values, sigma=1) # Adjust sigma for more/less smoothing

# Plot the original and smoothed time series
plt.figure(figsize=(8, 4))
plt.plot(time_values, temp_values, marker='o', label='Original', color='blue')
plt.plot(time_values, smoothed_temp, label='Smoothed (Gaussian)', color='red')
plt.xlabel('Time')
plt.ylabel('Temperature (°C)')
plt.title('Spatially Averaged Surface Temperature (Smoothed)')
plt.grid(True)
plt.legend()
plt.show()
# %%

# Some data preprocessing to plot on curve
df = combined_data.temp.isel(ocean_time=0,s_rho=0).to_dataframe().reset_index()
df
print(df['lat_rho'].min(),df['lat_rho'].max())
print(df['lon_rho'].min(),df['lon_rho'].max())
# %%
# %%
# %%

# plot to highlight regions of data in the US MAP

# intializing figure
plt.figure(figsize=(10, 6))
m1= plt.axes(projection=ccrs.PlateCarree())

# adding features to the map
m1.add_feature(cfeature.LAND)
m1.add_feature(cfeature.OCEAN)
m1.gridlines(draw_labels=True)
m1.add_feature(cfeature.COASTLINE)
m1.add_feature(cfeature.BORDERS, linestyle=':')
m1.add_feature(cfeature.LAKES, color='lightblue')
m1.add_feature(cfeature.RIVERS, color='blue')
m1.add_feature(cfeature.STATES, linestyle=':')

# adding subset to see in the map
m1.set_extent([-130,-60,20,35])

# plotting the data as a scatter plot to see the regions covered by the data
for i in df.itertuples():
m1.plot(i.lon_rho,i.lat_rho,color='red',marker='o')
m1.stock_img()



# %%
70 changes: 70 additions & 0 deletions GOSC-25_SHIVAM_CODES/TimeSeries_Data_Exploration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""
TimeSeries_Data_Exploration.py
Purpose: Load and explore the data of different time steps to get a flavour of timeseries data
Handling multiple time periods data ,plotting and saving it

"""

#%%%%

# Import necessary libraries
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import os

#%%%
# LOADING DATA

loc = "../Example Data" # specify folder where data will be loaded
files= os.listdir(loc) # list all files in the folder
print(files) # print the list of files
"""
we have 3 files for the same day but executed at gap of 6 hrs
these files are :
1. nos.cbofs.fields.f001.20231127.t00z.nc
2. nos.cbofs.fields.f001.20231127.t06z.nc
3 . nos.cbofs.fields.f001.20231127.t12z.nc
"""

data1 = xr.open_dataset(f"{files[0]}",engine='netcdf4')
data2 = xr.open_dataset(f"{files[1]}",engine='netcdf4')
data3 = xr.open_dataset(f"{files[2]}",engine='netcdf4')


# %%

"""
We would append temp data for all time-steps and create a combined data

Then we would slice data to create timeseries data for a specific location

Then we would try to plot the same

"""

datasets = [data1, data2, data3]

# Combine along the ocean_time dimension
combined_data = xr.concat(datasets, dim='ocean_time')

# Check the combined dataset
print(combined_data)


""" PLOTTING TEMP AT A SPECIFIC LOCATION """
specific_point_temp = combined_data.temp.isel(eta_rho=100, xi_rho=100, s_rho=19)

# Plot
plt.figure(figsize=(8, 4))
plt.plot(combined_data.ocean_time, specific_point_temp, marker='o', label='Temp at (eta_rho=100, xi_rho=100, s_rho=19)')
plt.xlabel('Time')
plt.ylabel('Temperature (°C)')
plt.title('Temperature Time Series at (eta_rho=100, xi_rho=100, s_rho=19 Surface)')
plt.grid(True)
plt.legend()
plt.show()

combined_data.to_netcdf('combined_data.nc') # save the combined data/

#%%
68 changes: 68 additions & 0 deletions GOSC-25_SHIVAM_CODES/Validation_module.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""
Validation_module.py
Purpose: Create RMSE error between 2 different NETCDF Files
Assuming 2 model prediction files(at 6 hrs assuming one is predicted and one is observed) create a module for creating RMSE

"""

#%%%%

# Import necessary libraries
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import os

# %%

# creating a class to preprocess data and evaluation code using RMSE after slicing entered by user
class Eval():
def __init__(self, data1, data2):
self.pred = data1
self.actual = data2


# create a function taking preproceess data and calculate RMSE
def rmse(self,filter):

# the dictionary passed will have key value pairs of {features:columns} for data sclicing
if self.pred is None:
print("Error: Dataset not loaded.")
return None

keys=list(filter.keys())
values=list(filter.values())

for i in range(len(keys)):
pred_subset = self.pred['temp'].sel(**{keys[i]:slice(values[i][0], values[i][1])})
actual_subset = self.actual['temp'].sel(**{keys[i]:slice(values[i][0], values[i][1])})
# convert temp column to numpy array

pred_subset_df= pred_subset.to_dataframe().reset_index()
actual_subset_df =actual_subset.to_dataframe().reset_index()

pred_subset_df= pred_subset_df[~(pred_subset_df['temp'].isna())]['temp']
actual_subset_df= actual_subset_df[~(actual_subset_df['temp'].isna())]['temp']

# Calculate RMSE
return np.sqrt(((pred_subset_df - actual_subset_df) ** 2).mean())

# abstract class for computing any metric provided by user
def Metric(Self,filter) :
pass




# %%

pred = xr.open_dataset("../Example Data/nos.cbofs.fields.f001.20231127.t00z.nc") # load predicted data for now
actual = xr.open_dataset("../Example Data/nos.cbofs.fields.f001.20231127.t06z.nc") # load actual data for now

#%%
eval = Eval(pred, actual)
filter= {"s_rho":[-0.975,-0.875]}
rmse = eval.rmse(filter)
print("RMSE extracted successfully")
print(f"RMSE is :{rmse}")
# %%
Loading