import cartopy from cartopy import config import cartopy.feature as cfeature import cartopy.crs as ccrs from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER import numpy as np import datetime as dt import pandas as pd import matplotlib import matplotlib.pyplot as plt import matplotlib.lines as mlines import matplotlib.patches as mpatches from matplotlib import cm from matplotlib.colors import ListedColormap, LinearSegmentedColormap import xarray as xr import os import numpy as np import cfgrib from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split from sklearn.metrics import r2_score from scipy.stats import pearsonr,bootstrap import xesmf as xe import glob import scipy.stats as st def generating_dates_ecmwf(year): """ the ECMWF S2S dataset is biweekly on mondays and thursdays to download from ECMWF you have to have the dates of all monday and thursday This function gives all the mondays and thrudays for a whole year """ mondays = pd.date_range(start=str(year),end=str(year+1),freq='W-MON').strftime('%Y-%m-%d').to_series() thursdays = pd.date_range(start=str(year),end=str(year+1),freq='W-THU').strftime('%Y-%m-%d').to_series() dates = pd.concat([mondays,thursdays],axis=0).sort_values().to_list() return mondays.sort_values().to_list() def generating_hindcast_dates(date): """ For downloading the hindcasts from 2020 and 2021 that are used in the evaluation process In our case we take the model CY47R3 from ECMWF which began in late 2021 We take this model so that we have hindcasts originating from the same model, as the last version of model has been implemented in middle 2023 Args : -date of real-time forecast, in our case we take the year 2022 Returns: - date of the hindcasts for year 2020 and 2021 corresponding to real-time forecast """ date = date.split('-') years = ['2020','2021'] hindcast_dates = [year +'-'+ date[1] +'-'+ date[2] for year in years] return '/'.join(hindcast_dates) def processing_hindcasts(path : str,temp: bool, ensemble_mean : False): """ After downloading hindcasts from ECMWF Each hindcast is related to a forecast. The hindcasts are used (in ECMWF) to initialize a a forecast. In our case we took the forecasts from 2022, and downloaded the corresponding hincasts for 2020 and 2021 Model version is CY47R3 from ECMWF which began in late 2021 (still with 51 ensemble members) Args : -path to reforecasts -temp boolean True for temperature False the winds (U10/V10) and total precipitation are taken -ensemble mean boolean to say whether or not to perform ensemble mean computation for perturbed hindcasts returns: - xr.Dataset concatenated for all hindcast dates between 2020 and 2021 """ os.chdir(path) #calling temperature or winds/tp files = glob.glob('reforecast_temp_*',recursive=True) if temp else glob.glob('reforecast_winds_tp_*',recursive=True) #removing format .idx from files files = [file for file in files if file.split('.')[-1] == 'grib'] data = [xr.open_dataset(file) for file in files] if not ensemble_mean: return xr.concat(data,dim='time').sortby('time') if ensemble_mean: return xr.concat(data,dim='time').sortby('time').mean('number') def open_data_files_graphcast(files: list,variable: str,condition: bool): """ Args : files : list of name files to process variable : variable to consider ['u10','v10','t2m','tp'] condition : boolean used to limit the opening of the files if True, used mainly for testing the script, need to set to False to have the whole results Returns : xr.dataset : concatenated and sorted by dates files """ dates = [] for name in files: str_date = name.split('_')[1].removesuffix('.nc') dates.append([dt.datetime.strptime(str_date,"%Y%m%d").date(),str_date]) df = pd.DataFrame(dates,columns=['dates','files']) df = df.sort_values(by='dates').reset_index(drop=True) #erased somes predictions by accident df.index = pd.DatetimeIndex(df['dates']) proper_dates = pd.date_range(start='2020-01-01',end='2021-06-09',freq='D') df = df[df.index.isin(proper_dates)] if condition: df = df.iloc[0:100] #print(df) data = [] for name in df['files']: if variable == 'tp': datum = xr.open_dataset(variable+'_'+name+'.nc')#.isel(time=1,surface=0) datum_time = datum.isel(time=1).time datum = datum.sum('time',keep_attrs=True).isel(surface=0) datum['time'] = datum_time data.append(datum) else: data.append(xr.open_dataset(variable+'_'+name+'.nc')) data = xr.concat(data,dim='time').fillna(0) return data def opening_data_files_pangu(months: int, year: int, considered_variable : str): """ Download Pangu Forecasts Args : months : int How many months to process i.e. 1 or 2 or 10 Pangu forecasts available from January 2020 to december 2021 year : int Which year 2020 or 2021 ? considered_variable : str Returns : predicted_pressures : xr.dataset the predicted variables @ 13 pressure levels predicted_surface : xr.dataset predicted surface variables original_frames_indices : np.array indices to select the observations in correlation with the predicted fields dates_frames : np.array containing dates """ print("-----> Importing Pangu Weather's forecasts for {}".format(year)) #South pacific box including french polynesia and Fiji islands min_lon = 180 min_lat = 0 max_lon = 240 max_lat = -35 predicted_surface = [] predicted_pressures = [] original_frames_indices = [] dates_frames = [] for i in range(months): month = i+1 predicted_surface.append(xr.open_dataset("/home/vsansi01/pangu_forecasts/predicted_surface_level_{}_{}_30_days_ahead.nc".format(month,year))) #predicted_pressures.append(xr.open_dataset("/home/vsansi01/pangu_forecasts/predicted_pressure_levels_{}_{}_30_days_ahead.nc".format(month,year))) original_frames_indices.append(np.load("/home/vsansi01/pangu_forecasts/observed_indices_{}_{}_30_days_ahead.npy".format(month,year))) dates_frames.append(np.load("/home/vsansi01/pangu_forecasts/dates_{}_{}_30_days_ahead.npy".format(month,year),allow_pickle=True)) #predicted_pressures = xr.concat(predicted_pressures,dim='time')#.sel(latitude=slice(min_lat,max_lat), longitude=slice(min_lon,max_lon)) predicted_surface = xr.concat(predicted_surface,dim='time')#.sel(latitude=slice(min_lat,max_lat), longitude=slice(min_lon,max_lon)) original_frames_indices = np.concatenate(original_frames_indices,axis=0) dates_frames = np.concatenate(dates_frames,axis=0) return predicted_surface,original_frames_indices,dates_frames#predicted_pressures, def weekly_mean_operation(data, considered_variable=None, graphcast=False, climatology = False): """ Args : data : xr.Dataset or np.ndarray of forecasts to do the weekly mean operation on Initial dimensions are (time, step of forecasts, lat, lon, variables) For Pangu, S2S we have daily forecasts the steps of forecasts are daily For Graphcast Operational, forecasts are 6-hourly (4 forecasts/day) Also used to process the observations from ERA5 reanalysis graphcast : boolean to tell whether or not Graphcast outputs are being processed 6-hourly time-step Returns : weekly_predicted_frames : np.array with finaledimensions (time, 3, lat, lon, variables) 3 corresponding to the weekly mean of the forecasts """ if isinstance(data,xr.Dataset): #no this is for pangu #Mean operation on the first, second and third week data_first = data.isel(step=range(0,7)).sel(variables=considered_variable).mean('step').to_array().values data_second = data.isel(step=range(7,14)).sel(variables=considered_variable).mean('step').to_array().values data_third = data.isel(step=range(14,21)).sel(variables=considered_variable).mean('step').to_array().values weekly_data = np.concatenate([data_first,data_second,data_third],axis=0) if isinstance(data,np.ndarray) and not climatology: #!!! Change here !!! #for observations ERA5, S2S data_first = np.mean(data[:,range(0,7),...],axis=1,keepdims=True) data_second = np.mean(data[:,range(7,14),...],axis=1,keepdims=True) data_third = np.mean(data[:,range(14,21),...],axis=1,keepdims=True) data_fourth = np.mean(data[:,range(21,28),...],axis=1,keepdims=True) weekly_data = np.concatenate([data_first,data_second,data_third,data_fourth],axis=1) if isinstance(data,np.ndarray) and climatology: #!!! Change here !!! #for observations ERA5, S2S data_first = np.mean(data[range(0,7),...],axis=0,keepdims=True) data_second = np.mean(data[range(7,14),...],axis=0,keepdims=True) data_third = np.mean(data[range(14,21),...],axis=0,keepdims=True) data_fourth = np.mean(data[range(21,28),...],axis=0,keepdims=True) weekly_data = np.concatenate([data_first,data_second,data_third,data_fourth],axis=0) if isinstance(data,np.ndarray) and graphcast: data_first = np.mean(data[:,range(0,28),...],axis=1,keepdims=True) data_second = np.mean(data[:,range(28,56),...],axis=1,keepdims=True) data_third = np.mean(data[:,range(56,84),...],axis=1,keepdims=True) #data_fourth = np.mean(data[:,range(84,112),...],axis=1,keepdims=True) weekly_data = np.concatenate([data_first,data_second,data_third],axis=1) return weekly_data def RegriddingData(ds_in,ds_out,data=None): """ Regridding data with template ds_out Args : ds_in : xr.Dataset template of the initial data that needs to be regridded ds_out : xr.Dataset dataset at 1.5°, taken as template for all the data to be regridded to data : Data = None data to regrid is ds_in If data is passed needs to be xr.Dataset or np.ndarray that needs regridding Returns : data : Regridded xr.Dataset or np.array to 1.5° with bilinear method """ regridder = xe.Regridder(ds_in,ds_out,"bilinear",periodic=True) ds_in = data if data is not None else ds_in return regridder(ds_in) def ChoosingWeek(data: np.ndarray,week: int, shape : tuple): """ Args : data : np.ndarray with dimension (time, week, lat, lon) week : int select which weekly forecast to analyse shape : tuple for reshaping final data Returns : data : np.ndarray with dimension (time, lat, lon) dimension week is chosen and removed """ return data[:,week,...].reshape(shape) def calling_climatology(considered_variable): """ Calling daily climatology carried out between 1979 to 2019 on variables U10, V10, T2M, TP_6H """ VariablesDict = {'u10':'10m_u_component_of_wind','v10':'10m_v_component_of_wind','t2m':'2m_temperature','tp':'total_precipitation'} return xr.open_dataset('/home/vsansi01/climatology.nc')[VariablesDict[considered_variable]] def removing_climatology(considered_variable,forecasts,dates,week,template): """ Removing climatology from considered forecasts Args : -considered variable string -forecasts xr.Dataset daily forecast -dates of forcasts -week int, considered week -template for regridding to 1.5° S2S resolution returns: - Detrended data xr.Dataset - Daily clim with coords of detrended data for significance test """ DailyClimatology = calling_climatology(considered_variable) DailyClimatology = RegriddingData(DailyClimatology,template) days_ahead = 21 if week in [0,1,2] else 29 correlated_dates = [] detrended_data = [] for i,time in enumerate(dates.values): correlated_dates.append(time) dayofyear = pd.to_datetime(time).dayofyear forecast = forecasts[i] DailyClim = DailyClimatology.sel(dayofyear=slice(dayofyear,dayofyear+29)).values #21 DailyClim = weekly_mean_operation(DailyClim,climatology=True) DailyClim = DailyClim[week] detrended_data.append(forecast - DailyClim) return np.stack(detrended_data,axis=0)#, xr.concat(clim,dim='time') def processing_climatology(climatology: xr.Dataset, dates: list, ds_out: xr.Dataset, week: int): """ Processing climatology so as to make comparison with observations and forecasts possible Firstly, need to regrid climatology to 1.5°. Secondly, correlate the proper dates of climatology with observations/forecasts. Thirdly, since the forecasts and observations are weekly, need to make weekly mean operation on corresponding daily climatology Args : -daily climatology xr.Dataset with shape (366,lat,lon) at 1° -dates list of real time dates of obervations -ds_out template for regriiding at 1.5° -Week int used to choose which week to compare (i.e. 2 or 3 weeks ahead) Returns : -Processed climatology np.darray with shape (time,lat,lon,1) at 1.5° """ climatology = RegriddingData(climatology,ds_out) latitudes = climatology.latitude.values longitudes = climatology.longitude.values days_ahead = 21 if week in [0,1,2] else 29 #convert dates to dayofyear as climatology is expressed as dayofyear dates_DayOfYear = [pd.to_datetime(date).timetuple().tm_yday for date in dates] #building weekly climato corresponding to each weekly forecasts with day of year WeeklyClimato = [np.expand_dims(climatology.sel(dayofyear=slice(day,day+days_ahead)).values,axis=0) for day in dates_DayOfYear] #21 WeeklyClimato = np.concatenate(WeeklyClimato,axis=0) WeeklyClimato = weekly_mean_operation(WeeklyClimato) WeeklyClimato = WeeklyClimato[:,week,...] return WeeklyClimato,latitudes,longitudes def compute_anomalies(data,WeeklyClimato): """ Compute observed or forecasted anomalies Args: -Data being Observations/Forecasts np.ndarray with shape (time, lat, lon) -WeeklyClimato np.ndarray with shape (time, lat, lon) Returns: -Computed anomalies with shape (time, lat, lon) """ return data - WeeklyClimato def anomaly_correlation_coefficient(observations, forecasts, dates: list, ds_out: xr.Dataset, considered_variable: str, week: int, nom: str): """ Calculate the Anomaly Correlation Coefficient (ACC) between observed and forecasted values. Args: -Observations np.ndarray with shape (time, lat, lon) -Forecasts np.darray with shape (time, lat, lon) -dates list of real time dates of obervations -ds_out template for regriiding at 1.5° -Considered variable str -Week int used to choose which week to compare (i.e. 2 or 3 weeks ahead) -nom : str to save the file with a specific name Returns: float: Anomaly Correlation Coefficient. """ climatology = calling_climatology(considered_variable) WeeklyClimato,latitudes,longitudes = processing_climatology(climatology, dates, ds_out, week) observed_anomalies = compute_anomalies(observations, WeeklyClimato) forecasted_anomalies = compute_anomalies(forecasts, WeeklyClimato) #have to compute the weights for each latitude CosPhi = np.array([[np.cos(lat*np.pi/180) for lon in longitudes] for lat in latitudes]) #adding weights to observered/forecasted anomalies observed_anomalies = np.sqrt(CosPhi)*observed_anomalies forecasted_anomalies = np.sqrt(CosPhi)*forecasted_anomalies # Calculate correlation coefficient for each timestep # Computation of mean is done over time axis R = [pearsonr(observed_anomaly.values.reshape(-1),forecasted_anomaly.values.reshape(-1)).statistic\ for observed_anomaly,forecasted_anomaly in zip(observed_anomalies,forecasted_anomalies)] np.save('/home/vsansi01/ACC_R_values/ACC_{}_{}_week{}.npy'.format(nom,considered_variable,week+1),np.array(R)) return np.mean(R),np.std(R)