import cartopy
from cartopy import config
import cartopy.feature as cfeature
import cartopy.crs as ccrs
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
import numpy as np
import datetime as dt
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.patches as mpatches
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import xarray as xr
import os
import numpy as np
import cfgrib
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from scipy.stats import pearsonr,bootstrap
import xesmf as xe
import glob 
import scipy.stats as st

def generating_dates_ecmwf(year):
    
    """
    the ECMWF S2S dataset is biweekly on mondays and thursdays
    to download from ECMWF you have to have the dates of all monday and thursday
    This function gives all the mondays and thrudays for a whole year
    
    """  
    mondays = pd.date_range(start=str(year),end=str(year+1),freq='W-MON').strftime('%Y-%m-%d').to_series()
    thursdays = pd.date_range(start=str(year),end=str(year+1),freq='W-THU').strftime('%Y-%m-%d').to_series()
    dates = pd.concat([mondays,thursdays],axis=0).sort_values().to_list()
    
    return mondays.sort_values().to_list()
    
def generating_hindcast_dates(date):

    """
    For downloading the hindcasts from 2020 and 2021 that are used in the evaluation process
    In our case we take the model CY47R3 from ECMWF which began in late 2021
    We take this model so that we have hindcasts originating from the same model,
    as the last version of model has been implemented in middle 2023
    
    Args : 
          -date of real-time forecast, in our case we take the year 2022
          
    Returns: 
            - date of the hindcasts for year 2020 and 2021 corresponding to real-time forecast
    """

    date = date.split('-')
    years = ['2020','2021'] 
    hindcast_dates = [year +'-'+ date[1] +'-'+ date[2] for year in years]
    
    return '/'.join(hindcast_dates)
    
def processing_hindcasts(path : str,temp: bool, ensemble_mean : False):
    
    """
    After downloading hindcasts from ECMWF 
    
    Each hindcast is related to a forecast. The hindcasts are used (in ECMWF) to initialize a
    a forecast. 
    
    In our case we took the forecasts from 2022, and downloaded the corresponding hincasts for 
    2020 and 2021
    Model version is CY47R3 from ECMWF which began in late 2021 (still with 51 ensemble members)
    
    Args : 
    
        -path to reforecasts
        
        -temp boolean True for temperature False the winds (U10/V10) and total precipitation are taken
        
        -ensemble mean boolean to say whether or not to perform ensemble mean computation for perturbed hindcasts
        
    returns: 
        - xr.Dataset concatenated for all hindcast dates between 2020 and 2021
    
        
    """
    os.chdir(path)
    
    #calling temperature or winds/tp
    files = glob.glob('reforecast_temp_*',recursive=True) if temp else glob.glob('reforecast_winds_tp_*',recursive=True)
    
    #removing format .idx from files
    files = [file for file in files if file.split('.')[-1] == 'grib']
    data = [xr.open_dataset(file) for file in files]
    
    if not ensemble_mean:
        return xr.concat(data,dim='time').sortby('time')
    
    if ensemble_mean:
        return xr.concat(data,dim='time').sortby('time').mean('number')
    
    

def open_data_files_graphcast(files: list,variable: str,condition: bool):

    """
    Args :
          files : list of name files to process
          
          variable : variable to consider ['u10','v10','t2m','tp']
          
          condition : boolean used to limit the opening of the files if True,
                      used mainly for testing the script, need to set to False 
                      to have the whole results
    Returns : 
      xr.dataset : concatenated and sorted by dates files 

    """
    dates = []
    for name in files:
        str_date = name.split('_')[1].removesuffix('.nc')
        dates.append([dt.datetime.strptime(str_date,"%Y%m%d").date(),str_date])
    
    df = pd.DataFrame(dates,columns=['dates','files'])    
    df = df.sort_values(by='dates').reset_index(drop=True)
    
    #erased somes predictions by accident
    df.index = pd.DatetimeIndex(df['dates'])
    proper_dates = pd.date_range(start='2020-01-01',end='2021-06-09',freq='D')
    df = df[df.index.isin(proper_dates)]
    
    if condition:
        df = df.iloc[0:100]
        #print(df)
    
    data = []
    for name in df['files']:
        if variable == 'tp':
            datum = xr.open_dataset(variable+'_'+name+'.nc')#.isel(time=1,surface=0)
            datum_time = datum.isel(time=1).time
            datum = datum.sum('time',keep_attrs=True).isel(surface=0)
            datum['time'] = datum_time
            data.append(datum)
        else:
            data.append(xr.open_dataset(variable+'_'+name+'.nc'))

    data = xr.concat(data,dim='time').fillna(0)
    
    return data
    
def opening_data_files_pangu(months: int, year: int, considered_variable : str):
    """
    Download Pangu Forecasts
    
    Args :
          months : int How many months to process i.e. 1 or 2 or 10 
                   Pangu forecasts available from January 2020 to december 2021
          
          year : int Which year 2020 or 2021 ? 
          
          considered_variable : str 
                            
    
    Returns : 
      predicted_pressures : xr.dataset the predicted variables @ 13 pressure levels
      
      predicted_surface : xr.dataset predicted surface variables  
      
      original_frames_indices : np.array indices to select the observations in correlation with the predicted fields 
      
      dates_frames : np.array containing dates 

    """
    
    print("-----> Importing Pangu Weather's forecasts for {}".format(year))

    #South pacific box including french polynesia and Fiji islands
    min_lon = 180
    min_lat = 0
    max_lon = 240
    max_lat = -35

    predicted_surface = []
    predicted_pressures = []
    original_frames_indices = []
    dates_frames = []

    for i in range(months):
        month = i+1
        predicted_surface.append(xr.open_dataset("/home/vsansi01/pangu_forecasts/predicted_surface_level_{}_{}_30_days_ahead.nc".format(month,year)))
        #predicted_pressures.append(xr.open_dataset("/home/vsansi01/pangu_forecasts/predicted_pressure_levels_{}_{}_30_days_ahead.nc".format(month,year)))
        original_frames_indices.append(np.load("/home/vsansi01/pangu_forecasts/observed_indices_{}_{}_30_days_ahead.npy".format(month,year)))
        dates_frames.append(np.load("/home/vsansi01/pangu_forecasts/dates_{}_{}_30_days_ahead.npy".format(month,year),allow_pickle=True))
    
    
    #predicted_pressures = xr.concat(predicted_pressures,dim='time')#.sel(latitude=slice(min_lat,max_lat), longitude=slice(min_lon,max_lon))
    predicted_surface = xr.concat(predicted_surface,dim='time')#.sel(latitude=slice(min_lat,max_lat), longitude=slice(min_lon,max_lon))
    original_frames_indices = np.concatenate(original_frames_indices,axis=0)
    dates_frames = np.concatenate(dates_frames,axis=0)
    
    return predicted_surface,original_frames_indices,dates_frames#predicted_pressures,
    
def weekly_mean_operation(data, considered_variable=None, graphcast=False, climatology = False):

    """
    Args :
          data : xr.Dataset or np.ndarray of forecasts to do the weekly mean operation on
                 Initial dimensions are (time, step of forecasts, lat, lon, variables)
                 
                 
                 For Pangu, S2S we have daily forecasts the steps of forecasts are daily
                 For Graphcast Operational, forecasts are 6-hourly (4 forecasts/day)
                 Also used to process the observations from ERA5 reanalysis
          
          graphcast : boolean to tell whether or not Graphcast outputs are being processed
                      6-hourly time-step
    
    Returns : 
      weekly_predicted_frames : np.array with finaledimensions (time, 3, lat, lon, variables)
                                3 corresponding to the weekly mean of the forecasts

    """
    
    if isinstance(data,xr.Dataset):
        #no this is for pangu
        #Mean operation on the first, second and third week
        data_first = data.isel(step=range(0,7)).sel(variables=considered_variable).mean('step').to_array().values
        data_second = data.isel(step=range(7,14)).sel(variables=considered_variable).mean('step').to_array().values
        data_third = data.isel(step=range(14,21)).sel(variables=considered_variable).mean('step').to_array().values
        
        weekly_data = np.concatenate([data_first,data_second,data_third],axis=0)
        
    if isinstance(data,np.ndarray) and not climatology:
        #!!! Change here !!!
        #for observations ERA5, S2S
        data_first = np.mean(data[:,range(0,7),...],axis=1,keepdims=True)
        data_second = np.mean(data[:,range(7,14),...],axis=1,keepdims=True)
        data_third = np.mean(data[:,range(14,21),...],axis=1,keepdims=True)
        data_fourth = np.mean(data[:,range(21,28),...],axis=1,keepdims=True)
        
        weekly_data = np.concatenate([data_first,data_second,data_third,data_fourth],axis=1)
            
    if isinstance(data,np.ndarray) and climatology:
        #!!! Change here !!!
        #for observations ERA5, S2S
        data_first = np.mean(data[range(0,7),...],axis=0,keepdims=True)
        data_second = np.mean(data[range(7,14),...],axis=0,keepdims=True)
        data_third = np.mean(data[range(14,21),...],axis=0,keepdims=True)
        data_fourth = np.mean(data[range(21,28),...],axis=0,keepdims=True)
        
        weekly_data = np.concatenate([data_first,data_second,data_third,data_fourth],axis=0)

    if isinstance(data,np.ndarray) and graphcast:
        data_first = np.mean(data[:,range(0,28),...],axis=1,keepdims=True)
        data_second = np.mean(data[:,range(28,56),...],axis=1,keepdims=True)
        data_third = np.mean(data[:,range(56,84),...],axis=1,keepdims=True)
        #data_fourth = np.mean(data[:,range(84,112),...],axis=1,keepdims=True)
        
        weekly_data = np.concatenate([data_first,data_second,data_third],axis=1)        
    
    return weekly_data    
    
def RegriddingData(ds_in,ds_out,data=None):
    """
    Regridding data with template ds_out
    Args :
          ds_in : xr.Dataset template of the initial data that needs to be regridded 
          
          ds_out : xr.Dataset dataset at 1.5°, taken as template for all the data to be regridded to
          
          data : Data = None data to regrid is ds_in 
                 If data is passed needs to be xr.Dataset or np.ndarray that needs regridding
    
    Returns : 
      data : Regridded xr.Dataset or np.array to 1.5° with bilinear method

    """
    regridder = xe.Regridder(ds_in,ds_out,"bilinear",periodic=True)
    ds_in = data if data is not None else ds_in
    return regridder(ds_in)
    
def ChoosingWeek(data: np.ndarray,week: int, shape : tuple):
    """
    Args :
          data : np.ndarray with dimension (time, week, lat, lon) 
          
          week : int select which weekly forecast to analyse
          
          shape : tuple for reshaping final data
    Returns : 
      data : np.ndarray with dimension (time, lat, lon) dimension week is chosen and removed

    """
    return data[:,week,...].reshape(shape)

    
def calling_climatology(considered_variable):
    """
    Calling daily climatology carried out between 1979 to 2019 on variables U10, V10, T2M, TP_6H   
    """

    VariablesDict = {'u10':'10m_u_component_of_wind','v10':'10m_v_component_of_wind','t2m':'2m_temperature','tp':'total_precipitation'}
    
    return xr.open_dataset('/home/vsansi01/climatology.nc')[VariablesDict[considered_variable]]
    
def removing_climatology(considered_variable,forecasts,dates,week,template):
    """
    Removing climatology from considered forecasts
    
    Args :
    
          -considered variable string 
          
          -forecasts xr.Dataset daily forecast
          
          -dates of forcasts
          
          -week int, considered week
          
          -template for regridding to 1.5° S2S resolution
          
    returns: 
          - Detrended data xr.Dataset
          - Daily clim with coords of detrended data for significance test 
          
    """
    
    DailyClimatology = calling_climatology(considered_variable)
    DailyClimatology = RegriddingData(DailyClimatology,template)
    days_ahead = 21 if week in [0,1,2] else 29
    correlated_dates = []
    detrended_data = []
    for i,time in enumerate(dates.values):
        correlated_dates.append(time)
        dayofyear = pd.to_datetime(time).dayofyear
        forecast = forecasts[i]
        DailyClim = DailyClimatology.sel(dayofyear=slice(dayofyear,dayofyear+29)).values  #21
        DailyClim = weekly_mean_operation(DailyClim,climatology=True)
        DailyClim = DailyClim[week]
        detrended_data.append(forecast - DailyClim)
        
    return np.stack(detrended_data,axis=0)#, xr.concat(clim,dim='time')
        
def processing_climatology(climatology: xr.Dataset, dates: list, ds_out: xr.Dataset, week: int):
    """
    Processing climatology so as to make comparison with observations and forecasts possible
    
    Firstly, need to regrid climatology to 1.5°.
    
    Secondly, correlate the proper dates of climatology with observations/forecasts.
    
    Thirdly, since the forecasts and observations are weekly, need to make weekly mean operation 
    on corresponding daily climatology
    
    Args :     	  
    	  -daily climatology xr.Dataset with shape (366,lat,lon) at 1°
    	  
    	  -dates list of real time dates of obervations 
    	  
    	  -ds_out template for regriiding at 1.5°
    	  
    	  -Week int used to choose which week to compare (i.e. 2 or 3 weeks ahead)
    	  
    Returns :
    	  -Processed climatology np.darray with shape (time,lat,lon,1) at 1.5°
    
    """
    
    climatology = RegriddingData(climatology,ds_out)
    latitudes = climatology.latitude.values
    longitudes = climatology.longitude.values
    days_ahead = 21 if week in [0,1,2] else 29
    #convert dates to dayofyear as climatology is expressed as dayofyear
    dates_DayOfYear = [pd.to_datetime(date).timetuple().tm_yday for date in dates]
    #building weekly climato corresponding to each weekly forecasts with day of year
    WeeklyClimato = [np.expand_dims(climatology.sel(dayofyear=slice(day,day+days_ahead)).values,axis=0) for day in dates_DayOfYear] #21
    WeeklyClimato = np.concatenate(WeeklyClimato,axis=0)
    WeeklyClimato = weekly_mean_operation(WeeklyClimato)
    WeeklyClimato = WeeklyClimato[:,week,...]
    
    return WeeklyClimato,latitudes,longitudes
    
def compute_anomalies(data,WeeklyClimato):
    """
    Compute observed or forecasted anomalies 
    
    Args: 
         -Data being Observations/Forecasts np.ndarray with shape (time, lat, lon)
         
         -WeeklyClimato np.ndarray with shape (time, lat, lon)
         
    Returns:
         -Computed anomalies with shape (time, lat, lon)
    """
    return data - WeeklyClimato
    
def anomaly_correlation_coefficient(observations, 
                                    forecasts,
                                    dates: list, 
                                    ds_out: xr.Dataset, 
                                    considered_variable: str, 
                                    week: int, 
                                    nom: str):
    """
    Calculate the Anomaly Correlation Coefficient (ACC) between observed and forecasted values.
    
    Args: 
         -Observations np.ndarray with shape (time, lat, lon)
         
         -Forecasts np.darray with shape (time, lat, lon)
    	  
    	 -dates list of real time dates of obervations 
    	  
    	 -ds_out template for regriiding at 1.5°
    	 
    	 -Considered variable str
    	  
    	 -Week int used to choose which week to compare (i.e. 2 or 3 weeks ahead)
    	 
    	 -nom : str to save the file with a specific name  
    
    Returns:
    float: Anomaly Correlation Coefficient.
    """
    
    climatology = calling_climatology(considered_variable)
    WeeklyClimato,latitudes,longitudes = processing_climatology(climatology, dates, ds_out, week)
    
    observed_anomalies = compute_anomalies(observations, WeeklyClimato)
    forecasted_anomalies = compute_anomalies(forecasts, WeeklyClimato)
    
    #have to compute the weights for each latitude                 
    CosPhi = np.array([[np.cos(lat*np.pi/180) for lon in longitudes] for lat in latitudes])
    
    #adding weights to observered/forecasted anomalies 
    observed_anomalies = np.sqrt(CosPhi)*observed_anomalies
    forecasted_anomalies = np.sqrt(CosPhi)*forecasted_anomalies
        
    # Calculate correlation coefficient for each timestep 
    # Computation of mean is done over time axis 
    R = [pearsonr(observed_anomaly.values.reshape(-1),forecasted_anomaly.values.reshape(-1)).statistic\
        for observed_anomaly,forecasted_anomaly in zip(observed_anomalies,forecasted_anomalies)]

    np.save('/home/vsansi01/ACC_R_values/ACC_{}_{}_week{}.npy'.format(nom,considered_variable,week+1),np.array(R))
    return np.mean(R),np.std(R)