#
# Note: most import statements have been moved up here to make it easier to run particular examples (out of order)
#

import numpy as np  # The foundation of array-based computation with python.

import pandas as pd  # pandas is a very popular analysis package.

import xarray as xr  # Used for netCDF I/O and labeled array operations (like pandas, but for N-d arrays)

from scipy import stats  # SciPy is full of useful functions for scientific computing

import requests  # Used to make HTML requests - used below to download the example data.

import matplotlib as mpl  # For plotting; use mpl to get access to `colors` module

import matplotlib.pyplot as plt  # For plotting functions

import cartopy.crs as ccrs  # For map projections


def generate_sample_indices(N, method=True, rng=None, minval=None, maxval=None):
    """Return a length N array of integers derived by sampling with or without replacement np.arange(N) (or np.arange(min,max)).
    
    method determine whether to sample with replacement (True) or not (False).
    
    If rng is provided, use the `choice` method on that random number generator.
    If rng is None, use Numpy's `defaut_rng` to instantiate a fresh random number generator.
    
    minval and maxval can be used to specify the range of values (i.e. indices) to use.
    This is useful when the sample size is different from the total data size, 
    for example, if you want to select 250 values from an array that is 10_000_000 elements.
    
    """
    if rng is None:
        rng = np.random.default_rng()  # Numpy's default Generator
    if minval is not None:
        mn = minval
    else:
        mn = 0
    if maxval is not None:
        mx = maxval
    else:
        mx = N
    if (minval is not None) or (maxval is not None):
        arr = np.arange(mn,mx)
    else:
        arr = N
    return rng.choice(arr, size=N, replace=method)


# Original NCL Code is often provided in comments.
# Some lines are directly modified to python syntax, but often it is written in more idiomatic python.
#
# ;***********************************************************
# ; resampling_1.ncl
# ;
# ; Concepts illustrated:
# ;   - Generating random number seeds based on a 'clock'
# ;   - Generate 'resampling with replacement' indices
# ;     Use: generate_sample_indices( N, 1 )
# ;   - Use histogram to display distribution
# ;   - create a panel plot
# ;***********************************************************
# ;--- Specify sample sizes
# ;***********************************************************

NS    = np.array([25, 100, 500, 1000, 5000, 10000, 100000, 1000000])

#   NP    = len(NS)                              # NCL --> not needed in Python version
#   plot  = new (NP, "graphic", "No_FillValue")  #

#***********************************************************
#--- Generate and use random number seeds for initialization
#***********************************************************
# NCL defaults to fixed seeds for its random number generator,
# but that is not the case for Numpy, so re-seeding is not strictly necessary.
# Numpy has fairly advanced random number generation features (https://numpy.org/doc/stable/reference/random/index.html).

# NCL:
#  rand1 = toint(systemfunc(" date +%s")) # --> NCL uses the system's date command

# Python:
# Method 1 (subprocess to use OS date just like NCL)
    # import subprocess
    # cmd = subprocess.run(['date', '+%s'], capture_output=True)
    # rand1 = int(cmd.stdout)
# If you don't want to use the OS (directly) to get the date, you could use Python:
# Method 2 (datetime):
from datetime import datetime
curr_time = datetime.now()
rand1 = int(curr_time.strftime('%s'))
# Note: Method 1 and Method 2 produce the same result.

rand2 = int((54321 * rand1) % 2147483398)
#   NCL : random_setallseed(rand1, rand2)  # the ncl RNG uses two integers for seeding
# Python/Numpy:
# We can use a sequence (of any length) to seed.
# If no seed is given, "fresh, unpredictable entropy will be pulled from the OS" <-- Means we didn't need to worry about trying to make up a new seed.
rng = np.random.default_rng(seed=[rand1, rand2])  # rng = random number generator

# NCL Code from example
#***********************************************************
#--- Plot a histogram of index distributions for each sample size
#    
#***********************************************************
#  wks  = gsn_open_wks ("png","resampling")  ; send graphics to PNG file
#  gsn_define_colormap(wks,"default")  ; 30 distinct colors
#  resh           = True
#  resh@gsnDraw   = False
#  resh@gsnFrame  = False
#  resh@tmXBLabelStride = 1   
#  resh@tiMainOffsetYF  = -0.020       ; move tiMain down into plot
# ;resh@gsnHistogramBinWidth = ?? ; for 'tuning' ... if desired
#***********************************************************
#--- Loop 
#***********************************************************
#   do plotnum=0,2
#     do np=0,NP-1
#        iw := generate_sample_indices( NS(np), 1 )
#        iw@long_name = "N="+NS(np)
#        printMinMax(iw, 0)
#        resh@tiMainString = iw@long_name
#        delete(iw@long_name)  ; do not want on plot
#        if (np.eq.0 .or. np.eq.4) then      ; reduce clutter
#            resh@tiYAxisString = "Frequency"
#        else
#          resh@tiYAxisString = " "         ; No title on Y axis.
#        end if
#        plot(np) = gsn_histogram(wks, iw  ,resh)
#     end do
#   resP                        = True
#   resP@gsnMaximize            = True
#   resP@gsnPanelMainString     = "Distribution: Uniform Random Indices w/ Replacement"
#   resP@gsnPanelLeft           = 0.01  ; make room on left and right side of paneled
#   resP@gsnPanelRight          = 0.99  ; plots for text and tickmarks
#   gsn_panel(wks,plot,(/2,4/),resP)
#   end do

# Python using Numpy and Matplotlib
fig, ax = plt.subplots(figsize=(12,6), ncols=4, nrows=2, sharey=False, constrained_layout=True) 
aa = ax.ravel()  # makes a view of ax that can be indexed with one number. 
iw = dict()  # hold the results all together // This is optional, but could be useful if there are more steps than just making the histogram.
histograms = dict()
h_edges = dict()
for i, ns in enumerate(NS):
    print(f"i = {i}, ns = {ns}")
    iw[ns] = generate_sample_indices(ns, method=True, rng=rng)
    histograms[ns], h_edges[ns] = np.histogram(iw[ns])  # keep histogram counts and bin edges.
    aa[i].bar( h_edges[ns][0:-1] + np.diff(h_edges[ns]), histograms[ns], width=0.8*np.diff(h_edges[ns])[1] )
    aa[i].set_title(f"N = {ns}")
    
plt.setp(ax[:,0], ylabel="FREQUENCY")  # put labels on left side axes.

i = 0, ns = 25
i = 1, ns = 100
i = 2, ns = 500
i = 3, ns = 1000
i = 4, ns = 5000
i = 5, ns = 10000
i = 6, ns = 100000
i = 7, ns = 1000000

[Text(0, 0.5, 'FREQUENCY'), Text(0, 0.5, 'FREQUENCY')]


# the python version

# pre-work -- NCL example didn't actually provide the data x or the size N, 
#             let's assume it is some normally distributed 
#             variable with mean of 10.0 and standard deviation 2.0
mu, sigma = 10.0, 2.0 # mean and standard deviation
N = 499
x = rng.normal(mu, sigma, N)  # make the size of the data 499 just to be different from the sampling size

nBoot = 10_000           # <-- underscores just for visual purposes || This is the number of resamples
xBoot = np.zeros(nBoot)  # allocate an array (semi-optional, depending on how you want to do it)

for ns in range(nBoot):                                 # generate multiple estimates
    iw = generate_sample_indices(N, method=1, rng=rng)  # indices with replacement
    xBoot[ns] = np.mean(x[iw])                          # compute average -- if it were a multidimensional array, use axis kwarg.

xAvgBoot = np.mean(xBoot)         # Averages of re-samples
xStdBoot = np.std(xBoot)          # Std Dev  "        "
xStdErrBoot = xStdBoot/nBoot      # Std. Error of bootstrapped estimates

quants = np.quantile(xBoot, [0.025, 0.5, 0.975])
xBoot_025, xBoot_500, xBoot_975 = quants
print(f"The estimated mean is {xBoot_500} with 95% confidence interval that it is between {xBoot_025} and {xBoot_975}")

The estimated mean is 9.98351058116047 with 95% confidence interval that it is between 9.806645600599495 and 10.1543088012326


def bootstrap_stat(z, stat, nBoot, nDim, sample_size=None, sample_method=1, rng=None, **kwargs):
    """Bootstrap estimates of a user specified statistic derived from a variable.
    
    The python translation of NCL's function can do things a bit different.
    
    z : a numeric array (in NCL limited to up to four dimensions)
    
    stat: can be an integer or a string, with this mapping matching NCL:
        = 0 : 'mean' ('average')
        = 1 : 'standard deviation'
        = 2 : 'variance'
        = 3 : 'median'
        = 4 : 'min'
        = 5 : 'max'
        = 6 : 'sum'
        = 7 : 'correlation'
    nBoot : An integer specifying the number of bootstrap data samples to be generated.
    
    nDim : The dimension (i.e. axis, an integer) of z on which to calculate the statistic. NCL allow multiple, but here just one for now.
    
    kwargs:
        - sample_size : specifies the size of the resampled array to be used for the bootstrapped statistics
        - sample_method : specifies the sampling method to use : 1 or True => with replacement , 0 or False => without
        - rng : specified random number generator (maybe with a specified seed)
        - **kwargs -- anything else, which could get passed along.
        
    Return:
        A tuple of zBoot, zBootAvg, zBootStd
        
        zBoot - The result of applying stat to each of the nBoot resamples of z. 
        zBootAvg - The average of zBoot. If zBoot is just a vector, this is a scalar, but generally is dimensioned one less than zBoot
        zBootStd - Like zBootAvg, but the standard deviation.
        
        Note that zBoot is not ordered like in NCL, so user might have to sort it to get confidence intervals.
    
    """
    stat_mapping = {0: 'mean', 1 : 'standard deviation', 2 : 'variance', 3 : 'median', 4 : 'min', 5 : 'max', 6 : 'sum', 7: 'correlation'}
    stat_funcs = {'mean':np.mean, 'standard deviation':np.std, 'variance':np.var, 'median':np.median, 'min':np.min, 'max':np.max, 'sum':np.sum, 'correlation':bscorr}
    if rng is None:
        rng = np.random.default_rng()  # Numpy's default Generator
    if (sample_size is None) :
        n = z.shape[nDim]  # defaults to sampling size same as the size of the data in that dimension.
    elif sample_size <= 1:
        # specifies a fraction of the sample size:
        n = int(z.shape[nDim] * sample_size)
    else:
        # assume user has specified a particular size
        n = sample_size
    if isinstance(stat, int):
        stat_spec = stat_mapping[stat]
    elif isinstance(stat, str):
        stat_spec = stat
    else:
        stat_spec = None
    if stat_spec is not None:
        func = stat_funcs[stat_spec]
    else:
        func = stat  # if user wants to pass in a function directly.
    
    # generate n samples and calculate the stat
    # since we want to allow ndarrays and only work on one dimension,
    # we need to initialize the correct shape
    zBootInit = func(z, axis=nDim)
    print(f'zBootInit shape : {zBootInit.shape} // Type : {type(zBootInit)} // is an ndarray: {isinstance(zBootInit, np.ndarray)}')
    if not isinstance(zBootInit, np.ndarray):
        zBootShape = nBoot
    else:
        zBootShape = tuple([nBoot] + list(zBootInit.shape))
    zBoot = np.zeros(zBootShape)    
    print(f"Shape of zBoot: {zBoot.shape} using np.zeros({zBootShape}). Apply stat to axis {nDim}.")
    for i in range(nBoot):
        # generate random indices:
        iw = generate_sample_indices(n, method=sample_method, rng=rng, minval=0, maxval=z.shape[nDim])
        sample = np.take(z, iw, axis=nDim)
        zBoot[i,...] = func(sample, axis=nDim)  # IS nDim CORRECT for multidimensional array? 
    zBootAvg = np.mean(zBoot, axis=0)
    zBootStd = np.std(zBoot, axis=0)
    return zBoot, zBootAvg, zBootStd
    

    
def bscorr(x,y, *args, **kwargs):
    """This is probably not a good way to write this function.
    
    Try to use xr.corr() to do correlations along given dimension.
    
    Otherwise, try to use stats.pearsonr().
    """
    if isinstance(x, xr.DataArray) and isinstance(y, xr.DataArray):
        if ("dim" in kwargs) and (isinstance(kwargs['dim'],str)):
            usedim = kwargs['dim']
        elif ("dim" in kwargs) and (isinstance(kwargs['dim'],int)):
            usedim = x.dims[kwargs['dim']]
        elif ("dim" in kwargs) and (isinstance(kwargs['dim'],list)):
            assert len(kwargs['dim'] == 2)
            usedim = x.dims[kwargs['dim'][0]]
            if x.dims[kwargs['dim'][0]] != y.dims[kwargs['dim'][1]]:
                print("Very confused because the dims kwarg points to differently named dimensions.")
        else:
            usedim = None
        return xr.corr(soi, yAnom, dim=usedim)
    else:
        return stats.pearsonr(x,y)[0]  # only return the r value


def bootstrap_correl(x, y, nBoot, nDim, **kwargs):
    """Bootstrap estimates of sample cross correlations (ie, Pearson's correlation coefficient) between two variables.
    
    This follows `bootstrap_stat` closely, but now we specify the function being used with two inputs.
    
    """
    if ('rng' in kwargs) and (kwargs['rng'] is not None):
        rng = kwargs['rng']
    else:
        rng = np.random.default_rng()
    if ('sample_size' in kwargs) and (kwargs['sample_size'] is not None):
        sample_size = kwargs['sample_size']
    else:
        sample_size = x.shape[nDim]

    for i in range(nBoot):
        indices = generate_sample_indices(sample_size, method=1, rng=rng, minval=0, maxval=x.shape[nDim])
        xsample = np.take(x, indices, axis=nDim)
        ysample = np.take(y, indices, axis=nDim)
        r = bscorr(xsample, ysample, dim=nDim)
#         print(f"size of indices: {len(indices)}, xsample: {xsample.shape}, ysample: {ysample.shape}, r: {r.shape}")
        if i == 0:
            if isinstance(r, float):
                rBoot = np.zeros(nBoot)
            else:
                rBootShape = tuple([nBoot]+list(r.shape))
                rBoot = np.zeros(rBootShape)
        rBoot[i,...] = r
    rBootAvg = np.mean(rBoot, axis=0)
    rBootStd = np.std(rBoot, axis=0)
    return rBoot, rBootAvg, rBootStd


# To replace dim_stat4_n, we can use scipy's stats library
# I think axis here can only be an int or None (use all dimensions).
def dim_stat4_n(data, axis=None, moments=4):
    """Return moments a data distribution.
    
    If moments <= 4, use scipy's `describe` function.
    
    If moments > 4:
        take k = moment+1 for moment in moments to get to canonical notation (1st moment is mean)
        For moment 1, use mean
        For moment 1, use variance
        For moments >1 use scipy's moment function.
    
    """
    if moments <= 4:
        results = stats.describe(data, axis=axis, nan_policy='omit')
        result = results[2:moments+2]
    else:
        result = []
        for m in range(1,moments+1):
            print(f'moment {m}')
            if m == 1:
                result.append(np.mean(data, axis=axis))
            else:
                result.append(stats.moment(data, moment=m, axis=axis, nan_policy='omit'))
    return result


# Bonus -- NCL assumes the user has downloaded the data files, but what if we just grab them from their URLs.
#***************************************************************
#--- Read data file and extract LSAT
#    Law School Data: Efron & Tibshirani (1993) 
#    An Introduction to the Bootstrap. Chapman and Hall 
#***************************************************************

N     = 15                          # 82; 15
# fili  = "law_school_"+N+".txt"    # law_school_82.txt or law_school_15.txt
# LSAT  = asciiread(diri+fili,(/N,3/), "float")  ; 3 columns


LSAT = pd.read_csv(f"http://ncl.ucar.edu/Applications/Data/asc/law_school_{N}.txt", delim_whitespace=True)
# Also works: pd.read_csv(f"http://ncl.ucar.edu/Applications/Data/asc/law_school_{N}.txt", delimiter=r"\s+")
# Pandas is not always as automagical as this, but for very simple formatting, works like a charm.

x = LSAT['LSAT']


#***************************************************************
#--- Set bootstrap parameters and any desired options
#***************************************************************

stat  = 0                         # 0=mean; 1=variance; 2=std. dev; ...
n     = 15                        # sub-sample size;  default is n=N       
nBoot = 1000                      # bootstrap replications

opt   = dict()                    # NCL allows any object to have arbitrary attributes,
opt['state'] = False              # e.g., opt=False; opt@myatt = "Attribute"
                                  # In python we could do that by defining a class or dataclass
                                  # but for simple things, it is usually easier to make them dicts

# I'm not sure why this logic is included in the example, but
# we can certainly do something similar in python
if (('n' in locals()) and isinstance(n,int) and (n <= N)):
    opt['state'] = True
    opt['sample_size'] = n           # default N

# the example sets seeds here ... as already mentioned, that does not seem necessary unless you need to do testing.

#***************************************************************
#--- bootstrap_stat: extract information from returned 'list' variable: Bootstrap
#***************************************************************
xBoot, xBootAvg, xBootStd = bootstrap_stat(x, stat, nBoot, 0, **opt)  # opt holds the kwargs
print(f"xBoot shape: {xBoot.shape}")
print(f"Average of the bootstrapped statistic is {xBootAvg} with standard deviation {xBootStd}")

# xBootLow    = bootstrap_estimate(xBoot, 0.025, False)   #  2.5% lower confidence bound 
# xBootMed    = bootstrap_estimate(xBoot, 0.500, False)   # 50.0% median of bootstrapped estimates
# xBootHi     = bootstrap_estimate(xBoot, 0.975, False)   # 97.5% upper confidence bound

# xBoot4      = dim_stat4_n(xBoot, 0)
# xBootSkew   = xBoot4(2)          # skewness of bootstrapped distribution
# xBootKurt   = xBoot4(3)          # Kurtosis       "           "


# I am not sure why we need `bootstrap_estimate`
# Just use quantiles of xBoot
# xBootQuantiles = np.quantile(xBoot, [0.025, 0.5, 0.975])
xBootLow, xBootMed, xBootHi = np.quantile(xBoot, [0.025, 0.5, 0.975])

xBoot4 = dim_stat4_n(xBoot, axis=0, moments=4) # tuple(mean, variance, skewness, kurtosis)
# print(xBoot4)
xBootSkew = xBoot4[2]
xBootKurt = xBoot4[3]
moms = ["Mean", "Variance", "Skewness", "Kurtosis"]
[print(f"{moms[i]}: {a}") for i, a in enumerate(xBoot4)]

zBootInit shape : () // Type : <class 'numpy.float64'> // is an ndarray: False
Shape of zBoot: (1000,) using np.zeros(1000). Apply stat to axis 0.
xBoot shape: (1000,)
Average of the bootstrapped statistic is 599.965 with standard deviation 10.75248258155607
Mean: 599.965
Variance: 115.73161327994664
Skewness: 0.05485091531248447
Kurtosis: -0.04868504006729557

[None, None, None, None]


# Pandas approach
# Pandas has many convenient tools, but is often of limited use for geoscience
# because it is built with the conceptual model of "tabular" data (i.e. like a spreadsheet)
# while geoscientific data sets are often multidimensional.
# In this case, the simple Dataframe that we have is easy to work with in pandas.

# select samples:
samples = [x.sample(n, replace=True) for i in range(nBoot)]
pdBoot = [s.mean() for s in samples]
pdMean = np.array(pdBoot).mean()
pdStd = np.array(pdBoot).std() 
print(f"pandas version of xBootAvg, xBootStd: {pdMean}, {pdStd}")

pandas version of xBootAvg, xBootStd: 599.9359333333333, 10.87546177389979


#***************************************************************
#--- "normal" (conventional) estimates for full sample 
#     get t-value for desired 'p' and degrees of freedom
#***************************************************************

xStat4      = dim_stat4_n(x, axis=0, moments=4)  # 1st 4 moments of original sample
print(xStat4)
#                                 # explicitly extract for clarity
xAvg        = xStat4[0]          # original sample mean
xStd        = np.sqrt(xStat4[1]) #    "     sample std dev
xSkew       = xStat4[2]          # skewness; departure from symmetry
xKurt       = xStat4[3]          # kurtosis; relative to a normal distribution
xMed        = np.median(x)  # median of original sample
print(xMed)
                                # estimate confidence limits
df          = len(x)-1                # degrees of freedom (full sample size)               
p           = 0.975              # match default bootstrap 'p' (0.025, 0.975)
tval        = stats.t.ppf(p, df, loc=0, scale=1) #  IS this the same? --> cdft_t(p,df)       # 2-sided   
print(tval)
xLow        = xAvg-tval*xStd     # normal low:  2.5% 
xHi         = xAvg+tval*xStd     # normal hi ; 97.5%

(600.2666666666667, 1746.780952380952, 0.422905832172061, -1.329122210490465)
580.0
2.1447866879169273


fig2, ax2 = plt.subplots(figsize=(12,6), ncols=2, constrained_layout=True)

fsiz = 16

ax2[0].hist(x, color='green')
ax2[0].set_xlabel("Sample LSAT", fontsize=fsiz)
ax2[0].set_ylabel("Frequency", fontsize=fsiz)
ax2[0].set_title(f"Original Sample: N = {N}", fontsize=16, fontweight='bold')
textSample  = f"""Mean = {xAvg:5.2f}
Std = {xStd:5.2f}
Skew = {xSkew:5.2f}
Kurt = {xKurt:5.2f}
xLow = {xLow:5.2f}
xMed = {xMed:5.2f}
xHi = {xHi:5.2f} """
# place stats at a specified vertical position relative to yaxis
ylim = ax2[0].get_ylim()
ax2[0].text(620, 0.6*(ylim[1]-ylim[0]), textSample, fontsize=12)

# 2 ways to change tick label font size; the second way works better
# because the labels aren't converted to floats, so don't have to worry about formatting
# xticks = ax2[0].get_xticks()
# ax2[0].set_xticklabels(xticks, fontsize=fsiz-2)
ax2[0].tick_params(axis='both', labelsize=fsiz-2)

ax2[1].hist(xBoot, color='blue')
ax2[1].set_xlabel("Bootstrapped LSAT Means", fontsize=fsiz)
# ax2[1].set_ylabel("Frequency")
ax2[1].set_title(f"nBoot={nBoot}: N = {N}, n={n}", fontsize=fsiz, fontweight='bold')
textSample2  = f"""Mean = {xBootAvg:5.2f}
Std = {xBootStd:5.2f}
Skew = {xBootSkew:5.2f}
Kurt = {xBootKurt:5.2f}
xLow = {xBootLow:5.2f}
xMed = {xBootMed:5.2f}
xHi = {xBootHi:5.2f} """
ylim = ax2[1].get_ylim()
ax2[1].text(620, 0.6*(ylim[1]-ylim[0]), textSample2, fontsize=12)
ax2[1].tick_params(axis='both', labelsize=fsiz-2)
pass # suppress jupyter printing some extraneous output from last command // NOT NEEDED.


# import requests
# import xarray as xr
data_url = "https://www.ncl.ucar.edu/Applications/Data/cdf/Boulder.precip.1897-2014.nc"
r = requests.get(data_url)
with open("boulder_precip.nc", 'wb') as f:
    f.write(r.content)
# now open that file:
ds2 = xr.open_dataset('boulder_precip.nc')

# Follow the NCL example
# - specify long name as 
# - specify a month [0-11]
month = {i:mstr for i, mstr in enumerate(["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"])}
print(month)

# - only get the data for that month by striding throug the data
mon = 4  # => May
varname = 'precip'
precip = ds2[varname][mon::12]
precip.attrs['long_name'] = f"Boulder {varname} ({precip.attrs['units']}): {month[mon]}"
precip

{0: 'Jan', 1: 'Feb', 2: 'Mar', 3: 'Apr', 4: 'May', 5: 'Jun', 6: 'Jul', 7: 'Aug', 8: 'Sep', 9: 'Oct', 10: 'Nov', 11: 'Dec'}

<xarray.DataArray 'precip' (time: 118)>
array([2.3 , 3.76, 0.55, 1.84, 1.62, 2.32, 2.02, 5.35, 4.78, 3.37, 5.29, 3.45,
       1.85, 3.38, 0.9 , 2.82, 1.85, 3.57, 3.83, 3.77, 6.17, 2.03, 0.88, 1.61,
       1.5 , 1.02, 3.47, 2.77, 1.61, 3.22, 1.72, 5.48, 1.5 , 2.17, 3.67, 1.1 ,
       3.94, 3.11, 7.04, 2.75, 2.31, 4.38, 1.74, 2.57, 2.07, 1.8 , 4.85, 1.77,
       3.86, 3.55, 4.35, 1.24, 3.69, 3.49, 1.93, 4.45, 2.69, 1.33, 2.49, 3.67,
       9.27, 4.17, 3.73, 3.79, 3.7 , 1.99, 1.37, 2.27, 1.42, 0.8 , 4.46, 2.27,
       8.66, 1.17, 2.  , 1.22, 4.88, 0.04, 4.33, 2.14, 0.93, 8.03, 6.67, 2.48,
       4.47, 4.65, 5.54, 0.45, 1.37, 2.62, 2.12, 3.7 , 2.68, 1.73, 2.9 , 1.7 ,
       1.73, 1.35, 9.59, 4.63, 2.19, 1.82, 1.84, 1.6 , 3.62, 3.2 , 2.62, 1.28,
       1.91, 1.14, 1.79, 4.21, 3.08, 2.71, 5.16, 1.78, 2.66, 4.43],
      dtype=float32)
Coordinates:
  * time     (time) int32 189705 189805 189905 190005 ... 201205 201305 201405
Attributes:
    long_name:  Boulder precip (inches): May
    units:      inches

array([2.3 , 3.76, 0.55, 1.84, 1.62, 2.32, 2.02, 5.35, 4.78, 3.37, 5.29, 3.45,
       1.85, 3.38, 0.9 , 2.82, 1.85, 3.57, 3.83, 3.77, 6.17, 2.03, 0.88, 1.61,
       1.5 , 1.02, 3.47, 2.77, 1.61, 3.22, 1.72, 5.48, 1.5 , 2.17, 3.67, 1.1 ,
       3.94, 3.11, 7.04, 2.75, 2.31, 4.38, 1.74, 2.57, 2.07, 1.8 , 4.85, 1.77,
       3.86, 3.55, 4.35, 1.24, 3.69, 3.49, 1.93, 4.45, 2.69, 1.33, 2.49, 3.67,
       9.27, 4.17, 3.73, 3.79, 3.7 , 1.99, 1.37, 2.27, 1.42, 0.8 , 4.46, 2.27,
       8.66, 1.17, 2.  , 1.22, 4.88, 0.04, 4.33, 2.14, 0.93, 8.03, 6.67, 2.48,
       4.47, 4.65, 5.54, 0.45, 1.37, 2.62, 2.12, 3.7 , 2.68, 1.73, 2.9 , 1.7 ,
       1.73, 1.35, 9.59, 4.63, 2.19, 1.82, 1.84, 1.6 , 3.62, 3.2 , 2.62, 1.28,
       1.91, 1.14, 1.79, 4.21, 3.08, 2.71, 5.16, 1.78, 2.66, 4.43],
      dtype=float32)

array([189705, 189805, 189905, 190005, 190105, 190205, 190305, 190405, 190505,
       190605, 190705, 190805, 190905, 191005, 191105, 191205, 191305, 191405,
       191505, 191605, 191705, 191805, 191905, 192005, 192105, 192205, 192305,
       192405, 192505, 192605, 192705, 192805, 192905, 193005, 193105, 193205,
       193305, 193405, 193505, 193605, 193705, 193805, 193905, 194005, 194105,
       194205, 194305, 194405, 194505, 194605, 194705, 194805, 194905, 195005,
       195105, 195205, 195305, 195405, 195505, 195605, 195705, 195805, 195905,
       196005, 196105, 196205, 196305, 196405, 196505, 196605, 196705, 196805,
       196905, 197005, 197105, 197205, 197305, 197405, 197505, 197605, 197705,
       197805, 197905, 198005, 198105, 198205, 198305, 198405, 198505, 198605,
       198705, 198805, 198905, 199005, 199105, 199205, 199305, 199405, 199505,
       199605, 199705, 199805, 199905, 200005, 200105, 200205, 200305, 200405,
       200505, 200605, 200705, 200805, 200905, 201005, 201105, 201205, 201305,
       201405], dtype=int32)


# the next thing in the example is to print some info; we already see the data above, but we can also use scipy.stats.describe
# to get a little more:
precip_stats = stats.describe(precip)
print(precip_stats)

DescribeResult(nobs=118, minmax=(0.04, 9.59), mean=2.9972034, variance=3.183749, skewness=1.3595125675201416, kurtosis=2.3749956154263065)


#***************************************************************
#--- Set parameters and any desired options
#***************************************************************
N = precip_stats.nobs  # number of years; could also use len(precip) or precip.shape[0]
#  n = N  # 30  # sub-sample size; default n=N; n=30 means 30 years  
n = 30
df = n-1
nBoot = 1000 # number of resamples with replacement
# as in NCL example, use default values for options, and mean=0 for stat:
precip_BootStrap = bootstrap_stat(precip, 'mean', nBoot, 0)
# print(precip_BootStrap)

zBootInit shape : () // Type : <class 'xarray.core.dataarray.DataArray'> // is an ndarray: False
Shape of zBoot: (1000,) using np.zeros(1000). Apply stat to axis 0.


def plot_boot_sample(odata, bsdata, n=None):
    # conventional stats
    xAvg, xVariance, xSkew, xKurt = dim_stat4_n(odata, axis=0, moments=4)
    xStd = np.sqrt(xVariance)
    xMed = np.median(odata)
    df          = len(odata)-1       # degrees of freedom (full sample size)               
    p           = 0.975              # match default bootstrap 'p' (0.025, 0.975)
    tval        = stats.t.ppf(p, df, loc=0, scale=1) #  IS this the same? --> cdft_t(p,df)       # 2-sided   
    xLow        = xAvg-tval*xStd     # normal low:  2.5% 
    xHi         = xAvg+tval*xStd     # normal hi ; 97.5%
    
    # bootstrapped stats
    xBoot, xBootAvg, xBootStd = bsdata  # unpack the tuple of data
    xBootLow, xBootMed, xBootHi = np.quantile(xBoot, [0.025, 0.5, 0.975])
    xBootMean, xBootVariance, xBootSkew, xBootKurt = dim_stat4_n(xBoot, axis=0, moments=4) # tuple(mean, variance, skewness, kurtosis)
    if n is not None:
        samplesize = n
    else:
        samplesize = len(odata) # default assumed to be N
    
    figX, axX = plt.subplots(figsize=(12,6), ncols=2, constrained_layout=True)
    fsiz = 16
    axX[0].hist(odata, color='green')
    axX[0].set_xlabel(" - ", fontsize=fsiz)
    axX[0].set_ylabel("Frequency", fontsize=fsiz)
    axX[0].set_title(f"Original Sample: N = {len(odata)}", fontsize=16, fontweight='bold')
    textSample  = f"""Mean = {xAvg:5.2f}\nStd = {xStd:5.2f}\nSkew = {xSkew:5.2f}\nKurt = {xKurt:5.2f}\nxLow = {xLow:5.2f}\nxMed = {xMed:5.2f}\nxHi = {xHi:5.2f}"""
    # place stats at a specified vertical position relative to yaxis
    ylim = axX[0].get_ylim()
    xlim = axX[0].get_xlim()
    print(xlim)
    axX[0].text(xlim[0]+(0.05*(xlim[1]-xlim[0])), 0.6*(ylim[1]-ylim[0]), textSample, fontsize=12)
    axX[0].tick_params(axis='both', labelsize=fsiz-2)

    axX[1].hist(xBoot, color='blue')
    axX[1].set_xlabel("Bootstrapped", fontsize=fsiz)
    axX[1].set_title(f"nBoot={nBoot}: N = {len(odata)}, n={samplesize}", fontsize=fsiz, fontweight='bold')
    textSample2  = f"""Mean = {xBootAvg:5.2f}\nStd = {xBootStd:5.2f}\nSkew = {xBootSkew:5.2f}\nKurt = {xBootKurt:5.2f}\nxLow = {xBootLow:5.2f}\nxMed = {xBootMed:5.2f}\nxHi = {xBootHi:5.2f} """
    ylim = axX[1].get_ylim()
    xlim = axX[1].get_xlim()
    axX[1].text(xlim[0]+(.05*(xlim[1]-xlim[0])), 0.6*(ylim[1]-ylim[0]), textSample2, fontsize=12)
    axX[1].tick_params(axis='both', labelsize=fsiz-2)
    return figX, axX


fig_P, ax_P = plot_boot_sample(precip, precip_BootStrap, n=n)
ax_P[0].set_xlabel(precip.attrs["long_name"])  # customize the labels

(-0.43750001639127734, 10.067500160634518)

Text(0.5, 0, 'Boulder precip (inches): May')


LSAT82 = pd.read_csv("http://ncl.ucar.edu/Applications/Data/asc/law_school_82.txt", delim_whitespace=True)


# Now that we are doing the same things again, let's make a function that provides all the conventional stats:
from collections import namedtuple 
StatColl = namedtuple('StatColl',['Avg','Std','Med','Low','Hi','Skew','Kurt']) 

def get_conventional_stats(data):
    xAvg, xVariance, xSkew, xKurt = dim_stat4_n(data, axis=0, moments=4)
    xStd = np.sqrt(xVariance)
    xMed = np.median(data)
    df          = len(data)-1       # degrees of freedom (full sample size)               
    p           = 0.975              # match default bootstrap 'p' (0.025, 0.975)
    tval        = stats.t.ppf(p, df, loc=0, scale=1) #  IS this the same? --> cdft_t(p,df)       # 2-sided   
    xLow        = xAvg-tval*xStd     # normal low:  2.5% 
    xHi         = xAvg+tval*xStd     # normal hi ; 97.5%
    return StatColl(xAvg, xStd, xMed, xLow, xHi, xSkew, xKurt)


lsat82_stats = get_conventional_stats(LSAT82['LSAT'])
gpa82_stats = get_conventional_stats(LSAT82['GPA'])


#*************************************************
#--- Compute sample corelation; 'r' distribution is skewed
#--- Fischer z-transform; assumes bivariate normal distribution; really n>=10
#--- "conventional" statistical estimates; get t-value for desired 'p'
#*************************************************

r    = np.corrcoef(LSAT82['LSAT'],LSAT82['GPA'])[0,1]            # sample cross-correlation coefficient

df   = len(LSAT82['LSAT'])-2     # degrees of freedom
z    = 0.5*np.log((1+r)/(1-r))   # Fischer z-transform
zse  = 1.0/np.sqrt(n-3)          # standard error of z statistic

ptst = 0.975                   
zval = stats.t.ppf(ptst, df, loc=0, scale=1)      # Still not sure this is exactly the same as NCL's calculation.
zLow = z-zval*zse             # z-normal low:  2.5% 
zHi  = z+zval*zse             # z-normal hi ; 97.5%
                            # transform z back to 'r' space
rLow = np.tanh(zLow)             #  =(exp(2*zlow)-1)/(exp(2*zlow)+1)
rHi  = np.tanh(zHi)              #  =(exp(2*zhi )-1)/(exp(2*zhi )+1)

rse  = np.sqrt(1-r**2)/np.sqrt(df)   # standard error of r statistic  
t    = r*np.sqrt(df/(1-r**2))     # 'classic' t-value for Pearson x-correlation
p    = stats.t.sf(t, df)       # probability ("survival function")


#*************************************************
#--- bootstrap
#*************************************************

nBoot = 1000

rBoot, rBootAvg, rBootStd = bootstrap_correl(LSAT82['LSAT'], LSAT82['GPA'], nBoot, 0)
rBootLow, rBootMed, rBootHi = np.quantile(rBoot, [0.025, 0.5, 0.975])
rBootStats = get_conventional_stats(rBoot)


# Plots 

# first one is just the two series:
fig3, ax3 = plt.subplots(figsize=(8,4))
ax3.plot(LSAT82.index, LSAT82['LSAT'], label='LSAT', color='blue')
ax3.set_ylabel("LSAT")
ax3b = ax3.twinx()  # instantiate a second axes that shares the same x-axis
ax3b.plot(LSAT82.index, LSAT82['GPA'], label='GPA', color='red')
ax3b.set_ylabel("GPA")
ax3.set_xlabel("School Index")
fig3.legend()

<matplotlib.legend.Legend at 0x187b00550>


# Our plot function is not quite generic enough to do this plot, so we repeat ourselves.

fig4, ax4 = plt.subplots(figsize=(12,6), ncols=2, constrained_layout=True)
ax4[0].hist(LSAT82['LSAT'])
ax4[1].hist(LSAT82['GPA'])
# add the stats, but try to use our tuple
tx = ""
for i,fld in enumerate(lsat82_stats):
    tx += f"{lsat82_stats._fields[i]}={getattr(lsat82_stats,lsat82_stats._fields[i]):5.2f}\n"
ylim = ax4[0].get_ylim()
xlim = ax4[0].get_xlim()
ax4[0].text(xlim[0]+(0.05*(xlim[1]-xlim[0])), 0.6*(ylim[1]-ylim[0]), tx, fontsize=12)
ax4[0].tick_params(axis='both', labelsize=fsiz-2)
ax4[0].set_title(f"LSAT: N = {len(LSAT82['LSAT'])}")

tx = ""
for i,fld in enumerate(gpa82_stats):
    tx += f"{gpa82_stats._fields[i]}={getattr(gpa82_stats,gpa82_stats._fields[i]):5.2f}\n"
ylim = ax4[1].get_ylim()
xlim = ax4[1].get_xlim()
ax4[1].text(xlim[0]+(0.05*(xlim[1]-xlim[0])), 0.6*(ylim[1]-ylim[0]), tx, fontsize=12)
ax4[1].tick_params(axis='both', labelsize=fsiz-2)
ax4[1].set_title(f"GPA: N = {len(LSAT82['GPA'])}")

Text(0.5, 1.0, 'GPA: N = 82')


fig5, ax5 = plt.subplots()
ax5.hist(rBoot)
tx = f"r={r:0.3f}\nStdErr={zse:0.3f}\nrLow={rLow:0.3f}\nrHi={rHi:0.3f}\n\nrBootAvg={rBootAvg:0.3f}\nrBootLow={rBootLow:0.3f}\nrBootMed={rBootMed:0.3f}\nrBootHi={rBootHi:0.3f}"
ax5.text(0.6, 100, tx)
ax5.set_xlabel("Bootstrapped cross-correlations")
ax5.set_ylabel("Frequency")

Text(0, 0.5, 'Frequency')


# replicate two NCL functions

def clmMonTLL(da):
    return da.groupby('time.month').mean(dim='time')

def calcMonAnomTLL(da, climo):
    return da.groupby('time.month') - climo


%%time
data_url = "https://www.ncl.ucar.edu/Applications/Data/cdf/SOI.nc"
req = requests.get(data_url)
with open("SOI.nc", 'wb') as f:
    f.write(req.content)
# now open that file:
ds3 = xr.open_dataset('SOI.nc')

## SET START & END YEARS HERE:
startYear = 1948
endYear = 1986

# soi = ds3['SOI_SIGNAL'][984:1752]       # 194801 - 201112 <-- Just following the example, +1 to get December

# Change to allow specifying start and end years:
soi = ds3['SOI_SIGNAL'].sel(time=slice(f"{startYear}-01",f"{endYear}-12"))

# print(soi) 
# Note that xarray has already decoded the time coordinate into datetime64[ns] objects.
print(f"ymStrt={soi['time'][0].dt.strftime('%Y/%m').item()}  ymLast={soi['time'][-1].dt.strftime('%Y/%m').item()}")

nSOI = len(soi)

#***********************************************************
#--- Calculate some basic statistics for the SOI
#    These statistics are not used .... just background statistics
#***********************************************************

# re-use our function:
soiStats = get_conventional_stats(soi)
# access as soiStats.Avg for example,
# or getattr(soiStats, "Std")
print(soiStats)

# NCL's next step is to load another dataset and get the monthly climatology
# NOTE: I couldn't find that file on the NCL site, 
#       but NOAA hosts a file with the same name on their OpenDAP server, 
#       so we will use that.

# ;***********************************************************
# ;--- Read variable
# ;    Calculate monthly climatology; Calculate monthly anomalies
# ;***********************************************************

#   diri  = "./"
#   fili  = "air.mon.mean.nc"
#   pthi  = diri+fili
#   f     = addfile(pthi, "r")
#   printVarSummary(f->air)
#   y     = short2flt(f->air(0:767,0,:,:)) ; [time | 774] x [lat | 73] x [lon | 144]  
#   yClm  = clmMonTLL(y)                   ; [time |  12] x [lat | 73] x [lon | 144]
#   yAnom = calcMonAnomTLL(y, yClm)        ; [time | 774] x [lat | 73] x [lon | 144]

#   dimy  = dimsizes(y)
#   ntim  = dimy(0)                        ; 768: 194801-201112

#   if (ntim.ne.nsoi) then
#       print("time mismatch: ntim="+ntim+"  nsoi="+nsoi)
#       exit
#   end if

# https://psl.noaa.gov/repository/entry/show?entryid=synth%3Ae570c8f9-ec09-4e89-93b4-babd5651e7a9%3AL25jZXAucmVhbmFseXNpcy5kZXJpdmVkL3N1cmZhY2UvYWlyLm1vbi5tZWFuLm5j
ds5 = xr.open_dataset("https://psl.noaa.gov/repository/opendap/synth:e570c8f9-ec09-4e89-93b4-babd5651e7a9:L25jZXAucmVhbmFseXNpcy5kZXJpdmVkL3N1cmZhY2UvYWlyLm1vbi5tZWFuLm5j/entry.das")

# NCL converst from short to float, but either xarray does that for us (likely) or the file is different.
# Note there appear to be many more years here, so we will subset to match SOI before doing calculations.
y = ds5['air'].sel(time=slice(f"{startYear}-01",f"{endYear}-12"))
yClm  = clmMonTLL(y)                   # [time |  12] x [lat | 73] x [lon | 144]
yAnom = calcMonAnomTLL(y, yClm)        # [time | 768] x [lat | 73] x [lon | 144]
print(yClm.shape)
print(yAnom.shape)

ymStrt=1948/01  ymLast=1986/12
StatColl(Avg=-0.08030657, Std=1.7189989, Med=-0.011237949, Low=-3.458236996457064, Hi=3.2976238619536993, Skew=-0.3417573869228363, Kurt=1.2288340102030464)
(12, 73, 144)
(468, 73, 144)
CPU times: user 315 ms, sys: 185 ms, total: 500 ms
Wall time: 5.52 s


%%time
#***********************************************************
#--- Compute conventional sample correlation('r') at each grid point
#    Assumes bivariate normal distribution
#    'r' distribution is skewed, hence, use Fischer z-transform
#***********************************************************


r = xr.corr(soi, yAnom, dim='time')
r.attrs['long_name'] = "correlation: xr.corr(soi, yAnom, dime='time')"
# print(r)
# r    = escorc_n(soi,yAnom,0,0)    ; [73] x [144]
# r@long_name = "correlation: escorc"

df   = len(soi)-2
rse  = np.sqrt(1-r**2)/np.sqrt(df)       # [73] x [144]
# rse@long_name = "correlation: escorc: std. error"
rse.attrs['long_name'] = "correlation: xr.corr: std. error"

z = 0.5*np.log((1+r)/(1-r))       # [73] x [144] 
z.attrs['long_name'] = "Fischer z-transform"

zse  = 1.0/np.sqrt(len(soi)-3)           # [1]
# Oops, zse is just a numpy array.
# zse.attrs['long_name'] = "standard error of z statistic" 

# NCL then copies metadata (i.e., coordinates) to these, but we've still got them.

CPU times: user 71.4 ms, sys: 22.7 ms, total: 94.1 ms
Wall time: 93 ms


%%time
#***********************************************************
#--- Conventional (normal) estimates; use z-transformed values
#--- Get t-value for specified 'ptst' : Add meta data
#***********************************************************

ptst = 0.975                # match default bootstrap 'p' (0.025, 0.975)
zval = stats.t.ppf(ptst, df, loc=0, scale=1)
zLow = z-zval*zse           # z-normal low:  2.5% 
zHi  = z+zval*zse           # z-normal hi ; 97.5%

                            # transform z back to 'r' space
rLow = np.tanh(zLow)        # =(exp(2*zlow)-1)/(exp(2*zlow)+1)
rHi  = np.tanh(zHi)         # =(exp(2*zhi )-1)/(exp(2*zhi )+1)
rLow.attrs['long_name'] = "Correlation:  2.5%"
rHi.attrs['long_name']  = "Correlation: 97.5%"

tval = r*np.sqrt(df/(1-r**2))  # conventional t-value
psoi = stats.t.sf(tval, df) # probability
psoi = xr.DataArray(psoi, dims=rLow.dims, coords=rLow.coords) # Copy MetaData
psoi.attrs['long_name'] = "Conventional Probability"

CPU times: user 5.34 ms, sys: 1.06 ms, total: 6.4 ms
Wall time: 5.42 ms


%%time
#***********************************************************
#--- BOOTSTRAP
#    Extract from returned 'list' vatiable; Add meta data
#***********************************************************
Nsamples = len(soi)           # convenience
sample_size = len(soi)        # no subsampling
nBoot = 500                   # test with nBoot=10

rBoot, rBootAvg, rBootStd = bootstrap_correl(soi, yAnom, nBoot, yAnom.dims.index("time"))  
qnt = np.quantile(rBoot, [0.025, 0.5, 0.975], axis=0) 
rBootLow = qnt[0,:,:]
rBootMed = qnt[1,:,:]
rBootHi = qnt[2,:,:]

CPU times: user 31.4 s, sys: 4.87 s, total: 36.2 s
Wall time: 36.3 s


%%time
# Now the plots

# first one is just the original SOI data

fig6, ax6 = plt.subplots(figsize=(6,6))

fsiz = 16

ax6.hist(soi, color='green')
ax6.set_xlabel("SOI signal", fontsize=fsiz)
ax6.set_ylabel("Frequency", fontsize=fsiz)
ax6.set_title(f"Original Sample: N = {len(soi)}", fontsize=16, fontweight='bold')
tx = ""
for i,fld in enumerate(soiStats):
    tx += f"{soiStats._fields[i]}={getattr(soiStats,soiStats._fields[i]):5.2f}\n"
# place stats at a specified vertical position relative to yaxis
ylim = ax6.get_ylim()
xlim = ax6.get_xlim()
ax6.text(xlim[0]+(0.25*(xlim[1]-xlim[0])), 0.6*(ylim[1]-ylim[0]), tx, fontsize=12)

CPU times: user 37.2 ms, sys: 5.05 ms, total: 42.3 ms
Wall time: 38.3 ms

Text(-4.790351486206054, 88.83, 'Avg=-0.08\nStd= 1.72\nMed=-0.01\nLow=-3.46\nHi= 3.30\nSkew=-0.34\nKurt= 1.23\n')


%time
# The second plot is a set of maps.
# So we'll get Cartopy to help.
# import cartopy.crs as ccrs
# import matplotlib as mpl
# import colorcet as cc  # Provides colormaps

clrnorm = mpl.colors.TwoSlopeNorm(vmin=-1, vcenter=0, vmax=1)
clrmap = "RdBu" # cc.cm.bwy #  "PiYG"
clevels = np.linspace(-1, 1, 21)

fig7, ax7 = plt.subplots(figsize=(12,12), ncols=2, nrows=3, subplot_kw={"projection":ccrs.PlateCarree(central_longitude=180.)}, sharex=True, sharey=True, constrained_layout=True)
aa = ax7.ravel()
lons, lats = np.meshgrid(ds5['lon'], ds5['lat'])
imgs = []
imgs.append(aa[0].contourf(lons, lats, rLow, levels=clevels, transform=ccrs.PlateCarree(), cmap=clrmap, norm=clrnorm))
aa[0].set_title("rLow")

imgs.append(aa[1].contourf(lons, lats, rBootLow, levels=clevels, transform=ccrs.PlateCarree(), cmap=clrmap, norm=clrnorm))
aa[1].set_title("rBootLow")

imgs.append(aa[2].contourf(lons, lats, r, levels=clevels, transform=ccrs.PlateCarree(), cmap=clrmap, norm=clrnorm))
aa[2].set_title("r")

imgs.append(aa[3].contourf(lons, lats, rBootMed, levels=clevels, transform=ccrs.PlateCarree(), cmap=clrmap, norm=clrnorm))
aa[3].set_title("rBootMed")

imgs.append(aa[4].contourf(lons, lats, rHi, levels=clevels, transform=ccrs.PlateCarree(), cmap=clrmap, norm=clrnorm))
aa[4].set_title("rHi")

imgs.append(aa[5].contourf(lons, lats, rBootHi, levels=clevels, transform=ccrs.PlateCarree(), cmap=clrmap, norm=clrnorm))
aa[5].set_title("rBootHi")

[a.coastlines() for a in aa]
[a.set_xticks(np.arange(-180,180+60,60)) for a in aa]
[a.set_yticks(np.arange(-90,90+30,30)) for a in aa]

# [fig.colorbar(imgs[i], ax=a) for i, a in enumerate(aa)]
fig.colorbar(imgs[-1], ax=aa, orientation='horizontal', shrink=0.45)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 2.86 µs

<matplotlib.colorbar.Colorbar at 0x18d9db3d0>


#
# Bonus -- how to calculate sample moments -- This is from a sidetrack when I thought NCL and python were disagreeing.
#

# MEAN
soi_np_mean = np.mean(soi)
soi_mean = np.sum(soi) / len(soi)
print(f"Two versions of the mean: {soi_np_mean.item()}, {soi_mean.item()} --> they are about the same: {np.isclose(soi_mean,soi_np_mean)}")

# the standard deviation (sqrt(VARIANCE)):
soi_np_std = np.std(soi)
soi_std = np.sqrt(np.sum((soi - soi_mean)**2)/len(soi))
print(f"Two versions of the std: {soi_np_std.item()}, {soi_std.item()} --> they are about the same: {np.isclose(soi_std,soi_np_std)}")

# SKEWNESS
soi_skew = ((1/len(soi))*np.sum((soi - soi_mean)**3)) / ((1/(len(soi)))*(np.sum((soi - soi_mean)**2)))**(3/2)
## note that if we use N-1 version in the denominator, that causes isclose to fail for the comparison.
# no numpy version => scipy.sats
soi_sp_skew = stats.skew(soi)
print(f"Two versions of the skew: {soi_sp_skew}, {soi_skew.item()} --> they are about the same: {np.isclose(soi_skew,soi_sp_skew)}")

# KURTOSIS
d = soi - soi_mean
top = np.mean(d**4)
bot = (np.mean(d**2))**2
soi_kurt = (top/bot) - 3
# not numpy version => scipy.stats
soi_sp_kurt = stats.kurtosis(soi)
soi_sp_kurt_pearson = stats.kurtosis(soi, fisher=False)
print(f"Kurtosis {soi_kurt.item()}, {soi_sp_kurt}, {soi_sp_kurt_pearson}")

Two versions of the mean: -0.08030656725168228, -0.0803065666785607 --> they are about the same: True
Two versions of the std: 1.7171614170074463, 1.7171614006605183 --> they are about the same: True
Two versions of the skew: -0.3417573869228363, -0.34175732336081166 --> they are about the same: True
Kurtosis 1.2288346020708394, 1.2288340102030464, 4.228834010203046


# reproduce NCL's `regline_stats`

# For 1-dimensional case at least:
from scipy.linalg import lstsq
def regCoef(a, b):
    p, res, rnk, s = lstsq(M, y)


# Let's make sure we have the data:
LSAT15 = pd.read_csv("http://ncl.ucar.edu/Applications/Data/asc/law_school_15.txt", delim_whitespace=True)
LSAT82 = pd.read_csv("http://ncl.ucar.edu/Applications/Data/asc/law_school_82.txt", delim_whitespace=True)

nBoot = 5000


xdata, ydata = LSAT82['LSAT'], LSAT82['GPA']
rc   = stats.linregress(xdata, ydata)
# rc is an object of type scipy.stats._stats_mstats_common.LinregressResult
# and for our purposes, it acts like a named tuple:
print(rc)
# Note: the regression coefficient is the same as the slope for a simple linear regression

LinregressResult(slope=0.003741868745134814, intercept=0.8989289433797469, rvalue=0.7599978555038979, pvalue=1.2402903310112808e-16, stderr=0.0003577625024658637, intercept_stderr=0.21421814196716416)


def bootstrap_regcoef(x, y, nBoot, nDim=0, **kwargs):
    """Bootstrap estimates of linear regression coefficient.
    
    This is a lot of repeated code, and we could probably implement a bootstrap class that could reduce the code.
    
    Implementation Note: I am not implementing the N-d version, so nDim really does not do anything here.
  
    """
    if (nDim is None):
        nDim = 0
    elif nDim != 0:
        raise NotImplementedError("Sorry, bootstrap_regcoef does not understand arrays yet. Please provide 1-dimensional arrays instead.")

        
    if ('rng' in kwargs) and (kwargs['rng'] is not None):
        rng = kwargs['rng']
    else:
        rng = np.random.default_rng()
    if ('sample_size' in kwargs) and (kwargs['sample_size'] is not None):
        sample_size = kwargs['sample_size']
    else:
        sample_size = x.shape[nDim]

    print(f"[bootstrap_regcoef] sample size will be {sample_size}")
    
    for i in range(nBoot):
        indices = generate_sample_indices(sample_size, method=1, rng=rng, minval=0, maxval=x.shape[nDim])
        xsample = np.take(x, indices, axis=nDim)
        ysample = np.take(y, indices, axis=nDim)
        regression_result = stats.linregress(xsample, ysample)
        regression_coef = regression_result.slope
#         print(f"size of indices: {len(indices)}, xsample: {xsample.shape}, ysample: {ysample.shape}, r: {r.shape}")
        if i == 0:
            if isinstance(regression_coef, float):
                rBoot = np.zeros(nBoot)
            else:
                # implementation: this should not happen until we support arrays
                rBootShape = tuple([nBoot]+list(regression_coef.shape))
                rBoot = np.zeros(rBootShape)
        rBoot[i,...] = regression_coef
    rBootAvg = np.mean(rBoot, axis=0)
    rBootStd = np.std(rBoot, axis=0)
    return rBoot, rBootAvg, rBootStd


rcBoot, rcBootAvg, rcBootStd = bootstrap_regcoef(xdata, ydata, nBoot)

[bootstrap_regcoef] sample size will be 82


rcBootLow, rcBootMed, rcBootHi = np.quantile(rcBoot, [0.025, 0.5, 0.975])


# the second example uses the LSAT82 data, but makes samples of size 15:
n = 15
rcBoot_b, rcBootAvg_b, rcBootStd_b = bootstrap_regcoef(xdata, ydata, nBoot, **{"sample_size":n})
rcBootLow_b, rcBootMed_b, rcBootHi_b = np.quantile(rcBoot_b, [0.025, 0.5, 0.975])

[bootstrap_regcoef] sample size will be 15


# the third example uses LSAT15
# Now we can see how little code it takes to do the whole calculation with another dataset:
# call it xdata2 to use in plot; use _c for panel c in plots
xdata2, ydata2 = LSAT15['LSAT'], LSAT15['GPA']
rc_c   = stats.linregress(xdata2, ydata2)
rcBoot_c, rcBootAvg_c, rcBootStd_c = bootstrap_regcoef(xdata2, ydata2, nBoot)
rcBootLow_c, rcBootMed_c, rcBootHi_c = np.quantile(rcBoot_c, [0.025, 0.5, 0.975])

[bootstrap_regcoef] sample size will be 15


fig8, ax8 = plt.subplots(figsize=(18,6), ncols=3, sharex=True)

fsiz = 16

# note: do the histograms first so that text placement works with the shared axes.
ax8[0].hist(rcBoot, color='gold')
ax8[1].hist(rcBoot_b, color='gold')
ax8[2].hist(rcBoot_c, color='gold')

ax8[0].set_xlabel("regression coefficient", fontsize=fsiz)
ax8[0].set_ylabel("Frequency", fontsize=fsiz)
ax8[0].set_title(f"Boot regcoef: nBoot={nBoot}, N = n = {len(xdata)}", fontsize=16, fontweight='bold')
tx = ""
for i,fld in enumerate(rc):
    tx += f"{rc._fields[i]}={getattr(rc,rc._fields[i]):5.4f}\n"
tx += "\n"
tx += f"rcAvg = {rcBootAvg:5.4f}\nrcStd={rcBootAvg:5.4f}\nrcLow={rcBootLow:5.4f}\nrcMed={rcBootMed:5.4f}\nrcHi={rcBootHi:5.4f}"
ylim = ax8[0].get_ylim()
xlim = ax8[0].get_xlim()
ax8[0].text(xlim[0]+(0.05*(xlim[1]-xlim[0])), 0.4*(ylim[1]-ylim[0]), tx, fontsize=12)


ax8[1].set_xlabel("regression coefficient", fontsize=fsiz)
ax8[1].set_ylabel("Frequency", fontsize=fsiz)
ax8[1].set_title(f"Boot regcoef: nBoot={nBoot}, N = {len(xdata)}, n = {n}", fontsize=14, fontweight='bold')
tx = ""
for i,fld in enumerate(rc):
    tx += f"{rc._fields[i]}={getattr(rc,rc._fields[i]):5.4f}\n"
tx += "\n"
tx += f"rcAvg = {rcBootAvg_b:5.4f}\nrcStd={rcBootAvg_b:5.4f}\nrcLow={rcBootLow_b:5.4f}\nrcMed={rcBootMed_b:5.4f}\nrcHi={rcBootHi_b:5.4f}"
ylim = ax8[1].get_ylim()
xlim = ax8[1].get_xlim()
ax8[1].text(xlim[0]+(0.05*(xlim[1]-xlim[0])), 0.4*(ylim[1]-ylim[0]), tx, fontsize=12)


ax8[2].set_xlabel("regression coefficient", fontsize=fsiz)
ax8[2].set_ylabel("Frequency", fontsize=fsiz)
ax8[2].set_title(f"Boot regcoef: nBoot={nBoot}, N = {len(xdata2)}, n = {len(xdata2)}", fontsize=14, fontweight='bold')
tx = ""
for i,fld in enumerate(rc):
    tx += f"{rc_c._fields[i]}={getattr(rc_c,rc_c._fields[i]):5.4f}\n"
tx += "\n"
tx += f"rcAvg = {rcBootAvg_c:5.4f}\nrcStd={rcBootAvg_c:5.4f}\nrcLow={rcBootLow_c:5.4f}\nrcMed={rcBootMed_c:5.4f}\nrcHi={rcBootHi_c:5.4f}"
ylim = ax8[2].get_ylim()
xlim = ax8[2].get_xlim()
ax8[2].text(xlim[0]+(0.05*(xlim[1]-xlim[0])), 0.4*(ylim[1]-ylim[0]), tx, fontsize=12)

Text(-0.001880947811934286, 749.7, 'slope=0.0045\nintercept=0.3794\nrvalue=0.7764\npvalue=0.0007\nstderr=0.0010\n\nrcAvg = 0.0045\nrcStd=0.0045\nrcLow=0.0027\nrcMed=0.0046\nrcHi=0.0059')

Python: Bootstrapping and Resampling¶

Data:¶

resampling_1.ncl¶

bootstrap_stat_1.ncl¶

Plots¶

bootstrap_stat_2.ncl¶

bootstrap_correl_1.ncl¶

bootstrap_correl_2.ncl¶

Discrepancy with NCL?¶

Timing of NCL vs. Python¶

bootstrap_regcoef_1.ncl¶