# read the data
from read_data_fx import read_tb_dataTA
data=read_tb_dataTA(3,4)
X=data[:,0]
Y=data[:,1]
import numpy as np
import pandas as pd
n=len(X)
df = pd.DataFrame({'Branch': np.arange(1,n+1),'Size of minimum deposit':X[:],'Number of new Accounts':Y[:]})
df=df.reindex_axis(['Branch','Size of minimum deposit','Number of new Accounts'], axis=1)
df.set_index('Branch')
def lin_reg(X,Y):
barX=np.mean(X); barY=np.mean(Y)
XminusbarX=X-barX; YminusbarY=Y-barY
b1=sum(XminusbarX*YminusbarY)/sum(XminusbarX**2)
b0=barY-b1*barX
Yhat=b0+b1*X
e_i=Y-Yhat
sse=np.sum(e_i**2)
ssr=np.sum((Yhat-barY )**2)
return {'Yhat':Yhat,'b0':b0,'b1':b1,'e_i': e_i,'sse':sse,'ssr':ssr}
def disp_lin_eq(X,Y):
linreg=lin_reg(X,Y)
b0=linreg['b0'];b1=linreg['b1']
b0d = format(b0,'.2f'); b1d=format(b1, '.2f')
from IPython.display import display, Math, Latex
display(Math(r'\hat{Y}='+str(b0d)+'+'+str(b1d)+r'X'))
disp_lin_eq(X,Y)
import matplotlib.pyplot as plt
%matplotlib inline
linreg=lin_reg(X,Y)
b0=linreg['b0'];b1=linreg['b1'];
plt.figure(figsize=(10,8))
plt.plot(X,Y,'ro',ms=20)
plt.plot(X,b0+b1*X,'b-',lw=5);
plt.xlim(55,205);
plt.ylim(10,175);
plt.xlabel('Size of minimum deposit',fontsize=15);
plt.ylabel('Number of new Accounts',fontsize=15);
Notation:
$j$ represents jth unique level $X$
$c$ is the total number of unique levels of $X$
$i$ represents the repeated values/replicas for a particular level of $X$
$n$ is the total number of data points
$n_j$ is the total number of repeated values/replicas for a particular level $j$ of $X$
X1=np.unique(X)
Yij=[]
for i in range(0,len(X1)):
yij=np.int32(Y[X==X1[i]])
Yij.append(yij)
barYj=[];
for i in range (0,len(X1)):
y0=np.mean(Yij[i])
barYj.append(y0)
df = pd.DataFrame({'$j$':np.arange(1,len(X1)+1), '$X_j$' : X1[:],'$Y_{ij}$': Yij[:],'$\overline{Y_j}$': barYj[:]})
df.set_index('$j$')
Full Model:
$Y_{ij}=\mu_j+\epsilon_{ij}$
Pure error sum of squares $SSPE$ is equal to $SSE_F$
Where $SSE_F=\sum_j\sum_i(Y_{ij}-\overline{Y_j})^2$, i.e.,
$$SSPE=\sum_j\sum_i(Y_{ij}-\overline{Y_j})^2 \tag{1} $$Degrees of freedom for the full model $df_F=n-c$
Pure error mean square is given by
$$MSSPE=\frac{SSPE}{n-c} \tag{2}$$# calculate msspe
yijminusbarYj=[];i=0
for yij in Yij:
yijminusbarYj_temp=(yij-barYj[i])**2
yijminusbarYj.append(yijminusbarYj_temp)
i=i+1
sspe=0
for i in range(0,len(X1)):
sspe1=np.sum(yijminusbarYj[i])
sspe=sspe+sspe1
n=len(X); c=len(X1)
msspe=sspe/(n-c)
Reduced model:
$Y_{ij}=\mu_j+\epsilon_{ij}$ but with constraint $\mu_j=\beta_0+\beta_1X_j$
$H_0: E\{Y\}=\beta_0+\beta_1X$
$H_a: E\{Y\} \neq \beta_0+\beta_1X$
$SSE_R$ is equal to $SSE$ i.e., $SSE_R=\sum_j\sum_i(Y_{ij}-\hat{Y}_{ij})^2$
$$SSE=\sum_j\sum_i(Y_{ij}-\hat{Y}_{ij})^2 \tag{3} $$Degrees of freedom for the reduced model $df_F=n-2$
Lack of fit sum of squares is
$$ SSLF=SSE-SSPE \tag{4}$$Formula for $SSLF$ is
$$SSLF=\sum_j\sum_i(\overline{Y}_{j}-\hat{Y}_{ij})^2 \tag{5} $$Degrees of freedom associate with $SSLF$ is $c-2$
Lack of fit mean square
$$MSSLF=\frac{SSLF}{c-2} \tag{6}$$# calculate msslf
sse=lin_reg(X,Y)['sse']
sslf=sse-sspe
mslf=sslf/(c-2.0)
from scipy import stats
alpha=0.01
Fstar=mslf/msspe
F=stats.f.ppf(1-alpha,c-2,n-c)
# fancy display of realtionships
Fstar0=format(Fstar,'.3f');F0=format(F,'.3f');
from IPython.display import display, Math, Latex
display(Math(r'F^*='+str(Fstar0)))
display(Math(r'F='+str(F0)))
We observe $F^* \gt F$ therefore we conclude $H_a$. That is regression function is not linear.
$p-value = P\{F^* \lt F \}$
pval=1.0-stats.f.cdf(Fstar,c-2,n-c)
pval0=format(pval,'.3f')
display(Math(r'p-value='+str(pval0)))