Given
Error terms have approximately constant variance
Then one can attempt to transform the X variable
CAUTION: Do not transform Y variable in this case.
# Upload all the functions and packages we need
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
def lin_reg(X,Y):
barX=np.mean(X); barY=np.mean(Y)
XminusbarX=X-barX; YminusbarY=Y-barY
b1=sum(XminusbarX*YminusbarY)/sum(XminusbarX**2)
b0=barY-b1*barX
Yhat=b0+b1*X
e_i=Y-Yhat
sse=np.sum(e_i**2)
ssr=np.sum((Yhat-barY )**2)
return {'b0':b0,'b1':b1,'e_i': e_i,'sse':sse,'ssr':ssr}
def normal_prob_plot(e_i):
n=len(e_i);sse=np.sum(e_i**2)
mse_sqrt=np.sqrt(sse/(n-2.0))
from scipy import stats
kr=stats.rankdata(e_i)
e_ik=np.zeros(n)
e_ik=mse_sqrt*stats.norm.ppf((kr-0.375)/(n+0.25))
return e_ik
# read the data and display the table
from read_data_fx import read_tb_dataTA
data=read_tb_dataTA(3,7)
X=data[:,0]
Y=data[:,1]
n=len(X)
df = pd.DataFrame({'Sales Trainee': np.arange(1,n+1),'Days of Training ($X_i$)':X[:],'Performance Score ($Y_i$)':Y[:]})
df=df.reindex_axis(['Sales Trainee','Days of Training ($X_i$)','Performance Score ($Y_i$)'], axis=1)
df.set_index('Sales Trainee')
plt.figure(figsize=(10,8))
plt.subplot(2,2,1)
plt.plot(X,Y,'ro',ms=10)
plt.xlim(0.1,2.6);
plt.ylim(40,130);
plt.title('(a) Scatter plot',fontsize=15)
plt.xlabel('Days of Training',fontsize=15);
plt.ylabel('Performance Score',fontsize=15);
plt.subplot(2,2,2)
# transformation
Xdash=np.sqrt(X)
plt.plot(Xdash,Y,'ro',ms=10)
plt.xlim(0.6,1.6);
plt.ylim(40,130);
plt.title('(b) Scatter plot against $\sqrt{X}$',fontsize=15)
plt.xlabel('$\sqrt{X}$',fontsize=15);
plt.ylabel('Performance Score',fontsize=15);
plt.subplot(2,2,3)
e_i=lin_reg(Xdash,Y)['e_i']
plt.plot(Xdash,e_i,'bo',ms=10)
plt.xlim(0.6,1.6);
plt.ylim(-11,11);
plt.title('(c) Residual plot against $\sqrt{X}$',fontsize=15)
plt.xlabel('$\sqrt{X}$',fontsize=15);
plt.ylabel('Residual',fontsize=15);
plt.subplot(2,2,4)
e_ik=normal_prob_plot(e_i)
plt.plot(e_ik,e_i,'ko',ms=10)
plt.xlim(-10,10);
plt.ylim(-10,10);
plt.title('(d) Normal probability plot',fontsize=15)
plt.xlabel('Expected',fontsize=15);
plt.ylabel('Residual',fontsize=15);
plt.tight_layout()
linreg=lin_reg(Xdash,Y)
b0=linreg['b0'];b1=linreg['b1']
b0d = format(b0,'.2f'); b1d=format(b1, '.2f')
from IPython.display import display, Math, Latex
display(Math(r'\hat{Y}='+str(b0d)+'+'+r''+str(b1d)+r'X^\prime'))
print 'Where'
display(Math(r'X^\prime=\sqrt{X}'))
To remedy departures error terms from normality and constant variance, we will need to transform the $Y$ variable.
# read the data and display the table
from read_data_fx import read_tb_dataTA
data=read_tb_dataTA(3,8)
X=data[:,0]
Y=data[:,1]
n=len(X)
df = pd.DataFrame({'Child': np.arange(1,n+1),'Age ($X_i$)':X[:],'Plasma Level ($Y_i$)':Y[:]})
df=df.reindex_axis(['Child','Age ($X_i$)','Plasma Level ($Y_i$)'], axis=1)
df.set_index('Child')
plt.figure(figsize=(10,8))
plt.subplot(2,2,1)
plt.plot(X,Y,'ro',ms=10)
plt.xlim(-0.5,5);
plt.ylim(0,22);
plt.title('(a) Scatter plot',fontsize=15)
plt.xlabel('Age',fontsize=15);
plt.ylabel('Plasma Level',fontsize=15);
plt.subplot(2,2,2)
# transformation
Ydash=np.log10(Y)
plt.plot(X,Ydash,'ro',ms=10)
plt.xlim(-0.5,5);
plt.ylim(0.5,1.7);
plt.title('(b) Scatter plot with $Y^\prime=\log_{10}Y$',fontsize=15)
plt.xlabel('Age',fontsize=15);
plt.ylabel('$\log_{10}Y$',fontsize=15);
plt.subplot(2,2,3)
e_i=lin_reg(X,Ydash)['e_i']
plt.plot(X,e_i,'bo',ms=10)
plt.xlim(-0.5,5);
plt.ylim(-0.2,0.2);
plt.plot((-0.5, 5), (0, 0), 'k-')
plt.title('(c) Residual plot against $X$',fontsize=15)
plt.xlabel('Age',fontsize=15);
plt.ylabel('Residual',fontsize=15);
plt.subplot(2,2,4)
e_ik=normal_prob_plot(e_i)
plt.plot(e_ik,e_i,'ko',ms=10)
plt.ylim(-0.2,0.2);
plt.xlim(-0.15,0.15);
plt.title('(d) Normal probability plot',fontsize=15)
plt.xlabel('Expected',fontsize=15);
plt.ylabel('Residual',fontsize=15);
plt.tight_layout()
linreg=lin_reg(X,Ydash)
b0=linreg['b0'];b1=linreg['b1']
b0d = format(b0,'.2f'); b1d=format(b1, '.2f')
from IPython.display import display, Math, Latex
display(Math(r'\hat{Y}^\prime='+str(b0d)+str(b1d)+r'X'))
print 'Where'
display(Math(r'Y^\prime=\log_{10}Y'))
Box -Cox Transformation procedure automatically identifies a transformation from the family of power transformations is of the form:
\begin{equation} Y^\prime=Y^\lambda \tag{1} \end{equation}where parameter $\lambda$ to be determined from the data. Family of transformations:
$\lambda=2 ~~~~~~~~~~ Y^\prime=Y^2$
$\lambda=.5 ~~~~~~~~~~ Y^\prime=\sqrt{Y}$
$\lambda=0 ~~~~~~~~~~ Y^\prime=\log_{10}Y$ (by definition)
$\lambda=-.5 ~~~~~~~~~~ Y^\prime=\frac{1}{\sqrt{Y}}$
$\lambda=-1.0 ~~~~~~~~~~ Y^\prime= \frac{1}{Y} $
Then the normal error regression model with the response variable from one of the above transformation will be given by
\begin{equation} Y_i^\lambda=\beta_0+\beta_1X+\epsilon_i \tag{2} \end{equation}Now we have to determine one more parameter $\lambda$. Box-Cox procedure uses the method of maximum likelihood to estimate all the parameters in (2). To remove SSE's dependency on $\lambda$, Box-Cox method involves standardizing the observation in the following way:
$$ W_i = \left\{ \begin{array}{rl} &K_1(Y_i^\lambda-1) &\mbox{$\lambda \neq 0$} \\ &K_2(\ln Y_i) & \mbox{$\lambda = 0$} \end{array} \right. $$Where:
$K_2=\bigg( \prod_{i=1}^n Y_i \bigg)^{\frac{1}{n}} $
$K_1=\frac{1}{\lambda K_{2}^{\lambda-1}}$
Note that $K_2$ is the geometric means of the $Y_i$ observations. Next SSE is obtained by regressing $W_i$ on the predictor variable $X$. It can be shown that maximum likelihood estimate of $\hat{\lambda}$ is the value of $\lambda$ for which SSE is minimum.
from scipy import stats
K2=stats.gmean(Y)
lamb=np.arange(-2,2.1,0.1)
sse0=np.zeros_like(lamb)
i=0
for lambt in lamb:
if (abs(lambt)>0.0000001):
K1=1.0/(lambt*K2**(lambt-1.0))
W1=K1*(Y**lambt-1.0)
else:
W1=K2*np.log(Y)
sse0[i]=lin_reg(X,W1)['sse']
i=i+1
plt.xlim(-2.0,2.0)
plt.ylim(20.20,70.0)
plt.plot(lamb,sse0,'k-o')
plt.plot((-2.0, 2.0), (30, 30), 'g-')
plt.plot((-2.0, 2.0), (33.5, 33.5), 'g-')
# find the minimum and plot and print it
print 'minumum value of sse occurs at \lambda =', lamb[np.argmin(sse0)], 'and sse =', format(sse0[np.argmin(sse0)],'.2f')
plt.plot(lamb[np.argmin(sse0)],sse0[np.argmin(sse0)], 'ro');
# print the values of see around its minimum for different \lambda
pd.options.display.float_format = '{:,.1f}'.format
df = pd.DataFrame({'$\lambda$': lamb[10:30],'sse':sse0[10:30]})
df.reindex_axis(['$\lambda$','sse'], axis=1)
df.set_index('$\lambda$')
Though the minimum of sse occurs at $\lambda=-0.5$. Looking at the values of sse for different $\lambda$, our choice of considering $\lambda=0$ i.e., $Y^\prime=log_{10}Y$ is not an unreasonable choice as sse values are fairly stable in the range -1.0 to 0.0. Box-Cox procedure is ordinarily used only to provide a guide for selecting a transformation, so overly precise results are not needed.