Solutions for mid-term 1

Problem 1(a)

In [1]:
# read the data 
from read_data_fx import read_tb_data
data=read_tb_data(1,28)
# Percentage of individuals with high school diploma is X
X=data[:,1]
# Crime rate is Y
Y=data[:,0]; 
In [2]:
# import the packages required 
import numpy as np
import scipy as sc
import statsmodels.api as sm
import matplotlib.pyplot as plt 
In [3]:
# Fit a line to the data 
barX=np.mean(X); barY=np.mean(Y)
XminusbarX=X-barX; YminusbarY=Y-barY
b1=sum(XminusbarX*YminusbarY)/sum(XminusbarX**2)
b0=barY-b1*barX
Yhat=b0+b1*X
e_i=Y-Yhat # Residuals 
print 'b0=%4.4f' %b0,'b1=%4.4f' %b1 
b0=20517.5999 b1=-170.5752
In [4]:
# Plot the line and crosscheck all the properties

sum_of_residuals = np.sum(e_i) 
sum_of_squares_of_residuals = np.sum(e_i**2) 
print 'sum of residuals = %4.4f' % sum_of_residuals, ' ---property 1'
print 'sum of squares of residuals %4.4f' %sum_of_squares_of_residuals, '  ---property 2'
print 'sum of Y_i %4.4f' % np.sum(Y), '  ---property 3'
print 'sum of Yhat %4.4f' % np.sum(Yhat), '  ---property 3'
print 'sum of X_ie_i %4.4f' % np.sum(X*e_i), '  ---property 4'
print 'sum of Yhat e_i %4.4f' % np.sum(Yhat*e_i), '   ---property 5'


#make matplotlib inline 
%matplotlib inline 
plt.figure(figsize=(20,10)) 
# scatter plot
plt.scatter(X,Y,c='k',s=220,marker='d')
plt.xlabel('Percentage of individuals with high school diploma',fontsize=25)
# ylabel of the scatter plot
plt.ylabel('Crime rate',fontsize=25)
# title of the scatter plot
plt.title('Education Vs. Crime',fontsize=25) 
# regression line
# point barX and barY
plt.plot(barX,barY,marker='*',markersize=70,color='b')
#regression line
plt.plot(X,Yhat,'g-',linewidth=3)
print 'Blue point represents the point xbar and ybar -- property 6'
sum of residuals = -0.0000  ---property 1
sum of squares of residuals 455273165.2925   ---property 2
sum of Y_i 597341.0000   ---property 3
sum of Yhat 597341.0000   ---property 3
sum of X_ie_i -0.0000   ---property 4
sum of Yhat e_i -0.0000    ---property 5
Blue point represents the point xbar and ybar -- property 6

Test for Linear relationship

To test whether or not there is a linear relationship we will use the following decision rule.

$H_0: \beta_1=0$ (No linear relationship)

$H_a; \beta_1 \neq 0$ (Linear relationship)

If $|t^*| \leq t(1-\alpha/2;n-2)$ then conclude $H_0$

If $|t^*| \gt t(1-\alpha/2;n-2)$ then conclude $H_a$

where $t^*=\frac{b_1}{s\{b_1\}}$

In [5]:
# We have already calculated b1, but we need to calculate sb1 
n=len(X) 
MSE=sum_of_squares_of_residuals/(n-2)
ssb1=np.sqrt(MSE/sum(XminusbarX**2))
In [6]:
# Packages for importing t-statistics 
from scipy import stats
# calculating t value; we are given \alpha =0.01 
t1=stats.t.ppf(0.995,n-2)
tstar=b1/ssb1 
print '|tstar|= %4.4f'% abs(tstar) ,'and', 't=%4.4f' % t1
|tstar|= 4.1029 and t=2.6371

We observe that $|t^*| \gt t(1-\alpha/2;n-2)$, therefore we conclude $H_a$

Two sided p-value

Two sided p-value is given by $2P\{t(n-2) \gt t^*\}$

In [7]:
pval=1.0-stats.t.cdf(abs(tstar),n-2); pval=2.0*pval; print 'Two sided p-value = %4.4f' % pval 
Two sided p-value = 0.0001

p-value $<\alpha$ therefore we can directly conclude $H_a$ i.e., there is a linear relationship between crime rate and education level

Problem 1(b)

$1-\alpha$ confidence interval for $\beta_1$ is given by:

$b_1 \pm t(1-\alpha/2;n-2)s\{b_1\}$.

Where $t(1-\alpha/2;n-2)$ denotes $(1-\alpha/2)100$ percentile of the $t$ distribution. We will set $\alpha=0.01$ i.e., we will attempt to find $99\%$ confidence interval.

In [8]:
tssb=t1*ssb1

print 'confidence interval for beta_1 is %4.4f' % b1, '+/-', '%4.4f' % tssb ,'i.e.,'
llb0=b1-tssb; upl0=b1+tssb;lra=tssb/b1

print '%4.4f <= beta_1'% llb0,  '<= %4.4f' % upl0
confidence interval for beta_1 is -170.5752 +/- 109.6366 i.e.,
-280.2118 <= beta_1 <= -60.9386

Therefore, we can infer that $99\%$ confidence interval of $\beta_1$ to be:

$-280.2118 \leq \beta_1 \leq -60.9386$

Problem 2(a)

In [9]:
X0=np.arange(5,30,5)
k=10
Y1=np.zeros(5*k)
X1=np.zeros(5*k)

labels=[];

l=0
for i in range(0,5):
    for j in range(0,k): 
        epsilon_1=np.random.normal(0,1,1)
        Y1[l] =0.2+0.95*X0[i]+epsilon_1[0]
        X1[l]=X0[i]
        l=l+1
     
    
%matplotlib inline 

plt.figure(figsize=(15,7)) 
plt.subplot(1,2,1)
plt.scatter(X1,Y1,c='r',s=80)
plt.xlabel(r'$X$',fontsize=25)
plt.ylabel(r'$Y$',fontsize=25)    
plt.xlim(3,28)    
    
    
        
k=25; Y2=np.zeros((5,k))
epsilon_1=np.random.normal(0,1,k)
labels=range(5,30,5);


xd=[]
for i in range(0,5): 
    
    Y2[i,:]=0.2+0.95*X0[i]+epsilon_1[:]
    
    xd.append(Y2[i,:].tolist())
    
    
    
plt.subplot(1,2,2)
plt.boxplot(xd,labels=labels,sym='ro',whis=1.5);
plt.xlabel(r'$X$',fontsize=25)
plt.ylabel(r'$Y$',fontsize=25)


plt.tight_layout();    

Problem 2 (b)

Maximum likelihood estimate of the variance is given by

$\hat\sigma=\frac{\sum(Y_i-\hat{Y_i})^2}{n}$

We are given by $\hat{Y}_i=0.25+ 0.9X_i$, and $Y_i=\beta_0+\beta_1X_i+ \epsilon_i $. Where $\epsilon_i$ are independent and are distributed as $N(0,1)$. We are given $\beta_0=0.2$, $\beta_1=0.95$, $i=1,...,5$ and in these $5$ trials $X_i=\{5,10,15,20,25\}$. We are also given that for every trial $i$ there are $k=10$ observations of $Y_i$

In [10]:
b0=0.25;b1=0.9
Yhat=b0+b1*X1

l=0;k=10
for i in range(0,5):
    for j in range(0,k): 
        epsilon_1=np.random.normal(0,1,1)
        Y1[l] =0.2+0.95*X0[i]+epsilon_1[0]
        X1[l]=X0[i]
        l=l+1



e_i=Y1-Yhat
n=len(X1)

masigma=sum(e_i**2)/n
print masigma
1.4906964476