#####################################################################
#####################################################################
###
###      Practice num. 3: MODEL-BASED SMALL AREA ESTIMATORS
###                  August 2009
###

### Instructions: If the Workspace created in Practice num. 2 is saved 
## in the working directory, then the R code from this practice can be 
## executed directly and it should work. Otherwise, copy all Practice 2
## in the R console to execute all R commands there and continue with Practice 3.

#####################################################################
# Notation: D will denote the number of areas and d=1,...,D will be the area index
#####################################################################

##############################################################
### 1. Model-based estimator obtained from a Fay-Herriot model
##############################################################

attach(data)

# 1.1. Read population sizes of each categorical explanatory variable

PopnSizes<-read.table("PopnSizeProv.txt",header=TRUE)
attach(PopnSizes)

PopnSizeNat<-read.table("PopnSizeProvByNat.txt",header=TRUE)
PopnSizeAge<-read.table("PopnSizeProvByAge.txt",header=TRUE)
PopnSizeEdu<-read.table("PopnSizeProvByEdu.txt",header=TRUE)
PopnSizeSit<-read.table("PopnSizeProvBySit.txt",header=TRUE)

Ndnat<-as.matrix(PopnSizeNat[,2:3])
Ndage<-as.matrix(PopnSizeAge[,2:6])
Ndedu<-as.matrix(PopnSizeEdu[,2:5])
Ndsit<-as.matrix(PopnSizeSit[,2:5])

Pdnat<-Ndnat/Nd
Pdage<-Ndage/Nd
Pdedu<-Ndedu/Nd
Pdsit<-Ndsit/Nd

# 1.2. Analyse the distribution of direct estimators

hist(poord.dir) # Not so far from normal
hist(gapd.dir)  # Not so far from normal

# 1.3. Fit the Fay-Herriot model for the poverty incidence
# We use the FH method using the Fisher-scoring algorithm

# 1.3.1. Construct matrices, vectors and constants involved in the FH model.

X<-cbind(rep(1,D),Pdnat[,1],Pdage[,2:5],Pdedu[,c(1,3)])  # Design matrix
Xt<-t(X)  	# We will need the transpose
p<-dim(X)[2]  	# Number of auxiliary variables
y<-poord.dir  	# Model response      
Dvec<-varpoord  # Vector with variances of direct estimators
m<-D  # Number of areas (provinces)

# 1.3.2. Fisher-scoring algorithm 

Aest.FH<-NULL  	# Initialization of vector with estimates of random effects variance in each iteration
Aest.FH[1]<-min(varpoord)  # Seed for the algorithm

k<-0  		# Initial value for iteration index
diff<-1  	# Initial value for diff (relative difference in two subsequent iterations)
while ((diff>0.0001)&(k<1000)) # Continues while diff is larger than the given tolerance and num. iterations smaller than limit
{
k<-k+1
Vi<-diag(1/(Aest.FH[k]+Dvec))  # Inverse of var-cov matrix of responses
XVi<-Xt%*%Vi
Q<-solve(XVi%*%X)   # Inverse
betaaux<-Q%*%XVi%*%y  
resaux<-y-X%*%betaaux
s<-t(resaux)%*%Vi%*%resaux-(m-p)  # Function that should be equal to zero in optimum A
F<-sum(diag(Vi)) # - Expectation of derivative of s at true value of A

Aest.FH[k+1]<-Aest.FH[k]+s/F   # Updating equation of Fisher-scoring algorithm
diff<-abs((Aest.FH[k+1]-Aest.FH[k])/Aest.FH[k])  # Calculation of diff

} # End of while

A.FH=Aest.FH[k+1]  # Final FH estimator of A
print(Aest.FH)  # Display the values of A in each iteration

# 1.3.3. Compute the coefficients'estimator

Vi<-diag(1/(A.FH+Dvec))  # Inverse of var-cov using the FH estimator
Q<-solve(Xt%*%Vi%*%X)
beta.FH<-Q%*%Xt%*%Vi%*%y  # Estimated beta

# 1.4. Compute the EBLUP estimators of province poverty incidences

res.FH<-y-X%*%beta.FH
thetaEB.FH<-X%*%beta.FH+A.FH*Vi%*%res.FH

# Final estimators

poord.FH<-thetaEB.FH
data.frame(province=provlab,prop.dir=poord.dir*100,prop.FH=poord.FH*100)

# 1.5. Do the same for the poverty gap

y<-gapd.dir
Dvec<-vargapd

Aest.FH<-NULL
Aest.FH[1]<-min(vargapd)

k<-0
diff<-1
while ((diff>0.0001)&(k<1000))
{
k<-k+1
Vi<-diag(1/(Aest.FH[k]+Dvec))
XVi<-Xt%*%Vi
Q<-solve(XVi%*%X)
betaaux<-Q%*%XVi%*%y
resaux<-y-X%*%betaaux
s<-t(resaux)%*%Vi%*%resaux-(m-p)
F<-sum(diag(Vi))

Aest.FH[k+1]<-Aest.FH[k]+s/F
diff<-abs((Aest.FH[k+1]-Aest.FH[k])/Aest.FH[k])

} # End of while

A.FH=Aest.FH[k+1]
print(Aest.FH)

# Compute the coefficients'estimator

Vi<-diag(1/(A.FH+Dvec))
Q<-solve(Xt%*%Vi%*%X)
beta.FH<-Q%*%Xt%*%Vi%*%y

# Compute the EBLUP of the province poverty gaps

res.FH<-y-X%*%beta.FH
thetaEB.FH<-X%*%beta.FH+A.FH*Vi%*%res.FH
gapd.FH<-thetaEB.FH
data.frame(province=provlab,gap.dir=gapd.dir*100,gap.FH=gapd.FH*100)

# 1.6. Analize how much strength are we borrowing from other areas

gamma.gap<-A.FH/(A.FH+Dvec)
gamma.gap
summary(gamma.gap)
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.3478  0.7605  0.8531  0.8276  0.9219  0.9702 

########################################################################
## 2. EB METHOD UNDER A NESTED-ERROR MODEL
########################################################################

# 2.1. Transform of variable norminc to make it closer to normal 

hist(norminc)
m<-abs(min(norminc))+1 # Constant to be added to norminc to make it positive
norminct<-norminc+m
ys<-log(norminct)
hist(ys,xlim=c(6,12))

# 2.2. Fit the nested-error model to sample data by REML method

# The function lme from the library nlme fits mixed models

p<-10
library(nlme)
fit.EB<-lme(ys~age2+age3+age4+age5+nat1+educ1+educ3+sitemp1+sitemp2,random=~1|as.factor(prov),method="REML")
summary(fit.EB)

# 2.3. Save the results of the fitting method

Xs<-model.matrix(fit.EB)      # Design matrix
betaest<-fixed.effects(fit.EB)# Vector of model coefficients (size p)
upred<-random.effects(fit.EB) # Predicted random effects: Watch out! It is not a vector, it is a matrix with 1 column
sigmae2est<-fit.EB$sigma^2    # Estimated error variance
sigmau2est<-as.numeric(VarCorr(fit.EB)[1,1]) # VarCorr(fit2) is the estimated cov. matrix of the model random components

# 2.4. Select the provinces with largest CVs of direct estimators: Here we will compute EB estimators only for these provinces 

povinc15<-c(42,5,34,44,40,21,25,19,16,2)
provlab[povinc15]
# [1] Soria       Avila       Palencia    Teruel      Segovia    
# [6] Huelva      Lerida      Guadalajara Cuenca      Albacete 

cvpoord[povinc15]

# 2.5. Read non-sample values of auxiliary variables from files for selected provinces

Xrd1<-as.matrix(read.table("X_Soria.txt",header=TRUE))
Xrd2<-as.matrix(read.table("X_Avila.txt",header=TRUE))
Xrd3<-as.matrix(read.table("X_Palencia.txt",header=TRUE))
Xrd4<-as.matrix(read.table("X_Teruel.txt",header=TRUE))
Xrd5<-as.matrix(read.table("X_Segovia.txt",header=TRUE))
Xrd6<-as.matrix(read.table("X_Huelva.txt",header=TRUE))
Xrd7<-as.matrix(read.table("X_Lerida.txt",header=TRUE))
Xrd8<-as.matrix(read.table("X_Guadalajara.txt",header=TRUE))
Xrd9<-as.matrix(read.table("X_Cuenca.txt",header=TRUE))
Xrd10<-as.matrix(read.table("X_Albacete.txt",header=TRUE))

# 2.6. Construct design matrix with non-sample values

rd<-rep(0,D)
rd[povinc15[1]]<-dim(Xrd1)[1]
rd[povinc15[2]]<-dim(Xrd2)[1]
rd[povinc15[3]]<-dim(Xrd3)[1]
rd[povinc15[4]]<-dim(Xrd4)[1]
rd[povinc15[5]]<-dim(Xrd5)[1]
rd[povinc15[6]]<-dim(Xrd6)[1]
rd[povinc15[7]]<-dim(Xrd7)[1]
rd[povinc15[8]]<-dim(Xrd8)[1]
rd[povinc15[9]]<-dim(Xrd9)[1]
rd[povinc15[10]]<-dim(Xrd10)[1]

Xrdtot<-list(Xrd1,Xrd2,Xrd3,Xrd4,Xrd5,Xrd6,Xrd7,Xrd8,Xrd9,Xrd10)

# 2.7. EB method: Generate non-sample values and calculate EB estimators of poverty proportions and gaps

L<-50  # Number of Monte Carlo simulations for approximation of EB estimator   

# Matrices with poverty incidences and gaps for the L simulations in the EB method
povinc<-matrix(0,nr=D,nc=L)   
povgap<-matrix(0,nr=D,nc=L)

# Vectors with final EB estimators
povinc.EB<-rep(0,D)
povgap.EB<-rep(0,D)

# Time counter initialization
time1<-Sys.time()
time1

for (i in 1:10){    # Cycle for province index

print(i)
d<-povinc15[i]
Xrd<-Xrdtot[[i]]

# Obtain sample values for selected province

ysd<-ys[prov==d]

# Compute conditional means for non-sample units
mudpred<-Xrd%*%matrix(betaest,nr=p,nc=1)+upred[d,1]

# The conditional distribution of (non-sample data|sample data) in the EB method 
# can be written as a new nested-error model with different random effects variance.
# We calculate this random effects variance (called sigmav2)

gammad<-sigmau2est/(sigmau2est+sigmae2est/nd[d])
sigmav2<-sigmau2est*(1-gammad)

for (ell in 1:L){   ### Start of Monte Carlo simulations for EB method

  # Generate random effect for province d 
  vd<-rnorm(1,0,sqrt(sigmav2))
  
  # Generate model random errors for all non-sample units in province d
  ed<-rnorm(rd[d],0,sqrt(sigmae2est))

  # Compute non-sample values 
  yrdpred<-mudpred+vd+ed
  
  # Merge non-sample and sample values (full population or census)
  ydnew<-c(ysd,yrdpred)
  
  # Compute province poverty measures with population values 

  Ednew<-exp(ydnew)-m
  povinc[d,ell]<-mean(Ednew<z)
  povgap[d,ell]<-mean((Ednew<z)*(z-Ednew)/z)

}  # End of simulations for EB method

# EB predictors of poverty measures (averages over the L Monte Carlo simulations)

povinc.EB[d]<-mean(povinc[d,])
povgap.EB[d]<-mean(povgap[d,])

} # End of cycle for province index

time2 <- Sys.time()   # Current time 
print(difftime(time2,time1,units="mins")) # Print computation time

respovinc15<-data.frame(povinc15,100*poord.dir[povinc15],100*poord.ps[povinc15],100*poord.c[povinc15],100*poord.FH[povinc15],100*povinc.EB[povinc15])
respovgap15<-data.frame(povinc15,100*gapd.dir[povinc15],100*gapd.ps[povinc15],100*gapd.c[povinc15],100*gapd.FH[povinc15],100*povgap.EB[povinc15])

write.table(respovinc15,"Estpovinc.txt",row.names=FALSE)
write.table(respovgap15,"Estpovgap.txt",row.names=FALSE)

#####################################################################
### 3. Parametric Bootstrap for the calculation of the MSE of the EBP
#####################################################################

# 3.1. Select the number of MC and Bootstrap replicates 

B<-50   # Number of bootstrap replicates
LB<-50   # Number of Monte Carlo simulations for EB method, for each bootstrap sample

# 3.2. Initialize Vectors with the bootstrap MSEs and CVs of EB estimators

MSEpropsum.B<-rep(0,D)
MSEgapsum.B<-rep(0,D)
MSEpovincEB.B<-rep(0,D)
MSEpovgapEB.B<-rep(0,D)
CVpovincEB.B<-rep(0,D) 
CVpovgapEB.B<-rep(0,D) 

time1<-Sys.time()
time1

for (b in 1:B){   ### Start of bootstrap cycle

print(cbind("Bootstrap iteration",b))

# We will generate a bootstrap population by generating sample + nonsample elements
# and calculate the true poverty measures for this population.

ys.B<-NULL   # Boostrap sample vector

# Initialize vectors with true poverty measures
truepovinc.B<-rep(0,D) 
truepovgap.B<-rep(0,D)

# Initialize vectors with estimated poverty measures
povinc.B<-matrix(0,nr=D,nc=LB)
povgap.B<-matrix(0,nr=D,nc=LB)
povinc.EB.B<-rep(0,D)
povgap.EB.B<-rep(0,D)

ud.B<-rep(0,D)  # Bootstrap random effects
nd.cum<-rep(0,D) # Vector with cummulated sample sizes of provinces

# 3.4. Generate sample elements (for all areas)

for (d in 1:D){

    # Take sample elements

    Xsd<-Xs[prov==d,]
    
    # Calculate the cummulated sample sizes

    if(d==1) {nd.cum[d]<-nd[1]} else {nd.cum[d]<-nd.cum[d-1]+nd[d]}

    # Generate sample values of y from the fitted model

    esd.B<-rnorm(nd[d],0,sqrt(sigmae2est))
    ud.B[d]<-rnorm(1,0,sqrt(sigmau2est))
    musd.B<-Xsd%*%matrix(betaest,nr=p,nc=1)
    ysd.B<-musd.B+ud.B[d]+esd.B
    ys.B<-c(ys.B,ysd.B)
}

# 3.5. Take the bootstrap sample data and fit nested-error model to it

fit.B<-lme(ys.B~age2+age3+age4+age5+nat1+educ1+educ3+sitemp1+sitemp2,random=~1|as.factor(prov),method="REML")
Xs<-model.matrix(fit.B)      
betaest.B<-fixed.effects(fit.B)
upred.B<-random.effects(fit.B)
sigmae2est.B<-fit.B$sigma^2    
sigmau2est.B<-as.numeric(VarCorr(fit.B)[1,1])

# 3.6. Generate non-sample values only for selected provinces

for (i in 1:10){

print(i)
d<-povinc15[i]
Xrd<-Xrdtot[[i]]

erd.B<-rnorm(rd[d],0,sqrt(sigmae2est))
murd.B<-Xrd%*%matrix(betaest,nr=p,nc=1)
yrd.B<-murd.B+ud.B[d]+erd.B
Erd.B<-exp(yrd.B)-m

# 3.7. Merge sample and non-sample elements for selected province

ysd.B<-ys.B[(nd.cum[d-1]+1):(nd.cum[d])]
Esd.B<-exp(ysd.B)-m
Ed.B<-c(Erd.B,Esd.B)
    
# 3.8. Calculate true domain poverty incidence and gap for selected province

truepovinc.B[d]<-mean(Ed.B<z)
truepovgap.B[d]<-mean((Ed.B<z)*(z-Ed.B)/z)

# 3.9. Compute EB predictors for each bootstrap sample

# Non-sample units
mudpred.B<-Xrd%*%matrix(betaest,nr=p,nc=1)+upred.B[d,1]

gammad.B<-sigmau2est.B/(sigmau2est.B+sigmae2est.B/nd[d])
sigmav2.B<-sigmau2est.B*(1-gammad.B)

for (ellb in 1:LB){   ### Monte Carlo approximation for EB method, for each bootstrap sample 

  vd.B<-rnorm(1,0,sqrt(sigmav2.B))
  ed.B<-rnorm(rd[d],0,sqrt(sigmae2est.B))

  yrdpred.B<-mudpred.B+vd.B+ed.B
  ydnew.B<-c(ysd.B,yrdpred.B)
  Ednew.B<-exp(ydnew.B)-m

  # EB predictors of poverty measures for each simulation

  povinc.B[d,ellb]<-mean(Ednew.B<z)
  povgap.B[d,ellb]<-mean((Ednew.B<z)*(z-Ednew.B)/z)

}  # End of generations

# EB predictors of poverty measures

povinc.EB.B[d]<-mean(povinc.B[d,])
povgap.EB.B[d]<-mean(povgap.B[d,])

# Cummulated squared errors for Bootstrap MSE

MSEpropsum.B[d]<-MSEpropsum.B[d]+(povinc.EB.B[d]-truepovinc.B[d])^2
MSEgapsum.B[d]<-MSEgapsum.B[d]+(povgap.EB.B[d]-truepovgap.B[d])^2

} # En of cycle for selected area

}  # End of the bootstrap cycle

time2 <- Sys.time()
print(difftime(time2,time1,units="mins"))

# 3.10. Bootstrap MSEs and CVs

for (i in 1:10){
  
  d<-povinc15[i]
  
  # Bootstrap MSS of EB estimators
  MSEpovincEB.B[d]<-MSEpropsum.B[d]/B
  MSEpovgapEB.B[d]<-MSEgapsum.B[d]/B

  # Bootstrap CVs of EB estimators
  CVpovincEB.B[d]<-100*sqrt(MSEpovincEB.B[d])/povinc.EB[d]
  CVpovgapEB.B[d]<-100*sqrt(MSEpovgapEB.B[d])/povgap.EB[d]
}

# 3.11. Save results

Respovinc<-data.frame(povinc15,nd[povinc15],counts[povinc15],100*poord.dir[povinc15],100*povinc.EB[povinc15],cvpoord[povinc15],CVpovincEB.B[povinc15],cvpoord[povinc15]/CVpovincEB.B[povinc15])
Respovgap<-data.frame(povinc15,nd[povinc15],counts[povinc15],100*gapd.dir[povinc15],100*povgap.EB[povinc15],cvgapd[povinc15],CVpovgapEB.B[povinc15],cvgapd[povinc15]/CVpovgapEB.B[povinc15])

write.table(Respovinc,"ResultspovincL50B50.xls",row.names=FALSE,sep="\t")
write.table(Respovgap,"ResultspovgapL50B50.xls",row.names=FALSE,sep="\t")


#######################################################
# Function for computing poverty proportions and gaps
# for given poverty line
#######################################################

povme<-function(md,povline,wel){

  D<-length(md)
  prop<-rep(0,D)
  gap<-rep(0,D)
  summd<-0

  for (d in 1:D){

    weld<-wel[(summd+1):(summd+md[d])]
    prop[d]<-mean(weld<povline)
    gap[d]<-mean((weld<povline)*(povline-weld)/povline)

    summd<-summd+md[d]

    res<-data.frame(prop,gap)
    res
  }

}   # End of function