##################################################################### ##################################################################### ### ### Practice num. 3: MODEL-BASED SMALL AREA ESTIMATORS ### August 2009 ### ### Instructions: If the Workspace created in Practice num. 2 is saved ## in the working directory, then the R code from this practice can be ## executed directly and it should work. Otherwise, copy all Practice 2 ## in the R console to execute all R commands there and continue with Practice 3. ##################################################################### # Notation: D will denote the number of areas and d=1,...,D will be the area index ##################################################################### ############################################################## ### 1. Model-based estimator obtained from a Fay-Herriot model ############################################################## attach(data) # 1.1. Read population sizes of each categorical explanatory variable PopnSizes<-read.table("PopnSizeProv.txt",header=TRUE) attach(PopnSizes) PopnSizeNat<-read.table("PopnSizeProvByNat.txt",header=TRUE) PopnSizeAge<-read.table("PopnSizeProvByAge.txt",header=TRUE) PopnSizeEdu<-read.table("PopnSizeProvByEdu.txt",header=TRUE) PopnSizeSit<-read.table("PopnSizeProvBySit.txt",header=TRUE) Ndnat<-as.matrix(PopnSizeNat[,2:3]) Ndage<-as.matrix(PopnSizeAge[,2:6]) Ndedu<-as.matrix(PopnSizeEdu[,2:5]) Ndsit<-as.matrix(PopnSizeSit[,2:5]) Pdnat<-Ndnat/Nd Pdage<-Ndage/Nd Pdedu<-Ndedu/Nd Pdsit<-Ndsit/Nd # 1.2. Analyse the distribution of direct estimators hist(poord.dir) # Not so far from normal hist(gapd.dir) # Not so far from normal # 1.3. Fit the Fay-Herriot model for the poverty incidence # We use the FH method using the Fisher-scoring algorithm # 1.3.1. Construct matrices, vectors and constants involved in the FH model. X<-cbind(rep(1,D),Pdnat[,1],Pdage[,2:5],Pdedu[,c(1,3)]) # Design matrix Xt<-t(X) # We will need the transpose p<-dim(X)[2] # Number of auxiliary variables y<-poord.dir # Model response Dvec<-varpoord # Vector with variances of direct estimators m<-D # Number of areas (provinces) # 1.3.2. Fisher-scoring algorithm Aest.FH<-NULL # Initialization of vector with estimates of random effects variance in each iteration Aest.FH[1]<-min(varpoord) # Seed for the algorithm k<-0 # Initial value for iteration index diff<-1 # Initial value for diff (relative difference in two subsequent iterations) while ((diff>0.0001)&(k<1000)) # Continues while diff is larger than the given tolerance and num. iterations smaller than limit { k<-k+1 Vi<-diag(1/(Aest.FH[k]+Dvec)) # Inverse of var-cov matrix of responses XVi<-Xt%*%Vi Q<-solve(XVi%*%X) # Inverse betaaux<-Q%*%XVi%*%y resaux<-y-X%*%betaaux s<-t(resaux)%*%Vi%*%resaux-(m-p) # Function that should be equal to zero in optimum A F<-sum(diag(Vi)) # - Expectation of derivative of s at true value of A Aest.FH[k+1]<-Aest.FH[k]+s/F # Updating equation of Fisher-scoring algorithm diff<-abs((Aest.FH[k+1]-Aest.FH[k])/Aest.FH[k]) # Calculation of diff } # End of while A.FH=Aest.FH[k+1] # Final FH estimator of A print(Aest.FH) # Display the values of A in each iteration # 1.3.3. Compute the coefficients'estimator Vi<-diag(1/(A.FH+Dvec)) # Inverse of var-cov using the FH estimator Q<-solve(Xt%*%Vi%*%X) beta.FH<-Q%*%Xt%*%Vi%*%y # Estimated beta # 1.4. Compute the EBLUP estimators of province poverty incidences res.FH<-y-X%*%beta.FH thetaEB.FH<-X%*%beta.FH+A.FH*Vi%*%res.FH # Final estimators poord.FH<-thetaEB.FH data.frame(province=provlab,prop.dir=poord.dir*100,prop.FH=poord.FH*100) # 1.5. Do the same for the poverty gap y<-gapd.dir Dvec<-vargapd Aest.FH<-NULL Aest.FH[1]<-min(vargapd) k<-0 diff<-1 while ((diff>0.0001)&(k<1000)) { k<-k+1 Vi<-diag(1/(Aest.FH[k]+Dvec)) XVi<-Xt%*%Vi Q<-solve(XVi%*%X) betaaux<-Q%*%XVi%*%y resaux<-y-X%*%betaaux s<-t(resaux)%*%Vi%*%resaux-(m-p) F<-sum(diag(Vi)) Aest.FH[k+1]<-Aest.FH[k]+s/F diff<-abs((Aest.FH[k+1]-Aest.FH[k])/Aest.FH[k]) } # End of while A.FH=Aest.FH[k+1] print(Aest.FH) # Compute the coefficients'estimator Vi<-diag(1/(A.FH+Dvec)) Q<-solve(Xt%*%Vi%*%X) beta.FH<-Q%*%Xt%*%Vi%*%y # Compute the EBLUP of the province poverty gaps res.FH<-y-X%*%beta.FH thetaEB.FH<-X%*%beta.FH+A.FH*Vi%*%res.FH gapd.FH<-thetaEB.FH data.frame(province=provlab,gap.dir=gapd.dir*100,gap.FH=gapd.FH*100) # 1.6. Analize how much strength are we borrowing from other areas gamma.gap<-A.FH/(A.FH+Dvec) gamma.gap summary(gamma.gap) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 0.3478 0.7605 0.8531 0.8276 0.9219 0.9702 ######################################################################## ## 2. EB METHOD UNDER A NESTED-ERROR MODEL ######################################################################## # 2.1. Transform of variable norminc to make it closer to normal hist(norminc) m<-abs(min(norminc))+1 # Constant to be added to norminc to make it positive norminct<-norminc+m ys<-log(norminct) hist(ys,xlim=c(6,12)) # 2.2. Fit the nested-error model to sample data by REML method # The function lme from the library nlme fits mixed models p<-10 library(nlme) fit.EB<-lme(ys~age2+age3+age4+age5+nat1+educ1+educ3+sitemp1+sitemp2,random=~1|as.factor(prov),method="REML") summary(fit.EB) # 2.3. Save the results of the fitting method Xs<-model.matrix(fit.EB) # Design matrix betaest<-fixed.effects(fit.EB)# Vector of model coefficients (size p) upred<-random.effects(fit.EB) # Predicted random effects: Watch out! It is not a vector, it is a matrix with 1 column sigmae2est<-fit.EB$sigma^2 # Estimated error variance sigmau2est<-as.numeric(VarCorr(fit.EB)[1,1]) # VarCorr(fit2) is the estimated cov. matrix of the model random components # 2.4. Select the provinces with largest CVs of direct estimators: Here we will compute EB estimators only for these provinces povinc15<-c(42,5,34,44,40,21,25,19,16,2) provlab[povinc15] # [1] Soria Avila Palencia Teruel Segovia # [6] Huelva Lerida Guadalajara Cuenca Albacete cvpoord[povinc15] # 2.5. Read non-sample values of auxiliary variables from files for selected provinces Xrd1<-as.matrix(read.table("X_Soria.txt",header=TRUE)) Xrd2<-as.matrix(read.table("X_Avila.txt",header=TRUE)) Xrd3<-as.matrix(read.table("X_Palencia.txt",header=TRUE)) Xrd4<-as.matrix(read.table("X_Teruel.txt",header=TRUE)) Xrd5<-as.matrix(read.table("X_Segovia.txt",header=TRUE)) Xrd6<-as.matrix(read.table("X_Huelva.txt",header=TRUE)) Xrd7<-as.matrix(read.table("X_Lerida.txt",header=TRUE)) Xrd8<-as.matrix(read.table("X_Guadalajara.txt",header=TRUE)) Xrd9<-as.matrix(read.table("X_Cuenca.txt",header=TRUE)) Xrd10<-as.matrix(read.table("X_Albacete.txt",header=TRUE)) # 2.6. Construct design matrix with non-sample values rd<-rep(0,D) rd[povinc15[1]]<-dim(Xrd1)[1] rd[povinc15[2]]<-dim(Xrd2)[1] rd[povinc15[3]]<-dim(Xrd3)[1] rd[povinc15[4]]<-dim(Xrd4)[1] rd[povinc15[5]]<-dim(Xrd5)[1] rd[povinc15[6]]<-dim(Xrd6)[1] rd[povinc15[7]]<-dim(Xrd7)[1] rd[povinc15[8]]<-dim(Xrd8)[1] rd[povinc15[9]]<-dim(Xrd9)[1] rd[povinc15[10]]<-dim(Xrd10)[1] Xrdtot<-list(Xrd1,Xrd2,Xrd3,Xrd4,Xrd5,Xrd6,Xrd7,Xrd8,Xrd9,Xrd10) # 2.7. EB method: Generate non-sample values and calculate EB estimators of poverty proportions and gaps L<-50 # Number of Monte Carlo simulations for approximation of EB estimator # Matrices with poverty incidences and gaps for the L simulations in the EB method povinc<-matrix(0,nr=D,nc=L) povgap<-matrix(0,nr=D,nc=L) # Vectors with final EB estimators povinc.EB<-rep(0,D) povgap.EB<-rep(0,D) # Time counter initialization time1<-Sys.time() time1 for (i in 1:10){ # Cycle for province index print(i) d<-povinc15[i] Xrd<-Xrdtot[[i]] # Obtain sample values for selected province ysd<-ys[prov==d] # Compute conditional means for non-sample units mudpred<-Xrd%*%matrix(betaest,nr=p,nc=1)+upred[d,1] # The conditional distribution of (non-sample data|sample data) in the EB method # can be written as a new nested-error model with different random effects variance. # We calculate this random effects variance (called sigmav2) gammad<-sigmau2est/(sigmau2est+sigmae2est/nd[d]) sigmav2<-sigmau2est*(1-gammad) for (ell in 1:L){ ### Start of Monte Carlo simulations for EB method # Generate random effect for province d vd<-rnorm(1,0,sqrt(sigmav2)) # Generate model random errors for all non-sample units in province d ed<-rnorm(rd[d],0,sqrt(sigmae2est)) # Compute non-sample values yrdpred<-mudpred+vd+ed # Merge non-sample and sample values (full population or census) ydnew<-c(ysd,yrdpred) # Compute province poverty measures with population values Ednew<-exp(ydnew)-m povinc[d,ell]<-mean(Ednew