#####################################################################
#####################################################################
###
###      Practice num. 2: TRADITIONAL SMALL AREA ESTIMATORS
###                  August 2009                           
###

### Instructions: To follow this practice, first go to the website: 

## http://www.uc3m.es/portal/page/portal/dpto_estadistica/home/members/isabel_molina_peralta

## and clicking in Course material, download: 

## - The Spanish data set: "silc0106_ISI09.txt"

## - The description of the data set: "Description_silc0106_ISI09.doc"

## - The populating sizes of provinces: 
## "PopnSizeProv.txt",
## "PopnSizeProvByAge.txt", 
## "PopnSizeProvByEdu.txt",
## "PopnSizeProvByNat.txt",
## "PopnSizeProvBySit.txt".

## - The values of auxiliary variables for a group of provinces:
## "X_Albacete.txt"
## "X_Avila.txt"
## "X_Cuenca.txt"
## "X_Guadalajara.txt"
## "X_Huelva.txt"
## "X_Lerida.txt"
## "X_Palencia.txt"
## "X_Segovia.txt"
## "X_Soria.txt"
## "X_Teruel.txt"

#####################################################################
# Notation: D will denote the number of areas and d=1,...,D will be the area index
#####################################################################

### 1. READING DATA AND OBTENTION OF SAMPLE AND POPULATION SIZES

# 1.1. Read the data file silc0106_ISI09.txt

data<-read.table("silc0106_ISI09.txt",header=TRUE)
data[1:10,]
provl
attach(data)
provl

# 1.2. Sample size = 1st. dimension of data file

dim(data)
n<-dim(data)[1]
n
#[1] 34389

# 1.3. Number of provinces (areas or domains) in the data file

unique(prov)
D<-length(unique(prov))  

# 1.4. Province sample sizes

nd<-rep(0,D)

for (d in 1:D) {
    nd[d]<-sum(prov==d)
}

# 1.5. Read the population sizes of the provinces 

PopnSizes<-read.table("PopnSizeProv.txt",header=TRUE)
attach(PopnSizes)
Nd
provlab

data.frame(provlab,nd,Nd,10000*nd/Nd)

# 1.6. Popn. size

N<-sum(Nd)
N;n;10000*n/N
#[1] 43586848
#[1] 34389
#[1] 7.889765

### 2. DESCRIPTION OF INCOME VARIABLE AND CONSTRUCTION OF POVERTY VARIABLES

# Poverty line

z<-6557.143   # It is obtained as 0.6*median(true norminc)

# 2.1. Distribution of normalized income

summary(norminc)
hist(norminc)
abline(v=z,col=2,lwd=2)

# Task: Do a boxplot of norminc and include a the poverty line

# 2.2. Construct the variable "poor": Indicator of people under the poverty line

poor<-rep(0,n)
poor[norminc<z]<-1
summary(as.factor(poor))

# 2.3. Construct the variable "gap": Relative distance to poverty line for people under the poverty line and zero otherwise

gap<-(norminc<z)*(z-norminc)/z

# 2.4. Direct estimators of poverty incidence and poverty gap for Spain
      
poor.dir<-100*sum(poor*weight)/N
gap.dir<-100*sum(gap*weight)/N
poor.dir;gap.dir
#[1] 24.79721
#[1] 8.508456

# 2.5. Calculate province poverty counts

counts<-rep(0,D)

for (d in 1:D){
    poord<-poor[prov==d]
    counts[d]<-sum(poord)
}

summary(counts)

##################################################################
### 3. DIRECT ESTIMATORS OF PROVINCE POVERTY PROPORTIONS AND GAP`S
##################################################################

# 3.1. Calculate direct estimators of province poverty proportions and gaps

poord.dir<-rep(0,D)
gapd.dir<-rep(0,D)

for (d in 1:D){

    poord<-poor[prov==d]
    gapd<-gap[prov==d]
    weightd<-weight[prov==d]

    poord.dir[d]<-sum(poord*weightd)/Nd[d]
    gapd.dir[d]<-sum(gapd*weightd)/Nd[d]

}

data.frame(province=provlab,prop=100*poord.dir,gap=100*gapd.dir)

# 3.2. Estimated variances of direct estimators

varpoord<-rep(0,D)
vargapd<-rep(0,D)

for (d in 1:D){

    poord<-poor[prov==d]
    gapd<-gap[prov==d]
    weightd<-weight[prov==d]

    varpoord[d]<-sum(weightd*(weightd-1)*(poord^2))/(Nd[d]^2)
    vargapd[d]<-sum(weightd*(weightd-1)*(gapd^2))/(Nd[d]^2)

}

# 3.3. Coeficients of variation

cvpoord<-100*sqrt(varpoord)/poord.dir
cvgapd<-100*sqrt(vargapd)/gapd.dir

data.frame(province=provlab,cvprop=cvpoord,cvgap=cvgapd)

summary(data.frame(cvpoord,cvgapd))

########################################################################
### 4. Post-stratified synthetic estimators with nationality as post-strata
########################################################################

summary(as.factor(nat))
J<-length(unique(nat))

# 4.1. Read population sizes for each nationality from a file

Nj<-rep(0,J) # Vector which will contain the popn. sizes for each nationality j
PopnSizeNat<-read.table("PopnSizeProvByNat.txt",header=TRUE)
Ndj<-as.matrix(PopnSizeNat[,2:3]) # It contains the popn. sizes by nationaloty in each province

# 4.2. Calculate direct estimators for each nationality

poorj.dir<-rep(0,J)
gapj.dir<-rep(0,J)

for (j in 1:J){

    poorj<-poor[nat==j]
    gapj<-gap[nat==j]
    weightj<-weight[nat==j]
    
    Nj[j]<-sum(Ndj[,j])

    poorj.dir[j]<-sum(poorj*weightj)/Nj[j]
    gapj.dir[j]<-sum(gapj*weightj)/Nj[j]

}

data.frame(Nj,poorj.dir,gapj.dir)

# 4.3. Calculate post-stratified synthetic estimators

poord.ps<-as.vector(Ndj%*%matrix(poorj.dir,nr=2,nc=1))/Nd
gapd.ps<-as.vector(Ndj%*%matrix(gapj.dir,nr=2,nc=1))/Nd

data.frame(prop.dir=poord.dir*100,prop.ps=poord.ps*100,gap.dir=gapd.dir*100,gap.ps=gapd.ps*100)

# Task: Calculate post-stratified synthetic estimators with gen, age or educ as post-strata.

# Two categorical variables can be combined, such as gen and age:

unique(as.factor(age):as.factor(gen)) 
agegen<-as.factor(age):as.factor(gen)

# Task (optional): Calculate post-stratified synthetic estimators with agegen as post-strata

########################################################################
### 5. COMPOSITE ESTIMATORS: Sample-size dependent estimator
########################################################################

# 5.1. Calculate estimated province popn. sizes Ndhat and weights phii for SSD estimator

Ndhat<-rep(0,D)
phid<-rep(0,D)
delta<-1

for (d in 1:D) {

    Ndhat[d]<-sum(weight[prov==d])
    if(Ndhat[d]<delta*Nd[d]){phid[d]<-Ndhat[d]/(delta*Nd[d])} else 
    {phid[d]<-1}
    
}

summary(phid)

# 5.2. Calculate SSD estimator

poord.c<-phid*poord.dir+(1-phid)*poord.ps
gapd.c<-phid*gapd.dir+(1-phid)*gapd.ps

# 5.3. Analize how much strength are we borrowing

summary(phid)
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.4846  0.8800  0.9779  0.9224  1.0000  1.0000 

data.frame(province=provlab,prop.c=poord.c*100,gap.c=gapd.c*100)

# Task: Use a different post-stratified synthetic estimator to derive composite estimators
# Task: Compare the values of the different estimators as you want (displaying data.frames, making graphics, summarys, ...)

### Before closing R, Save Workspace with a name such as ISI09.RData.
### We will need it for Practice num 3.