Preambule

The census data are in 3 files in the raw_data/census folder of the DropBox Ecomore2 folder:

Here we summarise these 3 data sets per village and merge them into a data frame of 485 villages and 260 variables.

Packages

> required <- c("dplyr",    # manipulating data frames
+               "haven",    # importing .sav files
+               "labelled", # manipulating labelled data
+               "magrittr", # pipe operators
+               "purrr",    # functional programming tools
+               "tidyr"     # tidying data functions (spread and gather)
+               )

Installing these packages when not already installed:

> to_install <- setdiff(required, rownames(installed.packages()))
> if (length(to_install)) install.packages(to_install)

Loading the packages for interactive use at the command line:

> invisible(lapply(required, library, character.only = TRUE))

Utilitary functions

The function below reads a .sav file and converts the output to a proper R data frame. In particular, the names of the variables are built from the metadata information and the attributes are removed.

> read_sav_data <- function(file) {
+   require(haven)    # read_sav
+   require(labelled) # remove_attributes
+   require(magrittr) # %>% 
+   data <- read_sav(file)
+   data <- to_factor(data)
+   data$EXPORTFILE <- NULL
+   data$FIELDFILE <- NULL
+   names(data) <- data %>%
+     sapply(function(x) attr(x, "label")) %>%
+     unlist() %>%
+     unname() %>%
+     gsub(" +", "_", .)
+   remove_attributes(data, c("label", "format.spss", "display_width"))
+ }

The following function replaces values of a vector above a threshold by NAs:

> replace_by_na <- function(x, th) {
+   x[x > th] <- NA
+   x
+ }

The following function converts the NaN values of a vector to NAs:

> nan2na <- function(x) {
+   x[is.nan(x)] <- NA
+   x
+ }

The following function fixes factor variables:

> fix_factors <- function(x) {
+   x <- as.character(x)
+   x[x == "0"] <- NA
+   x[x == "9"] <- NA
+   x[x == "Not stated"] <- NA
+   x[x == "Other"] <- NA
+   x[x == "Other or Not stated"] <- NA
+   x
+ }

The following function transform all the categorical variable of a data frame df into variables (as many as there are variables) of proportions of these categories, per village. It is used by the make_quali() function.

> make_tables <- function(df) {
+   code <- paste0(df$District_ID, "-", df$Village_ID)
+   code_unique <- unique(code)
+   tmp <- df %>%
+     split(code) %>% 
+     lapply(function(x) lapply(select_if(x, is.character), table))
+   first_slot <- tmp[[1]]
+   lapply(seq_along(first_slot), function(x) {
+       reduce(lapply(tmp, `[[`, x), bind_rows) %>% 
+         data.frame(code_unique, stringsAsFactors = FALSE) %>% 
+         mutate_if(is.integer, coalesce, 0L) %>%   
+         gather("key", "value", -code_unique) %>% 
+         group_by(code_unique) %>% 
+         mutate(percentage = value / sum(value)) %>% 
+         ungroup() %>% 
+         select(-value) %>% 
+         spread(key, percentage) %>% 
+         separate(code_unique, c("District_ID", "Village_ID")) %>% 
+         mutate_at(vars(ends_with("_ID")), as.integer)
+     }) %>% 
+     setNames(names(first_slot))
+ }

This function fixes some names after a merging:

> change_names <- function(df) {
+   df %>% 
+     names() %>% 
+     sub("\\.x\\.x$", "_floor", .) %>% 
+     sub("\\.y\\.y$", "_cooking", .) %>% 
+     sub("\\.x$", "_roof", .) %>% 
+     sub("\\.y$", "_wall", .) %>% 
+     sub("^Bamboo$", "Bamboo_floor", .) %>% 
+     setNames(df, .)
+ }

The two previous functions are used into the following function that aggregates the categorical variables of a data frame, by village:

> make_quali <- function(df) {
+   df %>%
+     make_tables() %>% 
+     reduce(full_join, c("District_ID", "Village_ID")) %>% 
+     change_names()
+ }

The following function aggregates the quantitative variables (with the mean) of a data frame, by village:

> make_quanti <- function(df) {
+   df %>% 
+     select_if(~ ! is.character(.)) %>%
+     group_by(District_ID, Village_ID) %>% 
+     summarise_all(mean, na.rm = TRUE)
+ }

Villages data

Generating villages data:

> villages <- "../../raw_data/Lao Statistics Bureau/PHC2015 01 province.xlsx" %>% 
+   readxl::read_excel() %>% 
+   mutate(District_ID = as.integer(sub("^01", "", `district ID`)),
+          District_ID = ifelse(District_ID > 10, 3, District_ID),
+          Village_ID = as.integer(`village ID`),
+          District_Name = sub(" *District *$", "", `District Name`), 
+          Village_Name = sub("^B. *", "", `Village Name`)) %>% 
+   select(-ID, -`Pro ID`, -province, -`district ID`, -`village ID`, -`Village Name(lo)`,
+          -`District Name(lo)`, -`District Name`, -`Village Name`) %>% 
+   mutate_if(is.numeric, as.integer) %>%
+   mutate_at(c("electricity", "watersuply", "market", "heathcenter"), ~ . == "Yes") %>% 
+   select(District_ID, Village_ID, District_Name, Village_Name, everything(), -electricity)

Households data

Generating households data:

> households <- "../../raw_data/Lao Statistics Bureau/PHC2015_Household_Record.sav" %>% 
+   read_sav_data() %>% 
+   select(-Province_ID, -Enumeration_area_1, -Household_number, -Bookserial, -EA2,
+          -Book_number, -Building_number, -Form_number_within_household,
+          -If_hhld_continues_on_next_sheet, -Q61X_Males_4_positions, -Village_type,
+          -Q62X_Females_4_positions, -Q63X_Total_persons_4_positions, -Home_sheet_number) %>% 
+   mutate_if(is.factor, fix_factors) %>% 
+   mutate(Q36._Death_last_12_month = Q36._Death_last_12_month == "Yes",
+          Q40._Moved_in_HH = Q40._Moved_in_HH == "Yes",
+          Q44._Moved_out_of_household = Q44._Moved_out_of_household == "Yes",
+          Q54._Area_occupied = as.integer(na_if(Q54._Area_occupied, "Not stated")),
+          Q55._Number_of_room = as.integer(sub(" Rooms*", "", ifelse(grepl("Room", Q55._Number_of_room), Q55._Number_of_room, NA))),
+          Q61._Males = replace_by_na(as.integer(Q61._Males), 10),
+          Q62._Females = replace_by_na(as.integer(Q62._Females), 10),
+          Q63_Total_persons = replace_by_na(as.integer(Q63_Total_persons), 10),
+          area_per_person = Q54._Area_occupied / Q63_Total_persons,
+          rooms_per_person = Q55._Number_of_room / Q63_Total_persons) %>% 
+   mutate_at(vars(starts_with("Q60")), `==`, "Yes") %>% 
+   (function(x) lapply(c(make_quali, make_quanti), function(f) f(x))) %>% 
+   reduce(full_join, c("District_ID", "Village_ID")) %>% 
+   rename(collective_households = Collective,
+          private_households = Private) %>% 
+   mutate_all(nan2na)

Persons data

Generating persons data:

> persons <- "../../raw_data/Lao Statistics Bureau/PHC2015_Person_Record_Province1.sav" %>% 
+   read_sav_data() %>% 
+   select(District_ID, Village_ID, Household_type, Q4._Sex, Q5._Age, Q6._Marital_Status,
+          Q10A._Born_this_district, Q18._Can_read_and_write, Q19._Attended_school,
+          Q20._Highest_education, Q21._Main_subject_of_study, Q22A._Place_living_2005,
+          Q24._Main_activity_12_month, Q32M_Age_first_birth) %>% 
+   mutate_if(is.factor, fix_factors) %>% 
+   mutate(Q5._Age = paste0("a", Q5._Age),
+          Q18._Can_read_and_write = sub("Zero", "Not at all", Q18._Can_read_and_write),
+          Q32M_Age_first_birth = as.integer(sub(" *[Yy]ears *", "", Q32M_Age_first_birth))) %>% 
+   (function(x) lapply(c(make_quali, make_quanti), function(f) f(x))) %>% 
+   reduce(full_join, c("District_ID", "Village_ID")) %>% 
+   rename(collective_persons = Collective,
+          private_persons = Private) %>% 
+   mutate_all(nan2na)

Merging and writing to disk

Checking that we can merge:

> intersect(names(persons), names(households))
[1] "District_ID" "Village_ID" 
> intersect(names(persons), names(villages))
[1] "District_ID" "Village_ID" 
> intersect(names(households), names(villages))
[1] "District_ID" "Village_ID" 

Merging, tuning and writing to disk:

> if (!dir.exists("data")) dir.create("data")
> list(villages, households, persons) %>% 
+   reduce(full_join, c("District_ID", "Village_ID")) %>% 
+   rename(water_supply = watersuply,
+          Bottled.or.canned.water = Botled.or.canned.water,
+          health_center = heathcenter,
+          On.premises = On.permises,
+          Journalism.and.information = Jounalism.and.information,
+          Mathematics.and.statistics = Mathematics.and.statatistics) %>% 
+   select(District_ID:Well.borehole.unprotected, On.premises, Less.than.200.meters,
+          X200.to.499.meters, X500.to.999.meters, X1000.meters.or.more, Bucket:a1,
+          paste0("a", 2:98), a99:Never, No.grade, paste0("Grade.", 1:6),
+          paste0("Lower.secondary.", 1:4), paste0("Upper.secondary.", 1:3),
+          Vocation.education.first.level, Vocation.education.middle.level,
+          Vocation.education.high.level, Graduate.degree.holder,
+          Post.graduate.masters.degree, Higher.than.post.graduate,
+          Agriculture..forestry.and.fishing:Q32M_Age_first_birth) %>%
+   write.csv("data/census.csv", FALSE, row.names = FALSE)