The census data are in 3 files in the raw_data/census
folder of the DropBox Ecomore2 folder:
PHC2015 01 province.xlsx
with village identifiers and names in Lao and English, and 11 variables for 480 villages;PHC2015_Household_Record.sav
with 168,949 households and 33 questions;PHC2015_Person_Record_Province1.sav
with 820,940 individuals and 61 questions.Here we summarise these 3 data sets per village and merge them into a data frame of 485 villages and 260 variables.
> required <- c("dplyr", # manipulating data frames
+ "haven", # importing .sav files
+ "labelled", # manipulating labelled data
+ "magrittr", # pipe operators
+ "purrr", # functional programming tools
+ "tidyr" # tidying data functions (spread and gather)
+ )
Installing these packages when not already installed:
> to_install <- setdiff(required, rownames(installed.packages()))
> if (length(to_install)) install.packages(to_install)
Loading the packages for interactive use at the command line:
> invisible(lapply(required, library, character.only = TRUE))
The function below reads a .sav
file and converts the output to a proper R data frame. In particular, the names of the variables are built from the metadata information and the attributes are removed.
> read_sav_data <- function(file) {
+ require(haven) # read_sav
+ require(labelled) # remove_attributes
+ require(magrittr) # %>%
+ data <- read_sav(file)
+ data <- to_factor(data)
+ data$EXPORTFILE <- NULL
+ data$FIELDFILE <- NULL
+ names(data) <- data %>%
+ sapply(function(x) attr(x, "label")) %>%
+ unlist() %>%
+ unname() %>%
+ gsub(" +", "_", .)
+ remove_attributes(data, c("label", "format.spss", "display_width"))
+ }
The following function replaces values of a vector above a threshold by NA
s:
> replace_by_na <- function(x, th) {
+ x[x > th] <- NA
+ x
+ }
The following function converts the NaN
values of a vector to NA
s:
> nan2na <- function(x) {
+ x[is.nan(x)] <- NA
+ x
+ }
The following function fixes factor variables:
> fix_factors <- function(x) {
+ x <- as.character(x)
+ x[x == "0"] <- NA
+ x[x == "9"] <- NA
+ x[x == "Not stated"] <- NA
+ x[x == "Other"] <- NA
+ x[x == "Other or Not stated"] <- NA
+ x
+ }
The following function transform all the categorical variable of a data frame df
into variables (as many as there are variables) of proportions of these categories, per village. It is used by the make_quali()
function.
> make_tables <- function(df) {
+ code <- paste0(df$District_ID, "-", df$Village_ID)
+ code_unique <- unique(code)
+ tmp <- df %>%
+ split(code) %>%
+ lapply(function(x) lapply(select_if(x, is.character), table))
+ first_slot <- tmp[[1]]
+ lapply(seq_along(first_slot), function(x) {
+ reduce(lapply(tmp, `[[`, x), bind_rows) %>%
+ data.frame(code_unique, stringsAsFactors = FALSE) %>%
+ mutate_if(is.integer, coalesce, 0L) %>%
+ gather("key", "value", -code_unique) %>%
+ group_by(code_unique) %>%
+ mutate(percentage = value / sum(value)) %>%
+ ungroup() %>%
+ select(-value) %>%
+ spread(key, percentage) %>%
+ separate(code_unique, c("District_ID", "Village_ID")) %>%
+ mutate_at(vars(ends_with("_ID")), as.integer)
+ }) %>%
+ setNames(names(first_slot))
+ }
This function fixes some names after a merging:
> change_names <- function(df) {
+ df %>%
+ names() %>%
+ sub("\\.x\\.x$", "_floor", .) %>%
+ sub("\\.y\\.y$", "_cooking", .) %>%
+ sub("\\.x$", "_roof", .) %>%
+ sub("\\.y$", "_wall", .) %>%
+ sub("^Bamboo$", "Bamboo_floor", .) %>%
+ setNames(df, .)
+ }
The two previous functions are used into the following function that aggregates the categorical variables of a data frame, by village:
> make_quali <- function(df) {
+ df %>%
+ make_tables() %>%
+ reduce(full_join, c("District_ID", "Village_ID")) %>%
+ change_names()
+ }
The following function aggregates the quantitative variables (with the mean) of a data frame, by village:
> make_quanti <- function(df) {
+ df %>%
+ select_if(~ ! is.character(.)) %>%
+ group_by(District_ID, Village_ID) %>%
+ summarise_all(mean, na.rm = TRUE)
+ }
Generating villages data:
> villages <- "../../raw_data/Lao Statistics Bureau/PHC2015 01 province.xlsx" %>%
+ readxl::read_excel() %>%
+ mutate(District_ID = as.integer(sub("^01", "", `district ID`)),
+ District_ID = ifelse(District_ID > 10, 3, District_ID),
+ Village_ID = as.integer(`village ID`),
+ District_Name = sub(" *District *$", "", `District Name`),
+ Village_Name = sub("^B. *", "", `Village Name`)) %>%
+ select(-ID, -`Pro ID`, -province, -`district ID`, -`village ID`, -`Village Name(lo)`,
+ -`District Name(lo)`, -`District Name`, -`Village Name`) %>%
+ mutate_if(is.numeric, as.integer) %>%
+ mutate_at(c("electricity", "watersuply", "market", "heathcenter"), ~ . == "Yes") %>%
+ select(District_ID, Village_ID, District_Name, Village_Name, everything(), -electricity)
Generating households data:
> households <- "../../raw_data/Lao Statistics Bureau/PHC2015_Household_Record.sav" %>%
+ read_sav_data() %>%
+ select(-Province_ID, -Enumeration_area_1, -Household_number, -Bookserial, -EA2,
+ -Book_number, -Building_number, -Form_number_within_household,
+ -If_hhld_continues_on_next_sheet, -Q61X_Males_4_positions, -Village_type,
+ -Q62X_Females_4_positions, -Q63X_Total_persons_4_positions, -Home_sheet_number) %>%
+ mutate_if(is.factor, fix_factors) %>%
+ mutate(Q36._Death_last_12_month = Q36._Death_last_12_month == "Yes",
+ Q40._Moved_in_HH = Q40._Moved_in_HH == "Yes",
+ Q44._Moved_out_of_household = Q44._Moved_out_of_household == "Yes",
+ Q54._Area_occupied = as.integer(na_if(Q54._Area_occupied, "Not stated")),
+ Q55._Number_of_room = as.integer(sub(" Rooms*", "", ifelse(grepl("Room", Q55._Number_of_room), Q55._Number_of_room, NA))),
+ Q61._Males = replace_by_na(as.integer(Q61._Males), 10),
+ Q62._Females = replace_by_na(as.integer(Q62._Females), 10),
+ Q63_Total_persons = replace_by_na(as.integer(Q63_Total_persons), 10),
+ area_per_person = Q54._Area_occupied / Q63_Total_persons,
+ rooms_per_person = Q55._Number_of_room / Q63_Total_persons) %>%
+ mutate_at(vars(starts_with("Q60")), `==`, "Yes") %>%
+ (function(x) lapply(c(make_quali, make_quanti), function(f) f(x))) %>%
+ reduce(full_join, c("District_ID", "Village_ID")) %>%
+ rename(collective_households = Collective,
+ private_households = Private) %>%
+ mutate_all(nan2na)
Generating persons data:
> persons <- "../../raw_data/Lao Statistics Bureau/PHC2015_Person_Record_Province1.sav" %>%
+ read_sav_data() %>%
+ select(District_ID, Village_ID, Household_type, Q4._Sex, Q5._Age, Q6._Marital_Status,
+ Q10A._Born_this_district, Q18._Can_read_and_write, Q19._Attended_school,
+ Q20._Highest_education, Q21._Main_subject_of_study, Q22A._Place_living_2005,
+ Q24._Main_activity_12_month, Q32M_Age_first_birth) %>%
+ mutate_if(is.factor, fix_factors) %>%
+ mutate(Q5._Age = paste0("a", Q5._Age),
+ Q18._Can_read_and_write = sub("Zero", "Not at all", Q18._Can_read_and_write),
+ Q32M_Age_first_birth = as.integer(sub(" *[Yy]ears *", "", Q32M_Age_first_birth))) %>%
+ (function(x) lapply(c(make_quali, make_quanti), function(f) f(x))) %>%
+ reduce(full_join, c("District_ID", "Village_ID")) %>%
+ rename(collective_persons = Collective,
+ private_persons = Private) %>%
+ mutate_all(nan2na)
Checking that we can merge:
> intersect(names(persons), names(households))
[1] "District_ID" "Village_ID"
> intersect(names(persons), names(villages))
[1] "District_ID" "Village_ID"
> intersect(names(households), names(villages))
[1] "District_ID" "Village_ID"
Merging, tuning and writing to disk:
> if (!dir.exists("data")) dir.create("data")
> list(villages, households, persons) %>%
+ reduce(full_join, c("District_ID", "Village_ID")) %>%
+ rename(water_supply = watersuply,
+ Bottled.or.canned.water = Botled.or.canned.water,
+ health_center = heathcenter,
+ On.premises = On.permises,
+ Journalism.and.information = Jounalism.and.information,
+ Mathematics.and.statistics = Mathematics.and.statatistics) %>%
+ select(District_ID:Well.borehole.unprotected, On.premises, Less.than.200.meters,
+ X200.to.499.meters, X500.to.999.meters, X1000.meters.or.more, Bucket:a1,
+ paste0("a", 2:98), a99:Never, No.grade, paste0("Grade.", 1:6),
+ paste0("Lower.secondary.", 1:4), paste0("Upper.secondary.", 1:3),
+ Vocation.education.first.level, Vocation.education.middle.level,
+ Vocation.education.high.level, Graduate.degree.holder,
+ Post.graduate.masters.degree, Higher.than.post.graduate,
+ Agriculture..forestry.and.fishing:Q32M_Age_first_birth) %>%
+ write.csv("data/census.csv", FALSE, row.names = FALSE)