Packages that we need from CRAN:
> cran <- c("dplyr", # data frames manipulation
+ "geosphere", # spherical trigonometry
+ "magrittr", # pipe operators
+ "measurements", # tools for units measurement
+ "purrr", # functional programming tools
+ "sp", # classes and methods for spatial data
+ "readxl", # to read excel files
+ "spatstat" # spatial point pattern analysis
+ )
Installing these packages when not already installed:
> to_install <- setdiff(cran, rownames(installed.packages()))
> if (length(to_install)) install.packages(to_install)
Loading the packages for interactive use at the command line:
> invisible(lapply(cran, library, character.only = TRUE))
The following function convert DMS coordinates into decimal degrees coordiantes:
> dms2dd <- function(x) {
+ require(magrittr) # %>%
+ require(measurements) # conv_unit
+ x %>%
+ sub("°", " ", .) %>%
+ sub("'", " ", .) %>%
+ sub("\\.*''[EN]", "", .) %>%
+ conv_unit("deg_min_sec", "dec_deg")
+ }
The following function identifies in a vector of coordinates the ones that are in DMS and then uses the above function to convert them into decimal degrees coordinates:
> dms2dd2 = function(x) {
+ sel <- which(grepl("[EN]", x))
+ x[sel] <- sapply(x[sel], dms2dd)
+ as.numeric(x)
+ }
The following function reads and reformat GPS coordinates data collected from one of the 2 GPS devices (the old and the new one) as well as from the WhatsApp app:
> read_gps <- function(file, tab, tabs) {
+ require(dplyr) # %>%, transmute
+ require(readxl) # read_excel
+ read_excel(file, grep(tab, tabs, TRUE, value = TRUE)) %>%
+ transmute(id = as.integer(`N° patient`),
+ longitude = dms2dd2(sub("/",".", Longitude)),
+ latitude = dms2dd2(Latitude))
+ }
The following function returns the ID that have more than GPS coordinate record:
> check_duplicates <- function(df) {
+ as.numeric(names(which(table(df[["id"]]) > 1)))
+ }
The following function takes a data frame with 2 columns, longitude and latitude and two rows (2 points) and tests whether these 2 points have exactly the same coordinates:
> are_same_points <- function(x) {
+ require(purrr) # invoke
+ invoke(identical, x$longitude) & invoke(identical, x$latitude)
+ }
The following function takes a data frame of coordinates and ID, and the ID that has more than one entry in the data frame, and returns the distance in km between these two entries:
> dist_btw_2_houses <- function(df, i) {
+ require(dplyr) # %>%, filter, select
+ require(geosphere) # distHarversine
+ tmp <- df %>%
+ filter(id == i) %>%
+ select(-id, -source) %>%
+ as.matrix()
+ distHaversine(tmp[1, ], tmp[2, ]) / 1000
+ }
Reading the GPS coordinates from the 4 sources:
> file <- "../../raw_data/GPS/all main data_2018-09-27.xls"
> tabs <- excel_sheets(file)
> oldmachine <- read_gps(file, "old", tabs)
> newmachine <- read_gps(file, "new", tabs)
> smartphoneapp <- read_gps(file, "app", tabs)
> iplserver <- read_excel(file, grep("server", tabs, TRUE, value = TRUE)) %>%
+ transmute(id = as.integer(Reference),
+ longitude = Longitude,
+ latitude = Latitude)
New names:
* `` -> ...8
> lapply(list(iplserver, smartphoneapp, oldmachine, newmachine), check_duplicates) %>%
+ setNames(c("iplserver", "smartphoneapp", "oldmachine", "newmachine"))
$iplserver
[1] 7681
$smartphoneapp
[1] 4374 5925 6060 6640
$oldmachine
numeric(0)
$newmachine
numeric(0)
Note that patients 7681
and 6060
live in 2 different houses at the same time (see issue #1), hence the reason for their duplication. Patients 4374
and 6640
sent the GPS data twice (see issue #1), so we can delete the first coordinates for the 2 of them. Patient 5925
first reported the house only and the coordinates were retrieved from Google Maps; then he sent the GPS coordinates (see issue #1). So we can delete the first coordinates for this one too:
> to_delete <- sapply(c(4374, 5925, 6640),
+ function(x) which(smartphoneapp$id == x)[1])
> smartphoneapp <- smartphoneapp[-to_delete, ]
> gps <- bind_rows(server = iplserver,
+ whatsapp = smartphoneapp,
+ old_gps = oldmachine,
+ new_gps = newmachine, .id = "source")
Looking for GPS coordinates that have been taken by more than one mean:
> duplicates <- check_duplicates(gps)
Here we remove the duplicates that are exactly the same points:
> duplicates <- gps %>%
+ filter(id %in% duplicates) %>%
+ split(.$id) %>%
+ sapply(are_same_points) %>%
+ `!`() %>%
+ which() %>%
+ names() %>%
+ as.numeric()
Now we look to what sources these duplicates are:
> duplicates <- list(iplserver, smartphoneapp, oldmachine, newmachine) %>%
+ sapply(function(x) is.element(duplicates, x$id)) %>%
+ as.data.frame() %>%
+ setNames(c("server", "whatsapp", "oldgps", "newgps")) %>%
+ mutate(id = duplicates) %>%
+ select(id, everything())
> duplicates
id server whatsapp oldgps newgps
1 4445 FALSE TRUE FALSE TRUE
2 4536 FALSE TRUE FALSE TRUE
3 4582 FALSE TRUE FALSE TRUE
4 4600 FALSE TRUE FALSE TRUE
5 4632 FALSE TRUE FALSE TRUE
6 4831 FALSE TRUE FALSE TRUE
7 4840 FALSE TRUE FALSE TRUE
8 4897 FALSE TRUE FALSE TRUE
9 5018 FALSE TRUE FALSE TRUE
10 5019 FALSE TRUE FALSE TRUE
11 5033 FALSE TRUE FALSE TRUE
12 5122 FALSE TRUE FALSE TRUE
13 5217 FALSE TRUE FALSE TRUE
14 5218 FALSE TRUE FALSE TRUE
15 5251 FALSE TRUE FALSE TRUE
16 5252 FALSE TRUE FALSE TRUE
17 5254 FALSE TRUE FALSE TRUE
18 6060 FALSE TRUE FALSE FALSE
19 6323 FALSE TRUE FALSE TRUE
20 7413 FALSE TRUE FALSE TRUE
21 7436 FALSE TRUE FALSE TRUE
22 7681 TRUE FALSE FALSE FALSE
23 8101 TRUE FALSE FALSE TRUE
So, we have 20 houses that were checked both with WhatsApp and the server:
> duplicates %>%
+ filter(whatsapp, newgps) %>%
+ nrow()
[1] 20
One house that was checked both with the server and by the GPS:
> duplicates %>%
+ filter(server, newgps) %>%
+ nrow()
[1] 1
And the 2 patients that live in 2 houses (see above):
> names(which(rowSums(duplicates) < 2))
NULL
Patients 4445, 4536, 4582, 4600, 4632, 4831, 4840, 4897, 5018, 5019, 5033, 5122, 5217, 5218, 5251, 5252, 5254, 6323, 7413 and 7436 had their coordinates reported both on-site from GPS device and through the smartphone app in order to assess the accuracy of the latter (see issue #3). Thus we can remove these ID from the smartphone data. But, before that, we can look at the distances between the two means of data collection. Let’s first get the ID of these duplicates:
> dpl <- duplicates %>%
+ filter(whatsapp, newgps) %>%
+ select(id) %>%
+ unlist() %>%
+ unname()
The following function makes a matrix of coordinates as required by the function distHaversine
:
> make_coord_mat <- function(df, idsel, sourcesel) {
+ df %>%
+ filter(id %in% idsel, source == sourcesel) %>%
+ arrange(id) %>% # to make sure that the coordinates are in the same order
+ select(longitude, latitude) %>% # in the two matrices
+ as.matrix()
+ }
> dpl_nw <- make_coord_mat(gps, dpl, "new_gps")
> dpl_sp <- make_coord_mat(gps, dpl, "whatsapp")
> sort(distHaversine(dpl_nw, dpl_sp)) / 1000
[1] 0.009745362 0.010946436 0.011268729 0.030684530 0.031444077 0.032671816 0.043119851 0.050385576 0.102365185 0.116229296
[11] 0.133134439 0.158631137 0.264623458 0.288712412 0.317148231 0.704071464 1.345591159 6.081439467 49.873441728 50.627912191
Let’s remove all the whatsapp data that are also collected by GPS, as well as the server datum that is also collected by GPS:
> whatsapp_dplcts <- duplicates %>%
+ filter(whatsapp, newgps) %>%
+ select(id) %>%
+ unlist()
> gps %<>% filter(!(id %in% whatsapp_dplcts & source == "whatsapp"))
Let’s do the same with the server datum. For ID 8101, the coordinates were collected on-site with GPS device by mistake, forgetting that the patient had aleady sent the data through the server (see issue #1). Let’s first look at the distance:
> ip <- make_coord_mat(gps, "8101", "server")
> nm <- make_coord_mat(gps, "8101", "new_gps")
> distHaversine(ip, nm) / 1000
[1] 0.03922978
And then we remove the server datum:
> server_dplcts <- duplicates %>%
+ filter(server, newgps) %>%
+ select(id) %>%
+ unlist()
> gps %<>% filter(!(id %in% server_dplcts & source == "server"))
We can check that the only 2 duplicated IDs and the ones that live in two different houses:
> gps %>%
+ select(id) %>%
+ table() %>%
+ `>`(1) %>%
+ which() %>%
+ names()
[1] "6060" "7681"
We can fancy looking at the distances, in km, between these two houses:
> dist_btw_2_houses(gps, 6060)
[1] 7.052577
> dist_btw_2_houses(gps, 7681)
[1] 9.753693
Let’s check whether there are missing values in the coordinates:
> filter(gps, is.na(longitude) | is.na(latitude))
Registered S3 method overwritten by 'cli':
method from
print.boxx spatstat
# A tibble: 2 x 4
source id longitude latitude
<chr> <int> <dbl> <dbl>
1 whatsapp 6983 NA NA
2 new_gps NA NA NA
Let’s remove the points with missing values in the coordinates:
> gps %<>% na.exclude()
Let’s get the polygons of the provinces of Lao PDR:
> provinces <- readRDS("../../cleaned_data/gadm/gadm36_LAO_1_sp.rds")
Let’s extract the province of Vientiane Capital:
> vientiane <- subset(provinces, grepl("Vientiane", provinces$VARNAME_1))
Let’s identify the GPS points that are not inside the polygon of the province of Vientiane Capital:
> not_in_vc <- gps %>%
+ select(-id, -source) %>%
+ SpatialPoints(CRS(proj4string(vientiane))) %>%
+ over(vientiane) %>%
+ `[`(, 1) %>%
+ is.na() %>%
+ which()
> gps[not_in_vc, ]
# A tibble: 6 x 4
source id longitude latitude
<chr> <int> <dbl> <dbl>
1 whatsapp 5553 103. 18.3
2 whatsapp 6019 102. 18.0
3 whatsapp 7004 0.926 40.4
4 whatsapp 7136 103. 21.5
5 new_gps 4100 103. 18.2
6 new_gps 4354 103. 18.2
ID 7004
is obviously a mistake, let’s remove it too:
> out_of_vc <- filter(gps[not_in_vc, ], latitude < 100, longitude > 100)
And let’s see where these points left are on the map:
> plot(provinces, col = "grey")
> with(out_of_vc, points(longitude, latitude, col = "red"))
7136
is from Phôngsali province! The other ones are actually very close to Vientiane capital:
> plot(vientiane, col = "grey")
> with(out_of_vc, points(longitude, latitude, col = "red"))
> if (!dir.exists("data")) dir.create("data")
> write.csv(select(gps, id, source, longitude, latitude), "data/gps.csv", FALSE, row.names = FALSE)