Code
library(tidyverse)
library(readxl) # part of tidyverse - reading excel file
library(here) # file paths
library(janitor)
library(fs) # to use dir_tree()
library(ggplot2)
library(scales)
library(gt)The Global Pandemic and Epidemic-Prone Disease Outbreaks dataset includes information on over 3,000 pandemic- and epidemic-prone disease outbreaks associated with more than 90 different infectious diseases, occurring globally from January 1996 to the present.
The aim of this analysis is to assess the proportion of global outbreaks captured in this dataset from the Disease Outbreak News (DON) that were due to water-borne diseases (e.g. cholera, diarrhoeal diseases) i.e. ‘WASH-related’ and where in the world they typically occurred.
This dataset was downloaded from (Disease Outbreaks Data Project 2024)
Full details on its curation are published by (Torres Munguía et al. 2022)
The following section describes the steps taken to load and prepare data for analysis in R (R Core Team 2019).
As the data was obtained from a freely available online source, no specific details are provided here on the exact data curation. This can be read directly in the published article.
library(tidyverse)
library(readxl) # part of tidyverse - reading excel file
library(here) # file paths
library(janitor)
library(fs) # to use dir_tree()
library(ggplot2)
library(scales)
library(gt)# load raw global outbreak data
# outbreaks_raw <- read_excel("/cloud/project/data/raw/disease_outbreaks_HDX.xlsx")
raw_data <- read_excel(here::here("data/raw/disease_outbreaks_HDX.xlsx"))
# load transmission route list per icd103n (table generated by AI)
transmission_route <- disease_data_tbl <- read_delim(
here::here("data/raw/transmission_route.csv"),
delim = ";",
comment = "",
show_col_types = FALSE
)# prepare data for analysis
pre_processed_data <- raw_data |>
# janitor package to clean col names
janitor::clean_names() |>
# drop 1st row after titles
slice(-1) |>
# remove out rows from coronavirus dashboard as only interested in DONs
filter(do_ns != "Coronavirus dashboard")
# clean up transmission route table to allow for joining to main dataset
processed_transmission_route <- transmission_route |>
mutate(wash_related = str_detect(`WASH-Related?`, pattern = "Yes"))
# join onto main dataset
processed_data <- pre_processed_data |>
left_join(processed_transmission_route, by = c("icd103n" = "Disease"))
# write out processed data to folder - careful not to overright previous data
write_csv(processed_data,
here::here("data/processed/my-processed-data.csv"))Figure 1 shows the number of outbreaks over time, split by whether WASH-related
# look at how many outbreaks there were over time, split by WASH vs non-WASH
processed_data |>
mutate(wash_related = factor(wash_related, levels = c(TRUE, FALSE))) |>
group_by(year, wash_related) |>
summarise(count = n()) |>
ungroup() |>
filter(!is.na(wash_related)) |>
ggplot() +
geom_col(aes(x = year, y = count, fill = wash_related)) +
theme_classic() +
theme(legend.position = "bottom") + # move legend to the bottom of graph
theme(
# Rotate the x-axis text labels by 45 degrees
axis.text.x = element_text(angle = 45, hjust = 1)
# hjust = 1 aligns the right edge of the text with the tick mark
) +
labs(
title = "Outbreaks over time, split by whether WASH-related",
subtitle = "As reported in WHO's Disease Outbreak News (DONs)",
x = "Year",
y = "Number of outbreaks",
fill = "WASH-related" # Renaming the legend title here
)
Figure 2 shows the number of WASH-related outbreaks over time, split by WHO region.
# look at how many WASH outbreaks there were over time, split by country region
processed_data |>
filter(wash_related == TRUE) |> # keep only WASH related outbreaks
group_by(year, who_region) |>
summarise(count = n()) |>
ungroup() |>
filter(!is.na(who_region)) |>
ggplot() +
geom_col(aes(x = year, y = count, fill = who_region)) +
theme_classic() +
theme(legend.position = "bottom") + # move legend to the bottom of graph
theme(
# Rotate the x-axis text labels by 45 degrees
axis.text.x = element_text(angle = 45, hjust = 1)
# hjust = 1 aligns the right edge of the text with the tick mark
) +
labs(
#title = "Outbreaks over time, split by WHO region",
#subtitle = "As reported in WHO's Disease Outbreak News (DONs)",
x = "Year",
y = "Number of outbreaks",
fill = "WHO " # Renaming the legend title here
)
Figure 3 shows the number of WASH-related outbreaks by disease and WHO region.
# look at what the most common WASH related outbreaks were - by region
processed_data %>%
filter(wash_related == TRUE) |>
group_by(who_region, disease) |>
summarise(count = n(), .groups = 'drop') |>
filter(!is.na(who_region)) |>
ggplot() +
geom_col(aes(x = who_region, y = count, fill = disease)) +
scale_fill_discrete(
# Apply the wrapping function directly to the labels of the scale
labels = scales::label_wrap(30)
) +
theme_classic() +
labs(
#title = "WASH-related outbreaks*, split by WHO region",
#subtitle = "*As reported in WHO's Disease Outbreak News (DONs)",
x = "WHO Region",
y = "Number of outbreaks",
fill = "Disease" # Renaming the legend title here
) +
guides(
fill = guide_legend(
ncol = 3,
byrow = TRUE,
title.position = "top",
direction = "horizontal"
)
) +
theme(
legend.position = "bottom", # move legend to the bottom of graph
legend.text = element_text(size = 9),
) +
scale_x_discrete(
# Wrap text after 10 characters
labels = scales::label_wrap(20)
)
Table 1 provides a summary of the top 10 most common diseases responsible for the outbreaks reported in this dataset between 1996 and 2025 by WHO region, in order of number of outbreaks reported.
processed_data |>
filter(wash_related == TRUE) |>
group_by(who_region, disease) |>
summarise(count = n(), .groups = 'drop') |>
filter(!is.na(who_region)) |>
arrange(desc(count)) |>
head(10) |>
gt()| who_region | disease | count |
|---|---|---|
| African Region | Cholera | 155 |
| African Region | Acute poliomyelitis, unspecified | 82 |
| Eastern Mediterranean Region | Cholera | 40 |
| Region of the Americas | Cholera | 23 |
| Eastern Mediterranean Region | Acute poliomyelitis, unspecified | 18 |
| European Region | Acute hepatitis A | 15 |
| South-East Asia Region | Cholera | 12 |
| Western Pacific Region | Cholera | 12 |
| European Region | Infections due to other Salmonella | 11 |
| European Region | Acute poliomyelitis, unspecified | 9 |