# load libraries
library(dplyr)
library(ggplot2)
library(ggbeeswarm)
library(ggrepel)
library(ggtext)
# set default theme
theme_set(theme_minimal())
# set url
<- paste('https://raw.githubusercontent.com',
.repoURL 'rfordatascience',
'tidytuesday',
'master',
'data',
'2023',
'2023-01-24',
sep = "/")
# download data
<- readr::read_csv(paste(.repoURL, 'survivalists.csv', sep = "/"))
survivalists <- readr::read_csv(paste(.repoURL, 'loadouts.csv', sep = "/"))
loadouts <- readr::read_csv(paste(.repoURL, 'episodes.csv', sep = "/"))
episodes <- readr::read_csv(paste(.repoURL, 'seasons.csv', sep = "/")) seasons
Continuing previous work on Tidy Tuesday (Mock 2022), data from alone
package is analysed. The data is about Alone series which shows participants are left to survive on their own in a remote area. For this week, I attempted to replicate the graphs by the author (Dan Oehm) and another graph by Tanya Saphiro. Reproducing others’ plot provides me with guidance on how to use the tidyverse
package. I only cheated when I wanted to understand the type of geom_
used in the latter graph, and everything else was my original efforts. Besides, I also provided my original work in the third visualisation.
Preparation
In preparation, several relevant libraries are loaded into R. For the graphics, I used the configuration from theme_minimal
as the basis. All datasets then downloaded into the system for analysis and visualisation.
Visualisation 1: remainder proportion of genders
The first graph was inspired by the author of the alone
package describing how fast the number of survivalists, given their gender, depleted throughout the time represented by the proportion. The data was prepared by counting the total number of participants for each day. I used a type of purrr::map2
functions which is convenient to perform a function by providing two lists of parameters. Basically, I sum the TRUE
values that represents whether participants’ number of days lasted are more than a particular day number. This was previously filtered according to the gender.
<- tibble(day = rep(0:100, each = 2),
data_p1 sex = rep(c("Male", "Female"), 101)) |>
mutate(survivor = purrr::map2_dbl(day, sex, function(d, g) {
sum(survivalists$days_lasted[survivalists$gender == g] >= d)
|>
})) group_by(sex) |>
mutate(survivor = survivor / first(survivor))
The plot is a line plot which is produced using geom_line
function. From the visualisation, it can be seen that female could last longer.
# generate visualisation
ggplot(data_p1, aes(x = day, y = survivor, colour = sex)) +
# add graphical elements
geom_line(size = 1.2) +
# manually configure graphical elements
scale_y_continuous(breaks = 0:5 / 5) +
scale_colour_manual(values = c("#765631", "#4234A9")) +
# add plot labels
labs(title = "Survival curves",
subtitle = paste("there is some evidence that, on average, women tend",
"to survive longer than men"),
y = "Proportion Remaining",
x = "Days Lasted",
colour = "Gender") +
# configure the theme
theme(
plot.title = element_text(face = "bold"),
plot.title.position = "plot",
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank(),
)
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Visualisation 2:
The next visualisation is from a tweet by Tanya Saphiro. To help with the making of the graph, I created two dataframes for the average numbers of survival period and the name of the winners for each gender. I also specified a variable to store the mean of all participants survival days. For representational purposes, the reason players tapped out of the game in the variable reason_category
is modified in accord with the original graph.
# create table for average survival of both male and female
<- group_by(survivalists, gender) |>
survivalists_gender_summary summarise(days_lasted = mean(days_lasted)) |>
mutate(label = paste0(gender, " avg\n", round(days_lasted, digits = 1), " Days"))
# create table for participants who could last the longest
<- group_by(survivalists, gender) |>
survivalists_days_lasted_max arrange(desc(days_lasted)) |>
summarise(name = first(name) |> stringr::str_replace(" ", "\n"),
days_lasted = first(days_lasted))
# calculate the average period of survival
<- mean(survivalists$days_lasted)
days_lasted_average
# modify reason category for better description
<- survivalists |>
data_p2 mutate(status = case_when(
== "Family / personal" ~ "Out - Personal",
reason_category == "Medical / health" ~ "Out - Medical",
reason_category == "Loss of inventory" ~ "Out - Loss of inventory",
reason_category is.na(reason_category) ~ "Winner",
TRUE ~ ""
))
The main graphical element is provided through geom_beeswarm
function. This is similar to a dotplot function but this use different values for the coordinate in the y-axis making the data spreaded in a certain way. The colours are replicated from the original graph. Based on the visualisation, female participants are less than the male counterparts, but the group tend to outlast the other.
# generate visualisation to illustrate the resignation timings of survivalists
ggplot(data_p2, aes(x = days_lasted, y = gender)) +
# add graphical elements
geom_beeswarm(aes(colour = status), size = 4, alpha = .85, cex = 4) +
geom_point(data = survivalists_gender_summary, size = 4, shape = 22,
fill = "black", colour = "white") +
geom_text_repel(data = survivalists_gender_summary, aes(label = label),
size = 3, nudge_y = .35, nudge_x = -4, point.padding = 2,
segment.size = 0.65, segment.curvature = 0.1, segment.ncp = 3,
segment.angle = 30) +
geom_text(data = survivalists_days_lasted_max, aes(label = name),
size = 2, colour = "#444444", nudge_y = -.15) +
geom_vline(xintercept = days_lasted_average, linetype = "twodash") +
annotate("text", label = paste("avg:", round(days_lasted_average, 0), "days"),
x = 37, y = 0.25, size = 2.5, angle = 90, hjust = 0) +
# manually configure graphical elements
scale_color_manual(values = c("#2B4162", "#C3423F", "#9AADBF", "#FBB13C")) +
scale_y_discrete(labels = c(
paste0("**Female**<br/>(n=", sum(survivalists$gender == "Female"), ")"),
paste0("**Male**<br/>(n=", sum(survivalists$gender == "Male"), ")")
+
)) # add plot labels
labs(
title = "On average, female survivalists outlast their male counterparts",
subtitle = paste(
"Analysis of survivalists competing on the US reality TV series",
"**Alone**, across all seasons (1-9). <br/> Comparsion of days lasted",
"by gender. Although the show has yet to crown a female winner, <br/>on ",
"average as a group, female survivalists last 13 days longer than male",
"competitors."),
x = "Days Lasted",
y = NULL,
colour = "Survivalist Status:",
caption = "Source: {alone package}"
+
) # configure the theme
theme(
plot.title = element_markdown(face = "bold"),
plot.title.position = "plot",
plot.subtitle = element_markdown(),
plot.caption = element_markdown(hjust = 1),
plot.caption.position = "plot",
axis.text.y = element_markdown(hjust = 0),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank(),
legend.position = "top",
)
Visualisation 3: original work
Lastly, I explored which items are correlated with the how many days participants survive. As there are some loaded items that are positively correlated to survival but only brought by a small number of survivalists, I put a focus on more frequent items. This was done by limiting only to the stuff brought by more than 5 persons as presented in the code below. I also interested which items are in the top three.
# to answer what could be the most valuable items
<- full_join(survivalists,
data_p3
loadouts, by = c("season", "name")) |>
group_by(item) |>
summarise(n = n(), days_lasted = mean(days_lasted)) |>
arrange(desc(days_lasted)) |>
filter(n >= 5) |>
mutate(top3 = c(rep(TRUE, 3), rep(FALSE, n() - 3)),
item = paste0("**", item, "**<br/>(n=", n, ")"),
item = forcats::fct_reorder(item, days_lasted))
I used a simple bar chart where the values are sorted based on frequency. The x-axis demonstrates the average surviving days of the participants. The top three items are tapping wire, frying pan, and paracord.
# create visualisations
ggplot(data_p3, aes(y = item, x = days_lasted, fill = top3)) +
# add graphical elements
geom_col(width = .5) +
geom_vline(xintercept = days_lasted_average, linetype = "twodash") +
annotate("text", label = paste("avg:", round(days_lasted_average, 0), "days"),
x = 39.5, y = 1, size = 2.5, hjust = 0) +
geom_text(aes(label = round(days_lasted, 1)),
size = 3, hjust = 0, nudge_x = .5) +
# manually configure graphical elements
scale_fill_manual(values = c("#876546", "#402F20")) +
# add plot labels
labs(
title = "Certain items frequently brought by survivalists who last longer",
subtitle = paste(
"For all common loadout items (n > 5), survivalists who carried tapping",
"wire, frying pan, or paracord <br/>tend to last longer."
), caption = "Source: {alone package}",
x = "Days lasted on average",
y = NULL,
parse = TRUE
+
) # configure the theme
theme(
plot.title = element_markdown(face = "bold"),
plot.title.position = "plot",
plot.subtitle = element_markdown(),
plot.background = element_rect(fill = "#CCE8E3", color = FALSE),
plot.margin = unit(c(5, 5, 5, 5), units = "mm"),
panel.background = element_rect(fill = "#CCE8E3", color = FALSE),
panel.grid = element_blank(),
axis.title.x = element_markdown(colour = "#333333", size = 9.5, face = "bold"),
axis.text.y = element_markdown(size = 7),
legend.position = "none",
)
Wrap up
TidyTuesday could provide an opportunity for R users to practice their skills in using tidyverse
package, especially in manipulating and visualising data. For this week, I reproduced two graphs and made an additional graph.