# load libraries
library(dplyr)
library(ggplot2)
library(patchwork)
library(ggtext)
library(showtext)
# set default theme
theme_set(theme_minimal(15))
# set default font
font_add_google("Montserrat", "montserrat")
showtext_auto()
# set url
<- paste('https://raw.githubusercontent.com',
.repoURL 'rfordatascience',
'tidytuesday',
'master',
'data',
'2023',
'2023-02-14',
sep = "/")
# download data
<- readr::read_csv(paste(.repoURL, 'age_gaps.csv', sep = "/")) age_gaps
Week 7 of #TidyTuesday
2023 brings the Holywood Age Gaps
dataset, containing data about actors who play as a couple in a holywood movie. The data can be accessed at the TidyTuesday Github repository or from the Holywood Age Gap website. This is my first complete original analysis and visualisation of #TidyTuesday
datasets as, previously, I put a focus on recreating fellow R users works. Specifically, this week, I combined multiple graphics, as subplots, into one where each provides different information, enriching the amount of information contained in the plot.
You can access the source code of this week analysis here. See also my other #TidyTuesday
projects in my personal website.
Environment setup
Several libraries are essential for this project: dplyr
for data manipulation, ggplot2
as the backbone of the data visualisation, patchwork
to combine several plots, ggtext
providing richer functionalities for plot text elements, and showtext
for custom fonts.
Besides, I created theme_custom
function to set behaviour of ggplot graphics generation with a set of predefined parameters, such as font colours or the absence of grid lines.
<- function(){
theme_custom theme(
plot.title.position = "plot",
plot.title = element_markdown(colour = "white"),
plot.background = element_rect(colour = NA, fill = "#040F0F"),
axis.text.y = element_markdown(colour = "white", lineheight = .7),
axis.text.x = element_markdown(colour = "white"),
panel.grid = element_blank(),
legend.title = element_text(face = "bold", colour = "white"),
legend.text = element_text(colour = "white")
) }
Data preparation
There was no special treatment for the first two subplots. However, the way I created the last subplot requires me to perform pivot_longer
to the original data. This resulted in each observation (row) represented every actor/actress, instead of movies. In addition, I also leveraged some helper variables containing the average values of actor ages, actress ages, and all ages.
# Transform the data into its longer form
<- age_gaps |>
couple_ref_tbl mutate(ID = paste(movie_name, couple_number),
ID = forcats::fct_reorder(ID, age_difference)) |>
select(ID, movie_name, release_year, director, age_difference)
<- bind_rows(
age_gaps_long |>
age_gaps mutate(ID = paste(movie_name, couple_number)) |>
select(ID, contains("1")) |>
rename("name" = actor_1_name,
"gender" = character_1_gender,
"birthdate" = actor_1_birthdate,
"age" = actor_1_age),
|>
age_gaps mutate(ID = paste(movie_name, couple_number)) |>
select(ID, contains("2")) |>
rename("name" = actor_2_name,
"gender" = character_2_gender,
"birthdate" = actor_2_birthdate,
"age" = actor_2_age)
|>
) left_join(couple_ref_tbl, by = "ID")
# Define helper vars
<- mean(filter(age_gaps_long, gender == "man")$age)
avg_age_man <- mean(filter(age_gaps_long, gender == "woman")$age)
avg_age_woman <- mean(age_gaps$age_difference) avg_age_diff
Visualisation
Subplot 1: Age Gaps throughout Time
The subplot is a type of scatterplot which draws points referencing to the age gap between two actors playing as a couple in a given year. A smoothed line in pink colour is the trend of the gaps. This implies that the age differences between actors are decreasing overtime.
|>
age_gaps ggplot(aes(x = release_year, y = age_difference)) +
geom_jitter(alpha = 0.5, size = 2.5, colour = "#8AA29E") +
geom_smooth(se = FALSE, colour = "#FF9E99", linewidth = 2, method = lm) +
scale_x_continuous(breaks = seq(1935, 2025, 15)) +
scale_y_continuous(breaks = seq(0, 50, 5)) +
labs(title = "Decreasing trend in age gaps throughout time",
x = NULL, y = NULL) +
theme_custom()
`geom_smooth()` using formula = 'y ~ x'
Subplot 2: Directors who like to cast actors with huge age gap
Each director might have different preferences in choosing the cast. I was curious that perhaps there are eliminated directors with less than 5 movies (based on the dataset, not necessarily throughout their actual career). From the data, it can be inferred that Alfred Hitchcock among the directors who are more likely to cast two actors with a big age gap as a couple in their movie.
|>
age_gaps group_by(director) |>
summarise(age_diff_avg = mean(age_difference),
n = n()) |>
filter(n >= 5) |>
mutate(director = paste0("**", director, "**"),
director = forcats::fct_reorder(director, age_diff_avg),
label = scales::number(age_diff_avg, accuracy = 1e-1)) |>
arrange(desc(age_diff_avg)) |>
head(10) |>
ggplot(aes(x = age_diff_avg, y = director)) +
geom_col(width = 0.8, fill = "#8AA29E") +
geom_vline(xintercept = avg_age_diff, colour = "#FF9E99", linewidth = 1.5) +
geom_text(aes(label = label, x = age_diff_avg + 0.3),
hjust = 0, colour = "white", size = 5) +
geom_richtext(aes(x = avg_age_diff + 0.5, y = 12,
label = paste0("Avg age difference<br>",
round(avg_age_diff, digits = 1),
" y/o")),
hjust = 0, vjust = 1.2, size = 5, label.colour = NA,
colour = "white", fill = NA, lineheight = .7) +
labs(title = "Directors (>= 5 movies) who cast two actors with huge age gaps",
x = NULL, y = NULL) +
theme_custom()
Warning in geom_richtext(aes(x = avg_age_diff + 0.5, y = 12, label = paste0("Avg age difference<br>", : All aesthetics have length 1, but the data has 10 rows.
ℹ Did you mean to use `annotate()`?
Subplot 3: Movies with big age difference between cast
The final subplot is about age difference between two actors playing as a couple in a movie. Harold Maude (1971) is the movie with the biggest age gap, according to the dataset.
|>
age_gaps_long mutate(movie = paste0("**", movie_name, "**<br>(", release_year, ")"),
movie = forcats::fct_reorder(movie, age_difference)) |>
arrange(desc(age_difference)) |>
head(10 * 2) |>
ggplot(aes(x = age, y = movie, group = ID)) +
geom_vline(xintercept = avg_age_man, linewidth = 3, alpha = 0.6,
colour = "#E5B19E") +
geom_vline(xintercept = avg_age_woman, linewidth = 3, alpha = 0.6,
colour = "#FFE699") +
geom_line(linewidth = 1.2, colour = "#FF9E99") +
geom_point(aes(colour = gender, shape = gender), size = 5, fill = "white",
stroke = 3) +
geom_text(aes(label = age), size = 5, fontface = "bold") +
geom_richtext(aes(x = avg_age_man + 0.5,
y = 11.5,
label = paste0("Avg Man<br>",
round(avg_age_man),
" y/o")),
hjust = 0, vjust = 1.2, size = 5, label.colour = NA,
colour = "white", fill = NA, lineheight = .7) +
geom_richtext(aes(x = avg_age_woman + 0.5,
y = 11.5,
label = paste0("Avg Woman<br>",
round(avg_age_woman),
" y/o")),
hjust = 0, vjust = 1.2, size = 5, label.colour = NA,
colour = "white", fill = NA, lineheight = .7) +
scale_colour_manual(values = c("#E5B19E", "#FFE699")) +
scale_shape_manual(values = c(21, 22)) +
scale_x_continuous(limits = c(0, NA), breaks = seq(0, 80, 10)) +
labs(title = "Movies with the biggest age gaps between couple casts",
colour = "Gender",
shape = "Gender",
x = NULL, y = NULL) +
theme_custom()
Compiling
The three subplots were combined to make one plot. Here is the result. I does not show the code to create the plot for simplification o this post, so you might want to see the full code here.
`geom_smooth()` using formula = 'y ~ x'
Warning in geom_richtext(aes(x = avg_age_diff + 0.5, y = 12, label = paste0("Avg age difference<br>", : All aesthetics have length 1, but the data has 10 rows.
ℹ Did you mean to use `annotate()`?
Wrap up
The week 7 dataset of #TidyTuesday
2023 provides me the opportunity to work on my own. Based on this experiment, I learned that to make a detailed visualisation while keeping in mind the relevant information for the plot is an extraordinary task.