#Load dependencies
library(tidyverse)
library(tidytuesdayR)
Water Quality at Sydney Beaches
Load Data
# Load data
<- tidytuesdayR::tt_load(2025, week = 20)
tuesdata <- tuesdata$water_quality
water_quality <- tuesdata$weather
weather rm(tuesdata)
#View(water_quality)
#View(weather)
Cleaning
The variable I am most interested in is enterococci level since conductivity has a lot of nulls.
# Get summary stats
summary(water_quality)
region council swim_site date
Length:123530 Length:123530 Length:123530 Min. :1991-09-19
Class :character Class :character Class :character 1st Qu.:1999-09-28
Mode :character Mode :character Mode :character Median :2007-08-03
Mean :2008-03-08
3rd Qu.:2016-08-26
Max. :2025-04-28
time enterococci_cfu_100ml water_temperature_c conductivity_ms_cm
Length:123530 Min. : 0.0 Min. : 0.00 Min. : 0
Class1:hms 1st Qu.: 0.0 1st Qu.: 17.00 1st Qu.: 52600
Class2:difftime Median : 4.0 Median : 20.00 Median : 53800
Mode :numeric Mean : 116.8 Mean : 19.82 Mean : 51560
3rd Qu.: 19.0 3rd Qu.: 22.00 3rd Qu.: 54300
Max. :1100000.0 Max. :1040.00 Max. :710400
NA's :307 NA's :75039 NA's :78536
latitude longitude
Min. :-34.07 Min. :150.2
1st Qu.:-33.91 1st Qu.:151.2
Median :-33.83 Median :151.3
Mean :-33.84 Mean :151.2
3rd Qu.:-33.77 3rd Qu.:151.3
Max. :-33.60 Max. :151.3
summary(weather)
date max_temp_C min_temp_C precipitation_mm
Min. :1991-01-01 Min. : 9.8 Min. : 0.90 Min. : 0.000
1st Qu.:1999-08-01 1st Qu.:17.7 1st Qu.:10.70 1st Qu.: 0.000
Median :2008-02-29 Median :21.0 Median :14.40 Median : 0.100
Mean :2008-02-29 Mean :21.2 Mean :14.31 Mean : 2.038
3rd Qu.:2016-09-28 3rd Qu.:24.2 3rd Qu.:18.00 3rd Qu.: 1.200
Max. :2025-04-29 Max. :41.8 Max. :27.00 Max. :134.900
latitude longitude
Min. :-33.85 Min. :151.2
1st Qu.:-33.85 1st Qu.:151.2
Median :-33.85 Median :151.2
Mean :-33.85 Mean :151.2
3rd Qu.:-33.85 3rd Qu.:151.2
Max. :-33.85 Max. :151.2
# Value counts
table(water_quality$region)
Northern Sydney Southern Sydney Sydney City Sydney Harbour Western Sydney
43430 18770 18606 41770 954
table(water_quality$council)
Blue Mountains City Council
656
City of Canada Bay Council
2663
Hawkesbury City Council
247
Inner West Council
1805
Lane Cove Council
4594
Mosman Municipal Council
6779
North Sydney Council
1538
Northern Beaches Council
54569
Penrith City Council
51
Randwick City Council
12137
Sutherland Shire Council
18770
The City of Sydney
1426
The Council of the Municipality of Hunters Hill
2350
Waverley Council
6469
Willoughby City Council
1553
Woollahra Municipal Council
7923
table(water_quality$swim_site)
Avalon Beach Balmoral Baths
2064 1552
Bilarong Reserve Bilgola Beach
643 2080
Boat Harbour Bondi Beach
2495 2128
Bronte Beach Bungan Beach
2200 2080
Cabarita Beach Callan Park Seawall
1405 257
Camp Cove Chinamans Beach
398 1258
Chiswick Baths Clifton Gardens
1258 1499
Clontarf Pool Clovelly Beach
1554 2151
Collaroy Beach Coogee Beach
2078 2207
Darling Harbour Davidson Reserve
1426 1492
Dawn Fraser Pool Dee Why Beach
1548 2077
Edwards Beach Elouera Beach
1502 2508
Fairlight Beach Forty Baskets Pool
1378 1503
Freshwater Beach Gordons Bay (East)
2058 698
Greenhills Beach Greenwich Baths
2485 1558
Gurney Crescent Baths Hayes Street Beach
1373 1538
Henley Baths (Kelly Street Baths) Little Bay Beach
803 1333
Little Manly Cove Little Sirius Cove
1501 968
Long Reef Beach Malabar Beach
2048 2142
Manly Cove Maroubra Beach
1551 2135
Megalong Creek Mona Vale Beach
172 2063
Murray Rose Pool Narrabeen Lagoon (Birdwood Park)
1500 1152
Newport Beach Nielsen Park
2081 1501
North Cronulla Beach North Curl Curl Beach
2500 2146
North Narrabeen Beach North Steyne Beach
2076 2146
Northbridge Baths Oak Park Beach
1553 1886
Palm Beach Parsley Bay
2065 1501
Penrith Beach Queenscliff Beach
51 2212
Rose Bay Beach Sangrado Baths
1522 787
Shelly Beach (Manly) Shelly Beach (Sutherland)
2142 1884
South Cronulla Beach South Curl Curl Beach
2505 2059
South Maroubra Beach South Maroubra Rockpool
722 749
South Steyne Beach Tamarama Beach
2186 2141
Tambourine Bay Turimetta Beach
1499 1815
Wanda Beach Warriewood Beach
2507 2077
Watsons Bay Wentworth Falls Lake - Beach
1501 183
Wentworth Falls Lake - Jetty Whale Beach
191 2082
Windsor Beach Woodford Bay
124 1537
Woolwich Baths Yarramundi Reserve
1547 123
Yosemite Creek - Minnehaha Falls
110
# Set locations as factors
<- water_quality |>
water_quality mutate(
region = as.factor(region),
swim_site = as.factor(swim_site),
council = as.factor(council)
)
# Sort levels
<- sort(unique(water_quality$region))
region_levels $region <- factor(water_quality$region, levels = region_levels) water_quality
The variable I am most interested in is enterococci level since conductivity has a lot of nulls.
Outliers
# High enterococci
head(water_quality[water_quality$enterococci_cfu_100ml > 500, ], 10)
# High water temp
head(water_quality[water_quality$water_temperature_c > 100, ], 10)
These last few code lines indicate that there are quite a few NAs in the dataset.
# Remove NAs in enterococci
<- water_quality |>
water_quality_clean filter(!is.na(enterococci_cfu_100ml))
water_quality_clean
The summary table does not indicate there were any unusual outliers in the weather dataset.
Joining the weather dataset to the clean water quality dataset.
# Join datasets
<- left_join(water_quality_clean, weather, by = "date")
df
# Get column names
#colnames(df)
<- as.Date("2015-01-01")
start_date <- as.Date("2024-12-31")
end_date
# Rename and Remove duplicate columns
# Remove outliers on water temperature (under 0 is freezing, over 100 is boiling)
# Limit data to last 10 years
<- df |>
df select("region", "council", "swim_site", "date", "time", "enterococci_cfu_100ml", "water_temperature_c", "conductivity_ms_cm", "latitude.x","longitude.x", "max_temp_C", "min_temp_C", "precipitation_mm") |>
rename(latitude = latitude.x, longitude = longitude.x) |>
filter(between(water_temperature_c, 0, 100),
between(date, start_date, end_date))
head(df)
Explore Data
# Boxplot by region of enterococci
ggplot(data = df, aes(x = region, y = enterococci_cfu_100ml)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.5) +
labs(title = "Enterococci")
# Boxplot by region of water temp
ggplot(data = df, aes(x = region, y = water_temperature_c)) +
geom_boxplot() +
labs(title = "Water Temperature")
The median water temp for Sydney harbor and Western Sydney was slightly higher across all observations. Western Sydney also had a wider range.
Let’s look at some scatterplots
# Scatterplot of water_temperature and enterococci
ggplot(data = df, aes(x = water_temperature_c, y = enterococci_cfu_100ml)) +
geom_point() +
labs(title = "Water Temperature vs Enterococci Level")
# Scatterplot of precipitation and enterococci
ggplot(data = df, aes(x = precipitation_mm, y = enterococci_cfu_100ml)) +
geom_point() +
labs(title = "Precipitation vs Enterococci Level")
# Scatterplot of conductivity and enterococci
ggplot(data = df, aes(x = conductivity_ms_cm, y = enterococci_cfu_100ml)) +
geom_point() +
labs(title = "Conductivity vs Enterococci Level")
Warning: Removed 330 rows containing missing values or values outside the scale range
(`geom_point()`).
Surprised that precipitation level does not have a strong relationship with enterococci level.
# Time series plot
ggplot(df, aes(x = date, y = enterococci_cfu_100ml, color = region))+
geom_line()
# Add floor date columns
<- df |>
df mutate(month = floor_date(date, "month"),
year = floor_date(date, "year"),
quarter = floor_date(date, "quarter"))
# Time series by month
<- df |>
mean_enterococci_by_month group_by(region, month) |>
summarize(avg_enterococci = mean(enterococci_cfu_100ml), .groups = "drop") |>
ungroup()
mean_enterococci_by_month
# Time series by quarter
<- df |>
mean_enterococci_by_quarter group_by(region, quarter) |>
summarize(avg_enterococci = mean(enterococci_cfu_100ml), .groups = "drop") |>
ungroup()
mean_enterococci_by_quarter
# Time series by year
<- df |>
mean_enterococci_by_year group_by(region, year) |>
summarize(avg_enterococci = mean(enterococci_cfu_100ml), .groups = "drop") |>
ungroup()
mean_enterococci_by_year
# Plot time series by year
ggplot(mean_enterococci_by_year, aes(x = year, y = avg_enterococci, color = region))+
geom_line()
For the most part, enterococci levels are stable, although Western Sydney has been increasing. As expected, Western Sydney and Sydney Harbor have the highest average levels as these areas do not have clear access to the Pacific Ocean, causing water to stagnate after heavy rains.
Modeling
# Linear model
<- lm(df$enterococci_cfu_100ml ~ df$water_temperature_c + df$conductivity_ms_cm + df$max_temp_C + df$precipitation_mm)
model1 summary(model1)
Call:
lm(formula = df$enterococci_cfu_100ml ~ df$water_temperature_c +
df$conductivity_ms_cm + df$max_temp_C + df$precipitation_mm)
Residuals:
Min 1Q Median 3Q Max
-517 -33 -20 -9 38877
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.900e+02 2.004e+01 19.462 < 2e-16 ***
df$water_temperature_c 2.365e+00 1.091e+00 2.168 0.030195 *
df$conductivity_ms_cm -6.763e-03 2.173e-04 -31.119 < 2e-16 ***
df$max_temp_C -2.346e+00 6.330e-01 -3.706 0.000211 ***
df$precipitation_mm 4.129e+00 3.611e-01 11.433 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 408.9 on 33890 degrees of freedom
(330 observations deleted due to missingness)
Multiple R-squared: 0.03379, Adjusted R-squared: 0.03367
F-statistic: 296.3 on 4 and 33890 DF, p-value: < 2.2e-16
Model does a very poor job of explaining the variation in enterococci levels.
I’m going to further refine my df to look just at the variable of interest and keep the region columns. I would like to see which sites had the most readings above 70, which is the threshold for dangerous levels of enteroccoci in the water.
# Remove unused columns
<- df |> select(region, swim_site, date, enterococci_cfu_100ml)
df2 summary(df2)
region swim_site date
Northern Sydney:12538 Malabar Beach : 584 Min. :2015-01-05
Southern Sydney: 4544 Coogee Beach : 581 1st Qu.:2017-07-11
Sydney City : 6318 South Maroubra Beach: 580 Median :2019-12-23
Sydney Harbour :10160 Mona Vale Beach : 578 Mean :2020-01-16
Western Sydney : 665 Bronte Beach : 576 3rd Qu.:2022-07-29
South Cronulla Beach: 576 Max. :2024-12-31
(Other) :30750
enterococci_cfu_100ml
Min. : 0.00
1st Qu.: 0.00
Median : 3.00
Mean : 46.89
3rd Qu.: 14.00
Max. :39000.00
# Compute % of readings that were above 70 and remove beaches with <120 readings
<- 120
min_readings
<- df2 |>
rankings group_by(region, swim_site) |>
summarize(
readings_over_70 = sum(enterococci_cfu_100ml > 70, na.rm = TRUE),
readings = n(),
pct = round(sum(enterococci_cfu_100ml > 70, na.rm = TRUE) / n(), 4),
.groups = "drop"
|>
) filter(readings >= min_readings) |>
mutate(rank = rank(pct, ties.method = "min")) |>
arrange(desc(rank)) |>
ungroup()
rankings
# Define "best" and "worst" groups (top/bottom 10)
<- rankings |> filter(rank >= max(rank) - 10 & rank <= max(rank))
worst <- rankings |> filter(rank >= 1 & rank <= 10) best
Visualization
This plot uses patchwork to put two lollipop plots side by side. The theme section below controls many of the finer details of the plot.
library(patchwork)
library(scales)
Attaching package: 'scales'
The following object is masked from 'package:purrr':
discard
The following object is masked from 'package:readr':
col_factor
library(showtext)
Loading required package: sysfonts
Loading required package: showtextdb
# Load Lato font
font_add_google("Lato", "lato")
showtext_auto()
# Set x axis limits
<- 0
xlim_min <- 0.45
xlim_max
# Left Plot (Bottom 10)
<- ggplot(worst, aes(x = pct, y = reorder(swim_site, pct), color = region)) +
plot_worst geom_segment(aes(x = 0, xend = pct, y = reorder(swim_site, pct), yend = reorder(swim_site, pct)), linewidth = 1.5) +
geom_point(size = 10) +
geom_text(aes(x = pct, y = reorder(swim_site, pct), label = percent(pct, accuracy = 1)),
color = "white", fontface = "bold", size = 3.5, vjust = 0.5, hjust = 0.5) +
geom_text(aes(label = paste0(readings_over_70, " / ", readings)),
hjust = -0.4, vjust = 0.5, size = 3.5, color = "black") +
labs(
title = "Bottom 10",
x = NULL,
y = "Swim Site",
color = NULL
+
) scale_x_continuous(labels = percent_format(accuracy = 1), limits = c(xlim_min, xlim_max)) + theme_minimal()
# Right Plot (Top 10)
<- ggplot(best, aes(x = pct, y = reorder(swim_site, -pct), color = region)) +
plot_best geom_segment(aes(x = 0, xend = pct, y = reorder(swim_site, -pct), yend = reorder(swim_site, pct)), linewidth = 1.5) +
geom_point(size = 10) +
geom_text(aes(x = pct, y = reorder(swim_site, pct), label = percent(pct, accuracy = 1)),
color = "white", fontface = "bold", size = 3.5, vjust = 0.5, hjust = 0.5) +
geom_text(aes(label = paste0(readings_over_70, " / ", readings)),
hjust = -0.4, vjust = 0.5, size = 3.5, color = "black") +
labs(
title = "Top 10",
x = NULL,
y = NULL,
color = NULL
+
) scale_x_continuous(labels = percent_format(accuracy = 1), limits = c(xlim_min, xlim_max)) + theme_minimal() +
theme(legend.position = "none")
# Combine plots with patchwork
+ plot_best +
plot_worst plot_layout(guides = "collect") &
theme(text = element_text(family = "lato"),
plot.caption = element_text(hjust = 0),
plot.title = element_text(size = 16, face = "bold"),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank(),
legend.position = "top",
legend.justification = "left",
legend.box.just = "left",
legend.margin = margin(t = 10, b = 10),
axis.text.y = element_text(size = 11)
& plot_annotation(
) title = "Water Quality at Sydney Beaches Over the Past 10 Years (2015-2024)",
subtitle = "Water contamination is determined by assessing Enterococci bacteria levels, which are measured in colony forming units (CFUs) per 100 ml of\nwater. 70 CFUs is considered the threshold for unsafe bacteria levels. The percentages below represent how many observations over the past 10 \nyears were over 70 by location. Measurements were taken weekly for most beaches.\n",
caption = "\n\nChart produced by Steven Villalon for Tidy Tuesday exercise on May 20, 2025"
)
# Filter data
#water_quality[water_quality$swim_site == "Shelly Beach (Sutherland)", ]