Water Quality at Sydney Beaches

Lollipop Plot
R
Author

Steven Villalon

Published

May 20, 2025

#Load dependencies
library(tidyverse)
library(tidytuesdayR)

Load Data

# Load data
tuesdata <- tidytuesdayR::tt_load(2025, week = 20)
water_quality <- tuesdata$water_quality
weather <- tuesdata$weather
rm(tuesdata)

#View(water_quality)
#View(weather)

Cleaning

The variable I am most interested in is enterococci level since conductivity has a lot of nulls.

# Get summary stats
summary(water_quality)
    region            council           swim_site              date           
 Length:123530      Length:123530      Length:123530      Min.   :1991-09-19  
 Class :character   Class :character   Class :character   1st Qu.:1999-09-28  
 Mode  :character   Mode  :character   Mode  :character   Median :2007-08-03  
                                                          Mean   :2008-03-08  
                                                          3rd Qu.:2016-08-26  
                                                          Max.   :2025-04-28  
                                                                              
     time          enterococci_cfu_100ml water_temperature_c conductivity_ms_cm
 Length:123530     Min.   :      0.0     Min.   :   0.00     Min.   :     0    
 Class1:hms        1st Qu.:      0.0     1st Qu.:  17.00     1st Qu.: 52600    
 Class2:difftime   Median :      4.0     Median :  20.00     Median : 53800    
 Mode  :numeric    Mean   :    116.8     Mean   :  19.82     Mean   : 51560    
                   3rd Qu.:     19.0     3rd Qu.:  22.00     3rd Qu.: 54300    
                   Max.   :1100000.0     Max.   :1040.00     Max.   :710400    
                   NA's   :307           NA's   :75039       NA's   :78536     
    latitude        longitude    
 Min.   :-34.07   Min.   :150.2  
 1st Qu.:-33.91   1st Qu.:151.2  
 Median :-33.83   Median :151.3  
 Mean   :-33.84   Mean   :151.2  
 3rd Qu.:-33.77   3rd Qu.:151.3  
 Max.   :-33.60   Max.   :151.3  
                                 
summary(weather)
      date              max_temp_C     min_temp_C    precipitation_mm 
 Min.   :1991-01-01   Min.   : 9.8   Min.   : 0.90   Min.   :  0.000  
 1st Qu.:1999-08-01   1st Qu.:17.7   1st Qu.:10.70   1st Qu.:  0.000  
 Median :2008-02-29   Median :21.0   Median :14.40   Median :  0.100  
 Mean   :2008-02-29   Mean   :21.2   Mean   :14.31   Mean   :  2.038  
 3rd Qu.:2016-09-28   3rd Qu.:24.2   3rd Qu.:18.00   3rd Qu.:  1.200  
 Max.   :2025-04-29   Max.   :41.8   Max.   :27.00   Max.   :134.900  
    latitude        longitude    
 Min.   :-33.85   Min.   :151.2  
 1st Qu.:-33.85   1st Qu.:151.2  
 Median :-33.85   Median :151.2  
 Mean   :-33.85   Mean   :151.2  
 3rd Qu.:-33.85   3rd Qu.:151.2  
 Max.   :-33.85   Max.   :151.2  
# Value counts
table(water_quality$region)

Northern Sydney Southern Sydney     Sydney City  Sydney Harbour  Western Sydney 
          43430           18770           18606           41770             954 
table(water_quality$council)

                    Blue Mountains City Council 
                                            656 
                     City of Canada Bay Council 
                                           2663 
                        Hawkesbury City Council 
                                            247 
                             Inner West Council 
                                           1805 
                              Lane Cove Council 
                                           4594 
                       Mosman Municipal Council 
                                           6779 
                           North Sydney Council 
                                           1538 
                       Northern Beaches Council 
                                          54569 
                           Penrith City Council 
                                             51 
                          Randwick City Council 
                                          12137 
                       Sutherland Shire Council 
                                          18770 
                             The City of Sydney 
                                           1426 
The Council of the Municipality of Hunters Hill 
                                           2350 
                               Waverley Council 
                                           6469 
                        Willoughby City Council 
                                           1553 
                    Woollahra Municipal Council 
                                           7923 
table(water_quality$swim_site)

                     Avalon Beach                    Balmoral Baths 
                             2064                              1552 
                 Bilarong Reserve                     Bilgola Beach 
                              643                              2080 
                     Boat Harbour                       Bondi Beach 
                             2495                              2128 
                     Bronte Beach                      Bungan Beach 
                             2200                              2080 
                   Cabarita Beach               Callan Park Seawall 
                             1405                               257 
                        Camp Cove                   Chinamans Beach 
                              398                              1258 
                   Chiswick Baths                   Clifton Gardens 
                             1258                              1499 
                    Clontarf Pool                    Clovelly Beach 
                             1554                              2151 
                   Collaroy Beach                      Coogee Beach 
                             2078                              2207 
                  Darling Harbour                  Davidson Reserve 
                             1426                              1492 
                 Dawn Fraser Pool                     Dee Why Beach 
                             1548                              2077 
                    Edwards Beach                     Elouera Beach 
                             1502                              2508 
                  Fairlight Beach                Forty Baskets Pool 
                             1378                              1503 
                 Freshwater Beach                Gordons Bay (East) 
                             2058                               698 
                 Greenhills Beach                   Greenwich Baths 
                             2485                              1558 
            Gurney Crescent Baths                Hayes Street Beach 
                             1373                              1538 
Henley Baths (Kelly Street Baths)                  Little Bay Beach 
                              803                              1333 
                Little Manly Cove                Little Sirius Cove 
                             1501                               968 
                  Long Reef Beach                     Malabar Beach 
                             2048                              2142 
                       Manly Cove                    Maroubra Beach 
                             1551                              2135 
                   Megalong Creek                   Mona Vale Beach 
                              172                              2063 
                 Murray Rose Pool  Narrabeen Lagoon (Birdwood Park) 
                             1500                              1152 
                    Newport Beach                      Nielsen Park 
                             2081                              1501 
             North Cronulla Beach             North Curl Curl Beach 
                             2500                              2146 
            North Narrabeen Beach                North Steyne Beach 
                             2076                              2146 
                Northbridge Baths                    Oak Park Beach 
                             1553                              1886 
                       Palm Beach                       Parsley Bay 
                             2065                              1501 
                    Penrith Beach                 Queenscliff Beach 
                               51                              2212 
                   Rose Bay Beach                    Sangrado Baths 
                             1522                               787 
             Shelly Beach (Manly)         Shelly Beach (Sutherland) 
                             2142                              1884 
             South Cronulla Beach             South Curl Curl Beach 
                             2505                              2059 
             South Maroubra Beach           South Maroubra Rockpool 
                              722                               749 
               South Steyne Beach                    Tamarama Beach 
                             2186                              2141 
                   Tambourine Bay                   Turimetta Beach 
                             1499                              1815 
                      Wanda Beach                  Warriewood Beach 
                             2507                              2077 
                      Watsons Bay      Wentworth Falls Lake - Beach 
                             1501                               183 
     Wentworth Falls Lake - Jetty                       Whale Beach 
                              191                              2082 
                    Windsor Beach                      Woodford Bay 
                              124                              1537 
                   Woolwich Baths                Yarramundi Reserve 
                             1547                               123 
 Yosemite Creek - Minnehaha Falls 
                              110 
# Set locations as factors
water_quality <- water_quality  |> 
  mutate(
    region = as.factor(region),
    swim_site = as.factor(swim_site),
    council = as.factor(council)
  )

# Sort levels
region_levels <- sort(unique(water_quality$region))
water_quality$region <- factor(water_quality$region, levels = region_levels)

The variable I am most interested in is enterococci level since conductivity has a lot of nulls.

Outliers

# High enterococci
head(water_quality[water_quality$enterococci_cfu_100ml > 500, ], 10)
# High water temp
head(water_quality[water_quality$water_temperature_c > 100, ], 10)

These last few code lines indicate that there are quite a few NAs in the dataset.

# Remove NAs in enterococci
water_quality_clean <- water_quality |> 
  filter(!is.na(enterococci_cfu_100ml))
water_quality_clean

The summary table does not indicate there were any unusual outliers in the weather dataset.

Joining the weather dataset to the clean water quality dataset.

# Join datasets
df <- left_join(water_quality_clean, weather, by = "date")

# Get column names
#colnames(df)

start_date <- as.Date("2015-01-01")
end_date <- as.Date("2024-12-31") 

# Rename and Remove duplicate columns
# Remove outliers on water temperature (under 0 is freezing, over 100 is boiling)
# Limit data to last 10 years
df <- df |> 
  select("region", "council", "swim_site", "date", "time", "enterococci_cfu_100ml", "water_temperature_c", "conductivity_ms_cm", "latitude.x","longitude.x", "max_temp_C", "min_temp_C", "precipitation_mm") |> 
  rename(latitude = latitude.x, longitude = longitude.x) |> 
  filter(between(water_temperature_c, 0, 100),
         between(date, start_date, end_date))

head(df)

Explore Data

# Boxplot by region of enterococci
ggplot(data = df, aes(x = region, y = enterococci_cfu_100ml)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.5) +
  labs(title = "Enterococci")

# Boxplot by region of water temp
ggplot(data = df, aes(x = region, y = water_temperature_c)) +
  geom_boxplot() +
  labs(title = "Water Temperature")

The median water temp for Sydney harbor and Western Sydney was slightly higher across all observations. Western Sydney also had a wider range.

Let’s look at some scatterplots

# Scatterplot of water_temperature and enterococci
ggplot(data = df, aes(x = water_temperature_c, y = enterococci_cfu_100ml)) +
  geom_point() +
  labs(title = "Water Temperature vs Enterococci Level")

# Scatterplot of precipitation and enterococci
ggplot(data = df, aes(x = precipitation_mm, y = enterococci_cfu_100ml)) +
  geom_point() +
  labs(title = "Precipitation vs Enterococci Level")

# Scatterplot of conductivity and enterococci
ggplot(data = df, aes(x = conductivity_ms_cm, y = enterococci_cfu_100ml)) +
  geom_point() +
  labs(title = "Conductivity vs Enterococci Level")
Warning: Removed 330 rows containing missing values or values outside the scale range
(`geom_point()`).

Surprised that precipitation level does not have a strong relationship with enterococci level.

# Time series plot
ggplot(df, aes(x = date, y = enterococci_cfu_100ml, color = region))+
  geom_line()

# Add floor date columns
df <- df |> 
  mutate(month = floor_date(date, "month"),
         year = floor_date(date, "year"),
         quarter = floor_date(date, "quarter"))
# Time series by month
mean_enterococci_by_month <- df |> 
  group_by(region, month) |> 
  summarize(avg_enterococci = mean(enterococci_cfu_100ml), .groups = "drop") |> 
  ungroup()
mean_enterococci_by_month
# Time series by quarter
mean_enterococci_by_quarter <- df |> 
  group_by(region, quarter) |> 
  summarize(avg_enterococci = mean(enterococci_cfu_100ml), .groups = "drop") |> 
  ungroup()
mean_enterococci_by_quarter
# Time series by year
mean_enterococci_by_year <- df |> 
  group_by(region, year) |> 
  summarize(avg_enterococci = mean(enterococci_cfu_100ml), .groups = "drop") |> 
  ungroup()
mean_enterococci_by_year
# Plot time series by year
ggplot(mean_enterococci_by_year, aes(x = year, y = avg_enterococci, color = region))+
  geom_line()

For the most part, enterococci levels are stable, although Western Sydney has been increasing. As expected, Western Sydney and Sydney Harbor have the highest average levels as these areas do not have clear access to the Pacific Ocean, causing water to stagnate after heavy rains.

Modeling

# Linear model
model1 <- lm(df$enterococci_cfu_100ml ~ df$water_temperature_c + df$conductivity_ms_cm + df$max_temp_C + df$precipitation_mm)
summary(model1)

Call:
lm(formula = df$enterococci_cfu_100ml ~ df$water_temperature_c + 
    df$conductivity_ms_cm + df$max_temp_C + df$precipitation_mm)

Residuals:
   Min     1Q Median     3Q    Max 
  -517    -33    -20     -9  38877 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)             3.900e+02  2.004e+01  19.462  < 2e-16 ***
df$water_temperature_c  2.365e+00  1.091e+00   2.168 0.030195 *  
df$conductivity_ms_cm  -6.763e-03  2.173e-04 -31.119  < 2e-16 ***
df$max_temp_C          -2.346e+00  6.330e-01  -3.706 0.000211 ***
df$precipitation_mm     4.129e+00  3.611e-01  11.433  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 408.9 on 33890 degrees of freedom
  (330 observations deleted due to missingness)
Multiple R-squared:  0.03379,   Adjusted R-squared:  0.03367 
F-statistic: 296.3 on 4 and 33890 DF,  p-value: < 2.2e-16

Model does a very poor job of explaining the variation in enterococci levels.

I’m going to further refine my df to look just at the variable of interest and keep the region columns. I would like to see which sites had the most readings above 70, which is the threshold for dangerous levels of enteroccoci in the water.

# Remove unused columns
df2 <- df |> select(region, swim_site, date, enterococci_cfu_100ml)
summary(df2)
             region                     swim_site          date           
 Northern Sydney:12538   Malabar Beach       :  584   Min.   :2015-01-05  
 Southern Sydney: 4544   Coogee Beach        :  581   1st Qu.:2017-07-11  
 Sydney City    : 6318   South Maroubra Beach:  580   Median :2019-12-23  
 Sydney Harbour :10160   Mona Vale Beach     :  578   Mean   :2020-01-16  
 Western Sydney :  665   Bronte Beach        :  576   3rd Qu.:2022-07-29  
                         South Cronulla Beach:  576   Max.   :2024-12-31  
                         (Other)             :30750                       
 enterococci_cfu_100ml
 Min.   :    0.00     
 1st Qu.:    0.00     
 Median :    3.00     
 Mean   :   46.89     
 3rd Qu.:   14.00     
 Max.   :39000.00     
                      
# Compute % of readings that were above 70 and remove beaches with <120 readings
min_readings <- 120

rankings <- df2 |> 
  group_by(region, swim_site) |> 
  summarize(
    readings_over_70 = sum(enterococci_cfu_100ml > 70, na.rm = TRUE),
    readings = n(),
    pct = round(sum(enterococci_cfu_100ml > 70, na.rm = TRUE) / n(), 4),
    .groups = "drop"
  ) |> 
  filter(readings >= min_readings) |> 
  mutate(rank = rank(pct, ties.method = "min")) |> 
  arrange(desc(rank)) |> 
  ungroup()

rankings
# Define "best" and "worst" groups (top/bottom 10)
worst <- rankings |> filter(rank >= max(rank) - 10 & rank <= max(rank))
best  <- rankings |> filter(rank >= 1 & rank <= 10)

Visualization

This plot uses patchwork to put two lollipop plots side by side. The theme section below controls many of the finer details of the plot.

library(patchwork)
library(scales)

Attaching package: 'scales'
The following object is masked from 'package:purrr':

    discard
The following object is masked from 'package:readr':

    col_factor
library(showtext)
Loading required package: sysfonts
Loading required package: showtextdb
# Load Lato font
font_add_google("Lato", "lato")
showtext_auto()

# Set x axis limits
xlim_min <- 0
xlim_max <- 0.45


# Left Plot (Bottom 10)
plot_worst <- ggplot(worst, aes(x = pct, y = reorder(swim_site, pct), color = region)) +
  geom_segment(aes(x = 0, xend = pct, y = reorder(swim_site, pct), yend = reorder(swim_site, pct)), linewidth = 1.5) +
  geom_point(size = 10) +
  geom_text(aes(x = pct, y = reorder(swim_site, pct), label = percent(pct, accuracy = 1)),
            color = "white", fontface = "bold", size = 3.5, vjust = 0.5, hjust = 0.5) +
  geom_text(aes(label = paste0(readings_over_70, " / ", readings)),
            hjust = -0.4, vjust = 0.5, size = 3.5, color = "black") +
  labs(
    title = "Bottom 10",
    x = NULL,
    y = "Swim Site",
    color = NULL
  ) +
  scale_x_continuous(labels = percent_format(accuracy = 1), limits = c(xlim_min, xlim_max)) + theme_minimal()


# Right Plot (Top 10)
plot_best <- ggplot(best, aes(x = pct, y = reorder(swim_site, -pct), color = region)) +
  geom_segment(aes(x = 0, xend = pct, y = reorder(swim_site, -pct), yend = reorder(swim_site, pct)), linewidth = 1.5) +
  geom_point(size = 10) +
  geom_text(aes(x = pct, y = reorder(swim_site, pct), label = percent(pct, accuracy = 1)),
            color = "white", fontface = "bold", size = 3.5, vjust = 0.5, hjust = 0.5) +
  geom_text(aes(label = paste0(readings_over_70, " / ", readings)),
            hjust = -0.4, vjust = 0.5, size = 3.5, color = "black") +
  labs(
    title = "Top 10",
    x = NULL,
    y = NULL,
    color = NULL
  ) +
  scale_x_continuous(labels = percent_format(accuracy = 1), limits = c(xlim_min, xlim_max)) + theme_minimal() +
  theme(legend.position = "none")


# Combine plots with patchwork
plot_worst + plot_best +
  plot_layout(guides = "collect") &
  theme(text = element_text(family = "lato"),
        plot.caption = element_text(hjust = 0),
        plot.title = element_text(size = 16, face = "bold"),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        legend.position = "top",
        legend.justification = "left",
        legend.box.just = "left",
        legend.margin = margin(t = 10, b = 10),
        axis.text.y = element_text(size = 11)
) & plot_annotation(
  title = "Water Quality at Sydney Beaches Over the Past 10 Years (2015-2024)",
  subtitle = "Water contamination is determined by assessing Enterococci bacteria levels, which are measured in colony forming units (CFUs) per 100 ml of\nwater. 70 CFUs is considered the threshold for unsafe bacteria levels. The percentages below represent how many observations over the past 10 \nyears were over 70 by location. Measurements were taken weekly for most beaches.\n",
  caption = "\n\nChart produced by Steven Villalon for Tidy Tuesday exercise on May 20, 2025"
)

# Filter data
#water_quality[water_quality$swim_site == "Shelly Beach (Sutherland)", ]
Back to top