# ── STEP 0: Install packages (only needed once on a new computer) ──
install.packages("tidyverse")   # installs readr, tibble, dplyr, tidyr, ggplot2 and more
install.packages("broom")       # tidy regression summaries

# ─────────────────────────────────────────────────────────
# IUU FISHING LOSS ESTIMATION — SESSION 1
# Using tibble() from the tidyverse
# ─────────────────────────────────────────────────────────

library(tidyverse)
library(broom)

rm(list=ls())

fish_data <- tibble(
  year            = rep(2018:2022, each = 4),
  province        = rep(c("Province A", "Province A", "Province B", "Province B"), 5),
  vessel_type     = rep(c("Small-scale", "Commercial"), 10),
  
  # Reported catch (metric tonnes) — what was officially declared
  reported_catch  = c(1200, 4500, 980,  3200,
                      1150, 4800, 1020, 3400,
                      1050, 4200, 890,  2900,
                      1300, 5100, 1100, 3600,
                      1400, 5400, 1200, 3900),
  
  # Estimated actual catch — from independent survey or model
  actual_catch    = c(1680, 6750, 1274, 5120,
                      1725, 7200, 1428, 5440,
                      1575, 6720, 1246, 4640,
                      1950, 7650, 1540, 5760,
                      2100, 8100, 1680, 6240),
  
  # Market price (USD per metric tonne)
  price_per_tonne = c(1200, 950,  1100, 900,
                      1250, 970,  1150, 920,
                      1300, 1000, 1200, 950,
                      1350, 1020, 1250, 970,
                      1400, 1050, 1300, 1000),
  
  # Patrol days per year per province
  patrol_days     = c(45, 45, 30, 30,
                      50, 50, 28, 28,
                      20, 20, 18, 18,
                      35, 35, 32, 32,
                      55, 55, 40, 40)
)

# Print the tibble — notice the compact, clean display with column types shown
fish_data

# read_csv() from the readr package (loaded with tidyverse)
fish_data <- read_csv("iuu_fishing_data.csv")

# read_csv() will print a "Column specification" message — this tells you
# what data type it assigned to each column. Always check this!

# Preview the tibble
fish_data          # prints neatly with column types
glimpse(fish_data)  # shows each column on its own line — great for wide tables

# Fix column types if needed, e.g., year should be numeric not character:
fish_data <- read_csv("iuu_fishing_data.csv",
                      col_types = cols(year = col_double()))

# glimpse() is the tidyverse alternative to str() — much more readable
glimpse(fish_data)

# How many rows and columns?
dim(fish_data)
nrow(fish_data)
ncol(fish_data)

# Column names
names(fish_data)

# First and last rows
slice_head(fish_data, n = 6)   # tidyverse version of head()
slice_tail(fish_data, n = 6)   # last 6 rows

# Quick summary (base R — still very useful)
summary(fish_data)

# select() — choose columns by name (no $ or bracket notation needed)
fish_data |>
  select(year, province, vessel_type, reported_catch)

# filter() — keep only rows matching a condition
fish_data |>
  filter(province == "Province A")

# Chain: filter AND select together
fish_data |>
  filter(vessel_type == "Commercial") |>
  select(year, province, reported_catch, actual_catch)

# filter with two conditions (& means AND, | means OR)
fish_data |>
  filter(province == "Province A", year >= 2020)

# Unique values of a column
fish_data |>
  distinct(province)

fish_data |>
  count(province, vessel_type)   # count rows per group (like table())

# Overall descriptive statistics for reported catch
fish_data |>
  summarise(
    mean_catch   = mean(reported_catch),
    median_catch = median(reported_catch),
    sd_catch     = sd(reported_catch),
    min_catch    = min(reported_catch),
    max_catch    = max(reported_catch)
  )

# Stats grouped by province
fish_data |>
  group_by(province) |>
  summarise(
    total_reported = sum(reported_catch),
    mean_reported  = mean(reported_catch),
    n_records      = n()            # n() counts rows in each group
  )

# Stats grouped by province AND vessel type
fish_data |>
  group_by(province, vessel_type) |>
  summarise(
    total_reported = sum(reported_catch),
    total_actual   = sum(actual_catch),
    .groups        = "drop"      # good practice: drop grouping after summarise
  )

# Bar chart: total reported catch by province, filled by vessel type
fish_data |>
  group_by(province, vessel_type) |>
  summarise(total_catch = sum(reported_catch), .groups = "drop") |>
  ggplot(aes(x = province, y = total_catch, fill = vessel_type)) +
  geom_col(position = "dodge") +               # side-by-side bars
  scale_fill_manual(values = c("steelblue", "coral")) +
  labs(
    title = "Total Reported Catch by Province (2018–2022)",
    x     = "Province",
    y     = "Catch (metric tonnes)",
    fill  = "Vessel Type"
  ) +
  theme_minimal()  

# Histogram: distribution of reported catch
ggplot(fish_data, aes(x = reported_catch)) +
  geom_histogram(bins = 8, fill = "steelblue", colour = "white") +
  labs(title = "Distribution of Reported Catch",
       x     = "Catch (metric tonnes)",
       y     = "Count") +
  theme_minimal()

# Box plot: distribution of reported catch
ggplot(fish_data, aes(x = province, y = reported_catch, fill = vessel_type)) +
  geom_boxplot(color = "black", outlier.shape = 16, width = 0.5) +
  scale_fill_manual(values = c("Small-scale" = "pink", 
                               "Commercial" = "limegreen")) +
  labs(
    title = "",
    x     = "Province",
    y     = "Catch (metric tonnes)",
    fill  = "Vessel type"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    legend.position = "right"
  )

# Scatter plot: reported vs actual catch, coloured by vessel type
ggplot(fish_data, aes(x = reported_catch, y = actual_catch,
                      colour = vessel_type, shape = province)) +
  geom_point(size = 3) +
  geom_abline(intercept = 0, slope = 1,
              linetype = "dashed", colour = "red") +  # 45° = perfect reporting
  annotate("text", x = 4000, y = 3700,
           label = "Perfect reporting (no IUU)",
           colour = "red", size = 3.5, angle = 32) +
  scale_colour_manual(values = c("#0369a1", "#b45309")) +
  labs(title  = "Reported vs Estimated Actual Catch",
       x      = "Reported Catch (t)",
       y      = "Estimated Actual Catch (t)",
       colour = "Vessel Type",
       shape  = "Province") +
  theme_minimal()

# mutate() adds new columns or modifies existing ones.
# Notice: you can reference a new column in the same mutate() call!

fish_data <- fish_data |>
  mutate(
    iuu_catch    = actual_catch - reported_catch,
    iuu_rate     = (iuu_catch / actual_catch) * 100,   # % of actual catch that is IUU
    iuu_loss_usd = iuu_catch * price_per_tonne           # economic loss in USD
  )

# View just the key result columns
fish_data |>
  select(year, province, vessel_type, iuu_catch, iuu_rate, iuu_loss_usd)

# Grand totals
fish_data |>
  summarise(
    total_iuu_catch_t  = sum(iuu_catch),
    total_iuu_loss_usd = sum(iuu_loss_usd),
    mean_iuu_rate_pct  = round(mean(iuu_rate), 1)
  )

# IUU loss by province
fish_data |>
  group_by(province) |>
  summarise(total_loss_usd = sum(iuu_loss_usd), .groups = "drop")

# IUU rate by vessel type (mean)
fish_data |>
  group_by(vessel_type) |>
  summarise(mean_iuu_rate = round(mean(iuu_rate), 1), .groups = "drop")

# Full two-way breakdown: province × vessel type
fish_data |>
  group_by(province, vessel_type) |>
  summarise(
    total_iuu_catch  = sum(iuu_catch),
    total_iuu_loss   = sum(iuu_loss_usd),
    mean_iuu_rate    = round(mean(iuu_rate), 1),
    .groups = "drop"
  ) |>
  arrange(desc(total_iuu_loss))   # sort by largest loss first

# Summarise by year first, then pipe directly into ggplot
fish_data |>
  group_by(year) |>
  summarise(total_loss_m = sum(iuu_loss_usd) / 1e6, .groups = "drop") |>
  ggplot(aes(x = year, y = total_loss_m)) +
  geom_line(colour = "darkred", linewidth = 1.2) +
  geom_point(colour = "darkred", size = 4) +
  geom_label(aes(label = round(total_loss_m, 1)),
             nudge_y = 0.3, size = 3.5) +         # add value labels
  scale_x_continuous(breaks = 2018:2022) +
  labs(title   = "Annual IUU Economic Loss (2018–2022)",
       subtitle = "Combined across both provinces and vessel types",
       x        = "Year",
       y        = "IUU Loss (million USD)") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold"))

# Faceted version: one panel per province
fish_data |>
  group_by(year, province) |>
  summarise(total_loss_m = sum(iuu_loss_usd) / 1e6, .groups = "drop") |>
  ggplot(aes(x = year, y = total_loss_m, colour = province)) +
  geom_line(linewidth = 1.1) +
  geom_point(size = 3) +
  facet_wrap(~ province) +                       # one panel per province
  scale_x_continuous(breaks = 2018:2022) +
  labs(title = "IUU Loss by Province", x = "Year",
       y = "IUU Loss (million USD)") +
  theme_minimal() + theme(legend.position = "none")