---
title: "Prepare dataset"
author: "Esme Ashe-Jepson"
format: html
editor: visual
editor_options: 
  chunk_output_type: console
execute:
  message: false
  warning: false
  options:
    root.dir: "C:/Users/esa38zy/Documents/Research/Alps/Data/R"
bibliography: references.bib
---

## Preparing the datasets

We have a dataset of butterfly body temperatures (field measurements) and upper thermal limits (lab measurements) from central Europe, collected in August 2025. We need to prepare this dataset before we can analyse it.

### Thermoregulation dataset

First we load the dataset.

```{r}
#| label: load original dataset

setwd("C:/Users/esa38zy/Documents/Research/Alps/Data/R")

buff <- read.csv("Buff.csv", header = T)
```

Now we need to load the necessary packages.

```{r}
#| label: load packages

library (dplyr)
```

First we can make sure that our variables are the correct classes: categorical variables must be factors, some of which are ordered. We can also add in elevation as a new column.

```{r}
#| label: correct classes

buff$Species <- as.factor(buff$Species)
buff$Site <- as.factor(buff$Site)
buff$Sex <- as.factor(buff$Sex)
buff$Condition <- factor(buff$Condition,
                         levels = c(1, 2, 3, 4, 5),
                         ordered = TRUE) # ordinal factor

# Add in elevation
elev <- read.csv("Sites.csv", header = T)
buff <- left_join(buff, elev, by = "Site")
```

Now we can start to create some new variables. Here we create species-centred wing length (whereby a large value means an individual that is large for its species), and species-mean wing length (whereby a large values means the species is large)

```{r}
#| label: thermoregulation new variables

# Centre wing length within species
buff <- buff %>%
  group_by(Species) %>%
  mutate(
    Length.centered = Wing.length - mean(Wing.length) # high value means individual is larger than average for the species etc
  ) %>%
  ungroup()

buff <- buff %>%
  group_by(Species) %>%
  mutate(Species_WL = mean(Wing.length, na.rm = TRUE)) %>%
  ungroup()
```

For the next few variables (basking type, wing colouration), we can combine several datasets. This uses data from [@middleton-welling2020] and [@kang2021]. Here our goal is to pull out the near-infrared reflectance value for the relevant part of the wing (always basal region, dorsal for dorsal baskers, ventral for lateral baskers).

```{r}
#| label: combine datasets

nir <- read.csv("ButterflyNIRData.csv", header = T)
trait <- read.csv("Maghreb_traits.csv", header = T)

trait <- trait %>%
  rename(Species = Taxa.name) # change column name to match our dataset

# subset to only data to be merged
nir.2 <- nir %>%
  filter(Species %in% buff$Species) %>%
  select(Species, NirDorsB, NirVentB) 

trait.2 <- trait %>%
  filter(Species %in% buff$Species) %>%
  select(Species, BaT_dorsalabsorb_Da, BaT_doesalreflect_Dr, BaT_lateral_La)

# Create basking type column to pull over relevant nir values

trait.3 <- trait.2 %>%
  mutate(
    basking_type = case_when(
      BaT_dorsalabsorb_Da == 1 ~ "dorsalabsorb",
      BaT_doesalreflect_Dr == 1 ~ "dorsalreflect",
      BaT_lateral_La == 1 ~ "lateral",
      TRUE ~ NA_character_
    )
  ) %>%
  select(Species, basking_type)
# Creates new dataframe with only basking type per species

# Merge datasets
buff.clean <- buff %>%
  left_join(trait.3, by = "Species") %>%
  left_join(nir.2, by = "Species")

# Create Nir_value column, and bring over NirDorsB for NAs
buff.clean <- buff.clean %>%
  mutate(
    Nir_value = case_when(
      basking_type %in% c("dorsalabsorb", "dorsalreflect") ~ NirDorsB,
      basking_type == "lateral" ~ NirVentB,
      TRUE ~ NA_real_),
    # Fill in NA with dorsal value as default
    Nir_value = coalesce(Nir_value, NirDorsB))

# remove unnecessary columns
buff.clean <- buff.clean %>%
  select(-NirDorsB, -NirVentB)

buff.clean$basking_type <- as.factor(buff.clean$basking_type)
```

Now we have a formatted dataset for butterfly thermoregulation. We can save this file to load into R for analysis.

```{r}
#| label: thermoregulation save file

save(buff.clean, file = "Buff_clean.RData")
```

### Thermal tolerance dataset

We also have a separate thermal tolerance dataset, from a series of heat knockdown assays. We also need to prepare this dataset before analysis. First we load the dataset.

```{r}
#| label: load ctmax dataset

ctmax <- read.csv("CTmax.csv", header = T)
```

Next we need to make sure the variables are the correct classes.

```{r}
#| label: ctmax correct classes

ctmax$Site <- as.factor(ctmax$Site)
ctmax$Species <- as.factor(ctmax$Species)
ctmax$Run <- as.factor(ctmax$Run)
ctmax$Sex <- as.factor(ctmax$Sex)
ctmax$Recovery <- as.factor(ctmax$Recovery)
ctmax$Condition <- factor(ctmax$Condition,
                             levels = c("1", "2", "3"),
                             ordered = TRUE)

ctmax$A1 <- 1
```

Next we want to create some new variables again; species-centred wing length and species-mean wing length.

```{r}
#| label: ctmax new variables

ctmax <- ctmax %>%
  group_by(Species) %>%
  mutate(
    Length.centered = Wing.length - mean(Wing.length) # high value means individual is larger than average for the species etc
  ) %>%
  ungroup()

ctmax <- ctmax %>%
  group_by(Species) %>%
  mutate(Species_WL = mean(Wing.length, na.rm = TRUE)) %>%
  ungroup()

elev.2 <- elev %>%
  filter(Site %in% ctmax$Site)

ctmax <- left_join(ctmax, elev.2, by = "Site")
```

Again we want to bring over the relevant colouration value (NIR reflectance) and basking type.

```{r}
#| label: ctmax nir value

nir <- read.csv("ButterflyNIRData.csv", header = T)
trait <- read.csv("Maghreb_traits.csv", header = T)

trait <- trait %>%
  rename(Species = Taxa.name)

# Subset to only data to be merged
nir.2 <- nir %>%
  filter(Species %in% ctmax$Species) %>%
  select(Species, NirDorsB, NirVentB) 

trait.2 <- trait %>%
  filter(Species %in% ctmax$Species) %>%
  select(Species, BaT_dorsalabsorb_Da, BaT_doesalreflect_Dr, BaT_lateral_La)

# Create basking type column to pull over relevant nir values
trait.3 <- trait.2 %>%
  mutate(
    basking_type = case_when(
      BaT_dorsalabsorb_Da == 1 ~ "dorsalabsorb",
      BaT_doesalreflect_Dr == 1 ~ "dorsalreflect",
      BaT_lateral_La == 1 ~ "lateral",
      TRUE ~ NA_character_
    )
  ) %>%
  select(Species, basking_type)

# Merge datasets
ctmax <- ctmax %>%
  left_join(trait.3, by = "Species") %>%
  left_join(nir.2, by = "Species")

# Create Nir_value column
ctmax <- ctmax %>%
  mutate(
    Nir_value = case_when(
      basking_type %in% c("dorsalabsorb", "dorsalreflect") ~ NirDorsB,
      basking_type == "lateral" ~ NirVentB,
      TRUE ~ NA_real_
    ),
    # Fill in NA with dorsal value as default
    Nir_value = coalesce(Nir_value, NirDorsB))


# Remove unnecessary columns
ct.clean <- ctmax %>%
  select(-NirDorsB, -NirVentB, -Notes)

ct.clean$basking_type <- as.factor(ct.clean$basking_type)
```

Finally we want to subset to only species with a minimum of 5 records so that our survival curves will be stable.

```{r}
#| label: ctmax subset

ct.clean <- ct.clean %>%
  group_by(Species) %>%
  filter(n() >= 5) %>%
  ungroup()
```

Now we can save our data file to load into R for analysis.

```{r}
#| label: ctmax save datafile

save(ct.clean, file = "CTmax_clean.RData")
```

Now we have our data files ready for analysis.