Appendix S10: Codes for used statistical analyses

Title: A workflow to optimize spatial sampling in ecoacoustic studies
Journal: Landscape Ecology

Authors:
Martínez-Arias, V. M.; Paniagua-Villada, C.; Daza, J. M.

Author affiliations at the time the work was conducted:
1. Grupo Herpetológico de Antioquia, Calle 70 No. 52 - 21, Postal Code 050010, Medellín, Colombia.

Corresponding author:
Victor M. Martínez-Arias
📧 Email:
📞 Tel: +57 311 679 7846

0. Exploration graphic

This graphic allow to visually compare the indices in the range 0-24khz, for all hours and localities. We did not use it

library(ggplot2)
library(readxl)
library(dplyr)

Statistic <- "mean"  # Set to "mean" or "median"
# Load data (replace path with your own)
data <- read_excel("#/DF1-1_Interpolations_variograms_and_error_for_subsamples.xlsx")
# Filter based on combined conditions
filtered_data <- data %>%
  filter(
    (Index %in% c("BI", "NDSI") & Statistic == Statistic) |
    (Index %in% c("ACItf", "ACIft", "NP", "Hf", "H") &
     Frequency_Range == "R0-24" & Statistic == "mean")
  ) %>%
  mutate(
    Subsample_Color = case_when(
      Subsample == "SLIC" ~ "SLIC",
      Subsample == "halfbasins" ~ "HalfBasins",
      Subsample == "basins" ~ "Basins",
      Subsample == "Landcover" ~ "Landcover",
      TRUE ~ "Random"
    )
  )
# Add synthetic category "ALL"
data_all <- filtered_data %>%
  mutate(Locality = "ALL")
# Merge both datasets
complete_data <- bind_rows(filtered_data, data_all)
# Create plot
plot <- ggplot(complete_data, aes(x = Sample_percent, y = Pearson_Correlation)) +
  geom_point(data = filter(complete_data, Subsample_Color == "Random"),
             aes(color = Subsample_Color),
             alpha = 0.25, size = 1) +
  geom_smooth(method = "lm", color = "blue", se = FALSE, linewidth = 0.8) + 
  geom_point(data = filter(complete_data, Subsample_Color != "Random"),
             aes(color = Subsample_Color), size = 2) +
  scale_color_manual(
    values = c(
      "SLIC" = "purple",
      "HalfBasins" = "skyblue",
      "Basins" = "navy",
      "Random" = "gray50",
      "Landcover" = "green"
    )
  ) +
  facet_grid(Locality ~ Index) +
  theme_minimal(base_size = 12) +
  labs(
    x = "Sample percent",
    y = "Subsample_performance",
    color = "Sampling method"
  ) +
  theme(
    legend.position = "right",
    axis.text = element_text(size = 10),
    strip.text = element_text(size = 12, face = "bold")
  )
# Save as PDF (set your desired path)
ggsave(
  filename = "#/0_Subsampleperformance_vs_SamplePercent_mean.pdf",
  plot = plot,
  width = 15.27,
  height = 8.15,
  units = "in"
)

print(plot)

OBJECTIVE 1 SIZE DEPENDANCE

S. objective 1 - Analyze the relationship between sample size and the spatial representativeness of interpolated soundscapes using different sampling strategies in heterogeneous landscapes;

1.1. Generate the pearson correlation between Sampling percent and design precision.

To evaluate how sampling size influences the performance of each subsampling strategy, we calculated a second Pearson correlation coefficient, Pcoef2. This coefficient quantifies the correlation between the proportion of sampled data (i.e., Sample_percent) and the subsampling performance, previously calculated as Pcoef1 (Pearson correlation between full-sample and subsampled raster surfaces).

The analysis was performed across all combinations of localities, subsampling designs, frequency ranges, time periods, acoustic indices, and summary statistics (e.g., mean, median). For each unique combination, we filtered the data and applied ggscatterstats() from the ggstatsplot package to compute the Pearson correlation (Pcoef2), along with associated test statistics.

The resulting outputs were consolidated into a dataset named DF2, which was later used for statistical comparisons between strategies and visualized using violin and density plots.

Used filters: | Locality | Type | Frequency_Range | Time_Period | Statistic | Index | Pearson_Correlation |

Resulting table will be named “DF2”, which includes the calculated Pcoef2.

# Load required libraries
library(ggstatsplot)
library(dplyr)
library(readxl)
library(openxlsx)
library(future.apply)

# Set up parallel plan
plan(multisession, workers = parallel::detectCores() - 1)

# Load data (replace with appropriate file path)
data <- read_excel("#/DF1-1_Interpolations_variograms_and_error_for_subsamples.xlsx") %>%
  mutate(across(c(Locality, Type, Subsample, Frequency_Range, Time_Period, Statistic, Index), as.character))

# Create duplicate with Locality = "ALL"
data_all <- data %>% mutate(Locality = "ALL")
expanded_data <- bind_rows(data, data_all)

# Get unique combinations of filters
combinations <- expanded_data %>%
  distinct(Locality, Type, Frequency_Range, Time_Period, Statistic, Index, Subsample)

# Function to process each combination
process_combination <- function(filter_row) {
  filtered_data <- expanded_data %>%
    filter(
      Locality == filter_row$Locality,
      Type == filter_row$Type,
      Frequency_Range == filter_row$Frequency_Range,
      Time_Period == filter_row$Time_Period,
      Statistic == filter_row$Statistic,
      Index == filter_row$Index,
      Subsample == filter_row$Subsample
    )

  # Check for sufficient data and variation
  if (nrow(filtered_data) < 3 ||
      length(unique(filtered_data$Sample_percent)) < 2 ||
      length(unique(filtered_data$Subsample_performance)) < 2) return(NULL)

  # Extract correlation statistics
  temp_plot <- tryCatch({
    ggscatterstats(
      data = filtered_data,
      x = Sample_percent,
      y = Subsample_performance,
      messages = FALSE
    )
  }, error = function(e) return(NULL))

  if (is.null(temp_plot)) return(NULL)

  stats <- tryCatch(extract_stats(temp_plot), error = function(e) return(NULL))
  if (is.null(stats)) return(NULL)

  subtitle_data <- stats$subtitle_data
  caption_data <- stats$caption_data

  # Add filtering context to results
  for (col in names(filter_row)) {
    subtitle_data[[col]] <- filter_row[[col]]
    caption_data[[col]] <- filter_row[[col]]
  }

  subtitle_data$Type <- "Frequentist"
  caption_data$Type <- "Bayesian"

  bind_rows(subtitle_data, caption_data)
}

# Apply processing to all combinations in parallel
results_list <- future_lapply(1:nrow(combinations), function(i) {
  filter_row <- combinations[i, ]
  process_combination(filter_row)
}, future.seed = TRUE)

# Combine results
final_table <- bind_rows(results_list)

# Remove columns incompatible with Excel
final_table_clean <- final_table %>%
  select_if(~ !any(class(.) %in% c("list", "language")))

# Export final result
write.xlsx(final_table_clean, "DF2_SampleSize_vs_Subsample_performance_Correlations(Pcoeff2).xlsx")

1.2. Visual inspection of sampling size effects

To explore the relationship between sample size and subsampling performance across different categorical variables (e.g., locality, index, strategy), we created an interactive visualization tool using a Shiny application. This tool allows the dynamic generation of violin plots showing the distribution of Pcoef2 values (correlation between sample size and subsample performance) across user-defined categories.

The application permits filtering by multiple metadata fields (e.g., frequency range, time period, statistic, or acoustic index) and dynamically updates the visualization accordingly. Violin plots were generated using the ggplot2 package, with optional overlay of jittered data points. Users can export the resulting plots as PDFs for reporting or supplementary use.

This tool is based on the table DF2, and facilitates visual comparison of the influence of sample size across grouping levels in the dataset.

This visual inspection is only for ALL dataset.

library(shiny)
library(ggplot2)
library(readxl)
library(dplyr)

ui <- fluidPage(
  titlePanel("Distribution of Pearson Coefficient (Frequentist)"),

  sidebarLayout(
    sidebarPanel(
      fileInput("archivo", "Upload Excel file", accept = c(".xlsx")),

      selectInput("variable_categorica", "Categorical variable to compare:",
                  choices = c("Locality", "Frequency_Range", "Time_Period", "Statistic", "Index", "Subsample", "Type")),

      numericInput("axis_title_size", "Axis title font size:", value = 14, min = 8, max = 30),
      numericInput("axis_text_size", "Axis text font size:", value = 12, min = 6, max = 30),

      downloadButton("descargar_pdf", "Download plot as PDF"),

      uiOutput("filtros_adicionales")
    ),

    mainPanel(
      plotOutput("grafico_violin", height = "600px"),
      tableOutput("tabla_filtrada")
    )
  )
)

server <- function(input, output, session) {

  # Read uploaded file
  datos <- reactive({
    req(input$archivo)
    read_excel(input$archivo$datapath) %>%
      filter(Tipo == "Frequentist") %>%
      filter(!is.na(estimate))
  })

  # Dynamic additional filters
  output$filtros_adicionales <- renderUI({
    req(datos())
    other_columns <- setdiff(c("Locality", "Frequency_Range", "Time_Period", "Statistic", "Index", "Subsample", "Type"), input$variable_categorica)

    lapply(other_columns, function(col) {
      selectInput(
        inputId = paste0("filtro_", col),
        label = col,
        choices = c("All", unique(datos()[[col]])),
        selected = "All"
      )
    })
  })

  # Filter data based on selected inputs
  datos_filtrados <- reactive({
    req(datos())
    df <- datos()
    other_columns <- setdiff(c("Locality", "Frequency_Range", "Time_Period", "Statistic", "Index", "Subsample", "Type"), input$variable_categorica)

    for (col in other_columns) {
      val <- input[[paste0("filtro_", col)]]
      if (!is.null(val) && val != "All") {
        df <- df %>% filter(.data[[col]] == val)
      }
    }

    df
  })

  # Generate violin plot
  plot_violines <- reactive({
    df <- datos_filtrados()
    req(nrow(df) > 0)

    ggplot(df, aes(x = .data[[input$variable_categorica]], y = estimate)) +
      geom_violin(fill = "skyblue", alpha = 0.7, color = NA) +
      # geom_jitter(width = 0.15, alpha = 0.5, size = 1.5, color = "black") +
      labs(x = input$variable_categorica, y = "Pearson Estimate") +
      theme_minimal() +
      theme(
        axis.title = element_text(size = input$axis_title_size),
        axis.text = element_text(size = input$axis_text_size),
        plot.title = element_blank()
      )
  })

  output$grafico_violin <- renderPlot({
    plot_violines()
  })

  # Show filtered table
  output$tabla_filtrada <- renderTable({
    datos_filtrados()
  })

  # Download button
  output$descargar_pdf <- downloadHandler(
    filename = function() {
      paste0("violin_plot_pearson_", Sys.Date(), ".pdf")
    },
    content = function(file) {
      ggsave(file, plot = plot_violines(), device = "pdf", width = 10, height = 6)
    }
  )
}

shinyApp(ui = ui, server = server)

1.3. Normality assessment

Before a statistical test to evaluate the size dependance, normality must be checked. To determine whether parametric tests could be applied to compare the sampling strategies, we evaluated the distribution of Pcoef2 values (i.e., correlations between sampling proportion and subsample performance) across all sampling designs. We applied the Shapiro–Wilk test for normality within each subsample category using the rstatix package. Additionally, we visually inspected the distributions by plotting histograms with density curves, faceted by subsample strategy (e.g., basins, SLIC, ssample40).

Results from the Shapiro–Wilk tests and visual assessments revealed widespread deviations from normality, justifying the use of non-parametric statistical tests in subsequent analyses. All results were saved as part of the reproducible workflow, including the test results and distribution plots.

The input is DF2_SampleSize_vs_Subsample_performance_Correlations(Pcoeff2).xlsx. Procedure must be done for mean and median

# Load required packages
library(readxl)
library(dplyr)
library(ggplot2)
library(rstatix)   # for normality tests and summaries
library(ggpubr)    # for visualization

# Load dataset (replace with correct file path)
data <- read_excel("#/DF2_SampleSize_vs_Subsample_performance_Correlations(Pcoeff2).xlsx")
# Filter only Frequentist analyses, remove NA, and keep Statistic == "mean"
data_frec <- data %>%
  filter(
    Tipo == "Frequentist",
    !is.na(estimate),
    Statistic == "mean"   # Change here if you want to use "median", etc.
  )


# Perform Shapiro-Wilk normality test by Subsample
normality <- data_frec %>%
  group_by(Subsample) %>%
  shapiro_test(estimate)

print("### Shapiro-Wilk normality test by Subsample:")
print(normality)

# Create histogram and density plots

plot <- ggplot(data_frec, aes(x = estimate, fill = Subsample)) +
  geom_histogram(aes(y = ..density..), position = "identity", alpha = 0.5, bins = 30) +
  geom_density(alpha = 0.7) +
  facet_wrap(~Subsample) +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "right",
    axis.text = element_text(size = 10),
    strip.text = element_text(size = 12, face = "bold")
  ) +
  labs(title = NULL, x = "Pearson Estimate", y = "Density")

# Save plot to PDF (update file path)
ggsave(
  filename = "#/R1-3_mean_Distribution_Estimates.pdf",
  plot = plot,
  width = 15.27,
  height = 8.15,
  units = "in"
)

# Display plot
plot

# Export normality test results
library(writexl)
write_xlsx(normality, "#/R1-3_mean_Shapiro_Normality_Test.xlsx")

1.4. Dependence of subsampling strategies on sample size

To evaluate whether different subsampling strategies exhibit distinct levels of sensitivity to sample size, we compared the strength of the correlation (Pcoef2) between sample size (Sample_percent) and subsampling performance across strategies.

We first filtered DF2 to include only frequentist estimates where the Pearson correlation was computed between sample size and design performance (parameter1 == Sample_percent and parameter2 == Pearson_Correlation) using the mean statistic.

Sampling strategies were then grouped into two categories: "random" (e.g., ssample*) and "tessellation" (e.g., grid-based designs such as SLIC or basins). For each group, we summarized the distribution of Pcoef2 values through descriptive statistics (mean, median, SD, IQR, CV).

Given the lack of normality (as previously assessed), we employed a non-parametric Wilcoxon rank-sum test to compare Pcoef2 distributions between the two strategy types.

Due to the lack of normality, a Wilcoxon rank-sum test was used to compare the distributions of Pcoef2 between groups. We also applied a Fligner–Killeen test to evaluate differences in dispersion between strategies.

All results were visualized using violin and boxplots. Summary tables and test outputs were exported for reproducibility.

# Load required libraries
library(readxl)
library(dplyr)
library(psych)
library(writexl)
library(ggplot2)

# Load results table DF2 (adjust path)
data <- read_excel("#/DF2_SampleSize_vs_Subsample_performance_Correlations(Pcoeff2).xlsx")

# Filter valid Pearson correlations between Sample_percent and Pearson_Correlation (Frequentist, Statistic = "median")
data_frec <- data %>%
  filter(
    parameter1 == "Sample_percent",
    parameter2 == "Subsample_performance",
    effectsize == "Pearson correlation",
    Tipo == "Frequentist",
    Statistic == "median",
    !is.na(estimate)
  )

# Classify sampling strategy
data_frec <- data_frec %>%
  mutate(Type_group = case_when(
    grepl("^ssample", Subsample) ~ "random",
    TRUE ~ "tessellation"
  ))

# Descriptive statistics for Pcoef2 by strategy group
summary_dependency <- data_frec %>%
  group_by(Type_group) %>%
  summarise(
    n = n(),
    mean_Pcoef2 = mean(estimate, na.rm = TRUE),
    median_Pcoef2 = median(estimate, na.rm = TRUE),
    sd_Pcoef2 = sd(estimate, na.rm = TRUE),
    iqr_Pcoef2 = IQR(estimate, na.rm = TRUE),
    cv_Pcoef2 = sd(estimate, na.rm = TRUE) / abs(mean(estimate, na.rm = TRUE))
  )

print("Summary of sample size dependency by strategy group:")
print(summary_dependency)

# Wilcoxon rank-sum test
wilcox <- wilcox.test(estimate ~ Type_group, data = data_frec)
print("Wilcoxon test between random and tessellation strategies:")
print(wilcox)

# Fisher's Z comparison of correlation strength

random <- data_frec %>% filter(Type_group == "random")
tess <- data_frec %>% filter(Type_group == "tessellation")

fisherz <- r.test(
  n = nrow(random), r12 = cor(random$`Sample percent`, random$estimate),
  n2 = nrow(tess), r34 = cor(tess$`Sample percent`, tess$estimate)
)

print("Fisher's Z test for correlation strength:")
print(fisherz)
# Fligner–Killeen test for dispersion differences
fligner <- fligner.test(estimate ~ Type_group, data = data_frec)
print("Fligner–Killeen test for variance differences:")
print(fligner)

# Optional plot: distribution of Pcoef2 values
ggplot(data_frec, aes(x = Type_group, y = estimate, fill = Type_group)) +
  geom_violin(alpha = 0.5) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.8) +
  labs(title = "Sample size dependence (Pcoef2)", y = "Pcoef2 (correlation)", x = "Sampling strategy") +
  theme_minimal(base_size = 14) +
  scale_y_continuous(limits = c(-1, 1), oob = scales::squish)

# Export values and summaries
write_xlsx(
  list(
    Pcoef2_Values = data_frec,
    Group_Summary = summary_dependency
  ),
  path = "#/R1-4_MEDIAN_SAMPLING_STRATEGY_COMPARISON.xlsx"
)
# Export F-test results (MEDIAN)

write_xlsx(
  list(
    Pcoef2_Spread_Summary = summary_dependency,
    F_Test_Variance = as.data.frame(t(c(
      F_statistic = 0.79566,
      df1 = 167,
      df2 = 152,
      p_value = 0.149,
      CI_lower = 0.5818262,
      CI_upper = 1.0855697
    )))
  ),
  path = "#/R1-4_MEDIAN_Pcoef2_VARIANCE_TEST.xlsx"
)

# Export F-test results (MEAN)
write_xlsx(
  list(
    Pcoef2_Spread_Summary = summary_dependency,
    F_Test_Variance = as.data.frame(t(c(
      F_statistic = 0.70577,
      df1 = 167,
      df2 = 163,
      p_value = 0.02567,
      CI_lower = 0.5194396,
      CI_upper = 0.9583777
    )))
  ),
  path = "#/R1-4_MEAN_Pcoef2_VARIANCE_TEST.xlsx"
)

# Save final plot as PDF
final_plot <- ggplot(data_frec, aes(x = Type_group, y = estimate, fill = Type_group)) +
  geom_violin(alpha = 0.5, trim = FALSE) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.8) +
  scale_fill_manual(values = c("random" = "#F8766D", "tessellation" = "#00BFC4")) +
  labs(title = "Sample size dependency (Pcoef2)",
       y = "Pcoef2 (correlation)",
       x = "Sampling strategy") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none") +
  coord_cartesian(ylim = c(-1, 1)) +
  scale_y_continuous(limits = c(-1, 1), oob = scales::squish)

ggsave(
  filename = "#/R1-4_MEDIAN_Pcoef2_SAMPLING_STRATEGY.pdf",
  plot = final_plot,
  width = 8,
  height = 6,
  units = "in"
)

OBJECTIVE 2: EXPLORE EFFECTIVENESS OF STRATIFIED SAMPLING DESIGNS

Compare the performance of different sampling strategies, including random and stratified approaches, genarated by segmentation methods.

To assess the effectiveness of tessellation-based sampling strategies, we conducted a multi-step statistical analysis using R. First, we calculated two metrics for each subsample: (1) subsample performance in the form of Pearson correlation between the subsample and its full reference dataset; and (2) sampling efficiency, computed as the ratio between Pearson correlation and the sampling effort (Subsample performance/Sample_percent).

We then grouped data by summary statistic type (mean or median) and computed descriptive statistics for each tessellation strategy, including sample size, central tendency (mean, median), dispersion (standard deviation), and coefficient of variation.

Normality of Pearson correlations was assessed using the Shapiro–Wilk test within each subsampling group, using shapiro_test() from the rstatix package.

Given widespread non-normality, we applied Kruskal–Wallis rank-sum tests to compare both raw performance and efficiency across subsampling strategies. Where differences were detected, we used Dunn’s post-hoc tests with Benjamini–Hochberg correction to evaluate pairwise contrasts.

Violin and boxplots were generated to visually compare the distribution of both performance and efficiency across subsampling methods. All procedures were run separately for mean and median statistics, and results (summary tables and plots) were exported to PDF and Excel formats.

Packages we used: dplyr, ggplot2, ggpubr, rstatix, FSA, writexl.

The entry database is: DF1-1_Interpolations_variograms_and error_for_subsamples.xlsx

# Load required libraries
library(dplyr)
library(ggplot2)
library(readxl)
library(ggpubr)
library(rstatix)
library(FSA)
library(writexl)

# Load data (replace path accordingly)
df <- read_excel("#/DF1-1_Interpolations_variograms_and_error_for_subsamples.xlsx") %>%
  mutate(across(c(Subsample, Type, Index, Statistic), as.character)) %>%
  mutate(Pearson_rel = Pearson_Correlation / Sample_percent)

# Loop through each summary statistic type
for (stat in c("mean", "median")) {
  
  df_tess <- df %>%
    filter(Type == "Tessells", Statistic == stat, !is.na(Pearson_Correlation))
  
  # Summary statistics of precision and efficiency
  summary_precision <- df_tess %>%
    group_by(Subsample) %>%
    summarise(
      n = n(),
      mean_Pearson = mean(Pearson_Correlation, na.rm = TRUE),
      median_Pearson = median(Pearson_Correlation, na.rm = TRUE),
      sd_Pearson = sd(Pearson_Correlation, na.rm = TRUE),
      mean_eff = mean(Pearson_rel, na.rm = TRUE),
      median_eff = median(Pearson_rel, na.rm = TRUE),
      sd_eff = sd(Pearson_rel, na.rm = TRUE),
      cv_eff = sd_eff / abs(mean_eff)
    ) %>%
    arrange(desc(median_eff))
  
  # Shapiro-Wilk normality test (only for groups with n >= 3)
  
  valid <- df_tess %>%
    group_by(Subsample) %>%
    summarise(n = n()) %>%
    filter(n >= 3) %>%
    pull(Subsample)
  
  normality_df <- df_tess %>%
    filter(Subsample %in% valid) %>%
    group_by(Subsample) %>%
    shapiro_test(Pearson_Correlation)

  
  # Precision plot
  precision_plot <- ggplot(df_tess, aes(x = Subsample, y = Pearson_Correlation, fill = Subsample)) +
    geom_violin(trim = FALSE) +
    geom_boxplot(width = 0.2, outlier.shape = NA) +
    theme_minimal() +
    labs(title = paste("Strategy precision (Pearson) -", stat), y = "Pearson Correlation") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    scale_y_continuous(limits = c(-1, 1), oob = scales::squish)

  ggsave(
    paste0("#/Tessellation_KW_Precision_", stat, ".pdf"),
    plot = precision_plot,
    width = 8, height = 6, units = "in"
  )
  
  # Efficiency plot:
  
  efficiency_plot <- ggplot(df_tess, aes(x = Subsample, y = Pearson_rel, fill = Subsample)) +
    geom_violin(trim = FALSE) +
    geom_boxplot(width = 0.2, outlier.shape = NA) +
    theme_minimal() +
    labs(title = paste("Strategy efficiency (Pearson / Sample%) -", stat), y = "Relative Pearson") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

  ggsave(
    paste0("#/Tessellation_KW_Efficiency_", stat, ".pdf"),
    plot = efficiency_plot,
    width = 8, height = 6, units = "in"
  )

  # Kruskal–Wallis tests and post-hoc comparisions:
  kw_pearson <- kruskal_test(Pearson_Correlation ~ Subsample, data = df_tess)
  posthoc_pearson <- dunn_test(Pearson_Correlation ~ Subsample, data = df_tess, p.adjust.method = "BH")
  
  kw_eff <- kruskal_test(Pearson_rel ~ Subsample, data = df_tess)
  posthoc_eff <- dunn_test(Pearson_rel ~ Subsample, data = df_tess, p.adjust.method = "BH")
  
  # Export results to Excel
  write_xlsx(
    list(
      Summary_Statistics = summary_precision,
      Normality_Test = normality_df,
      KW_Pearson = as.data.frame(kw_pearson),
      PostHoc_Pearson = as.data.frame(posthoc_pearson),
      KW_Efficiency = as.data.frame(kw_eff),
      PostHoc_Efficiency = as.data.frame(posthoc_eff)
    ),
    path = paste0("#/KW_Tessellation_Strategies_", stat, ".xlsx")
  )
}

OBJECTIVE 3:

Specific objective 3: To explore how the interplay of categorical (e.g., landscape, time, index) and continuous variables (e.g., sample size, performance) explains the multivariate structure that differentiates stratified subsampling strategies.

3.1 FAMD analyses

A Factorial Analysis of Mixed Data (FAMD) is conducted to explore the multivariate structure of stratified subsampling strategies by integrating both categorical and continuous variables. The dataset was filtered to retain only relevant observations (e.g., median statistics from tessellation-based designs) and included key landscape, temporal, acoustic, and performance variables. The variable Subsample, which defines the sampling strategy under evaluation, was treated as a supplementary qualitative variable to avoid influencing the dimensional construction.

FAMD was performed using the FactoMineR package, extracting the coordinates of individuals, the contributions of quantitative and qualitative variables, and the proportion of variance explained by each dimension. Results were organized into separate Excel sheets for subsequent visualization and statistical analysis.

library(readxl)
library(FactoMineR)
library(factoextra)
library(dplyr)
library(openxlsx)

# Load dataset (replace path as needed)
data <- read_excel("#/DF1-1_Interpolations_variograms_and_error_for_subsamples.xlsx")

# Filter only valid values and select relevant columns
data_famd <- data %>%
  filter(!is.na(Pearson_Correlation), Statistic == "median", Type == "Tessells") %>%
  select(Locality, Frequency_Range, Time_Period, Index, Subsample, Sample_percent, Pearson_Correlation)

# Convert categorical variables to factors
data_famd <- data_famd %>%
  mutate(across(c(Locality, Frequency_Range, Time_Period, Index, Subsample), as.factor))

# Identify the supplementary qualitative variable (Subsample)
supplementary_var_index <- which(colnames(data_famd) == "Subsample")

# Run FAMD with Subsample as a supplementary variable
famd_result <- FAMD(data_famd, 
                    ncp = 5, 
                    sup.var = supplementary_var_index,
                    graph = FALSE)

# Extract coordinates of individuals and variables
coord_ind <- as.data.frame(famd_result$ind$coord)
coord_ind$ID <- rownames(coord_ind)

coord_quanti <- as.data.frame(famd_result$quanti.var$coord)
coord_quanti$Variable <- rownames(coord_quanti)

coord_quali <- as.data.frame(famd_result$quali.var$coord)
coord_quali$Category <- rownames(coord_quali)

eig <- as.data.frame(famd_result$eig)
colnames(eig) <- c("Dimension", "Variance (%)", "Cumulative Variance (%)")

# Save results to Excel
wb <- createWorkbook()
addWorksheet(wb, "Individual_Coordinates")
addWorksheet(wb, "Quantitative_Variables")
addWorksheet(wb, "Qualitative_Variables")
addWorksheet(wb, "Explained_Variance")

writeData(wb, sheet = "Individual_Coordinates", coord_ind)
writeData(wb, sheet = "Quantitative_Variables", coord_quanti)
writeData(wb, sheet = "Qualitative_Variables", coord_quali)
writeData(wb, sheet = "Explained_Variance", eig)

saveWorkbook(wb, "#/FAMD_results_export.xlsx", overwrite = TRUE)

3.2 Graphical abstract plots for FAMD

This script generates a visual summary of the Factorial Analysis of Mixed Data (FAMD) results. It includes a scree plot showing the variance explained by each dimension, along with bar plots displaying the top contributing variables to Dimensions 1 through 4. All plots are styled consistently using a minimal theme and arranged using the patchwork package into a single combined figure. The final output is saved as a high-resolution PDF for publication or reporting purposes.

# Load required libraries
# Ensure 'patchwork' is installed: install.packages("patchwork")
library(factoextra)
library(patchwork)   # For combining ggplot-based charts
library(ggplot2)

# Assumes 'famd_result' already exists and is valid
# (This is the result object from our prevously constructed FAMD analysis)
# Create and customize individual plots
# Scree plot with theme and y-axis view adjustment
plot_scree <- fviz_screeplot(famd_result, addlabels = TRUE, 
                             ggtheme = theme_minimal()) +
  ggtitle("Scree Plot (Explained Variance by Dimension)") +
  coord_cartesian(ylim = c(0, 17))  # Adjust this upper limit if needed

# Contribution plots for dimensions 1 to 4
plot_contrib_dim1 <- fviz_contrib(famd_result, choice = "var", axes = 1, top = 10,
                                  ggtheme = theme_minimal()) +
  ggtitle("Contribution to Dim.1 (Top 10)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot_contrib_dim2 <- fviz_contrib(famd_result, choice = "var", axes = 2, top = 10,
                                  ggtheme = theme_minimal()) +
  ggtitle("Contribution to Dim.2 (Top 10)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot_contrib_dim3 <- fviz_contrib(famd_result, choice = "var", axes = 3, top = 10,
                                  ggtheme = theme_minimal()) +
  ggtitle("Contribution to Dim.3 (Top 10)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot_contrib_dim4 <- fviz_contrib(famd_result, choice = "var", axes = 4, top = 10,
                                  ggtheme = theme_minimal()) +
  ggtitle("Contribution to Dim.4 (Top 10)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Combine plots using patchwork
combined_plot <- plot_scree /
  (plot_contrib_dim1 + plot_contrib_dim2) /
  (plot_contrib_dim3 + plot_contrib_dim4)

# Add global title to the full layout
combined_plot <- combined_plot + 
  plot_annotation(title = "      FAMD - MEDIAN")

# Display combined plot
print(combined_plot)

# Save the combined plot as PDF (update path accordingly)
ggsave(
  filename = "#/FAMD_summary_plot_MEDIAN.pdf",
  plot = combined_plot,
  device = "pdf",
  width = 8,
  height = 10,
  units = "in"
)

3.3 Variables projection and individuals plot

# Load required libraries
library(readxl)
library(FactoMineR)
library(factoextra)
library(dplyr)
library(ggplot2)
library(patchwork)

# Load dataset (adjust path accordingly)
data <- read_excel("#/DF1-1_Interpolations_variograms_and_error_for_subsamples.xlsx")

# Prepare variables for FAMD
data_famd <- data %>%
  filter(!is.na(Pearson_Correlation), Statistic == "median", Type == "Tessells") %>%
  select(Locality, Frequency_Range, Time_Period, Index, Subsample, Sample_percent, Pearson_Correlation) %>%
  mutate(across(c(Locality, Frequency_Range, Time_Period, Index, Subsample), as.factor))

# Run FAMD (Factorial Analysis of Mixed Data)
famd_result <- FAMD(data_famd, ncp = 5, graph = FALSE)

# Variable projection plots (repel = TRUE for label clarity)
plot_var_12 <- fviz_famd_var(famd_result, axes = c(1, 2), repel = TRUE) + 
  ggtitle("Variables: Dim 1 vs Dim 2")
plot_var_23 <- fviz_famd_var(famd_result, axes = c(2, 3), repel = TRUE) + 
  ggtitle("Variables: Dim 2 vs Dim 3")
plot_var_13 <- fviz_famd_var(famd_result, axes = c(1, 3), repel = TRUE) + 
  ggtitle("Variables: Dim 1 vs Dim 3")
# Individual projection plots (clean version without labels, with ellipses by Subsample)
plot_ind_12 <- fviz_famd_ind(famd_result, axes = c(1, 2), habillage = "Subsample", 
                             addEllipses = TRUE, label = "none") + 
  ggtitle("Individuals: Dim 1 vs Dim 2")
plot_ind_23 <- fviz_famd_ind(famd_result, axes = c(2, 3), habillage = "Subsample", 
                             addEllipses = TRUE, label = "none") + 
  ggtitle("Individuals: Dim 2 vs Dim 3")
plot_ind_13 <- fviz_famd_ind(famd_result, axes = c(1, 3), habillage = "Subsample", 
                             addEllipses = TRUE, label = "none") + 
  ggtitle("Individuals: Dim 1 vs Dim 3")

# Combine all plots in a 2-column × 3-row layout
combined_plot <- (plot_var_12 | plot_ind_12) /
                 (plot_var_23 | plot_ind_23) /
                 (plot_var_13 | plot_ind_13) +
                 plot_annotation(title = "FAMD Projections – Variables & Individuals",
                                 theme = theme(plot.title = element_text(size = 16, face = "bold")))
# Display the combined plot
print(combined_plot)

# Save the plot to PDF (update path as needed)
ggsave(
  filename = "#/FAMD_variables_individuals_clean_MEDIAN.pdf",
  plot = combined_plot,
  device = "pdf",
  width = 12,
  height = 14,
  units = "in"
)

3.4 KRUSKALL TEST

This graph presents the results of a statistical analysis (Kruskal-Wallis + post-hoc Dunn) applied to the coordinates of dimension 3 (Dim.3) of the FAMD analysis, with the aim of comparing the stratified sampling methods (Subsample: basins, halfbasins, SLIC).

# Load required libraries
library(readxl)
library(FactoMineR)
library(factoextra)
library(dplyr)
library(rstatix)
library(ggpubr)
library(multcompView)
library(tibble)
library(tidyr)
library(ggplot2)
library(patchwork)
library(writexl)

# Load and prepare data
data <- read_excel("#/DF1-1_Interpolations_variograms_and_error_for_subsamples.xlsx")

data_famd <- data %>%
  filter(!is.na(Pearson_Correlation), Statistic == "median", Type == "Tessells") %>%
  select(Locality, Frequency_Range, Time_Period, Index, Subsample, Sample_percent, Pearson_Correlation) %>%
  mutate(across(c(Locality, Frequency_Range, Time_Period, Index, Subsample), as.factor))

# Run FAMD
famd_result <- FAMD(data_famd, ncp = 5, graph = FALSE)

ind_coord <- as.data.frame(famd_result$ind$coord[, 1:4])  # Keep only first four dimensions
ind_coord$Subsample <- data_famd$Subsample

# Define a reusable function for each dimension's analysis
kruskal_dunn_plot <- function(df, dim_num) {
  dim_col <- paste0("Dim.", dim_num)
  
  kw <- kruskal_test(as.formula(paste0(dim_col, " ~ Subsample")), data = df)
  dunn <- dunn_test(as.formula(paste0(dim_col, " ~ Subsample")), data = df, p.adjust.method = "BH")

  letters_df <- NULL
  if (kw$p < 0.05) {
    groups <- levels(df$Subsample)
    pmat <- matrix(1, length(groups), length(groups), dimnames = list(groups, groups))
    diag(pmat) <- 0
    for (i in 1:nrow(dunn)) {
      pmat[dunn$group1[i], dunn$group2[i]] <- dunn$p.adj[i]
      pmat[dunn$group2[i], dunn$group1[i]] <- dunn$p.adj[i]
    }
    signif_matrix <- pmat < 0.05
    diag(signif_matrix) <- FALSE
    letters <- multcompLetters(signif_matrix)$Letters
    letters_df <- enframe(letters, name = "Subsample", value = "Letter")
    
    pos <- df %>%
      group_by(Subsample) %>%
      summarise(
        q3 = quantile(.data[[dim_col]], 0.75),
        iqr = IQR(.data[[dim_col]]),
        upper = q3 + 1.5 * iqr,
        max = max(.data[[dim_col]], na.rm = TRUE),
        y.position = max(upper, max) + 0.15
      )
    
    letters_df <- left_join(letters_df, pos, by = "Subsample")
  }

  ymax <- max(df[[dim_col]], na.rm = TRUE)
  bracket_y <- if (!is.null(letters_df)) max(letters_df$y.position, na.rm = TRUE) + 0.1 else ymax + 0.2

  dunn_plot <- dunn %>%
    filter(p.adj <= 0.05) %>%
    arrange(group1, group2) %>%
    mutate(y.position = seq(bracket_y, by = 0.15, length.out = n()))

  plot <- ggboxplot(df, x = "Subsample", y = dim_col, fill = "Subsample", palette = "Set2") +
    labs(
      title = paste("Kruskal-Wallis by Subsampling Strategy - Dimension", dim_num),
      subtitle = get_test_label(kw, detailed = TRUE),
      y = paste("FAMD Coordinate - Dim", dim_num),
      x = "Subsampling Strategy"
    ) +
    theme_minimal(base_size = 14) +
    theme(legend.position = "none")

  if (!is.null(letters_df)) {
    plot <- plot + geom_text(data = letters_df, aes(x = Subsample, y = y.position, label = Letter),
                             inherit.aes = FALSE, size = 5, vjust = -0.5)
  }

  if (nrow(dunn_plot) > 0) {
    plot <- plot + stat_pvalue_manual(dunn_plot, label = "p.adj.signif", tip.length = 0.01)
  } else if (kw$p < 0.05) {
    plot <- plot + labs(caption = "Kruskal-Wallis was significant, but no pairwise differences detected.")
  }

  return(plot)
}

# Generate plots for the first four dimensions
plot_dim1 <- kruskal_dunn_plot(ind_coord, 1)
plot_dim2 <- kruskal_dunn_plot(ind_coord, 2)
plot_dim3 <- kruskal_dunn_plot(ind_coord, 3)
plot_dim4 <- kruskal_dunn_plot(ind_coord, 4)

# Combine all plots into a grid layout
final_plot <- (plot_dim1 | plot_dim2) / (plot_dim3 | plot_dim4) +
  plot_annotation(title = "Kruskal–Wallis + Dunn Tests – FAMD Dimensions")

# Save final plot to PDF
ggsave(
  filename = "#/FAMD_KW_Dunn_Results.pdf",
  plot = final_plot,
  device = "pdf",
  width = 14,
  height = 12,
  units = "in"
)

# Export results for each dimension to Excel
library(openxlsx)

wb <- createWorkbook()

for (dim_num in 1:4) {
  dim_col <- paste0("Dim.", dim_num)

  kw_result <- kruskal_test(as.formula(paste0(dim_col, " ~ Subsample")), data = ind_coord)
  dunn_result <- dunn_test(as.formula(paste0(dim_col, " ~ Subsample")), data = ind_coord, p.adjust.method = "BH")

  groups <- levels(ind_coord$Subsample)
  pmat <- matrix(1, length(groups), length(groups), dimnames = list(groups, groups))
  diag(pmat) <- 0

  for (i in 1:nrow(dunn_result)) {
    pmat[dunn_result$group1[i], dunn_result$group2[i]] <- dunn_result$p.adj[i]
    pmat[dunn_result$group2[i], dunn_result$group1[i]] <- dunn_result$p.adj[i]
  }

  signif_matrix <- pmat < 0.05
  diag(signif_matrix) <- FALSE
  letters <- multcompLetters(signif_matrix)$Letters
  letters_df <- enframe(letters, name = "Subsample", value = "Significance_Letter")

  sheet_name <- paste0("Dimension_", dim_num)
  if (sheet_name %in% names(wb)) removeWorksheet(wb, sheet = sheet_name)

  addWorksheet(wb, sheet_name)
  writeData(wb, sheet = sheet_name, x = kw_result, startCol = 1, startRow = 1)
  writeData(wb, sheet = sheet_name, x = dunn_result, startCol = 1, startRow = 5)
  writeData(wb, sheet = sheet_name, x = letters_df, startCol = 1, startRow = nrow(dunn_result) + 8)
}

saveWorkbook(wb, file = "#/FAMD_KW_Dunn_results_export.xlsx", overwrite = TRUE)

3.5 HEATMAP

library(dplyr)
library(tidyr)
library(ggplot2)
library(ggtext)  # Opcional: para texto enriquecido


letras_df <- tribble(
  ~STATISTIC, ~DIMENSION, ~Subsample, ~Significance_Letter,
  "median","Dim.1", "basins",     "a",
  "median","Dim.1", "halfbasins", "b",
  "median","Dim.1", "Landcover",  "c",
  "median","Dim.1", "SLIC",       "d",
  "median","Dim.2", "basins",     "a",
  "median","Dim.2", "halfbasins", "b",
  "median","Dim.2", "Landcover",  "c",
  "median","Dim.2", "SLIC",       "ab",
  "median","Dim.3", "basins",     "a",
  "median","Dim.3", "halfbasins", "b",
  "median","Dim.3", "Landcover",  "b",
  "median","Dim.3", "SLIC",       "b",
  "median","Dim.4", "basins",     "a",
  "median","Dim.4", "halfbasins", "b",
  "median","Dim.4", "Landcover",  "c",
  "median","Dim.4", "SLIC",       "c"
)

# Pivotear a formato ancho
letras_matrix <- letras_df %>%
  unite("Dimension_Type", STATISTIC, DIMENSION, sep = "_") %>%
  pivot_wider(names_from = Dimension_Type, values_from = Significance_Letter)

# Convertir en data.frame para ggplot
letras_long <- letras_matrix %>%
  pivot_longer(cols = -Subsample, names_to = "Dimension", values_to = "Letter")

# Ordenar para estética
letras_long$Subsample <- factor(letras_long$Subsample, levels = c("basins", "halfbasins", "Landcover", "SLIC"))
letras_long$Dimension <- factor(letras_long$Dimension,
                                levels = c("mean_Dim.1", "mean_Dim.2", "mean_Dim.3", "mean_Dim.4",
                                           "median_Dim.1", "median_Dim.2", "median_Dim.3", "median_Dim.4"))

# Heatmap
ggplot(letras_long, aes(x = Dimension, y = Subsample, fill = Letter)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Letter), size = 5) +
  scale_fill_viridis_d(option = "plasma", direction = -1) +
  labs(
    title = "Significance Letter Groupings by FAMD Dimension and Strategy",
    x = "FAMD Dimension", y = "Subsampling Strategy", fill = "Letter"
  ) +
  theme_minimal(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggsave(
  filename = "/home/vmartinezarias/Dropbox/Doctorado_Personal_Folder/ThesisAnalyses/PROVISIONALES_PAPERS/OBJECTIVE3/V2/3.5-FAMD_heatmap_letters.pdf",
  plot = last_plot(),  # o guarda tu plot en un objeto: p <- ggplot(...) y pon plot = p
  width = 10,
  height = 6,
  units = "in",
  device = "pdf"
)

4 OBJECTIVE 4: LANDSCAPE REPRESENTATION

Specific objective 4: Determine the influence of landscape heterogeneity on the performance and representativeness of soundscape information obtained using tessellation-based sampling strategies

library(tidyverse)
library(readxl)
library(ggpubr)
library(openxlsx)  # For exporting to Excel

# Load Excel file
df <- read_excel("#/DF3_Subsampling_Heterogeneity_Comparison.xlsx")

# Filter valid values
df_filtered <- df %>% filter(!is.na(L_proxy_mean))

# Extract unique localities
localities <- unique(df_filtered$Locality)

# Set PDF output path
output_pdf <- "#/Comparative_Plots_Lproxy_mean_POLYPOLES_ALLTOGETHER.pdf"
pdf_opened <- FALSE

# Initialize results table
statistical_results <- data.frame(
  Locality = character(),
  Method = character(),
  Procedence = character(),
  KS_statistic = numeric(),
  KS_p_value = numeric(),
  Wasserstein = numeric(),
  stringsAsFactors = FALSE
)

for (loc in localities) {
  cat("\n\n📍 Locality:", loc, "\n")

  df_loc <- df_filtered %>% filter(Locality == loc)
  full_vals <- df_loc %>% filter(Tessell == "complete") %>% pull(L_proxy_mean)

  for (method in unique(df_loc$Tessell)) {
    if (method == "complete") next

    df_sub_all <- df_loc %>% filter(Tessell == method & Procedence %in% c("Examinated", "Poles"))
    if (nrow(df_sub_all) == 0) next

    # Compute statistics by procedence
    for (proc in c("Examinated", "Poles")) {
      df_sub <- df_loc %>% filter(Tessell == method, Procedence == proc)
      if (nrow(df_sub) == 0) next

      sub_vals <- df_sub$L_proxy_mean
      n <- min(length(full_vals), length(sub_vals))
      ks <- ks.test(full_vals, sub_vals)
      wass <- mean(abs(sort(full_vals)[1:n] - sort(sub_vals)[1:n]))

      cat("🔎 Method:", method, "| Procedence:", proc, "\n")
      cat("   KS p-value:", ks$p.value, "\n")
      cat("   KS statistic:", ks$statistic, "\n")
      cat("   Approx. Wasserstein distance:", round(wass, 4), "\n")

      statistical_results <- rbind(statistical_results, data.frame(
        Locality = loc,
        Method = method,
        Procedence = proc,
        KS_statistic = as.numeric(ks$statistic),
        KS_p_value = as.numeric(ks$p.value),
        Wasserstein = round(wass, 4),
        stringsAsFactors = FALSE
      ))
    }

    # Generate density and Q–Q plots
    if (!pdf_opened) {
      pdf(output_pdf, width = 14, height = 6)
      pdf_opened <- TRUE
    }

    # Density plot
    p1 <- ggplot(
      df_loc %>% filter((Tessell == method & Procedence %in% c("Examinated", "Poles")) | Tessell == "complete"),
      aes(x = L_proxy_mean, fill = interaction(Tessell, Procedence, drop = TRUE))
    ) +
      geom_density(alpha = 0.4) +
      theme_minimal() +
      labs(title = "Density Plot",
           x = "L_proxy_mean", y = "Density", fill = "Tessellation / Procedence")

    # Q–Q plot
    qq_dfs <- list()
    for (proc in c("Examinated", "Poles")) {
      df_sub <- df_loc %>% filter(Tessell == method, Procedence == proc)
      if (nrow(df_sub) == 0) next

      sub_vals <- df_sub$L_proxy_mean
      n <- min(length(full_vals), length(sub_vals))

      qq_dfs[[proc]] <- data.frame(
        complete = sort(full_vals)[1:n],
        sub = sort(sub_vals)[1:n],
        Procedence = proc
      )
    }
    qq_df_all <- bind_rows(qq_dfs)

    p2 <- ggplot(qq_df_all, aes(x = complete, y = sub, color = Procedence)) +
      geom_point(alpha = 0.6) +
      geom_abline(slope = 1, intercept = 0, color = "black", linetype = "dashed") +
      theme_minimal() +
      labs(title = "Q–Q Plot", x = "Complete Distribution", y = method)

    combined <- ggarrange(p2, p1, ncol = 2, widths = c(1, 1),
                          labels = c("A", "B"),
                          common.legend = TRUE, legend = "right")

    combined <- annotate_figure(combined,
                                 top = text_grob(paste0(loc, " / ", method, " / Comparison with Examinated & Poles"),
                                                 face = "bold", size = 14))

    print(combined)
  }
}

if (pdf_opened) {
  dev.off()
  cat("\n✅ PDF successfully generated with all comparisons and statistics.\n")
} else {
  cat("\n⚠️ No plots were generated. Please check available data.\n")
}

# Save statistical results to Excel
output_excel <- "#/Comparative_Stats_Lproxy_mean_POLYPOLES_ALLTOGETHER.xlsx"
write.xlsx(statistical_results, output_excel)
---
title: 'Appendix S9: Codes for used statistical analyses'
output:
  html_document:
    toc: true
    toc_depth: '2'
    df_print: paged
  html_notebook:
    toc: true
    toc_depth: 2
  pdf_document:
    toc: true
    toc_depth: '2'
---

### Appendix S10: Codes for used statistical analyses

**Title:** *A workflow to optimize spatial sampling in ecoacoustic studies*\
**Journal:** *Landscape Ecology*

**Authors:**\
Martínez-Arias, V. M.\textsuperscript{1}; Paniagua-Villada, C.\textsuperscript{1}; Daza, J. M.\textsuperscript{1}

**Author affiliations at the time the work was conducted:**\
1. Grupo Herpetológico de Antioquia, Calle 70 No. 52 - 21, Postal Code 050010, Medellín, Colombia.

**Corresponding author:**\
Victor M. Martínez-Arias\
📧 Email: [vmanuel.martinez\@udea.edu.co](mailto:vmanuel.martinez@udea.edu.co){.email}\
📞 Tel: +57 311 679 7846

# 0. Exploration graphic

This graphic allow to visually compare the indices in the range 0-24khz, for all hours and localities. We did not use it

```{r}
library(ggplot2)
library(readxl)
library(dplyr)

Statistic <- "mean"  # Set to "mean" or "median"
# Load data (replace path with your own)
data <- read_excel("#/DF1-1_Interpolations_variograms_and_error_for_subsamples.xlsx")
# Filter based on combined conditions
filtered_data <- data %>%
  filter(
    (Index %in% c("BI", "NDSI") & Statistic == Statistic) |
    (Index %in% c("ACItf", "ACIft", "NP", "Hf", "H") &
     Frequency_Range == "R0-24" & Statistic == "mean")
  ) %>%
  mutate(
    Subsample_Color = case_when(
      Subsample == "SLIC" ~ "SLIC",
      Subsample == "halfbasins" ~ "HalfBasins",
      Subsample == "basins" ~ "Basins",
      Subsample == "Landcover" ~ "Landcover",
      TRUE ~ "Random"
    )
  )
# Add synthetic category "ALL"
data_all <- filtered_data %>%
  mutate(Locality = "ALL")
# Merge both datasets
complete_data <- bind_rows(filtered_data, data_all)
# Create plot
plot <- ggplot(complete_data, aes(x = Sample_percent, y = Pearson_Correlation)) +
  geom_point(data = filter(complete_data, Subsample_Color == "Random"),
             aes(color = Subsample_Color),
             alpha = 0.25, size = 1) +
  geom_smooth(method = "lm", color = "blue", se = FALSE, linewidth = 0.8) + 
  geom_point(data = filter(complete_data, Subsample_Color != "Random"),
             aes(color = Subsample_Color), size = 2) +
  scale_color_manual(
    values = c(
      "SLIC" = "purple",
      "HalfBasins" = "skyblue",
      "Basins" = "navy",
      "Random" = "gray50",
      "Landcover" = "green"
    )
  ) +
  facet_grid(Locality ~ Index) +
  theme_minimal(base_size = 12) +
  labs(
    x = "Sample percent",
    y = "Subsample_performance",
    color = "Sampling method"
  ) +
  theme(
    legend.position = "right",
    axis.text = element_text(size = 10),
    strip.text = element_text(size = 12, face = "bold")
  )
# Save as PDF (set your desired path)
ggsave(
  filename = "#/0_Subsampleperformance_vs_SamplePercent_mean.pdf",
  plot = plot,
  width = 15.27,
  height = 8.15,
  units = "in"
)

print(plot)



```

# OBJECTIVE 1 SIZE DEPENDANCE

S. objective 1 - Analyze the relationship between sample size and the spatial representativeness of interpolated soundscapes using different sampling strategies in heterogeneous landscapes;

## 1.1. Generate the pearson correlation between Sampling percent and design precision.

To evaluate how sampling size influences the performance of each subsampling strategy, we calculated a second Pearson correlation coefficient, Pcoef2. This coefficient quantifies the correlation between the proportion of sampled data (i.e., Sample_percent) and the subsampling performance, previously calculated as Pcoef1 (Pearson correlation between full-sample and subsampled raster surfaces).

The analysis was performed across all combinations of localities, subsampling designs, frequency ranges, time periods, acoustic indices, and summary statistics (e.g., mean, median). For each unique combination, we filtered the data and applied ggscatterstats() from the ggstatsplot package to compute the Pearson correlation (Pcoef2), along with associated test statistics.

The resulting outputs were consolidated into a dataset named DF2, which was later used for statistical comparisons between strategies and visualized using violin and density plots.

Used filters: \| Locality \| Type \| Frequency_Range \| Time_Period \| Statistic \| Index \| Pearson_Correlation \|

Resulting table will be named "DF2", which includes the calculated Pcoef2.

```{r}
# Load required libraries
library(ggstatsplot)
library(dplyr)
library(readxl)
library(openxlsx)
library(future.apply)

# Set up parallel plan
plan(multisession, workers = parallel::detectCores() - 1)

# Load data (replace with appropriate file path)
data <- read_excel("#/DF1-1_Interpolations_variograms_and_error_for_subsamples.xlsx") %>%
  mutate(across(c(Locality, Type, Subsample, Frequency_Range, Time_Period, Statistic, Index), as.character))

# Create duplicate with Locality = "ALL"
data_all <- data %>% mutate(Locality = "ALL")
expanded_data <- bind_rows(data, data_all)

# Get unique combinations of filters
combinations <- expanded_data %>%
  distinct(Locality, Type, Frequency_Range, Time_Period, Statistic, Index, Subsample)

# Function to process each combination
process_combination <- function(filter_row) {
  filtered_data <- expanded_data %>%
    filter(
      Locality == filter_row$Locality,
      Type == filter_row$Type,
      Frequency_Range == filter_row$Frequency_Range,
      Time_Period == filter_row$Time_Period,
      Statistic == filter_row$Statistic,
      Index == filter_row$Index,
      Subsample == filter_row$Subsample
    )

  # Check for sufficient data and variation
  if (nrow(filtered_data) < 3 ||
      length(unique(filtered_data$Sample_percent)) < 2 ||
      length(unique(filtered_data$Subsample_performance)) < 2) return(NULL)

  # Extract correlation statistics
  temp_plot <- tryCatch({
    ggscatterstats(
      data = filtered_data,
      x = Sample_percent,
      y = Subsample_performance,
      messages = FALSE
    )
  }, error = function(e) return(NULL))

  if (is.null(temp_plot)) return(NULL)

  stats <- tryCatch(extract_stats(temp_plot), error = function(e) return(NULL))
  if (is.null(stats)) return(NULL)

  subtitle_data <- stats$subtitle_data
  caption_data <- stats$caption_data

  # Add filtering context to results
  for (col in names(filter_row)) {
    subtitle_data[[col]] <- filter_row[[col]]
    caption_data[[col]] <- filter_row[[col]]
  }

  subtitle_data$Type <- "Frequentist"
  caption_data$Type <- "Bayesian"

  bind_rows(subtitle_data, caption_data)
}

# Apply processing to all combinations in parallel
results_list <- future_lapply(1:nrow(combinations), function(i) {
  filter_row <- combinations[i, ]
  process_combination(filter_row)
}, future.seed = TRUE)

# Combine results
final_table <- bind_rows(results_list)

# Remove columns incompatible with Excel
final_table_clean <- final_table %>%
  select_if(~ !any(class(.) %in% c("list", "language")))

# Export final result
write.xlsx(final_table_clean, "DF2_SampleSize_vs_Subsample_performance_Correlations(Pcoeff2).xlsx")



```

## 1.2. Visual inspection of sampling size effects

To explore the relationship between sample size and subsampling performance across different categorical variables (e.g., locality, index, strategy), we created an interactive visualization tool using a Shiny application. This tool allows the dynamic generation of violin plots showing the distribution of Pcoef2 values (correlation between sample size and subsample performance) across user-defined categories.

The application permits filtering by multiple metadata fields (e.g., frequency range, time period, statistic, or acoustic index) and dynamically updates the visualization accordingly. Violin plots were generated using the ggplot2 package, with optional overlay of jittered data points. Users can export the resulting plots as PDFs for reporting or supplementary use.

This tool is based on the table DF2, and facilitates visual comparison of the influence of sample size across grouping levels in the dataset.

This visual inspection is only for ALL dataset.

```{r}
library(shiny)
library(ggplot2)
library(readxl)
library(dplyr)

ui <- fluidPage(
  titlePanel("Distribution of Pearson Coefficient (Frequentist)"),

  sidebarLayout(
    sidebarPanel(
      fileInput("archivo", "Upload Excel file", accept = c(".xlsx")),

      selectInput("variable_categorica", "Categorical variable to compare:",
                  choices = c("Locality", "Frequency_Range", "Time_Period", "Statistic", "Index", "Subsample", "Type")),

      numericInput("axis_title_size", "Axis title font size:", value = 14, min = 8, max = 30),
      numericInput("axis_text_size", "Axis text font size:", value = 12, min = 6, max = 30),

      downloadButton("descargar_pdf", "Download plot as PDF"),

      uiOutput("filtros_adicionales")
    ),

    mainPanel(
      plotOutput("grafico_violin", height = "600px"),
      tableOutput("tabla_filtrada")
    )
  )
)

server <- function(input, output, session) {

  # Read uploaded file
  datos <- reactive({
    req(input$archivo)
    read_excel(input$archivo$datapath) %>%
      filter(Tipo == "Frequentist") %>%
      filter(!is.na(estimate))
  })

  # Dynamic additional filters
  output$filtros_adicionales <- renderUI({
    req(datos())
    other_columns <- setdiff(c("Locality", "Frequency_Range", "Time_Period", "Statistic", "Index", "Subsample", "Type"), input$variable_categorica)

    lapply(other_columns, function(col) {
      selectInput(
        inputId = paste0("filtro_", col),
        label = col,
        choices = c("All", unique(datos()[[col]])),
        selected = "All"
      )
    })
  })

  # Filter data based on selected inputs
  datos_filtrados <- reactive({
    req(datos())
    df <- datos()
    other_columns <- setdiff(c("Locality", "Frequency_Range", "Time_Period", "Statistic", "Index", "Subsample", "Type"), input$variable_categorica)

    for (col in other_columns) {
      val <- input[[paste0("filtro_", col)]]
      if (!is.null(val) && val != "All") {
        df <- df %>% filter(.data[[col]] == val)
      }
    }

    df
  })

  # Generate violin plot
  plot_violines <- reactive({
    df <- datos_filtrados()
    req(nrow(df) > 0)

    ggplot(df, aes(x = .data[[input$variable_categorica]], y = estimate)) +
      geom_violin(fill = "skyblue", alpha = 0.7, color = NA) +
      # geom_jitter(width = 0.15, alpha = 0.5, size = 1.5, color = "black") +
      labs(x = input$variable_categorica, y = "Pearson Estimate") +
      theme_minimal() +
      theme(
        axis.title = element_text(size = input$axis_title_size),
        axis.text = element_text(size = input$axis_text_size),
        plot.title = element_blank()
      )
  })

  output$grafico_violin <- renderPlot({
    plot_violines()
  })

  # Show filtered table
  output$tabla_filtrada <- renderTable({
    datos_filtrados()
  })

  # Download button
  output$descargar_pdf <- downloadHandler(
    filename = function() {
      paste0("violin_plot_pearson_", Sys.Date(), ".pdf")
    },
    content = function(file) {
      ggsave(file, plot = plot_violines(), device = "pdf", width = 10, height = 6)
    }
  )
}

shinyApp(ui = ui, server = server)

```

## 1.3. Normality assessment

Before a statistical test to evaluate the size dependance, normality must be checked. To determine whether parametric tests could be applied to compare the sampling strategies, we evaluated the distribution of Pcoef2 values (i.e., correlations between sampling proportion and subsample performance) across all sampling designs. We applied the Shapiro–Wilk test for normality within each subsample category using the rstatix package. Additionally, we visually inspected the distributions by plotting histograms with density curves, faceted by subsample strategy (e.g., basins, SLIC, ssample40).

Results from the Shapiro–Wilk tests and visual assessments revealed widespread deviations from normality, justifying the use of non-parametric statistical tests in subsequent analyses. All results were saved as part of the reproducible workflow, including the test results and distribution plots.

The input is DF2_SampleSize_vs_Subsample_performance_Correlations(Pcoeff2).xlsx. Procedure must be done for mean and median

```{r}
# Load required packages
library(readxl)
library(dplyr)
library(ggplot2)
library(rstatix)   # for normality tests and summaries
library(ggpubr)    # for visualization

# Load dataset (replace with correct file path)
data <- read_excel("#/DF2_SampleSize_vs_Subsample_performance_Correlations(Pcoeff2).xlsx")
# Filter only Frequentist analyses, remove NA, and keep Statistic == "mean"
data_frec <- data %>%
  filter(
    Tipo == "Frequentist",
    !is.na(estimate),
    Statistic == "mean"   # Change here if you want to use "median", etc.
  )


# Perform Shapiro-Wilk normality test by Subsample
normality <- data_frec %>%
  group_by(Subsample) %>%
  shapiro_test(estimate)

print("### Shapiro-Wilk normality test by Subsample:")
print(normality)

# Create histogram and density plots

plot <- ggplot(data_frec, aes(x = estimate, fill = Subsample)) +
  geom_histogram(aes(y = ..density..), position = "identity", alpha = 0.5, bins = 30) +
  geom_density(alpha = 0.7) +
  facet_wrap(~Subsample) +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "right",
    axis.text = element_text(size = 10),
    strip.text = element_text(size = 12, face = "bold")
  ) +
  labs(title = NULL, x = "Pearson Estimate", y = "Density")

# Save plot to PDF (update file path)
ggsave(
  filename = "#/R1-3_mean_Distribution_Estimates.pdf",
  plot = plot,
  width = 15.27,
  height = 8.15,
  units = "in"
)

# Display plot
plot

# Export normality test results
library(writexl)
write_xlsx(normality, "#/R1-3_mean_Shapiro_Normality_Test.xlsx")


```

## 1.4. Dependence of subsampling strategies on sample size

To evaluate whether different subsampling strategies exhibit distinct levels of sensitivity to sample size, we compared the strength of the correlation (Pcoef2) between sample size (`Sample_percent`) and subsampling performance across strategies.

We first filtered DF2 to include only frequentist estimates where the Pearson correlation was computed between sample size and design performance (`parameter1 == Sample_percent` and `parameter2 == Pearson_Correlation`) using the mean statistic.

Sampling strategies were then grouped into two categories: `"random"` (e.g., `ssample*`) and `"tessellation"` (e.g., grid-based designs such as `SLIC` or `basins`). For each group, we summarized the distribution of Pcoef2 values through descriptive statistics (mean, median, SD, IQR, CV).

Given the lack of normality (as previously assessed), we employed a non-parametric Wilcoxon rank-sum test to compare Pcoef2 distributions between the two strategy types.

Due to the lack of normality, a **Wilcoxon rank-sum test** was used to compare the distributions of Pcoef2 between groups. We also applied a
Fligner–Killeen test to evaluate differences in dispersion between strategies.

All results were visualized using violin and boxplots. Summary tables and test outputs were exported for reproducibility.

```{r}
# Load required libraries
library(readxl)
library(dplyr)
library(psych)
library(writexl)
library(ggplot2)

# Load results table DF2 (adjust path)
data <- read_excel("#/DF2_SampleSize_vs_Subsample_performance_Correlations(Pcoeff2).xlsx")

# Filter valid Pearson correlations between Sample_percent and Pearson_Correlation (Frequentist, Statistic = "median")
data_frec <- data %>%
  filter(
    parameter1 == "Sample_percent",
    parameter2 == "Subsample_performance",
    effectsize == "Pearson correlation",
    Tipo == "Frequentist",
    Statistic == "median",
    !is.na(estimate)
  )

# Classify sampling strategy
data_frec <- data_frec %>%
  mutate(Type_group = case_when(
    grepl("^ssample", Subsample) ~ "random",
    TRUE ~ "tessellation"
  ))

# Descriptive statistics for Pcoef2 by strategy group
summary_dependency <- data_frec %>%
  group_by(Type_group) %>%
  summarise(
    n = n(),
    mean_Pcoef2 = mean(estimate, na.rm = TRUE),
    median_Pcoef2 = median(estimate, na.rm = TRUE),
    sd_Pcoef2 = sd(estimate, na.rm = TRUE),
    iqr_Pcoef2 = IQR(estimate, na.rm = TRUE),
    cv_Pcoef2 = sd(estimate, na.rm = TRUE) / abs(mean(estimate, na.rm = TRUE))
  )

print("Summary of sample size dependency by strategy group:")
print(summary_dependency)

# Wilcoxon rank-sum test
wilcox <- wilcox.test(estimate ~ Type_group, data = data_frec)
print("Wilcoxon test between random and tessellation strategies:")
print(wilcox)

# Fisher's Z comparison of correlation strength

random <- data_frec %>% filter(Type_group == "random")
tess <- data_frec %>% filter(Type_group == "tessellation")

fisherz <- r.test(
  n = nrow(random), r12 = cor(random$`Sample percent`, random$estimate),
  n2 = nrow(tess), r34 = cor(tess$`Sample percent`, tess$estimate)
)

print("Fisher's Z test for correlation strength:")
print(fisherz)
# Fligner–Killeen test for dispersion differences
fligner <- fligner.test(estimate ~ Type_group, data = data_frec)
print("Fligner–Killeen test for variance differences:")
print(fligner)

# Optional plot: distribution of Pcoef2 values
ggplot(data_frec, aes(x = Type_group, y = estimate, fill = Type_group)) +
  geom_violin(alpha = 0.5) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.8) +
  labs(title = "Sample size dependence (Pcoef2)", y = "Pcoef2 (correlation)", x = "Sampling strategy") +
  theme_minimal(base_size = 14) +
  scale_y_continuous(limits = c(-1, 1), oob = scales::squish)

# Export values and summaries
write_xlsx(
  list(
    Pcoef2_Values = data_frec,
    Group_Summary = summary_dependency
  ),
  path = "#/R1-4_MEDIAN_SAMPLING_STRATEGY_COMPARISON.xlsx"
)
# Export F-test results (MEDIAN)

write_xlsx(
  list(
    Pcoef2_Spread_Summary = summary_dependency,
    F_Test_Variance = as.data.frame(t(c(
      F_statistic = 0.79566,
      df1 = 167,
      df2 = 152,
      p_value = 0.149,
      CI_lower = 0.5818262,
      CI_upper = 1.0855697
    )))
  ),
  path = "#/R1-4_MEDIAN_Pcoef2_VARIANCE_TEST.xlsx"
)

# Export F-test results (MEAN)
write_xlsx(
  list(
    Pcoef2_Spread_Summary = summary_dependency,
    F_Test_Variance = as.data.frame(t(c(
      F_statistic = 0.70577,
      df1 = 167,
      df2 = 163,
      p_value = 0.02567,
      CI_lower = 0.5194396,
      CI_upper = 0.9583777
    )))
  ),
  path = "#/R1-4_MEAN_Pcoef2_VARIANCE_TEST.xlsx"
)

# Save final plot as PDF
final_plot <- ggplot(data_frec, aes(x = Type_group, y = estimate, fill = Type_group)) +
  geom_violin(alpha = 0.5, trim = FALSE) +
  geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.8) +
  scale_fill_manual(values = c("random" = "#F8766D", "tessellation" = "#00BFC4")) +
  labs(title = "Sample size dependency (Pcoef2)",
       y = "Pcoef2 (correlation)",
       x = "Sampling strategy") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none") +
  coord_cartesian(ylim = c(-1, 1)) +
  scale_y_continuous(limits = c(-1, 1), oob = scales::squish)

ggsave(
  filename = "#/R1-4_MEDIAN_Pcoef2_SAMPLING_STRATEGY.pdf",
  plot = final_plot,
  width = 8,
  height = 6,
  units = "in"
)

```

##### 

# OBJECTIVE 2: EXPLORE EFFECTIVENESS OF STRATIFIED SAMPLING DESIGNS

Compare the performance of different sampling strategies, including random and stratified approaches, genarated by segmentation methods.

To assess the effectiveness of tessellation-based sampling strategies, we conducted a multi-step statistical analysis using R. First, we calculated two metrics for each subsample: (1) subsample performance in the form of Pearson correlation between the subsample and its full reference dataset; and (2) sampling efficiency, computed as the ratio between Pearson correlation and the sampling effort (Subsample performance/Sample_percent).

We then grouped data by summary statistic type (mean or median) and computed descriptive statistics for each tessellation strategy, including sample size, central tendency (mean, median), dispersion (standard deviation), and coefficient of variation.

Normality of Pearson correlations was assessed using the Shapiro–Wilk test within each subsampling group, using shapiro_test() from the rstatix package.

Given widespread non-normality, we applied Kruskal–Wallis rank-sum tests to compare both raw performance and efficiency across subsampling strategies. Where differences were detected, we used Dunn’s post-hoc tests with Benjamini–Hochberg correction to evaluate pairwise contrasts.

Violin and boxplots were generated to visually compare the distribution of both performance and efficiency across subsampling methods. All procedures were run separately for mean and median statistics, and results (summary tables and plots) were exported to PDF and Excel formats.

Packages we used: dplyr, ggplot2, ggpubr, rstatix, FSA, writexl.

The entry database is: DF1-1_Interpolations_variograms_and error_for_subsamples.xlsx

```{r}
# Load required libraries
library(dplyr)
library(ggplot2)
library(readxl)
library(ggpubr)
library(rstatix)
library(FSA)
library(writexl)

# Load data (replace path accordingly)
df <- read_excel("#/DF1-1_Interpolations_variograms_and_error_for_subsamples.xlsx") %>%
  mutate(across(c(Subsample, Type, Index, Statistic), as.character)) %>%
  mutate(Pearson_rel = Pearson_Correlation / Sample_percent)

# Loop through each summary statistic type
for (stat in c("mean", "median")) {
  
  df_tess <- df %>%
    filter(Type == "Tessells", Statistic == stat, !is.na(Pearson_Correlation))
  
  # Summary statistics of precision and efficiency
  summary_precision <- df_tess %>%
    group_by(Subsample) %>%
    summarise(
      n = n(),
      mean_Pearson = mean(Pearson_Correlation, na.rm = TRUE),
      median_Pearson = median(Pearson_Correlation, na.rm = TRUE),
      sd_Pearson = sd(Pearson_Correlation, na.rm = TRUE),
      mean_eff = mean(Pearson_rel, na.rm = TRUE),
      median_eff = median(Pearson_rel, na.rm = TRUE),
      sd_eff = sd(Pearson_rel, na.rm = TRUE),
      cv_eff = sd_eff / abs(mean_eff)
    ) %>%
    arrange(desc(median_eff))
  
  # Shapiro-Wilk normality test (only for groups with n >= 3)
  
  valid <- df_tess %>%
    group_by(Subsample) %>%
    summarise(n = n()) %>%
    filter(n >= 3) %>%
    pull(Subsample)
  
  normality_df <- df_tess %>%
    filter(Subsample %in% valid) %>%
    group_by(Subsample) %>%
    shapiro_test(Pearson_Correlation)

  
  # Precision plot
  precision_plot <- ggplot(df_tess, aes(x = Subsample, y = Pearson_Correlation, fill = Subsample)) +
    geom_violin(trim = FALSE) +
    geom_boxplot(width = 0.2, outlier.shape = NA) +
    theme_minimal() +
    labs(title = paste("Strategy precision (Pearson) -", stat), y = "Pearson Correlation") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    scale_y_continuous(limits = c(-1, 1), oob = scales::squish)

  ggsave(
    paste0("#/Tessellation_KW_Precision_", stat, ".pdf"),
    plot = precision_plot,
    width = 8, height = 6, units = "in"
  )
  
  # Efficiency plot:
  
  efficiency_plot <- ggplot(df_tess, aes(x = Subsample, y = Pearson_rel, fill = Subsample)) +
    geom_violin(trim = FALSE) +
    geom_boxplot(width = 0.2, outlier.shape = NA) +
    theme_minimal() +
    labs(title = paste("Strategy efficiency (Pearson / Sample%) -", stat), y = "Relative Pearson") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

  ggsave(
    paste0("#/Tessellation_KW_Efficiency_", stat, ".pdf"),
    plot = efficiency_plot,
    width = 8, height = 6, units = "in"
  )

  # Kruskal–Wallis tests and post-hoc comparisions:
  kw_pearson <- kruskal_test(Pearson_Correlation ~ Subsample, data = df_tess)
  posthoc_pearson <- dunn_test(Pearson_Correlation ~ Subsample, data = df_tess, p.adjust.method = "BH")
  
  kw_eff <- kruskal_test(Pearson_rel ~ Subsample, data = df_tess)
  posthoc_eff <- dunn_test(Pearson_rel ~ Subsample, data = df_tess, p.adjust.method = "BH")
  
  # Export results to Excel
  write_xlsx(
    list(
      Summary_Statistics = summary_precision,
      Normality_Test = normality_df,
      KW_Pearson = as.data.frame(kw_pearson),
      PostHoc_Pearson = as.data.frame(posthoc_pearson),
      KW_Efficiency = as.data.frame(kw_eff),
      PostHoc_Efficiency = as.data.frame(posthoc_eff)
    ),
    path = paste0("#/KW_Tessellation_Strategies_", stat, ".xlsx")
  )
}

```

# OBJECTIVE 3:

Specific objective 3: To explore how the interplay of categorical (e.g., landscape, time, index) and continuous variables (e.g., sample size, performance) explains the multivariate structure that differentiates stratified subsampling strategies.

## 3.1 FAMD analyses

A Factorial Analysis of Mixed Data (FAMD) is conducted to explore the multivariate structure of stratified subsampling strategies by integrating both categorical and continuous variables. The dataset was filtered to retain only relevant observations (e.g., median statistics from tessellation-based designs) and included key landscape, temporal, acoustic, and performance variables. The variable Subsample, which defines the sampling strategy under evaluation, was treated as a supplementary qualitative variable to avoid influencing the dimensional construction.

FAMD was performed using the FactoMineR package, extracting the coordinates of individuals, the contributions of quantitative and qualitative variables, and the proportion of variance explained by each dimension. Results were organized into separate Excel sheets for subsequent visualization and statistical analysis.

```{r}
library(readxl)
library(FactoMineR)
library(factoextra)
library(dplyr)
library(openxlsx)

# Load dataset (replace path as needed)
data <- read_excel("#/DF1-1_Interpolations_variograms_and_error_for_subsamples.xlsx")

# Filter only valid values and select relevant columns
data_famd <- data %>%
  filter(!is.na(Pearson_Correlation), Statistic == "median", Type == "Tessells") %>%
  select(Locality, Frequency_Range, Time_Period, Index, Subsample, Sample_percent, Pearson_Correlation)

# Convert categorical variables to factors
data_famd <- data_famd %>%
  mutate(across(c(Locality, Frequency_Range, Time_Period, Index, Subsample), as.factor))

# Identify the supplementary qualitative variable (Subsample)
supplementary_var_index <- which(colnames(data_famd) == "Subsample")

# Run FAMD with Subsample as a supplementary variable
famd_result <- FAMD(data_famd, 
                    ncp = 5, 
                    sup.var = supplementary_var_index,
                    graph = FALSE)

# Extract coordinates of individuals and variables
coord_ind <- as.data.frame(famd_result$ind$coord)
coord_ind$ID <- rownames(coord_ind)

coord_quanti <- as.data.frame(famd_result$quanti.var$coord)
coord_quanti$Variable <- rownames(coord_quanti)

coord_quali <- as.data.frame(famd_result$quali.var$coord)
coord_quali$Category <- rownames(coord_quali)

eig <- as.data.frame(famd_result$eig)
colnames(eig) <- c("Dimension", "Variance (%)", "Cumulative Variance (%)")

# Save results to Excel
wb <- createWorkbook()
addWorksheet(wb, "Individual_Coordinates")
addWorksheet(wb, "Quantitative_Variables")
addWorksheet(wb, "Qualitative_Variables")
addWorksheet(wb, "Explained_Variance")

writeData(wb, sheet = "Individual_Coordinates", coord_ind)
writeData(wb, sheet = "Quantitative_Variables", coord_quanti)
writeData(wb, sheet = "Qualitative_Variables", coord_quali)
writeData(wb, sheet = "Explained_Variance", eig)

saveWorkbook(wb, "#/FAMD_results_export.xlsx", overwrite = TRUE)

```

## 3.2 Graphical abstract plots for FAMD

This script generates a visual summary of the Factorial Analysis of Mixed Data (FAMD) results. It includes a scree plot showing the variance explained by each dimension, along with bar plots displaying the top contributing variables to Dimensions 1 through 4. All plots are styled consistently using a minimal theme and arranged using the patchwork package into a single combined figure. The final output is saved as a high-resolution PDF for publication or reporting purposes.

```{r}
# Load required libraries
# Ensure 'patchwork' is installed: install.packages("patchwork")
library(factoextra)
library(patchwork)   # For combining ggplot-based charts
library(ggplot2)

# Assumes 'famd_result' already exists and is valid
# (This is the result object from our prevously constructed FAMD analysis)
# Create and customize individual plots
# Scree plot with theme and y-axis view adjustment
plot_scree <- fviz_screeplot(famd_result, addlabels = TRUE, 
                             ggtheme = theme_minimal()) +
  ggtitle("Scree Plot (Explained Variance by Dimension)") +
  coord_cartesian(ylim = c(0, 17))  # Adjust this upper limit if needed

# Contribution plots for dimensions 1 to 4
plot_contrib_dim1 <- fviz_contrib(famd_result, choice = "var", axes = 1, top = 10,
                                  ggtheme = theme_minimal()) +
  ggtitle("Contribution to Dim.1 (Top 10)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot_contrib_dim2 <- fviz_contrib(famd_result, choice = "var", axes = 2, top = 10,
                                  ggtheme = theme_minimal()) +
  ggtitle("Contribution to Dim.2 (Top 10)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot_contrib_dim3 <- fviz_contrib(famd_result, choice = "var", axes = 3, top = 10,
                                  ggtheme = theme_minimal()) +
  ggtitle("Contribution to Dim.3 (Top 10)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot_contrib_dim4 <- fviz_contrib(famd_result, choice = "var", axes = 4, top = 10,
                                  ggtheme = theme_minimal()) +
  ggtitle("Contribution to Dim.4 (Top 10)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Combine plots using patchwork
combined_plot <- plot_scree /
  (plot_contrib_dim1 + plot_contrib_dim2) /
  (plot_contrib_dim3 + plot_contrib_dim4)

# Add global title to the full layout
combined_plot <- combined_plot + 
  plot_annotation(title = "      FAMD - MEDIAN")

# Display combined plot
print(combined_plot)

# Save the combined plot as PDF (update path accordingly)
ggsave(
  filename = "#/FAMD_summary_plot_MEDIAN.pdf",
  plot = combined_plot,
  device = "pdf",
  width = 8,
  height = 10,
  units = "in"
)

```

## 3.3 Variables projection and individuals plot

```{r}
# Load required libraries
library(readxl)
library(FactoMineR)
library(factoextra)
library(dplyr)
library(ggplot2)
library(patchwork)

# Load dataset (adjust path accordingly)
data <- read_excel("#/DF1-1_Interpolations_variograms_and_error_for_subsamples.xlsx")

# Prepare variables for FAMD
data_famd <- data %>%
  filter(!is.na(Pearson_Correlation), Statistic == "median", Type == "Tessells") %>%
  select(Locality, Frequency_Range, Time_Period, Index, Subsample, Sample_percent, Pearson_Correlation) %>%
  mutate(across(c(Locality, Frequency_Range, Time_Period, Index, Subsample), as.factor))

# Run FAMD (Factorial Analysis of Mixed Data)
famd_result <- FAMD(data_famd, ncp = 5, graph = FALSE)

# Variable projection plots (repel = TRUE for label clarity)
plot_var_12 <- fviz_famd_var(famd_result, axes = c(1, 2), repel = TRUE) + 
  ggtitle("Variables: Dim 1 vs Dim 2")
plot_var_23 <- fviz_famd_var(famd_result, axes = c(2, 3), repel = TRUE) + 
  ggtitle("Variables: Dim 2 vs Dim 3")
plot_var_13 <- fviz_famd_var(famd_result, axes = c(1, 3), repel = TRUE) + 
  ggtitle("Variables: Dim 1 vs Dim 3")
# Individual projection plots (clean version without labels, with ellipses by Subsample)
plot_ind_12 <- fviz_famd_ind(famd_result, axes = c(1, 2), habillage = "Subsample", 
                             addEllipses = TRUE, label = "none") + 
  ggtitle("Individuals: Dim 1 vs Dim 2")
plot_ind_23 <- fviz_famd_ind(famd_result, axes = c(2, 3), habillage = "Subsample", 
                             addEllipses = TRUE, label = "none") + 
  ggtitle("Individuals: Dim 2 vs Dim 3")
plot_ind_13 <- fviz_famd_ind(famd_result, axes = c(1, 3), habillage = "Subsample", 
                             addEllipses = TRUE, label = "none") + 
  ggtitle("Individuals: Dim 1 vs Dim 3")

# Combine all plots in a 2-column × 3-row layout
combined_plot <- (plot_var_12 | plot_ind_12) /
                 (plot_var_23 | plot_ind_23) /
                 (plot_var_13 | plot_ind_13) +
                 plot_annotation(title = "FAMD Projections – Variables & Individuals",
                                 theme = theme(plot.title = element_text(size = 16, face = "bold")))
# Display the combined plot
print(combined_plot)

# Save the plot to PDF (update path as needed)
ggsave(
  filename = "#/FAMD_variables_individuals_clean_MEDIAN.pdf",
  plot = combined_plot,
  device = "pdf",
  width = 12,
  height = 14,
  units = "in"
)

```

## 3.4 KRUSKALL TEST

This graph presents the results of a statistical analysis (Kruskal-Wallis + post-hoc Dunn) applied to the coordinates of dimension 3 (Dim.3) of the FAMD analysis, with the aim of comparing the stratified sampling methods (Subsample: basins, halfbasins, SLIC).

```{r}
# Load required libraries
library(readxl)
library(FactoMineR)
library(factoextra)
library(dplyr)
library(rstatix)
library(ggpubr)
library(multcompView)
library(tibble)
library(tidyr)
library(ggplot2)
library(patchwork)
library(writexl)

# Load and prepare data
data <- read_excel("#/DF1-1_Interpolations_variograms_and_error_for_subsamples.xlsx")

data_famd <- data %>%
  filter(!is.na(Pearson_Correlation), Statistic == "median", Type == "Tessells") %>%
  select(Locality, Frequency_Range, Time_Period, Index, Subsample, Sample_percent, Pearson_Correlation) %>%
  mutate(across(c(Locality, Frequency_Range, Time_Period, Index, Subsample), as.factor))

# Run FAMD
famd_result <- FAMD(data_famd, ncp = 5, graph = FALSE)

ind_coord <- as.data.frame(famd_result$ind$coord[, 1:4])  # Keep only first four dimensions
ind_coord$Subsample <- data_famd$Subsample

# Define a reusable function for each dimension's analysis
kruskal_dunn_plot <- function(df, dim_num) {
  dim_col <- paste0("Dim.", dim_num)
  
  kw <- kruskal_test(as.formula(paste0(dim_col, " ~ Subsample")), data = df)
  dunn <- dunn_test(as.formula(paste0(dim_col, " ~ Subsample")), data = df, p.adjust.method = "BH")

  letters_df <- NULL
  if (kw$p < 0.05) {
    groups <- levels(df$Subsample)
    pmat <- matrix(1, length(groups), length(groups), dimnames = list(groups, groups))
    diag(pmat) <- 0
    for (i in 1:nrow(dunn)) {
      pmat[dunn$group1[i], dunn$group2[i]] <- dunn$p.adj[i]
      pmat[dunn$group2[i], dunn$group1[i]] <- dunn$p.adj[i]
    }
    signif_matrix <- pmat < 0.05
    diag(signif_matrix) <- FALSE
    letters <- multcompLetters(signif_matrix)$Letters
    letters_df <- enframe(letters, name = "Subsample", value = "Letter")
    
    pos <- df %>%
      group_by(Subsample) %>%
      summarise(
        q3 = quantile(.data[[dim_col]], 0.75),
        iqr = IQR(.data[[dim_col]]),
        upper = q3 + 1.5 * iqr,
        max = max(.data[[dim_col]], na.rm = TRUE),
        y.position = max(upper, max) + 0.15
      )
    
    letters_df <- left_join(letters_df, pos, by = "Subsample")
  }

  ymax <- max(df[[dim_col]], na.rm = TRUE)
  bracket_y <- if (!is.null(letters_df)) max(letters_df$y.position, na.rm = TRUE) + 0.1 else ymax + 0.2

  dunn_plot <- dunn %>%
    filter(p.adj <= 0.05) %>%
    arrange(group1, group2) %>%
    mutate(y.position = seq(bracket_y, by = 0.15, length.out = n()))

  plot <- ggboxplot(df, x = "Subsample", y = dim_col, fill = "Subsample", palette = "Set2") +
    labs(
      title = paste("Kruskal-Wallis by Subsampling Strategy - Dimension", dim_num),
      subtitle = get_test_label(kw, detailed = TRUE),
      y = paste("FAMD Coordinate - Dim", dim_num),
      x = "Subsampling Strategy"
    ) +
    theme_minimal(base_size = 14) +
    theme(legend.position = "none")

  if (!is.null(letters_df)) {
    plot <- plot + geom_text(data = letters_df, aes(x = Subsample, y = y.position, label = Letter),
                             inherit.aes = FALSE, size = 5, vjust = -0.5)
  }

  if (nrow(dunn_plot) > 0) {
    plot <- plot + stat_pvalue_manual(dunn_plot, label = "p.adj.signif", tip.length = 0.01)
  } else if (kw$p < 0.05) {
    plot <- plot + labs(caption = "Kruskal-Wallis was significant, but no pairwise differences detected.")
  }

  return(plot)
}

# Generate plots for the first four dimensions
plot_dim1 <- kruskal_dunn_plot(ind_coord, 1)
plot_dim2 <- kruskal_dunn_plot(ind_coord, 2)
plot_dim3 <- kruskal_dunn_plot(ind_coord, 3)
plot_dim4 <- kruskal_dunn_plot(ind_coord, 4)

# Combine all plots into a grid layout
final_plot <- (plot_dim1 | plot_dim2) / (plot_dim3 | plot_dim4) +
  plot_annotation(title = "Kruskal–Wallis + Dunn Tests – FAMD Dimensions")

# Save final plot to PDF
ggsave(
  filename = "#/FAMD_KW_Dunn_Results.pdf",
  plot = final_plot,
  device = "pdf",
  width = 14,
  height = 12,
  units = "in"
)

# Export results for each dimension to Excel
library(openxlsx)

wb <- createWorkbook()

for (dim_num in 1:4) {
  dim_col <- paste0("Dim.", dim_num)

  kw_result <- kruskal_test(as.formula(paste0(dim_col, " ~ Subsample")), data = ind_coord)
  dunn_result <- dunn_test(as.formula(paste0(dim_col, " ~ Subsample")), data = ind_coord, p.adjust.method = "BH")

  groups <- levels(ind_coord$Subsample)
  pmat <- matrix(1, length(groups), length(groups), dimnames = list(groups, groups))
  diag(pmat) <- 0

  for (i in 1:nrow(dunn_result)) {
    pmat[dunn_result$group1[i], dunn_result$group2[i]] <- dunn_result$p.adj[i]
    pmat[dunn_result$group2[i], dunn_result$group1[i]] <- dunn_result$p.adj[i]
  }

  signif_matrix <- pmat < 0.05
  diag(signif_matrix) <- FALSE
  letters <- multcompLetters(signif_matrix)$Letters
  letters_df <- enframe(letters, name = "Subsample", value = "Significance_Letter")

  sheet_name <- paste0("Dimension_", dim_num)
  if (sheet_name %in% names(wb)) removeWorksheet(wb, sheet = sheet_name)

  addWorksheet(wb, sheet_name)
  writeData(wb, sheet = sheet_name, x = kw_result, startCol = 1, startRow = 1)
  writeData(wb, sheet = sheet_name, x = dunn_result, startCol = 1, startRow = 5)
  writeData(wb, sheet = sheet_name, x = letters_df, startCol = 1, startRow = nrow(dunn_result) + 8)
}

saveWorkbook(wb, file = "#/FAMD_KW_Dunn_results_export.xlsx", overwrite = TRUE)


```

## 3.5 HEATMAP

```{r}
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggtext)  # Opcional: para texto enriquecido


letras_df <- tribble(
  ~STATISTIC, ~DIMENSION, ~Subsample, ~Significance_Letter,
  "median","Dim.1", "basins",     "a",
  "median","Dim.1", "halfbasins", "b",
  "median","Dim.1", "Landcover",  "c",
  "median","Dim.1", "SLIC",       "d",
  "median","Dim.2", "basins",     "a",
  "median","Dim.2", "halfbasins", "b",
  "median","Dim.2", "Landcover",  "c",
  "median","Dim.2", "SLIC",       "ab",
  "median","Dim.3", "basins",     "a",
  "median","Dim.3", "halfbasins", "b",
  "median","Dim.3", "Landcover",  "b",
  "median","Dim.3", "SLIC",       "b",
  "median","Dim.4", "basins",     "a",
  "median","Dim.4", "halfbasins", "b",
  "median","Dim.4", "Landcover",  "c",
  "median","Dim.4", "SLIC",       "c"
)

# Pivotear a formato ancho
letras_matrix <- letras_df %>%
  unite("Dimension_Type", STATISTIC, DIMENSION, sep = "_") %>%
  pivot_wider(names_from = Dimension_Type, values_from = Significance_Letter)

# Convertir en data.frame para ggplot
letras_long <- letras_matrix %>%
  pivot_longer(cols = -Subsample, names_to = "Dimension", values_to = "Letter")

# Ordenar para estética
letras_long$Subsample <- factor(letras_long$Subsample, levels = c("basins", "halfbasins", "Landcover", "SLIC"))
letras_long$Dimension <- factor(letras_long$Dimension,
                                levels = c("mean_Dim.1", "mean_Dim.2", "mean_Dim.3", "mean_Dim.4",
                                           "median_Dim.1", "median_Dim.2", "median_Dim.3", "median_Dim.4"))

# Heatmap
ggplot(letras_long, aes(x = Dimension, y = Subsample, fill = Letter)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Letter), size = 5) +
  scale_fill_viridis_d(option = "plasma", direction = -1) +
  labs(
    title = "Significance Letter Groupings by FAMD Dimension and Strategy",
    x = "FAMD Dimension", y = "Subsampling Strategy", fill = "Letter"
  ) +
  theme_minimal(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggsave(
  filename = "/home/vmartinezarias/Dropbox/Doctorado_Personal_Folder/ThesisAnalyses/PROVISIONALES_PAPERS/OBJECTIVE3/V2/3.5-FAMD_heatmap_letters.pdf",
  plot = last_plot(),  # o guarda tu plot en un objeto: p <- ggplot(...) y pon plot = p
  width = 10,
  height = 6,
  units = "in",
  device = "pdf"
)


```

# 4 OBJECTIVE 4: LANDSCAPE REPRESENTATION

Specific objective 4: Determine the influence of landscape heterogeneity on the performance and representativeness of soundscape information obtained using tessellation-based sampling strategies

```{r}
library(tidyverse)
library(readxl)
library(ggpubr)
library(openxlsx)  # For exporting to Excel

# Load Excel file
df <- read_excel("#/DF3_Subsampling_Heterogeneity_Comparison.xlsx")

# Filter valid values
df_filtered <- df %>% filter(!is.na(L_proxy_mean))

# Extract unique localities
localities <- unique(df_filtered$Locality)

# Set PDF output path
output_pdf <- "#/Comparative_Plots_Lproxy_mean_POLYPOLES_ALLTOGETHER.pdf"
pdf_opened <- FALSE

# Initialize results table
statistical_results <- data.frame(
  Locality = character(),
  Method = character(),
  Procedence = character(),
  KS_statistic = numeric(),
  KS_p_value = numeric(),
  Wasserstein = numeric(),
  stringsAsFactors = FALSE
)

for (loc in localities) {
  cat("\n\n📍 Locality:", loc, "\n")

  df_loc <- df_filtered %>% filter(Locality == loc)
  full_vals <- df_loc %>% filter(Tessell == "complete") %>% pull(L_proxy_mean)

  for (method in unique(df_loc$Tessell)) {
    if (method == "complete") next

    df_sub_all <- df_loc %>% filter(Tessell == method & Procedence %in% c("Examinated", "Poles"))
    if (nrow(df_sub_all) == 0) next

    # Compute statistics by procedence
    for (proc in c("Examinated", "Poles")) {
      df_sub <- df_loc %>% filter(Tessell == method, Procedence == proc)
      if (nrow(df_sub) == 0) next

      sub_vals <- df_sub$L_proxy_mean
      n <- min(length(full_vals), length(sub_vals))
      ks <- ks.test(full_vals, sub_vals)
      wass <- mean(abs(sort(full_vals)[1:n] - sort(sub_vals)[1:n]))

      cat("🔎 Method:", method, "| Procedence:", proc, "\n")
      cat("   KS p-value:", ks$p.value, "\n")
      cat("   KS statistic:", ks$statistic, "\n")
      cat("   Approx. Wasserstein distance:", round(wass, 4), "\n")

      statistical_results <- rbind(statistical_results, data.frame(
        Locality = loc,
        Method = method,
        Procedence = proc,
        KS_statistic = as.numeric(ks$statistic),
        KS_p_value = as.numeric(ks$p.value),
        Wasserstein = round(wass, 4),
        stringsAsFactors = FALSE
      ))
    }

    # Generate density and Q–Q plots
    if (!pdf_opened) {
      pdf(output_pdf, width = 14, height = 6)
      pdf_opened <- TRUE
    }

    # Density plot
    p1 <- ggplot(
      df_loc %>% filter((Tessell == method & Procedence %in% c("Examinated", "Poles")) | Tessell == "complete"),
      aes(x = L_proxy_mean, fill = interaction(Tessell, Procedence, drop = TRUE))
    ) +
      geom_density(alpha = 0.4) +
      theme_minimal() +
      labs(title = "Density Plot",
           x = "L_proxy_mean", y = "Density", fill = "Tessellation / Procedence")

    # Q–Q plot
    qq_dfs <- list()
    for (proc in c("Examinated", "Poles")) {
      df_sub <- df_loc %>% filter(Tessell == method, Procedence == proc)
      if (nrow(df_sub) == 0) next

      sub_vals <- df_sub$L_proxy_mean
      n <- min(length(full_vals), length(sub_vals))

      qq_dfs[[proc]] <- data.frame(
        complete = sort(full_vals)[1:n],
        sub = sort(sub_vals)[1:n],
        Procedence = proc
      )
    }
    qq_df_all <- bind_rows(qq_dfs)

    p2 <- ggplot(qq_df_all, aes(x = complete, y = sub, color = Procedence)) +
      geom_point(alpha = 0.6) +
      geom_abline(slope = 1, intercept = 0, color = "black", linetype = "dashed") +
      theme_minimal() +
      labs(title = "Q–Q Plot", x = "Complete Distribution", y = method)

    combined <- ggarrange(p2, p1, ncol = 2, widths = c(1, 1),
                          labels = c("A", "B"),
                          common.legend = TRUE, legend = "right")

    combined <- annotate_figure(combined,
                                 top = text_grob(paste0(loc, " / ", method, " / Comparison with Examinated & Poles"),
                                                 face = "bold", size = 14))

    print(combined)
  }
}

if (pdf_opened) {
  dev.off()
  cat("\n✅ PDF successfully generated with all comparisons and statistics.\n")
} else {
  cat("\n⚠️ No plots were generated. Please check available data.\n")
}

# Save statistical results to Excel
output_excel <- "#/Comparative_Stats_Lproxy_mean_POLYPOLES_ALLTOGETHER.xlsx"
write.xlsx(statistical_results, output_excel)

```
