Module_Preservation

#load packages
.libPaths("/share/lasallelab/programs/comethyl/R_4.1.0/rtest")
AnnotationHub::setAnnotationHubOption("CACHE", value = "/share/lasallelab/programs/comethyl/R_4.1.0/rtest")
library(tidyverse)
library(WGCNA)
library(comethyl)
library(Biobase)

# Set Global Options ####
options(stringsAsFactors = FALSE)
Sys.setenv(R_THREADS = 1)
enableWGCNAThreads(nThreads = 2)

# Create and Filter BSseq Object -----------------------------------------------
# Filter Sample Trait Table ####
setwd("Preservation_EARLI")
colData <- openxlsx::read.xlsx("sample_info_preservation.xlsx", rowNames = TRUE)

# Read Bismark CpG Reports ####
setwd("/placenta_cytosine_reports")
bs <- getCpGs(colData, file = "Preservation_EARLI/Preservation_Unfiltered_BSseq.rds") # 29.4M CpGs (29401795)
setwd("Preservation_EARLI")
bs_disc <- readRDS("Filtered_BSseq.rds") # 13.056694M CpGs (w cutoffs of cov=4 and perSample=0.8)
bs <- IRanges::subsetByOverlaps(bs, ranges = bs_disc) # 13.056694M CpGs (filtered for CpGs in discovery BSseq)
bs <- filterCpGs(bs, cov = 1, perSample = 0.01, # 20.6M CpGs (lost 102K CpGs with no reads, 0.5%)
                 file = "Preservation_Filtered_BSseq.rds")
rm(bs_disc)

# Call Regions and Build Network Using CpG Clusters ----------------------------
# Call Regions ####
regions_disc <- readRDS("Modules.rds") %>%
  .[["regions"]] %>% select(chr, start, end, RegionID) # Use discovery regions
regions_disc <- with(regions_disc,
                     GenomicRanges::GRanges(seqnames = chr,
                                            ranges = IRanges::IRanges(start = start, end = end),
                                            name = RegionID))
regions <- getRegions(bs, custom = regions_disc, n = 1, save = FALSE)
regions <- select(regions, RegionID = name, chr:methSD)
write_tsv(regions, file = "Unfiltered_Regions.txt")
plotRegionStats(regions, maxQuantile = 0.99,
                file = "Unfiltered_Region_Plots.pdf")
plotSDstats(regions, maxQuantile = 0.99,
            file = "Unfiltered_SD_Plots.pdf")

# Examine Region Totals at Different Cutoffs ####
regionTotals <- getRegionTotals(regions,
                                file = "Region_Totals.txt")
plotRegionTotals(regionTotals, file = "Region_Totals.pdf")

# Filter Regions ####
regions <- filterRegions(regions, covMin = 5, methSD = 0.05, # 196,967 regions (199,945 regions in discovery)
                         file = "Filtered_Regions.txt")
plotRegionStats(regions, maxQuantile = 0.99,
                file = "Filtered_Region_Plots.pdf")
plotSDstats(regions, maxQuantile = 0.99,
            file = "Filtered_SD_Plots.pdf")

# Adjust Methylation Data for PCs ####
meth <- getRegionMeth(regions, bs = bs,
                      file = "Region_Methylation.rds")
mod <- model.matrix(~1, data = bsseq::pData(bs))
methAdj <- adjustRegionMeth(meth, mod = mod, # Top 16 PCs
                            file = "Adjusted_Region_Methylation.rds")
getDendro(methAdj, distance = "euclidean") %>%
  plotDendro(file = "Sample_Dendrogram.pdf",
             expandY = c(0.25, 0.08))

# Select Soft Power Threshold ####
sft <- getSoftPower(methAdj, powerVector = 1:30, corType = "pearson",
                    file = "Soft_Power.rds")
plotSoftPower(sft, file = "Soft_Power_Plots.pdf")

# Get Comethylation Modules ####
#try power of 23 to get high fit 
modules <- getModules(methAdj, power = 23, regions = regions, corType = "pearson",
                      nThreads = 2, file = "Modules.rds")

# Examine Correlations between Modules and Samples ####
MEs <- modules$MEs
moduleDendro <- getDendro(MEs, distance = "bicor")
plotDendro(moduleDendro, labelSize = 2.5, nBreaks = 5, width = 48, height = 8,
           expandX = c(0.015, 0.015), expandY = c(0.15, 0.08),
           file = "Module_ME_Dendrogram.pdf")
moduleCor <- getCor(MEs, corType = "bicor")
plotHeatmap(moduleCor, rowDendro = moduleDendro, colDendro = moduleDendro,
            axis.text.size = 4, rowDendroMargins = c(-1.55, 0.95, -0.1, -0.5),
            colDendroMargins = c(0.95, -0.5, -0.5, 0.8),
            file = "Module_Correlation_Heatmap.pdf",
            width = 22, height = 19)

sampleDendro <- getDendro(MEs, transpose = TRUE, distance = "bicor")
plotDendro(sampleDendro, labelSize = 3, nBreaks = 5,
           file = "Sample_ME_Dendrogram.pdf")
sampleCor <- getCor(MEs, transpose = TRUE, corType = "bicor")
plotHeatmap(sampleCor, rowDendro = sampleDendro, colDendro = sampleDendro,
            file = "Sample_Correlation_Heatmap.pdf")

plotHeatmap(MEs, rowDendro = sampleDendro, colDendro = moduleDendro,
            legend.title = "Module\nEigennode", legend.position = c(0.37,0.89),
            file = "Sample_ME_Heatmap.pdf",
            width = 33, axis.text.size = 6)


# Compare CpG Cluster Modules Between Discovery and Replication ----------------
# Setup ####
modules_disc <- readRDS("Modules.rds")
methAdj_disc <- readRDS("Adjusted_Region_Methylation.rds")
modules_rep <- readRDS("Preservation_EARLI/Modules.rds")
methAdj_rep <- readRDS("Preservation_EARLI/Adjusted_Region_Methylation.rds")

# Calculate Module Preservation ####
# Change columns from region IDs to genomic coordinates (region IDs don't correspond)
table(colnames(methAdj_disc) == modules_disc$regions$RegionID) # All TRUE #199,944
colnames(methAdj_disc) <- with(modules_disc$regions, paste(chr, start, end, sep = "_"))
table(colnames(methAdj_rep) == modules_rep$regions$RegionID) # All TRUE #196,966
colnames(methAdj_rep) <- with(modules_rep$regions, paste(chr, start, end, sep = "_"))
table(colnames(methAdj_disc) %in% colnames(methAdj_rep))
# FALSE   TRUE
# 2,978 196,966
table(colnames(methAdj_rep) %in% colnames(methAdj_disc))
# TRUE
# 196,966

# Prepare multi-set inputs
multiData <- list(Discovery = list(data = methAdj_disc),
                  Replication = list(data = methAdj_rep))
multiColor <- list(Discovery = modules_disc$regions$module,
                   Replication = modules_rep$regions$module)
rm(methAdj_disc, methAdj_rep, modules_disc, modules_rep)

# Run modulePreservation
preservation <- modulePreservation(multiData, multiColor = multiColor,
                                   networkType = "signed", randomSeed = 5,
                                   goldName = "random", verbose = 3) #100 permutations
saveRDS(preservation, file = "/share/lasallelab/Jules/MARBLES_PCB/Preservation_EARLI/Module_Preservation.rds")

# Collect Results ####
categories <- c("quality", "preservation", "accuracy", "referenceSeparability",
                "testSeparability")
tables <- c("observed", "Z", "log.p")
pattern <- c("Z." = "", "Z" = "", "log.p." = "", "log.p" = "")
preservationStats <- lapply(categories, function(x){
  category <- lapply(tables, function(y){
    as.data.frame(preservation[[x]][[y]]$ref.Discovery$inColumnsAlsoPresentIn.Replication) %>%
      rownames_to_column(var = "module") %>%
      pivot_longer(!module & !moduleSize, names_to = "statistic",
                   values_to = y) %>%
      mutate(statistic = str_replace_all(statistic, pattern = fixed(pattern)))
  })
  names(category) <- tables
  category <- full_join(category$observed, y = category$Z,
                        by = c("module", "moduleSize", "statistic")) %>%
    full_join(y = category$log.p,
              by = c("module", "moduleSize", "statistic")) %>%
    mutate(category = x)
})
preservationStats <- bind_rows(preservationStats) %>%
  filter(!statistic %in% c("medianRank.qual", "meanClusterCoeff.qual",
                           "medianRank.pres", "medianRankConnectivity.pres",
                           "medianRankDensity.pres", "cor.clusterCoeff",
                           "meanClusterCoeff.pres", "coClustering")) %>%
  mutate(log.p = as.numeric(log.p)) %>%
  select(module, moduleSize, category, statistic:log.p) %>%
  arrange(module)
write_tsv(preservationStats,
          file = "Preservation_EARLI/Module_Preservation_Statistics.txt")

# Summarize Results ####
# Which modules are good quality?
filter(preservationStats, statistic == "summary.qual" & Z > 10) %>% pull(module) #27/29
filter(preservationStats, statistic == "summary.qual" & Z > 2 & Z <= 10) %>% pull(module) #none
filter(preservationStats, statistic == "summary.qual" & Z <= 2) %>% pull(module) # grey, random

# Which modules are preserved?
filter(preservationStats, statistic == "summary.pres" & Z > 10) %>% pull(module) #27/29
filter(preservationStats, statistic == "summary.pres" & Z > 2 & Z <= 10) %>% pull(module) #none
filter(preservationStats, statistic == "summary.pres" & Z <= 2) %>% pull(module) #grey, random

# Which modules are accurate?
filter(preservationStats, statistic == "accuracy" & Z > 10) %>% pull(module) #20/29
# [1] "black"         "blue"          "brown"         "cyan"         
#[5] "darkgreen"     "darkgrey"      "darkturquoise" "green"        
#[9] "greenyellow"   "grey"          "grey60"        "lightcyan"    
#[13] "lightgreen"    "midnightblue"  "pink"          "purple"       
#[17] "salmon"        "tan"           "turquoise"     "yellow" 
filter(preservationStats, statistic == "accuracy" & Z > 2 & Z <= 10) %>% pull(module) #none
filter(preservationStats, statistic == "accuracy" & Z <= 2) %>% pull(module)
#[1] "darkorange"  "darkred"     "lightyellow" "magenta"     "orange"     
#[6] "red"         "royalblue"   "white"   

# Which modules separate in the reference set?
filter(preservationStats, statistic == "separability.qual" & Z > 10) %>% pull(module) # none
filter(preservationStats, statistic == "separability.qual" & Z > 2 & Z <= 10) %>% pull(module)
#[1] "black"        "blue"         "brown"        "green"        "grey"        
#[6] "magenta"      "midnightblue" "turquoise"    "yellow" 
filter(preservationStats, statistic == "separability.qual" & Z <= 2) %>% pull(module) #20/29
#[1] "cyan"          "darkgreen"     "darkgrey"      "darkorange"   
#[5] "darkred"       "darkturquoise" "greenyellow"   "grey60"       
#[9] "lightcyan"     "lightgreen"    "lightyellow"   "orange"       
#[13] "pink"          "purple"        "random"        "red"          
#[17] "royalblue"     "salmon"        "tan"           "white"  

# Which modules separate in the test set?
filter(preservationStats, statistic == "separability.pres" & Z > 10) %>% pull(module) # none
filter(preservationStats, statistic == "separability.pres" & Z > 2 & Z <= 10) %>% pull(module) #grey, grey60
filter(preservationStats, statistic == "separability.pres" & Z <= 2) %>% pull(module) #27/29

# Visualize Results ####
plot_data <- select(preservationStats, module, moduleSize, statistic, Z) %>%
  mutate(statistic = factor(statistic, levels = unique(statistic))) %>%
  filter(!module %in% c("grey", "random") & is.finite(Z))
scatterplot <- ggplot(plot_data) +
  geom_hline(yintercept = 2, lty = 2, color = "blue", size = 0.9) +
  geom_hline(yintercept = 10, lty = 2, color = "darkgreen", size = 0.9) +
  geom_point(aes(x = moduleSize, y = Z, color = module), size = 1.2) +
  facet_wrap(vars(statistic), nrow = 6, scales = "free_y") +
  scale_x_continuous(breaks = scales::breaks_pretty(n = 3)) +
  scale_y_continuous(breaks = scales::breaks_pretty(n = 3),
                     expand = expansion(0.15)) +
  scale_color_identity() +
  theme_bw(base_size = 25) +
  theme(legend.position = "none", panel.grid.major = element_blank(),
        panel.border = element_rect(color = "black", size = 1.25),
        axis.ticks = element_line(size = 0.9),
        panel.grid.minor = element_blank(),
        strip.background = element_blank(),
        strip.text = element_text(size = 8.5, margin = unit(c(1,0,0.2,0), "lines")),
        axis.text = element_text(color = "black", size = 10),
        axis.title = element_text(size = 12),
        panel.spacing.x = unit(0.2, "lines"),
        panel.spacing.y = unit(-0.5, "lines"),
        plot.margin = unit(c(0,1,1,1), "lines")) +
  xlab("Module Size") +
  ylab("Z-score")
ggsave("/share/lasallelab/Jules/MARBLES_PCB/Preservation_EARLI/Module_Preservation_Statistics_Plot.pdf",
       plot = scatterplot, dpi = 600, width = 9, height = 9, units = "in")