-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathModule_Preservation
238 lines (212 loc) · 11.7 KB
/
Module_Preservation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#load packages
.libPaths("/share/lasallelab/programs/comethyl/R_4.1.0/rtest")
AnnotationHub::setAnnotationHubOption("CACHE", value = "/share/lasallelab/programs/comethyl/R_4.1.0/rtest")
library(tidyverse)
library(WGCNA)
library(comethyl)
library(Biobase)
# Set Global Options ####
options(stringsAsFactors = FALSE)
Sys.setenv(R_THREADS = 1)
enableWGCNAThreads(nThreads = 2)
# Create and Filter BSseq Object -----------------------------------------------
# Filter Sample Trait Table ####
setwd("Preservation_EARLI")
colData <- openxlsx::read.xlsx("sample_info_preservation.xlsx", rowNames = TRUE)
# Read Bismark CpG Reports ####
setwd("/placenta_cytosine_reports")
bs <- getCpGs(colData, file = "Preservation_EARLI/Preservation_Unfiltered_BSseq.rds") # 29.4M CpGs (29401795)
setwd("Preservation_EARLI")
bs_disc <- readRDS("Filtered_BSseq.rds") # 13.056694M CpGs (w cutoffs of cov=4 and perSample=0.8)
bs <- IRanges::subsetByOverlaps(bs, ranges = bs_disc) # 13.056694M CpGs (filtered for CpGs in discovery BSseq)
bs <- filterCpGs(bs, cov = 1, perSample = 0.01, # 20.6M CpGs (lost 102K CpGs with no reads, 0.5%)
file = "Preservation_Filtered_BSseq.rds")
rm(bs_disc)
# Call Regions and Build Network Using CpG Clusters ----------------------------
# Call Regions ####
regions_disc <- readRDS("Modules.rds") %>%
.[["regions"]] %>% select(chr, start, end, RegionID) # Use discovery regions
regions_disc <- with(regions_disc,
GenomicRanges::GRanges(seqnames = chr,
ranges = IRanges::IRanges(start = start, end = end),
name = RegionID))
regions <- getRegions(bs, custom = regions_disc, n = 1, save = FALSE)
regions <- select(regions, RegionID = name, chr:methSD)
write_tsv(regions, file = "Unfiltered_Regions.txt")
plotRegionStats(regions, maxQuantile = 0.99,
file = "Unfiltered_Region_Plots.pdf")
plotSDstats(regions, maxQuantile = 0.99,
file = "Unfiltered_SD_Plots.pdf")
# Examine Region Totals at Different Cutoffs ####
regionTotals <- getRegionTotals(regions,
file = "Region_Totals.txt")
plotRegionTotals(regionTotals, file = "Region_Totals.pdf")
# Filter Regions ####
regions <- filterRegions(regions, covMin = 5, methSD = 0.05, # 196,967 regions (199,945 regions in discovery)
file = "Filtered_Regions.txt")
plotRegionStats(regions, maxQuantile = 0.99,
file = "Filtered_Region_Plots.pdf")
plotSDstats(regions, maxQuantile = 0.99,
file = "Filtered_SD_Plots.pdf")
# Adjust Methylation Data for PCs ####
meth <- getRegionMeth(regions, bs = bs,
file = "Region_Methylation.rds")
mod <- model.matrix(~1, data = bsseq::pData(bs))
methAdj <- adjustRegionMeth(meth, mod = mod, # Top 16 PCs
file = "Adjusted_Region_Methylation.rds")
getDendro(methAdj, distance = "euclidean") %>%
plotDendro(file = "Sample_Dendrogram.pdf",
expandY = c(0.25, 0.08))
# Select Soft Power Threshold ####
sft <- getSoftPower(methAdj, powerVector = 1:30, corType = "pearson",
file = "Soft_Power.rds")
plotSoftPower(sft, file = "Soft_Power_Plots.pdf")
# Get Comethylation Modules ####
#try power of 23 to get high fit
modules <- getModules(methAdj, power = 23, regions = regions, corType = "pearson",
nThreads = 2, file = "Modules.rds")
# Examine Correlations between Modules and Samples ####
MEs <- modules$MEs
moduleDendro <- getDendro(MEs, distance = "bicor")
plotDendro(moduleDendro, labelSize = 2.5, nBreaks = 5, width = 48, height = 8,
expandX = c(0.015, 0.015), expandY = c(0.15, 0.08),
file = "Module_ME_Dendrogram.pdf")
moduleCor <- getCor(MEs, corType = "bicor")
plotHeatmap(moduleCor, rowDendro = moduleDendro, colDendro = moduleDendro,
axis.text.size = 4, rowDendroMargins = c(-1.55, 0.95, -0.1, -0.5),
colDendroMargins = c(0.95, -0.5, -0.5, 0.8),
file = "Module_Correlation_Heatmap.pdf",
width = 22, height = 19)
sampleDendro <- getDendro(MEs, transpose = TRUE, distance = "bicor")
plotDendro(sampleDendro, labelSize = 3, nBreaks = 5,
file = "Sample_ME_Dendrogram.pdf")
sampleCor <- getCor(MEs, transpose = TRUE, corType = "bicor")
plotHeatmap(sampleCor, rowDendro = sampleDendro, colDendro = sampleDendro,
file = "Sample_Correlation_Heatmap.pdf")
plotHeatmap(MEs, rowDendro = sampleDendro, colDendro = moduleDendro,
legend.title = "Module\nEigennode", legend.position = c(0.37,0.89),
file = "Sample_ME_Heatmap.pdf",
width = 33, axis.text.size = 6)
# Compare CpG Cluster Modules Between Discovery and Replication ----------------
# Setup ####
modules_disc <- readRDS("Modules.rds")
methAdj_disc <- readRDS("Adjusted_Region_Methylation.rds")
modules_rep <- readRDS("Preservation_EARLI/Modules.rds")
methAdj_rep <- readRDS("Preservation_EARLI/Adjusted_Region_Methylation.rds")
# Calculate Module Preservation ####
# Change columns from region IDs to genomic coordinates (region IDs don't correspond)
table(colnames(methAdj_disc) == modules_disc$regions$RegionID) # All TRUE #199,944
colnames(methAdj_disc) <- with(modules_disc$regions, paste(chr, start, end, sep = "_"))
table(colnames(methAdj_rep) == modules_rep$regions$RegionID) # All TRUE #196,966
colnames(methAdj_rep) <- with(modules_rep$regions, paste(chr, start, end, sep = "_"))
table(colnames(methAdj_disc) %in% colnames(methAdj_rep))
# FALSE TRUE
# 2,978 196,966
table(colnames(methAdj_rep) %in% colnames(methAdj_disc))
# TRUE
# 196,966
# Prepare multi-set inputs
multiData <- list(Discovery = list(data = methAdj_disc),
Replication = list(data = methAdj_rep))
multiColor <- list(Discovery = modules_disc$regions$module,
Replication = modules_rep$regions$module)
rm(methAdj_disc, methAdj_rep, modules_disc, modules_rep)
# Run modulePreservation
preservation <- modulePreservation(multiData, multiColor = multiColor,
networkType = "signed", randomSeed = 5,
goldName = "random", verbose = 3) #100 permutations
saveRDS(preservation, file = "/share/lasallelab/Jules/MARBLES_PCB/Preservation_EARLI/Module_Preservation.rds")
# Collect Results ####
categories <- c("quality", "preservation", "accuracy", "referenceSeparability",
"testSeparability")
tables <- c("observed", "Z", "log.p")
pattern <- c("Z." = "", "Z" = "", "log.p." = "", "log.p" = "")
preservationStats <- lapply(categories, function(x){
category <- lapply(tables, function(y){
as.data.frame(preservation[[x]][[y]]$ref.Discovery$inColumnsAlsoPresentIn.Replication) %>%
rownames_to_column(var = "module") %>%
pivot_longer(!module & !moduleSize, names_to = "statistic",
values_to = y) %>%
mutate(statistic = str_replace_all(statistic, pattern = fixed(pattern)))
})
names(category) <- tables
category <- full_join(category$observed, y = category$Z,
by = c("module", "moduleSize", "statistic")) %>%
full_join(y = category$log.p,
by = c("module", "moduleSize", "statistic")) %>%
mutate(category = x)
})
preservationStats <- bind_rows(preservationStats) %>%
filter(!statistic %in% c("medianRank.qual", "meanClusterCoeff.qual",
"medianRank.pres", "medianRankConnectivity.pres",
"medianRankDensity.pres", "cor.clusterCoeff",
"meanClusterCoeff.pres", "coClustering")) %>%
mutate(log.p = as.numeric(log.p)) %>%
select(module, moduleSize, category, statistic:log.p) %>%
arrange(module)
write_tsv(preservationStats,
file = "Preservation_EARLI/Module_Preservation_Statistics.txt")
# Summarize Results ####
# Which modules are good quality?
filter(preservationStats, statistic == "summary.qual" & Z > 10) %>% pull(module) #27/29
filter(preservationStats, statistic == "summary.qual" & Z > 2 & Z <= 10) %>% pull(module) #none
filter(preservationStats, statistic == "summary.qual" & Z <= 2) %>% pull(module) # grey, random
# Which modules are preserved?
filter(preservationStats, statistic == "summary.pres" & Z > 10) %>% pull(module) #27/29
filter(preservationStats, statistic == "summary.pres" & Z > 2 & Z <= 10) %>% pull(module) #none
filter(preservationStats, statistic == "summary.pres" & Z <= 2) %>% pull(module) #grey, random
# Which modules are accurate?
filter(preservationStats, statistic == "accuracy" & Z > 10) %>% pull(module) #20/29
# [1] "black" "blue" "brown" "cyan"
#[5] "darkgreen" "darkgrey" "darkturquoise" "green"
#[9] "greenyellow" "grey" "grey60" "lightcyan"
#[13] "lightgreen" "midnightblue" "pink" "purple"
#[17] "salmon" "tan" "turquoise" "yellow"
filter(preservationStats, statistic == "accuracy" & Z > 2 & Z <= 10) %>% pull(module) #none
filter(preservationStats, statistic == "accuracy" & Z <= 2) %>% pull(module)
#[1] "darkorange" "darkred" "lightyellow" "magenta" "orange"
#[6] "red" "royalblue" "white"
# Which modules separate in the reference set?
filter(preservationStats, statistic == "separability.qual" & Z > 10) %>% pull(module) # none
filter(preservationStats, statistic == "separability.qual" & Z > 2 & Z <= 10) %>% pull(module)
#[1] "black" "blue" "brown" "green" "grey"
#[6] "magenta" "midnightblue" "turquoise" "yellow"
filter(preservationStats, statistic == "separability.qual" & Z <= 2) %>% pull(module) #20/29
#[1] "cyan" "darkgreen" "darkgrey" "darkorange"
#[5] "darkred" "darkturquoise" "greenyellow" "grey60"
#[9] "lightcyan" "lightgreen" "lightyellow" "orange"
#[13] "pink" "purple" "random" "red"
#[17] "royalblue" "salmon" "tan" "white"
# Which modules separate in the test set?
filter(preservationStats, statistic == "separability.pres" & Z > 10) %>% pull(module) # none
filter(preservationStats, statistic == "separability.pres" & Z > 2 & Z <= 10) %>% pull(module) #grey, grey60
filter(preservationStats, statistic == "separability.pres" & Z <= 2) %>% pull(module) #27/29
# Visualize Results ####
plot_data <- select(preservationStats, module, moduleSize, statistic, Z) %>%
mutate(statistic = factor(statistic, levels = unique(statistic))) %>%
filter(!module %in% c("grey", "random") & is.finite(Z))
scatterplot <- ggplot(plot_data) +
geom_hline(yintercept = 2, lty = 2, color = "blue", size = 0.9) +
geom_hline(yintercept = 10, lty = 2, color = "darkgreen", size = 0.9) +
geom_point(aes(x = moduleSize, y = Z, color = module), size = 1.2) +
facet_wrap(vars(statistic), nrow = 6, scales = "free_y") +
scale_x_continuous(breaks = scales::breaks_pretty(n = 3)) +
scale_y_continuous(breaks = scales::breaks_pretty(n = 3),
expand = expansion(0.15)) +
scale_color_identity() +
theme_bw(base_size = 25) +
theme(legend.position = "none", panel.grid.major = element_blank(),
panel.border = element_rect(color = "black", size = 1.25),
axis.ticks = element_line(size = 0.9),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
strip.text = element_text(size = 8.5, margin = unit(c(1,0,0.2,0), "lines")),
axis.text = element_text(color = "black", size = 10),
axis.title = element_text(size = 12),
panel.spacing.x = unit(0.2, "lines"),
panel.spacing.y = unit(-0.5, "lines"),
plot.margin = unit(c(0,1,1,1), "lines")) +
xlab("Module Size") +
ylab("Z-score")
ggsave("/share/lasallelab/Jules/MARBLES_PCB/Preservation_EARLI/Module_Preservation_Statistics_Plot.pdf",
plot = scatterplot, dpi = 600, width = 9, height = 9, units = "in")