Skip to content

Commit a8cebab

Browse files
authored
Merge pull request #115 from NIDAP-Community/dev
Dev
2 parents 3b8b779 + b5675af commit a8cebab

12 files changed

+616843
-80
lines changed

R/filtering.R

Lines changed: 95 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,21 @@
1919
#' @return A list containing the ....
2020

2121
# To call function, must have data = raw object, dsp.obj = QC demoData,
22-
# loq.cutoff 2 is recommended, loq.min 2 is recommend,
23-
# cut.segment = remove segments with less than 10% of the genes detected; .05-.1 recommended,
22+
# loq.cutoff 2 is recommended,
23+
# loq.min 2 is recommend,
24+
# segment.gene.rate.cutoff = remove segments with less than x% of the gene set detected; .05-.1 recommended,
25+
# study.gene.rate.cutoff = remove genes detected in less than x% of segments; .05-.2 recommended,
2426
# goi = goi (genes of interest). Must be a vector of genes (i.e c("PDCD1", "CD274")),
25-
filtering <- function(object, loq.cutoff, loq.min, cut.segment, goi) {
27+
filtering <- function(object,
28+
loq.cutoff = 2,
29+
loq.min = 2,
30+
segment.gene.rate.cutoff = 0.05,
31+
study.gene.rate.cutoff = 0.05,
32+
sankey.exclude.slide = FALSE,
33+
goi) {
2634

2735
if(class(object)[1] != "NanoStringGeoMxSet"){
28-
stop(paste0("Error: You have the wrong data class, must be NanoStringGeoMxSet" ))
36+
stop(paste0("Error: The input object must be a NanoStringGeoMxSet" ))
2937
}
3038

3139
# run reductions ====
@@ -35,22 +43,18 @@ filtering <- function(object, loq.cutoff, loq.min, cut.segment, goi) {
3543
##4.4Limit of Quantification
3644
# Define LOQ SD threshold and minimum value
3745
if(class(loq.cutoff)[1] != "numeric"){
38-
stop(paste0("Error: You have the wrong data class, must be numeric" ))
46+
stop(paste0("Error: LOQ cutoff must be numeric" ))
3947
}
4048
if(class(loq.min)[1] != "numeric"){
41-
stop(paste0("Error: You have the wrong data class, must be numeric" ))
49+
stop(paste0("Error: LOQ min must be numeric" ))
4250
}
4351
# Define Modules
44-
#pkc.file <- pkc.file
4552
pkc.file <- annotation(object)
4653
if(class(pkc.file)[1] != "character"){
47-
stop(paste0("Error: You have the wrong data class, must be character" ))
54+
stop(paste0("The pkc file name must be character" ))
4855
}
4956
modules <- gsub(".pkc", "", pkc.file)
5057

51-
# Collapse probes to gene targets
52-
#target_Data <- aggregateCounts(data)
53-
5458
# Calculate LOQ per module tested
5559
loq <- data.frame(row.names = colnames(object))
5660
for(module in modules) {
@@ -78,7 +82,7 @@ filtering <- function(object, loq.cutoff, loq.min, cut.segment, goi) {
7882
# ensure ordering since this is stored outside of the geomxSet
7983
loq.mat <- loq.mat[fData(object)$TargetName, ]
8084

81-
##4.5.1S egment Gene Detection
85+
# Evaluate and Filter Segment Gene Detection Rate
8286
# Save detection rate information to pheno data
8387
pData(object)$GenesDetected <-
8488
colSums(loq.mat, na.rm = TRUE)
@@ -102,33 +106,63 @@ filtering <- function(object, loq.cutoff, loq.min, cut.segment, goi) {
102106
y = "Segments, #",
103107
fill = "Segment Type")
104108

105-
106109
# cut percent genes detected at 1, 5, 10, 15
107-
tab<- kable(table(pData(object)$DetectionThreshold, pData(object)$class))
108-
if(class(cut.segment)[1] != "numeric"){
109-
stop(paste0("Error: You have the wrong data class, must be numeric" ))
110+
segment.table <- kable(table(pData(object)$DetectionThreshold,
111+
pData(object)$class))
112+
if(class(segment.gene.rate.cutoff)[1] != "numeric"){
113+
stop(paste0("Error: segment.gene.rate.cutoff must be numeric" ))
114+
}
115+
if(segment.gene.rate.cutoff > 1 | segment.gene.rate.cutoff < 0){
116+
stop(paste0("Error: segment.gene.rate.cutoff must be between 0-1" ))
110117
}
111-
object <- object[, pData(object)$GeneDetectionRate >= cut.segment]
112-
if(cut.segment > 1 | cut.segment < 0){
113-
stop(paste0("Error: You need perecentage in decimals between 0-1" ))
118+
119+
# Filter the data using the cutoff for gene detection rate
120+
object <-
121+
object[, pData(object)$GeneDetectionRate >= segment.gene.rate.cutoff]
122+
123+
# Create a post-filtering Sankey Plot
124+
125+
# Create a count matrix with or without slide name
126+
if(sankey.exclude.slide == TRUE){
127+
count.mat <-
128+
count(pData(object), class, region, segment)
129+
} else {
130+
count.mat <-
131+
count(pData(object), slide_name, class, region, segment)
114132
}
115133

116-
# select the annotations we want to show, use `` to surround column names with
117-
# spaces or special symbols
118-
count.mat <- count(pData(object), `slide_name`, class, region, segment)
119134
if(class(object)[1] != "NanoStringGeoMxSet"){
120135
stop(paste0("Error: You have the wrong data class, must be NanoStringGeoMxSet" ))
121136
}
122-
# simplify the slide_names
123-
count.mat$`slide_name` <- gsub("disease", "d", gsub("normal", "n", count.mat$`slide_name`))
124-
# gather the data and plot in order: class, slide_name, region, segment
125-
test.gr <- gather_set_data(count.mat, 1:4)
126-
test.gr$x <-factor(test.gr$x, levels = c("class", "slide_name", "region", "segment"))
137+
138+
# Gather the data and plot in order: class, slide name, region, segment
139+
# gather_set_data creates x, id, y, and n fields within sankey.count.data
140+
# Establish the levels of the Sankey with or without the slide name
141+
if(sankey.exclude.slide == TRUE){
142+
sankey.count.data <- gather_set_data(count.mat, 1:3)
143+
sankey.count.data$x <-
144+
factor(
145+
sankey.count.data$x,
146+
levels = c("class", "region", "segment")
147+
)
148+
# For position of Sankey 100 segment scale
149+
adjust.scale.pos = 1
150+
} else {
151+
sankey.count.data <- gather_set_data(count.mat, 1:4)
152+
sankey.count.data$x <-
153+
factor(
154+
sankey.count.data$x,
155+
levels = c("class", "slide_name", "region", "segment")
156+
)
157+
# For position of Sankey 100 segment scale
158+
adjust.scale.pos = 0
159+
}
160+
127161
# plot Sankey
128-
sankey.plot<- ggplot(test.gr, aes(x, id = id, split = y, value = n)) +
162+
sankey.plot <- ggplot(sankey.count.data, aes(x, id = id, split = y, value = n)) +
129163
geom_parallel_sets(aes(fill = region), alpha = 0.5, axis.width = 0.1) +
130164
geom_parallel_sets_axes(axis.width = 0.2) +
131-
geom_parallel_sets_labels(color = "white", size = 5) +
165+
geom_parallel_sets_labels(color = "gray", size = 5, angle = 0) +
132166
theme_classic(base_size = 17) +
133167
theme(legend.position = "bottom",
134168
axis.ticks.y = element_blank(),
@@ -137,12 +171,23 @@ filtering <- function(object, loq.cutoff, loq.min, cut.segment, goi) {
137171
scale_y_continuous(expand = expansion(0)) +
138172
scale_x_discrete(expand = expansion(0)) +
139173
labs(x = "", y = "") +
140-
annotate(geom = "segment", x = 4.25, xend = 4.25, y = 20,
141-
yend = 120, lwd = 2) +
142-
annotate(geom = "text", x = 4.19, y = 70, angle = 90, size = 5,
143-
hjust = 0.5, label = "100 segments")
144-
145-
##4.5.2 Gene Detection Rate
174+
annotate(geom = "segment",
175+
x = (4.25 - adjust.scale.pos),
176+
xend = (4.25 - adjust.scale.pos),
177+
y = 20,
178+
yend = 120,
179+
lwd = 2) +
180+
annotate(geom = "text",
181+
x = (4.19 - adjust.scale.pos),
182+
y = 70,
183+
angle = 90,
184+
size = 5,
185+
hjust = 0.5,
186+
label = "100 segments")
187+
188+
189+
190+
# Evaluate and Filter Study-wide Gene Detection Rate
146191
# Calculate detection rate:
147192
loq.mat <- loq.mat[, colnames(object)]
148193
fData(object)$DetectedSegments <- rowSums(loq.mat, na.rm = TRUE)
@@ -154,11 +199,21 @@ filtering <- function(object, loq.cutoff, loq.min, cut.segment, goi) {
154199
if(class(goi)[1] != "character"){
155200
stop(paste0("Error: You have the wrong data class, must be character vector" ))
156201
}
157-
goi.df <- data.frame(Gene = goi,
202+
goi.table <- data.frame(Gene = goi,
158203
Number = fData(object)[goi, "DetectedSegments"],
159204
DetectionRate = percent(fData(object)[goi, "DetectionRate"]))
205+
#goi.table <- capture.output(print(goi.df, row.name = FALSE))
160206

161207
## 4.5.3 Gene Filtering
208+
209+
# Check that the study gene rate cutoff is correctly entered
210+
if(class(study.gene.rate.cutoff)[1] != "numeric"){
211+
stop(paste0("Error: study.gene.rate.cutoff must be numeric" ))
212+
}
213+
if(study.gene.rate.cutoff > 1 | study.gene.rate.cutoff < 0){
214+
stop(paste0("Error: study.gene.rate.cutoff must be between 0-1" ))
215+
}
216+
162217
# Plot detection rate:
163218
plot.detect <- data.frame(Freq = c(1, 5, 10, 20, 30, 50))
164219
plot.detect$Number <-
@@ -181,15 +236,12 @@ filtering <- function(object, loq.cutoff, loq.min, cut.segment, goi) {
181236
labs(x = "% of Segments",
182237
y = "Genes Detected, % of Panel > loq")
183238

184-
# Subset to target genes detected in at least 10% of the samples.
185-
# Also manually include the negative control probe, for downstream use
239+
# Subset for genes above the study gene detection rate cutoff
240+
# Manually include the negative control probe, for downstream use
186241
negative.probe.fData <- subset(fData(object), CodeClass == "Negative")
187242
neg.probes <- unique(negative.probe.fData$TargetName)
188-
object <- object[fData(object)$DetectionRate >= 0.1 |
243+
object <- object[fData(object)$DetectionRate >= study.gene.rate.cutoff |
189244
fData(object)$TargetName %in% neg.probes, ]
190245

191-
# retain only detected genes of interest
192-
goi <- goi[goi %in% rownames(object)]
193-
194-
return(list("stacked.bar.plot" = stacked.bar.plot, "tab" = tab, "sankey.plot" = sankey.plot, "genes.detected.plot" = genes.detected.plot, "object" = object))
246+
return(list("stacked.bar.plot" = stacked.bar.plot, "segment.table" = segment.table, "goi.table" = goi.table, "sankey.plot" = sankey.plot, "genes.detected.plot" = genes.detected.plot, "object" = object))
195247
}

R/study_design.R

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ studyDesign <- function(dcc.files,
6060
region.col = "region",
6161
segment.col = "segment",
6262
area.col = "area",
63-
nuclei.col = "nuclei") {
63+
nuclei.col = "nuclei",
64+
sankey.exclude.slide = FALSE) {
6465

6566
# load all input data into a GeoMX object
6667
object <-
@@ -131,8 +132,17 @@ studyDesign <- function(dcc.files,
131132

132133
# select the annotations we want to show, use `` to surround column
133134
# names with spaces or special symbols
134-
count.mat <-
135-
count(pData(object), `slide_name`, class, region, segment)
135+
136+
# Create a count matrix with or without slide name
137+
if(sankey.exclude.slide == TRUE){
138+
count.mat <-
139+
count(pData(object), class, region, segment)
140+
} else {
141+
count.mat <-
142+
count(pData(object), slide_name, class, region, segment)
143+
}
144+
145+
136146

137147
# Remove any rows with NA values
138148
na.per.column <- colSums(is.na(count.mat))
@@ -144,14 +154,28 @@ studyDesign <- function(dcc.files,
144154
}
145155

146156

147-
# gather the data and plot in order: class, slide name, region, segment
157+
# Gather the data and plot in order: class, slide name, region, segment
148158
# gather_set_data creates x, id, y, and n fields within sankey.count.data
149-
sankey.count.data <- gather_set_data(count.mat, 1:4)
150-
sankey.count.data$x <-
151-
factor(
152-
sankey.count.data$x,
153-
levels = c("class", "slide_name", "region", "segment")
154-
)
159+
# Establish the levels of the Sankey with or without the slide name
160+
if(sankey.exclude.slide == TRUE){
161+
sankey.count.data <- gather_set_data(count.mat, 1:3)
162+
sankey.count.data$x <-
163+
factor(
164+
sankey.count.data$x,
165+
levels = c("class", "region", "segment")
166+
)
167+
# For position of Sankey 100 segment scale
168+
adjust.scale.pos = 1
169+
} else {
170+
sankey.count.data <- gather_set_data(count.mat, 1:4)
171+
sankey.count.data$x <-
172+
factor(
173+
sankey.count.data$x,
174+
levels = c("class", "slide_name", "region", "segment")
175+
)
176+
# For position of Sankey 100 segment scale
177+
adjust.scale.pos = 0
178+
}
155179

156180
# plot Sankey diagram
157181
sankey.plot <-
@@ -167,7 +191,7 @@ studyDesign <- function(dcc.files,
167191
geom_parallel_sets_labels(color = "gray",
168192
size = 5,
169193
angle = 0) +
170-
theme_classic(base_size = 17) +
194+
theme_classic(base_size = 14) +
171195
theme(
172196
legend.position = "bottom",
173197
axis.ticks.y = element_blank(),
@@ -179,15 +203,15 @@ studyDesign <- function(dcc.files,
179203
labs(x = "", y = "") +
180204
annotate(
181205
geom = "segment",
182-
x = 4.25,
183-
xend = 4.25,
206+
x = (4.25 - adjust.scale.pos),
207+
xend = (4.25 - adjust.scale.pos),
184208
y = 20,
185209
yend = 120,
186210
lwd = 2
187211
) +
188212
annotate(
189213
geom = "text",
190-
x = 4.19,
214+
x = (4.19 - adjust.scale.pos),
191215
y = 70,
192216
angle = 90,
193217
size = 5,
Binary file not shown.

0 commit comments

Comments
 (0)