1919# ' @return A list containing the ....
2020
2121# To call function, must have data = raw object, dsp.obj = QC demoData,
22- # loq.cutoff 2 is recommended, loq.min 2 is recommend,
23- # cut.segment = remove segments with less than 10% of the genes detected; .05-.1 recommended,
22+ # loq.cutoff 2 is recommended,
23+ # loq.min 2 is recommend,
24+ # segment.gene.rate.cutoff = remove segments with less than x% of the gene set detected; .05-.1 recommended,
25+ # study.gene.rate.cutoff = remove genes detected in less than x% of segments; .05-.2 recommended,
2426# goi = goi (genes of interest). Must be a vector of genes (i.e c("PDCD1", "CD274")),
25- filtering <- function (object , loq.cutoff , loq.min , cut.segment , goi ) {
27+ filtering <- function (object ,
28+ loq.cutoff = 2 ,
29+ loq.min = 2 ,
30+ segment.gene.rate.cutoff = 0.05 ,
31+ study.gene.rate.cutoff = 0.05 ,
32+ sankey.exclude.slide = FALSE ,
33+ goi ) {
2634
2735 if (class(object )[1 ] != " NanoStringGeoMxSet" ){
28- stop(paste0(" Error: You have the wrong data class, must be NanoStringGeoMxSet" ))
36+ stop(paste0(" Error: The input object must be a NanoStringGeoMxSet" ))
2937 }
3038
3139 # run reductions ====
@@ -35,22 +43,18 @@ filtering <- function(object, loq.cutoff, loq.min, cut.segment, goi) {
3543 # #4.4Limit of Quantification
3644 # Define LOQ SD threshold and minimum value
3745 if (class(loq.cutoff )[1 ] != " numeric" ){
38- stop(paste0(" Error: You have the wrong data class, must be numeric" ))
46+ stop(paste0(" Error: LOQ cutoff must be numeric" ))
3947 }
4048 if (class(loq.min )[1 ] != " numeric" ){
41- stop(paste0(" Error: You have the wrong data class, must be numeric" ))
49+ stop(paste0(" Error: LOQ min must be numeric" ))
4250 }
4351 # Define Modules
44- # pkc.file <- pkc.file
4552 pkc.file <- annotation(object )
4653 if (class(pkc.file )[1 ] != " character" ){
47- stop(paste0(" Error: You have the wrong data class, must be character" ))
54+ stop(paste0(" The pkc file name must be character" ))
4855 }
4956 modules <- gsub(" .pkc" , " " , pkc.file )
5057
51- # Collapse probes to gene targets
52- # target_Data <- aggregateCounts(data)
53-
5458 # Calculate LOQ per module tested
5559 loq <- data.frame (row.names = colnames(object ))
5660 for (module in modules ) {
@@ -78,7 +82,7 @@ filtering <- function(object, loq.cutoff, loq.min, cut.segment, goi) {
7882 # ensure ordering since this is stored outside of the geomxSet
7983 loq.mat <- loq.mat [fData(object )$ TargetName , ]
8084
81- # #4.5.1S egment Gene Detection
85+ # Evaluate and Filter Segment Gene Detection Rate
8286 # Save detection rate information to pheno data
8387 pData(object )$ GenesDetected <-
8488 colSums(loq.mat , na.rm = TRUE )
@@ -102,33 +106,63 @@ filtering <- function(object, loq.cutoff, loq.min, cut.segment, goi) {
102106 y = " Segments, #" ,
103107 fill = " Segment Type" )
104108
105-
106109 # cut percent genes detected at 1, 5, 10, 15
107- tab <- kable(table(pData(object )$ DetectionThreshold , pData(object )$ class ))
108- if (class(cut.segment )[1 ] != " numeric" ){
109- stop(paste0(" Error: You have the wrong data class, must be numeric" ))
110+ segment.table <- kable(table(pData(object )$ DetectionThreshold ,
111+ pData(object )$ class ))
112+ if (class(segment.gene.rate.cutoff )[1 ] != " numeric" ){
113+ stop(paste0(" Error: segment.gene.rate.cutoff must be numeric" ))
114+ }
115+ if (segment.gene.rate.cutoff > 1 | segment.gene.rate.cutoff < 0 ){
116+ stop(paste0(" Error: segment.gene.rate.cutoff must be between 0-1" ))
110117 }
111- object <- object [, pData(object )$ GeneDetectionRate > = cut.segment ]
112- if (cut.segment > 1 | cut.segment < 0 ){
113- stop(paste0(" Error: You need perecentage in decimals between 0-1" ))
118+
119+ # Filter the data using the cutoff for gene detection rate
120+ object <-
121+ object [, pData(object )$ GeneDetectionRate > = segment.gene.rate.cutoff ]
122+
123+ # Create a post-filtering Sankey Plot
124+
125+ # Create a count matrix with or without slide name
126+ if (sankey.exclude.slide == TRUE ){
127+ count.mat <-
128+ count(pData(object ), class , region , segment )
129+ } else {
130+ count.mat <-
131+ count(pData(object ), slide_name , class , region , segment )
114132 }
115133
116- # select the annotations we want to show, use `` to surround column names with
117- # spaces or special symbols
118- count.mat <- count(pData(object ), `slide_name` , class , region , segment )
119134 if (class(object )[1 ] != " NanoStringGeoMxSet" ){
120135 stop(paste0(" Error: You have the wrong data class, must be NanoStringGeoMxSet" ))
121136 }
122- # simplify the slide_names
123- count.mat $ `slide_name` <- gsub(" disease" , " d" , gsub(" normal" , " n" , count.mat $ `slide_name` ))
124- # gather the data and plot in order: class, slide_name, region, segment
125- test.gr <- gather_set_data(count.mat , 1 : 4 )
126- test.gr $ x <- factor (test.gr $ x , levels = c(" class" , " slide_name" , " region" , " segment" ))
137+
138+ # Gather the data and plot in order: class, slide name, region, segment
139+ # gather_set_data creates x, id, y, and n fields within sankey.count.data
140+ # Establish the levels of the Sankey with or without the slide name
141+ if (sankey.exclude.slide == TRUE ){
142+ sankey.count.data <- gather_set_data(count.mat , 1 : 3 )
143+ sankey.count.data $ x <-
144+ factor (
145+ sankey.count.data $ x ,
146+ levels = c(" class" , " region" , " segment" )
147+ )
148+ # For position of Sankey 100 segment scale
149+ adjust.scale.pos = 1
150+ } else {
151+ sankey.count.data <- gather_set_data(count.mat , 1 : 4 )
152+ sankey.count.data $ x <-
153+ factor (
154+ sankey.count.data $ x ,
155+ levels = c(" class" , " slide_name" , " region" , " segment" )
156+ )
157+ # For position of Sankey 100 segment scale
158+ adjust.scale.pos = 0
159+ }
160+
127161 # plot Sankey
128- sankey.plot <- ggplot(test.gr , aes(x , id = id , split = y , value = n )) +
162+ sankey.plot <- ggplot(sankey.count.data , aes(x , id = id , split = y , value = n )) +
129163 geom_parallel_sets(aes(fill = region ), alpha = 0.5 , axis.width = 0.1 ) +
130164 geom_parallel_sets_axes(axis.width = 0.2 ) +
131- geom_parallel_sets_labels(color = " white " , size = 5 ) +
165+ geom_parallel_sets_labels(color = " gray " , size = 5 , angle = 0 ) +
132166 theme_classic(base_size = 17 ) +
133167 theme(legend.position = " bottom" ,
134168 axis.ticks.y = element_blank(),
@@ -137,12 +171,23 @@ filtering <- function(object, loq.cutoff, loq.min, cut.segment, goi) {
137171 scale_y_continuous(expand = expansion(0 )) +
138172 scale_x_discrete(expand = expansion(0 )) +
139173 labs(x = " " , y = " " ) +
140- annotate(geom = " segment" , x = 4.25 , xend = 4.25 , y = 20 ,
141- yend = 120 , lwd = 2 ) +
142- annotate(geom = " text" , x = 4.19 , y = 70 , angle = 90 , size = 5 ,
143- hjust = 0.5 , label = " 100 segments" )
144-
145- # #4.5.2 Gene Detection Rate
174+ annotate(geom = " segment" ,
175+ x = (4.25 - adjust.scale.pos ),
176+ xend = (4.25 - adjust.scale.pos ),
177+ y = 20 ,
178+ yend = 120 ,
179+ lwd = 2 ) +
180+ annotate(geom = " text" ,
181+ x = (4.19 - adjust.scale.pos ),
182+ y = 70 ,
183+ angle = 90 ,
184+ size = 5 ,
185+ hjust = 0.5 ,
186+ label = " 100 segments" )
187+
188+
189+
190+ # Evaluate and Filter Study-wide Gene Detection Rate
146191 # Calculate detection rate:
147192 loq.mat <- loq.mat [, colnames(object )]
148193 fData(object )$ DetectedSegments <- rowSums(loq.mat , na.rm = TRUE )
@@ -154,11 +199,21 @@ filtering <- function(object, loq.cutoff, loq.min, cut.segment, goi) {
154199 if (class(goi )[1 ] != " character" ){
155200 stop(paste0(" Error: You have the wrong data class, must be character vector" ))
156201 }
157- goi.df <- data.frame (Gene = goi ,
202+ goi.table <- data.frame (Gene = goi ,
158203 Number = fData(object )[goi , " DetectedSegments" ],
159204 DetectionRate = percent(fData(object )[goi , " DetectionRate" ]))
205+ # goi.table <- capture.output(print(goi.df, row.name = FALSE))
160206
161207 # # 4.5.3 Gene Filtering
208+
209+ # Check that the study gene rate cutoff is correctly entered
210+ if (class(study.gene.rate.cutoff )[1 ] != " numeric" ){
211+ stop(paste0(" Error: study.gene.rate.cutoff must be numeric" ))
212+ }
213+ if (study.gene.rate.cutoff > 1 | study.gene.rate.cutoff < 0 ){
214+ stop(paste0(" Error: study.gene.rate.cutoff must be between 0-1" ))
215+ }
216+
162217 # Plot detection rate:
163218 plot.detect <- data.frame (Freq = c(1 , 5 , 10 , 20 , 30 , 50 ))
164219 plot.detect $ Number <-
@@ -181,15 +236,12 @@ filtering <- function(object, loq.cutoff, loq.min, cut.segment, goi) {
181236 labs(x = " % of Segments" ,
182237 y = " Genes Detected, % of Panel > loq" )
183238
184- # Subset to target genes detected in at least 10% of the samples.
185- # Also manually include the negative control probe, for downstream use
239+ # Subset for genes above the study gene detection rate cutoff
240+ # Manually include the negative control probe, for downstream use
186241 negative.probe.fData <- subset(fData(object ), CodeClass == " Negative" )
187242 neg.probes <- unique(negative.probe.fData $ TargetName )
188- object <- object [fData(object )$ DetectionRate > = 0.1 |
243+ object <- object [fData(object )$ DetectionRate > = study.gene.rate.cutoff |
189244 fData(object )$ TargetName %in% neg.probes , ]
190245
191- # retain only detected genes of interest
192- goi <- goi [goi %in% rownames(object )]
193-
194- return (list (" stacked.bar.plot" = stacked.bar.plot , " tab" = tab , " sankey.plot" = sankey.plot , " genes.detected.plot" = genes.detected.plot , " object" = object ))
246+ return (list (" stacked.bar.plot" = stacked.bar.plot , " segment.table" = segment.table , " goi.table" = goi.table , " sankey.plot" = sankey.plot , " genes.detected.plot" = genes.detected.plot , " object" = object ))
195247}
0 commit comments