-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdiffExp.R
203 lines (154 loc) · 7.58 KB
/
diffExp.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# @ charles.k.w
# V2021.01.10
# Usage: Rscript diffExp.R currentWorkDir samplesheet genecount
# options(echo=TRUE) # if you want see commands in output file
args <- commandArgs(trailingOnly = TRUE)
if(length(args) != 3){
print("Usage: Rscript diffExp.R currentWorkDir samplesheet genecount")
exit(1)
}
currentWorkDir <- "/home/wangk/lab/shenlan/result/DAY7_vs_WT5"
genecountsFile <- "CIRC.GENECOUNT.txt"
samplesheetFile <- "samplesheet.csv"
genecountsAnno <- "GENECOUNTS.DAY7_vs_WT5.txt"
lfc_cutoff <- log2 (1.5) #0.584962501
pval_cutoff <- - log10(0.05) #1.301029996
padj_cutoff <- 0.05
setwd(currentWorkDir)
#if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("refGenome")
#library(refGenome)
#library(Rsubread)
library(DESeq2)
library(ggplot2)
library(clusterProfiler)
library(biomaRt)
library(ReactomePA)
library(DOSE)
library(KEGG.db)
library(org.Mm.eg.db)
library(org.Hs.eg.db)
library(pheatmap)
library(genefilter)
library(RColorBrewer)
library(GO.db)
library(topGO)
library(dplyr)
library(gage)
library(ggsci)
library(AnnotationDbi)
organismDb <- org.Mm.eg.db
# - - - - - - - - - - - -
genecountsData <- read.table(genecountsFile, header = TRUE, sep = "\t", quote = "", row.names = 1)
head(genecountsData)
rownum <- nrow(genecountsData)
colnum <- ncol(genecountsData)
metasheetData <- read.table(samplesheetFile, header = TRUE, sep = ",", quote = "", row.names = NULL)
metasheetData <- metasheetData[match(colnames(genecountsData), metasheetData$Sample), ]
head (metasheetData)
# An example of the sample sheet file:
### Sample,Group,Replicate,Condition
### A_M6,M6,Rep1,treatment
### B_M6,M6,Rep2,treatment
### A_ME1,ME1,Rep1,control
### B_ME1,ME1,Rep2,control
datasetMatrix <- DESeqDataSetFromMatrix(countData = genecountsData, colData = metasheetData, design = ~ Condition)
# Find differential expressed genes using DESEq2
datasetMatrix <- DESeq(datasetMatrix)
# - - - - - - - - - - - - -
# Differential expression analysis
# - - - - - - - - - - - - -
# comparison using treatment and control, and dort the result acooding to pvalues
comparison_res <- results(datasetMatrix, contrast=c("Condition", "Day7", "WT5"))
deseq = comparison_res[order(comparison_res$pvalue),]
deseq.df <- as.data.frame(deseq)
# Subset for only significant genes (q < padj_cutoff & FC > lfc_cutoff)
results_sig <- subset(deseq.df, padj <= padj_cutoff & abs(log2FoldChange) >= lfc_cutoff)
head(results_sig)
# - - - - - - - - - - - - -
# Export table result data
# - - - - - - - - - - - - -
# Write normalized gene counts for all samples to a .txt file
# write.table(x = as.data.frame(counts(datasetMatrix), normalized = T), file = 'gene_count_normalized.txt', sep = '\t', quote = F, col.names = NA)
# Write the annotated results table to a .txt file
# write.table(x = as.data.frame(deseq.df), file = "all_filtered_gene_annotated.txt", sep = '\t', quote = F, col.names = NA)
# Write significant normalized gene counts to a .txt file
# write.table(x = counts(datasetMatrix[row.names(results_sig)], normalized = T), file = 'gene_count_normalized_significant.txt', sep = '\t', quote = F, col.names = NA)
# Write significant annotated results table to a .txt file
# write.table(x = as.data.frame(results_sig), file = "all_filtered_gene_annotated_significant.txt", sep = '\t', quote = F, col.names = NA)
results_gene_count <- as.data.frame(counts(datasetMatrix, normalized = T))
results_gene_count$GENEID <- row.names(results_gene_count)
results_annotation <- deseq.df
results_annotation$GENEID <- row.names(results_annotation)
file_merged <- merge(results_gene_count, results_annotation, by = "GENEID", sort = F)
write.table(file_merged, genecountsAnno, sep = '\t', quote = FALSE, row.names = FALSE, col.names = TRUE)
if (file.exists("./figures")){
} else {
dir.create("./figures")
}
# -------------------------
# Convert all samples to rlog
# perform the rlog transformation and used as gene expression matrix for heatmap and PCA application
# -------------------------
datasetMatrix_rlog <- rlog(datasetMatrix, blind=FALSE)
head(assay(datasetMatrix_rlog), 3)
# -------------------------
# Sample distance heatmap
# ------------------------
sampleDists <- dist( t( assay(datasetMatrix_rlog) ) )
sampleDists
# We visualize the distances in a heatmap in a Figure below, using the function pheatmap from the pheatmap package.
# Heatmap of sample-to-sample distances using the rlog-transformed values
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- metasheetData$Sample #rownames(metasheetData)
colnames(sampleDistMatrix) <- metasheetData$Sample #rownames(metasheetData)
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
p <- pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
angle_col = 90, col=colors, fontsize = 8)
ggsave(p, filename="figures/Heatmap_sample_to_sample_distances.pdf", width=9, height=8, units=c("cm"), colormodel="srgb")
# - - - - - - - - - - - - -
# PCA plot
# - - - - - - - - - - - - -
# PCA plot using the rlog-transformed values. Each sample is color-coded according to their affiliations.
# Plot PCA by column variable
p <- plotPCA(datasetMatrix_rlog, intgroup = "Group", ntop = 1000) # + theme_bw() + ggsave('figures/pca_plot.pdf')
ggsave(p, filename="figures/pca_plot.pdf", width=9, height=5, units=c("cm"), colormodel="srgb")
# - - - - - - - - - - - - -
# MA plot
# - - - - - - - - - - - - -
pdf(file='figures/MA_plot.pdf', width=5, height=4, colormodel="srgb")
plotMA(deseq, ylim = c(-5, 5))
dev.off()
# - - - - - - - - - - - - -
# Volcano plot
# - - - - - - - - - - - - -
# Load libraries
library(ggplot2)
library(RColorBrewer)
# Gather Log-fold change and FDR-corrected pvalues from deseq2 results
data <- data.frame(pval = -log10(deseq$padj), lfc = deseq$log2FoldChange, row.names = row.names(deseq))
# Remove any rows that have NA as an entry
data <- na.omit(data)
# Color the points which are up or down
## If fold-change > 1.5 and pvalue < 0.05 (Increased significant)
## If fold-change < -1.5 and pvalue < 0.05 (Decreased significant)
data <- mutate(data, color = case_when(data$lfc >= lfc_cutoff & data$pval >= pval_cutoff ~ "Increased",
data$lfc <= - lfc_cutoff & data$pval >= pval_cutoff ~ "Decreased",
data$lfc > - lfc_cutoff & data$lfc < lfc_cutoff ~ "Nonsignificant",
data$pval < pval_cutoff ~ "Nonsignificant"))
# Make a basic ggplot2 object with x-y values
p <- ggplot(data, aes(x = lfc, y = pval, color = color)) +
ggtitle(label = "Volcano Plot", subtitle = "Colored by fold-change direction") +
geom_point(size = 1.0, alpha = 0.8, na.rm = T) +
scale_color_manual(name = "Directionality", values = c(Increased = "#CD4F39", Decreased = "#008B00", Nonsignificant = "darkgray")) +
theme_bw(base_size = 14) + # change overall theme
theme(legend.position = "right") + # change the legend
xlab(expression(log[2]("Treatment" / "Control"))) + # Change X-Axis label
ylab(expression(-log[10]("adjusted p-value"))) + # Change Y-Axis label
geom_hline(yintercept = 1.301029996, colour = "darkgrey") + # Add p-adj value cutoff line
scale_y_continuous(trans = "log1p") # Scale yaxis due to large p-value
ggsave(p, filename="figures/Volcano_plot.pdf", width=12, height=8, units=c("cm"), colormodel="srgb")