forked from StatGenPRD/STOPGAP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSTOPGAP2_run.R
201 lines (156 loc) · 11.4 KB
/
STOPGAP2_run.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#--------Run STOPGAP2 R functions to generate STOPGAP2 datasets -----------------#
# Author: Judong Shen and Kijoung Song
setwd("STOPGAP.working.directory")
library(gplots)
library(plyr)
library(reshape)
#library(cba)
#library(ggplot2)
source("STOPGAP2_functions.R")
# (1) merge the grasp, nhgri and gwasdb datasets
p.threshold <- 1E-04
grasp.import(file = "./Data/GraspFullDataset2.zip", p.thres=p.threshold)
NHGRI.import(file = "./Data/gwas_catalog_v1.0-associations_e85_r2016-08-01.tsv", p.thres=p.threshold)
gwasdb.import(file = "./Data/gwasdb_20150819_snp_trait.gz", p.thres=p.threshold)
phewas.import(file = "./Data/phewas-catalog.csv", p.thres=1E-04)
data.merge(nhgri, grasp, gwasdb, phewas)
# Identify the unique GWAS SNPs
gwas.snps(gwas.file="stopgap_4sources_clean.RData")
# Coordinate lookup and LD calculation based on the 1KG data for the new SNPs identified in this version
# if you want to calcuate LD SNPs based on your data, please see 'README.txt'.
#run.ld()
# Update rsIDs in the gwas data (data) to dbSNP141 version
rsID.update(gwas.file="stopgap_4sources_clean.RData",
coor.file="./STOPGAP2_LDResults/rsID_Coordinates.txt")
# Find all variants in LD with GWAS variants at r2 > 0.5
ld.snps(gwas.data="stopgap_4sources_dbSNP141.RData",
ld.path="./STOPGAP2_LDResults")
# updata.ld.data: updata ld data:
# Remove SNP.ld with two positions in both ld.snps and ld.snps.r2
# Use chr:pos to replace the missing ld SNP ids in both ld.snps and ld.snps.r2
updata.ld.data(ld.file="./STOPGAP2_LDResults/gwas_LD/ld.snps.RData",
ld.r2.file="./STOPGAP2_LDResults/gwas_LD/ld.snps.r2.RData")
# Update the ld.snps.r2 and ld_snps data by including the GWAS SNPs data which don't have any LD information there
update.ld.snps.r2(gwas.file="stopgap_4sources_dbSNP141.RData",
ld.file = "./STOPGAP2_LDResults/gwas_LD/ld.snps.r2.updated.RData",
rsid.file = "./STOPGAP2_LDResults/rsID_Coordinates.txt")
# Update the ld.snps.r2 and ld_snps data by removing the LD SNPs more than 500kb away from the GWAS SNPs,
# If any GWAS SNPs get deleted, put them back by including r2=1 there
update1.ld.snps.r2(gwas.file="stopgap_4sources_dbSNP141.RData",
ld.file="./STOPGAP2_LDResults/gwas_LD/ld.snps.r2.updated.plusGWASsnps.RData",
dis = 1000)
# Update the ld.snps.r2 data by adding the AF information from 1KG phase I data ...
r2.frq(ld.file="./STOPGAP2_LDResults/gwas_LD/ld.snps.r2.updated.plusGWASsnps.500kb.RData",
frq.file="./Data/1KG_AF/gws.frq.RData")
# For each LD SNP, (1) identify each dhscor where SNP falls within promoter or distal genomic region;
# (2) add gene, cor (if in promoter, set cor=1), and an indicator if it is in a distal or promoter region.
ld.dhscor(ld.file="./STOPGAP2_LDResults/gwas_LD/ld.snps.updated.plusGWASsnps.500kb.RData",
dhscor.file="./Data/genomic_resource/dhscor.RData",
mpos.rng.p = c("Chr.pdhs", "Start.pdhs", "Stop.pdhs"),
mpos.rng.d = c("Chr.ddhs", "Start.ddhs", "Stop.ddhs"))
# ----- Merge the CHIA-PET data
# ld.chiapet: For each LD SNP, (1) indentify each chiapet where SNP falls within promoter or distal genomic region;
# (2) add gene, cor (if in promoter, set cor=1), and an indicator if it is in a distal or promoter region.
ld.chiapet(ld.file="./STOPGAP2_LDResults/gwas_LD/ld.snps.updated.plusGWASsnps.500kb.RData",
chiapet.file="./Data/genomic_resource/CHIA.PET_5celltypes.RData",
mpos.rng.d = c("dChr", "dStart", "dEnd"))
# ld.eQTL: For each LD SNP, (1) indentify each eQTL where SNP falls within promoter or distal genomic region;
# (2) add gene, cor (if in promoter, set cor=1), and an indicator if it is in a distal or promoter region.
# (2) add gene, cor (if in promoter, set cor=1), and an indicator if it is in a distal or promoter region.
ld.eQTL(ld.file="./STOPGAP2_LDResults/gwas_LD/ld.snps.updated.plusGWASsnps.500kb.RData",
eQTL.file="./Data/genomic_resource/eQTL_single_44Tissues.RData")
ld.multi.eQTL(ld.file="./STOPGAP2_LDResults/gwas_LD/ld.snps.updated.plusGWASsnps.500kb.RData" ,
eQTL.multi.file="./Data/genomic_resource/eQTL_multi_9Tissues.RData")
# ld.fantom5: For each LD SNP, (1) indentify each fantom5 where SNP falls within promoter or distal genomic region;
# (2) add gene, tissue and an indicator if it is in a distal region (note no promoter region here).
ld.fantom5(ld.file="./STOPGAP2_LDResults/gwas_LD/ld.snps.updated.plusGWASsnps.500kb.RData",
fantom5.file="./Data/genomic_resource/FANTOM5.RData",
mpos.rng.d = c("chr", "startpos", "endpos"))
# ld.CHIC: For each LD SNP, (1) indentify each CHIC where SNP falls within promoter or distal genomic region;
# (2) add gene, tissue and an indicator if it is in a distal region (note no promoter region here).
ld.CHIC(ld.file="./STOPGAP2_LDResults/gwas_LD/ld.snps.updated.plusGWASsnps.500kb.RData",
CHIC.file="./Data/genomic_resource/chic.RData",
mpos.rng.d = c("chr", "startpos", "endpos"))
# add cato to ld snps
ld.cato(ld.file="./STOPGAP2_LDResults/gwas_LD/ld.snps.updated.plusGWASsnps.500kb.RData",
cato.file="./Data/genomic_resource/cato.RData")
# gwas.ld.deltaSVM.RData is generated and downloaded at Data/genomic_resource
ld.deltaSVM()
# gwas.ld.phylop.RData is generated and downloaded at Data/genomic_resource
ld.phylop()
# ld.Posgene: For each LD SNP, (1) identify genes falling in the 5Kb windows (upstream 5kb, downstream 5kb)
ld.posgene(ld.file="./STOPGAP2_LDResults/gwas_LD/ld.snps.updated.plusGWASsnps.500kb.RData",
Posgene.file="/Data/genomic_resource/gencode_v19_clean.RData")
# ld.vep: Merge gwas ld SNPs and VEP information.
# gwas.ld.vep.RData is generated and downloaded at Data/genomic_resource
ld.vep()
# ld.vep.simplify: Simplify the full version of merged gwas ld SNPs and VEP data.
ld.vep.simplify(vep.file="./VarGeneMapping/gwas.ld.vep.full.RData",
severity.file = "./VarGeneMapping/VEP_Consequence_w_severity.txt")
#merge.ld.simp: Merge based on SNP.ld and gene from each data
# % of unique (ld) GWAS SNPs with at least one genes
merge.ld.simp(dhscor.file ="./VarGeneMapping/gwas.ld.dhscor.RData",
chiapet.file = "./VarGeneMapping/gwas.ld.chiapet.RData",
fantom5.file = "./VarGeneMapping/gwas.ld.fantom5.RData",
CHIC.file="./VarGeneMapping/gwas.ld.CHIC.RData",
eQTL.s.file = "./VarGeneMapping/gwas.ld.eQTL.single.RData",
eQTL.m.file = "./VarGeneMapping/gwas.ld.eQTL.multi.RData",
cato.file="./VarGeneMapping/gwas.ld.cato.RData",
delta.file="./VarGeneMapping/gwas.ld.deltaSVM.RData",
phylop.file="./VarGeneMapping/gwas.ld.phylop.RData",
posgene.file = "./VarGeneMapping/gwas.ld.posgene.RData",
vep.file.simplified = "./VarGeneMapping/gwas.ld.vep.simplify.RData",
Posigene.file="./Data/gencode_v19_clean.RData")
# Subset of the var2gene data by using the union of RefSeq and Gencode gene names
# Also Update the var2gene mapping data by removing any rows with conflicting chr.ld with those from the chr information from gencode data
var2gene.filter(var2gene.file = "./VarGeneMapping/gwas.ld.var2gene.simplified.RData",
gencode.file = "./Data/gencode_v19_clean.RData")
# gwas.mesh: add mesh terms to the GWAS data.
# Run it on Windows only
gwas.mesh(gwas.file="./stopgap_4sources_dbSNP141.RData",
disease2mesh.file = "./Data/MeSH_Aug2016.txt")
# Merge GWAS data, LD r2 data and var2gene_vepSimp data together
gwas.ld.var2gene1(gwas.file="stopgap_4sources_dbSNP141_msh.RData",
ld.file = "./STOPGAP2_LDResults/gwas_LD/ld.snps.r2.updated.plusGWASsnps.500kb.AF.RData",
var2gene.file = "./VarGeneMapping/Var2Gene_vepSimp.RData")
rm.mhc.genes(data.file="./GWAS_LD_var2gene/Mergedata_VEPsimplified_withGene_r2_0.7_chr6.RData",
lRmMHC=TRUE)
# Add gene scores to the final Mergedata_VEPsimplified_withGene_r2_0.7.RData data
gene.score(data.file="./GWAS_LD_var2gene/Mergedata_VEPsimplified_withGene_r2_0.7_chr",
severity.file = "./VarGeneMapping/VEP_Consequence_w_severity.txt")
# gene.score1(data.file="./GWAS_LD_var2gene/Mergedata_VEPsimplified_withGene_r2_0.5_rmMHC.RData",
# severity.file = "./VarGeneMapping/VEP_Consequence_w_severity.txt")
#
# gene.rank1(data.file="./GWAS_LD_var2gene/Mergedata_VEPsimplified_withGene_r2_0.5_rmMHC_genescore.RData")
#ld.data: Get the LD data as a starting point for SNP locus clustering:
ld.data(data.file="./LocusClustering/gwas_r2.RData")
# locus.cluster1: clustering all the GWAS SNPs and LD SNPs into locus clusters based on LD r2 information between the GWAS SNPs and LD SNPs
# Note the ld data in the ld.file has been sorted by p-values
# This algorithm does not allow the LD (not GWAS) snps presenting in mulitiple locus clusters
# Run scripts at /GWD/bioinfo/projects/statgen3/PGx6691_STOPGAP/STOPGAP2.3/scripts/locus.cluster
locus.cluster(ld.file="./LocusClustering/ld")
#Fine mapping of casual variants using PICS
fine.map.pics(cluster.file = "./LocusClustering/STOPGAP_SNP_Cluster_",
gwas.file="stopgap_4sources_dbSNP141_msh.RData")
# For each unique GWAS SNP, rank its LD SNPs' genes based on their gene scores and r2 if gene scores tie
gene.rank(data.file="./GWAS_LD_var2gene/Mergedata_VEPsimplified_withGene_r2_0.7_rmMHC_genescore_chr")
# Add cluser number data to the best LD data
# Also add the ranks based on the p-values of snp.gwas within each cluster in each pubmedid/disease combination
add.cluster(data.file="./GWAS_LD_var2gene/Mergedata_VEPsimplified_withGene_r2_0.7_rmMHC_genescore_generank_bestLD_chr",
cluster.file = "./LocusClustering/STOPGAP_SNP_Cluster_")
#combine stopgap.data.bestld datasets
Combine.bestld.data(data.file="./GWAS_LD_var2gene/Mergedata_VEPsimplified_withGene_r2_0.7_MHC_genescore_generank_bestLD_cluster_chr")
# Generate the bestld data per gene-MeSH.
# Rank by the p-value bins and then by gene scores for bestld data per gene-MeSH.
# P-value bins (-log10(P)): > 12 as 1; 8-12 as 2 and <8 as 3.
mesh.gene(data.file="./GWAS_LD_var2gene/Mergedata_VEPsimplified_withGene_r2_0.7_MHC_genescore_generank_bestLD_cluster.RData")
# merge.orphanet: Merge stopgap.gene.mesh with latest Orphanet data processed on 11/13/2104
# Create merge between Orphanet, and stopgap.gene.mesh
# Change "Link" to "pubmedid" and add the OrphID as "pubmedid" when the Source=="Orphanet".
# Set Rank=1, pval2 as 0 and GeneScore=max(stopgap.gene.mesh$gene.score)
# Remove MSH.TOP and OrphID columms and change the names to being consistent with stopgap.gene.mesh
merge.orphanet(gm.file = "./Mesh_gene/STOPGAP_r2_0.7_rmMHC_bestLD_gene_mesh.RData",
orphanet.file = "./Data/orphanet.RData",
gencode.file="./Data/gencode_v19_clean.RData")
revice.bestld(gm.file = "./GWAS_LD_var2gene/Mergedata_VEPsimplified_withGene_r2_0.7_MHC_genescore_generank_bestLD_cluster.RData",
gencode.file="./Data/gencode_v19_clean.RData" )