From 629f5e92716171975bf99d3663ab370a0c427d30 Mon Sep 17 00:00:00 2001 From: szandavi Date: Mon, 6 May 2024 13:42:30 -0400 Subject: [PATCH 01/13] new susie method --- ldsc/src/main/scala/MakeSuSiE.scala | 73 ++++++++++++++++++ susie/.editorconfig | 14 ++++ susie/.scalafmt.conf | 4 + susie/LICENSE.txt | 29 +++++++ susie/README.md | 14 ++++ susie/built.sbt | 70 +++++++++++++++++ susie/project/build.properties | 1 + susie/project/plugins.sbt | 1 + susie/src/main/resources/install-susie.sh | 63 +++++++++++++++ susie/src/main/resources/makeSuSiE.py | 93 +++++++++++++++++++++++ susie/src/main/scala/MakeSuSiE.scala | 54 +++++++++++++ susie/src/main/scala/Susie.scala | 27 +++++++ susie/version.sbt | 1 + 13 files changed, 444 insertions(+) create mode 100644 ldsc/src/main/scala/MakeSuSiE.scala create mode 100644 susie/.editorconfig create mode 100644 susie/.scalafmt.conf create mode 100644 susie/LICENSE.txt create mode 100644 susie/README.md create mode 100644 susie/built.sbt create mode 100644 susie/project/build.properties create mode 100644 susie/project/plugins.sbt create mode 100644 susie/src/main/resources/install-susie.sh create mode 100644 susie/src/main/resources/makeSuSiE.py create mode 100644 susie/src/main/scala/MakeSuSiE.scala create mode 100644 susie/src/main/scala/Susie.scala create mode 100644 susie/version.sbt diff --git a/ldsc/src/main/scala/MakeSuSiE.scala b/ldsc/src/main/scala/MakeSuSiE.scala new file mode 100644 index 00000000..c69cf05a --- /dev/null +++ b/ldsc/src/main/scala/MakeSuSiE.scala @@ -0,0 +1,73 @@ +package org.broadinstitute.dig.aggregator.methods.susie + +import org.broadinstitute.dig.aggregator.core._ +import org.broadinstitute.dig.aws.emr._ +import org.broadinstitute.dig.aws.Ec2.Strategy +import org.broadinstitute.dig.aws.MemorySize + +class MakeSuSiE(implicit context: Context) extends Stage { + import MemorySize.Implicits._ + + val ancestrySpecific: Input.Source = Input.Source.Success("out/metaanalysis/bottom-line/ancestry-clumped/*/*/") + // val mixedDatasets: Input.Source = Input.Source.Success("variants/*/*/*/") + + /** Source inputs. */ + override val sources: Seq[Input.Source] = Seq(ancestrySpecific, mixedDatasets) + + /** Map inputs to their outputs. */ + override val rules: PartialFunction[Input, Outputs] = { + case ancestrySpecific(phenotype, ancestry) => Outputs.Named(s"$phenotype/${ancestry.split('=').last}") + // case mixedDatasets(_, _, phenotype) => Outputs.Named(s"$phenotype/Mixed") + } + + /** Just need a single machine with no applications, but a good drive. */ + override def cluster: ClusterDef = super.cluster.copy( + instances = 1, + applications = Seq.empty, + masterVolumeSizeInGB = 100, + bootstrapScripts = Seq(new BootstrapScript(resourceUri("install-susie.sh"))) + ) + + override def make(output: String): Job = { + val input = MakeSuSiEInput.fromString(output) + new Job(Job.Script(resourceUri("makeSuSiE.py"), input.flags:_*)) + } + + /** Before the jobs actually run, perform this operation. + */ + override def prepareJob(output: String): Unit = { + val input = MakeSuSiEInput.fromString(output) + context.s3.rm(input.outputDirectory + "/") + } + + /** On success, write the _SUCCESS file in the output directory. + */ + override def success(output: String): Unit = { + val input = MakeSuSiEInput.fromString(output) + context.s3.touch(input.outputDirectory + "/_SUCCESS") + () + } +} + +case class MakeSuSiEInput( + clump: String, + varId2rsId: String, + ld-folder: String, + out-folder: String +) { + def outputDirectory: String = s"out/susie/$phenotype/ancestry=$ancestry" + + def flags: Seq[String] = Seq(s"--clump=$MakeSuSiE.ancestrySpecific($phenotype, $ancestry)", + s"--varId2rsId=$ancestry", + s"--ld-folder=$ancestry", + s"--out-folder=$outputDirectory") +} + +object MakeSumstatsInput { + def fromString(output: String): MakeSumstatsInput = { + output.split("/").toSeq match { + case Seq(phenotype, ancestry) => MakeSumstatsInput(phenotype, ancestry) + } + } +} + diff --git a/susie/.editorconfig b/susie/.editorconfig new file mode 100644 index 00000000..587935a6 --- /dev/null +++ b/susie/.editorconfig @@ -0,0 +1,14 @@ +root = true + +[*] +insert_final_newline = true + +[*.java] +indent_style = space +indent_size = 4 +trim_trailing_whitespace = true + +[*.{scala,sbt}] +indent_style = space +indent_size = 2 +trim_trailing_whitespace = true diff --git a/susie/.scalafmt.conf b/susie/.scalafmt.conf new file mode 100644 index 00000000..ca683bf8 --- /dev/null +++ b/susie/.scalafmt.conf @@ -0,0 +1,4 @@ +version = "2.4.2" +align=more +docstrings=ScalaDoc +maxColumn=120 diff --git a/susie/LICENSE.txt b/susie/LICENSE.txt new file mode 100644 index 00000000..0d0952ea --- /dev/null +++ b/susie/LICENSE.txt @@ -0,0 +1,29 @@ +Copyright 2020 + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS +OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED +AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +DAMAGE. diff --git a/susie/README.md b/susie/README.md new file mode 100644 index 00000000..b9d83a00 --- /dev/null +++ b/susie/README.md @@ -0,0 +1,14 @@ +# susie + +This is the documentation about the method. + +Please put some details here about the method, what its inputs are, what its +outputs are, where it reads from, and where it writes to. + +## Stages + +These are the stages of susie. + +### SusieStage + +A description of what this stage does. diff --git a/susie/built.sbt b/susie/built.sbt new file mode 100644 index 00000000..bda37340 --- /dev/null +++ b/susie/built.sbt @@ -0,0 +1,70 @@ +val Versions = new { + val Aggregator = "0.3.4-SNAPSHOT" + val Scala = "2.13.2" +} + +// set the version of scala to compile with +scalaVersion := Versions.Scala + +// add scala compile flags +scalacOptions ++= Seq( + "-feature", + "-deprecation", + "-unchecked", + "-Ywarn-value-discard" +) + +// add required libraries +libraryDependencies ++= Seq( + "org.broadinstitute.dig" %% "dig-aggregator-core" % Versions.Aggregator +) + +// set the oranization this method belongs to +organization := "org.broadinstitute.dig" + +// entry point when running this method +mainClass := Some("org.broadinstitute.dig.aggregator.methods.susie.Susie") + +// enables buildInfo, which bakes git version info into the jar +enablePlugins(GitVersioning) + +// get the buildInfo task +val buildInfoTask = taskKey[Seq[File]]("buildInfo") + +// define execution code for task +buildInfoTask := { + val file = (resourceManaged in Compile).value / "version.properties" + + // log where the properties will be written to + streams.value.log.info(s"Writing version info to $file...") + + // collect git versioning information + val branch = git.gitCurrentBranch.value + val lastCommit = git.gitHeadCommit.value + val describedVersion = git.gitDescribedVersion.value + val anyUncommittedChanges = git.gitUncommittedChanges.value + val remoteUrl = (scmInfo in ThisBuild).value.map(_.browseUrl.toString) + val buildDate = java.time.Instant.now + + // map properties + val properties = Map[String, String]( + "branch" -> branch, + "lastCommit" -> lastCommit.getOrElse(""), + "remoteUrl" -> remoteUrl.getOrElse(""), + "uncommittedChanges" -> anyUncommittedChanges.toString, + "buildDate" -> buildDate.toString + ) + + // build properties content + val contents = properties.toList.map { + case (key, value) if value.length > 0 => s"$key=$value" + case _ => "" + } + + // output the version information from git to versionInfo.properties + IO.write(file, contents.mkString("\n")) + Seq(file) +} + +// add the build info task output to resources +(resourceGenerators in Compile) += buildInfoTask.taskValue diff --git a/susie/project/build.properties b/susie/project/build.properties new file mode 100644 index 00000000..e67343ae --- /dev/null +++ b/susie/project/build.properties @@ -0,0 +1 @@ +sbt.version=1.5.0 diff --git a/susie/project/plugins.sbt b/susie/project/plugins.sbt new file mode 100644 index 00000000..23d5057a --- /dev/null +++ b/susie/project/plugins.sbt @@ -0,0 +1 @@ +addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "1.0.0") diff --git a/susie/src/main/resources/install-susie.sh b/susie/src/main/resources/install-susie.sh new file mode 100644 index 00000000..d54eeba0 --- /dev/null +++ b/susie/src/main/resources/install-susie.sh @@ -0,0 +1,63 @@ +#!/bin/bash -xe + +# susie method +## Developed with python 3 and R + +SuSiE_ROOT=/mnt/var/susie + +# install to the root directory +sudo mkdir -p "$SuSiE_ROOT" +cd "$SuSiE_ROOT" + +# install yum dependencies +sudo yum install -y python3-devel +sudo yum install -y R + +# Install R-4.1.0 +# sudo wget https://cdn.rstudio.com/r/centos-7/pkgs/R-4.1.0-1-1.x86_64.rpm +# sudo yum install -y R-4.1.0-1-1.x86_64.rpm +# sudo rm R-4.1.0-1-1.x86_64.rpm + +# # find R directory +# for cmd in $(ls /usr/bin); do +# if echo "$cmd" | grep -qi "R"; then +# whereis $cmd +# fi +# done + + +# install R dependencies +sudo R -e "install.packages('dplyr', repos='http://cran.rstudio.com/')" +sudo R -e "install.packages('tidyr', repos='http://cran.rstudio.com/')" +sudo R -e "install.packages('base', repos='http://cran.rstudio.com/')" +sudo R -e "install.packages('stats', repos='http://cran.rstudio.com/')" +# sudo R -e "install.packages('https://cran.r-project.org/src/contrib/Archive/coloc/coloc_5.1.0.tar.gz', repos = NULL, type = 'source')" +sudo R -e "install.packages('coloc', repos='http://cran.rstudio.com/')" +sudo R -e "install.packages('sjmisc', repos='http://cran.rstudio.com/')" +sudo R -e "install.packages('susieR', repos='http://cran.rstudio.com/')" +# sudo R -e "install.packages('https://cran.hafro.is/contrib/main/00Archive/susieR/susieR_0.11.42.tar.gz', repos = NULL, type = 'source')" +sudo R -e "install.packages('stringr', repos='http://cran.rstudio.com/')" +sudo R -e "install.packages('Matrix', repos='http://cran.rstudio.com/')" +sudo R -e "install.packages('jsonlite', repos='http://cran.rstudio.com/')" +sudo R -e "install.packages('data.table', repos='http://cran.rstudio.com/')" +sudo R -e "install.packages('parallel', repos='http://cran.rstudio.com/')" +sudo R -e "install.packages('strengejacke', repos='http://cran.rstudio.com/')" +sudo R -e "install.packages('http://www.well.ox.ac.uk/~gav/resources/rbgen_v1.1.5.tgz', repos = NULL, type = 'source')" + +# install python dependencies +pip3 install -U pandas +pip3 install -U numpy +pip3 install -U fsspec + +# pull down LD bfiles +sudo mkdir -p ./1000G_EUR_plink +sudo aws s3 cp s3://dig-analysis-bin/susie/1000G_EUR_plink/ ./1000G_EUR_plink/ --recursive + +# fetch snps for mapping +sudo aws s3 cp "s3://dig-analysis-bin/snps/dbSNP_common_GRCh37.csv" ./snps.csv + +sudo aws s3 cp s3://dig-analysis-bin/susie/SuSiE.r ./ +sudo aws s3 cp s3://dig-analysis-bin/susie/plink ./ +sudo aws s3 cp s3://dig-analysis-bin/susie/plink_ld_snp_list.sh ./ +sudo chmod 777 ./plink_ld_snp_list.sh +sudo chmod 777 ./plink \ No newline at end of file diff --git a/susie/src/main/resources/makeSuSiE.py b/susie/src/main/resources/makeSuSiE.py new file mode 100644 index 00000000..7ab2bb3f --- /dev/null +++ b/susie/src/main/resources/makeSuSiE.py @@ -0,0 +1,93 @@ +#!/usr/bin/python3 +from optparse import OptionParser +import pandas as pd +import numpy as np +import shutil +import subprocess +import os + +s3_in=os.environ['INPUT_PATH'] +s3_out=os.environ['OUTPUT_PATH'] + +# def finds json files in the directory +def make_json_files(directory): + subprocess.check_call(['aws', 's3', 'cp', directory, 'input/', '--recursive']) + subprocess.run('cat input/*.json > input.json', shell=True) + shutil.rmtree('input') + +def make_ld_files(directory): + subprocess.check_call(['aws', 's3', 'cp', directory, 'input/', '--recursive']) + subprocess.run('cat input/*.ld > snp_ld.ld', shell=True) + shutil.rmtree('input') + +def main(): + usage = "usage: %prog [options]" + parser = OptionParser(usage) + parser.add_option("","--phenotype", default=None) + parser.add_option("","--ancestry", default=None) + + (options, args) = parser.parse_args() + + clump_path = f'{s3_in}/out/metaanalysis/bottom-line/ancestry-clumped/{options.phenotype}/ancestry={options.ancestry}' + var2rs_path = '/mnt/var/susie/snps.csv' + out_path = f'{s3_out}/out/susie/staging/{options.phenotype}/ancestry={options.ancestry}' + + # read all files in the clump path + make_json_files(clump_path) + + # read var2rs file + df_var_rs_Id = pd.read_csv(var2rs_path,sep='\t') + + # create the tmp out directory + out_directory = 'data' + if not os.path.exists(out_directory): + os.makedirs(out_directory,exist_ok=True) + + # read clump + df_clump = pd.read_json('input.json', lines=True) + + # sort clump based on the varId + df_clump.sort_values('varId',inplace = True) + df_var_filter = df_var_rs_Id[df_var_rs_Id['varId'].isin(df_clump['varId'])] + + # only common variants + df_clump = df_clump[df_clump['varId'].isin(df_var_filter['varId'])] + df_clump.sort_values('varId',inplace = True) + df_var_filter.sort_values('varId',inplace = True) + + + # add dbSNP into the clump files + df_clump['dbSNP'] = df_var_filter['dbSNP'].to_numpy() + + df_clump = df_clump.rename(columns={'dbSNP':'rsId','reference':'ref'}) + + # for loop over clump ids + for i in sorted(df_clump['clump'].unique()): + # filter gwas based on the clump id + df_susie = df_clump[df_clump['clump']==i] + chrom = df_susie['chromosome'].to_numpy()[0] + gwas_susie_file_name = out_directory+'/'+'clump_'+str(i)+'.csv' + df_susie.to_csv(gwas_susie_file_name,sep='\t',index=False) + df_susie['rsId'].to_csv(f'{out_directory}/snps.txt',sep='\t',index=False,header=False) + + # calculate LD for snps list + subprocess.call(["bash", "/mnt/var/susie/plink_ld_snp_list.sh", f'{chrom}', f'{out_directory}/snps.txt', f'{out_directory}/snps_ld']) + + # Call the Bash script of SuSiE with its arguments + argument1_gwas = gwas_susie_file_name + argument2_ld = f'{out_directory}/snps_ld.ld' + argument3_out = out_directory + subprocess.call(['Rscript','/mnt/var/susie/SuSiE.r','--gwas',gwas_susie_file_name,'--ld',argument2_ld, '--out',argument3_out]) + os.remove(argument1_gwas) + + os.remove(f'{out_directory}/snps.txt') + os.remove(f'{out_directory}/snps_ld.ld') + os.remove(f'{out_directory}/snps_ld.log') + os.remove(f'{out_directory}/snps_ld.nosex') + subprocess.check_call(['touch', f'{out_directory}/_SUCCESS']) + subprocess.check_call(['aws','s3','cp',f'{out_directory}/',out_path,'--recursive']) + os.remove('input.json') + shutil.rmtree(out_directory) + +if __name__ == '__main__': + main() diff --git a/susie/src/main/scala/MakeSuSiE.scala b/susie/src/main/scala/MakeSuSiE.scala new file mode 100644 index 00000000..5a989057 --- /dev/null +++ b/susie/src/main/scala/MakeSuSiE.scala @@ -0,0 +1,54 @@ +package org.broadinstitute.dig.aggregator.methods.susie + +import org.broadinstitute.dig.aggregator.core._ +import org.broadinstitute.dig.aws.emr._ +import org.broadinstitute.dig.aws.Ec2.Strategy +import org.broadinstitute.dig.aws.MemorySize + +class MakeSuSiE(implicit context: Context) extends Stage { + import MemorySize.Implicits._ + + val ancestrySpecific: Input.Source = Input.Source.Success("out/metaanalysis/bottom-line/ancestry-clumped/*/*/") + // val mixedDatasets: Input.Source = Input.Source.Success("variants/*/*/*/") + + /** Source inputs. */ + override val sources: Seq[Input.Source] = Seq(ancestrySpecific) + + /** Map inputs to their outputs. */ + override val rules: PartialFunction[Input, Outputs] = { + case ancestrySpecific(phenotype, ancestry) => Outputs.Named(s"$phenotype/${ancestry.split('=').last}") + // case mixedDatasets(_, _, phenotype) => Outputs.Named(s"$phenotype/Mixed") + } + + /** Just need a single machine with no applications, but a good drive. */ + override def cluster: ClusterDef = super.cluster.copy( + instances = 1, + applications = Seq.empty, + masterVolumeSizeInGB = 100, + bootstrapScripts = Seq(new BootstrapScript(resourceUri("install-susie.sh"))) + ) + + override def make(output: String): Job = { + val input = MakeSuSiEInput.fromString(output) + new Job(Job.Script(resourceUri("makeSuSiE.py"), input.flags:_*)) + } + +} + + +case class MakeSuSiEInput( + phenotype: String, + ancestry: String +) { + + def flags: Seq[String] = Seq(s"--phenotype=$phenotype", s"--ancestry=$ancestry") +} + +object MakeSuSiEInput { + def fromString(output: String): MakeSuSiEInput = { + output.split("/").toSeq match { + case Seq(phenotype, ancestry) => MakeSuSiEInput(phenotype, ancestry) + } + } +} + diff --git a/susie/src/main/scala/Susie.scala b/susie/src/main/scala/Susie.scala new file mode 100644 index 00000000..d051bd37 --- /dev/null +++ b/susie/src/main/scala/Susie.scala @@ -0,0 +1,27 @@ +package org.broadinstitute.dig.aggregator.methods.susie + +import org.broadinstitute.dig.aggregator.core._ +import org.broadinstitute.dig.aws._ +import org.broadinstitute.dig.aws.emr._ + +/** This is your aggregator method. + * + * All that needs to be done here is to implement the initStages function, + * which adds stages to the method in the order they should be executed. + * + * When you are ready to run it, use SBT from the CLI: + * + * sbt run [args] + * + * See the README of the dig-aggregator-core project for a complete list of + * CLI arguments available. + */ +object Susie extends Method { + + /** Add all stages used in this method here. Stages must be added in the + * order they should be serially executed. + */ + override def initStages(implicit context: Context) = { + addStage(new MakeSuSiE) + } +} diff --git a/susie/version.sbt b/susie/version.sbt new file mode 100644 index 00000000..e7654440 --- /dev/null +++ b/susie/version.sbt @@ -0,0 +1 @@ +version in ThisBuild := "0.1.0" From a4572725815a3b8d0be5f10fb81ccb1b14a6d164 Mon Sep 17 00:00:00 2001 From: szandavi Date: Mon, 6 May 2024 14:27:58 -0400 Subject: [PATCH 02/13] clean code --- susie/src/main/resources/install-susie.sh | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/susie/src/main/resources/install-susie.sh b/susie/src/main/resources/install-susie.sh index d54eeba0..a2e79fd9 100644 --- a/susie/src/main/resources/install-susie.sh +++ b/susie/src/main/resources/install-susie.sh @@ -13,29 +13,15 @@ cd "$SuSiE_ROOT" sudo yum install -y python3-devel sudo yum install -y R -# Install R-4.1.0 -# sudo wget https://cdn.rstudio.com/r/centos-7/pkgs/R-4.1.0-1-1.x86_64.rpm -# sudo yum install -y R-4.1.0-1-1.x86_64.rpm -# sudo rm R-4.1.0-1-1.x86_64.rpm - -# # find R directory -# for cmd in $(ls /usr/bin); do -# if echo "$cmd" | grep -qi "R"; then -# whereis $cmd -# fi -# done - # install R dependencies sudo R -e "install.packages('dplyr', repos='http://cran.rstudio.com/')" sudo R -e "install.packages('tidyr', repos='http://cran.rstudio.com/')" sudo R -e "install.packages('base', repos='http://cran.rstudio.com/')" sudo R -e "install.packages('stats', repos='http://cran.rstudio.com/')" -# sudo R -e "install.packages('https://cran.r-project.org/src/contrib/Archive/coloc/coloc_5.1.0.tar.gz', repos = NULL, type = 'source')" sudo R -e "install.packages('coloc', repos='http://cran.rstudio.com/')" sudo R -e "install.packages('sjmisc', repos='http://cran.rstudio.com/')" sudo R -e "install.packages('susieR', repos='http://cran.rstudio.com/')" -# sudo R -e "install.packages('https://cran.hafro.is/contrib/main/00Archive/susieR/susieR_0.11.42.tar.gz', repos = NULL, type = 'source')" sudo R -e "install.packages('stringr', repos='http://cran.rstudio.com/')" sudo R -e "install.packages('Matrix', repos='http://cran.rstudio.com/')" sudo R -e "install.packages('jsonlite', repos='http://cran.rstudio.com/')" From 2fcb8a3550f09dc728bcf667b7ecd559403c127c Mon Sep 17 00:00:00 2001 From: szandavi Date: Thu, 16 May 2024 13:06:27 -0400 Subject: [PATCH 03/13] clean up code --- susie/built.sbt | 2 +- susie/project/build.properties | 2 +- susie/src/main/resources/makeSuSiE.py | 41 +++++++++++++-------------- susie/src/main/scala/MakeSuSiE.scala | 3 +- 4 files changed, 22 insertions(+), 26 deletions(-) diff --git a/susie/built.sbt b/susie/built.sbt index bda37340..461df957 100644 --- a/susie/built.sbt +++ b/susie/built.sbt @@ -1,5 +1,5 @@ val Versions = new { - val Aggregator = "0.3.4-SNAPSHOT" + val Aggregator = "0.3.5-SNAPSHOT" val Scala = "2.13.2" } diff --git a/susie/project/build.properties b/susie/project/build.properties index e67343ae..46e43a97 100644 --- a/susie/project/build.properties +++ b/susie/project/build.properties @@ -1 +1 @@ -sbt.version=1.5.0 +sbt.version=1.8.2 diff --git a/susie/src/main/resources/makeSuSiE.py b/susie/src/main/resources/makeSuSiE.py index 7ab2bb3f..b1229df6 100644 --- a/susie/src/main/resources/makeSuSiE.py +++ b/susie/src/main/resources/makeSuSiE.py @@ -15,69 +15,66 @@ def make_json_files(directory): subprocess.run('cat input/*.json > input.json', shell=True) shutil.rmtree('input') -def make_ld_files(directory): - subprocess.check_call(['aws', 's3', 'cp', directory, 'input/', '--recursive']) - subprocess.run('cat input/*.ld > snp_ld.ld', shell=True) - shutil.rmtree('input') - def main(): usage = "usage: %prog [options]" parser = OptionParser(usage) - parser.add_option("","--phenotype", default=None) - parser.add_option("","--ancestry", default=None) + parser.add_option("", "--phenotype", default=None) + parser.add_option("", "--ancestry", default=None) - (options, args) = parser.parse_args() + args = parser.parse_args() - clump_path = f'{s3_in}/out/metaanalysis/bottom-line/ancestry-clumped/{options.phenotype}/ancestry={options.ancestry}' + clump_path = f'{s3_in}/out/metaanalysis/bottom-line/ancestry-clumped/{args.phenotype}/ancestry={args.ancestry}' var2rs_path = '/mnt/var/susie/snps.csv' - out_path = f'{s3_out}/out/susie/staging/{options.phenotype}/ancestry={options.ancestry}' + out_path = f'{s3_out}/out/susie/staging/{args.phenotype}/ancestry={args.ancestry}' # read all files in the clump path make_json_files(clump_path) # read var2rs file - df_var_rs_Id = pd.read_csv(var2rs_path,sep='\t') + df_var_rs_Id = pd.read_csv(var2rs_path, sep='\t') # create the tmp out directory out_directory = 'data' if not os.path.exists(out_directory): - os.makedirs(out_directory,exist_ok=True) + os.makedirs(out_directory, exist_ok=True) # read clump df_clump = pd.read_json('input.json', lines=True) # sort clump based on the varId - df_clump.sort_values('varId',inplace = True) + df_clump.sort_values('varId', inplace = True) df_var_filter = df_var_rs_Id[df_var_rs_Id['varId'].isin(df_clump['varId'])] # only common variants df_clump = df_clump[df_clump['varId'].isin(df_var_filter['varId'])] - df_clump.sort_values('varId',inplace = True) - df_var_filter.sort_values('varId',inplace = True) + df_clump.sort_values('varId', inplace = True) + df_var_filter.sort_values('varId', inplace = True) # add dbSNP into the clump files df_clump['dbSNP'] = df_var_filter['dbSNP'].to_numpy() - df_clump = df_clump.rename(columns={'dbSNP':'rsId','reference':'ref'}) + df_clump = df_clump.rename(columns={'dbSNP':'rsId', 'reference':'ref'}) # for loop over clump ids for i in sorted(df_clump['clump'].unique()): # filter gwas based on the clump id df_susie = df_clump[df_clump['clump']==i] chrom = df_susie['chromosome'].to_numpy()[0] - gwas_susie_file_name = out_directory+'/'+'clump_'+str(i)+'.csv' - df_susie.to_csv(gwas_susie_file_name,sep='\t',index=False) - df_susie['rsId'].to_csv(f'{out_directory}/snps.txt',sep='\t',index=False,header=False) + gwas_susie_file_name = f'{out_directory}/clump_{i}.csv' + df_susie.to_csv(gwas_susie_file_name, sep='\t', index=False) + df_susie['rsId'].to_csv(f'{out_directory}/snps.txt', sep='\t', index=False, header=False) # calculate LD for snps list - subprocess.call(["bash", "/mnt/var/susie/plink_ld_snp_list.sh", f'{chrom}', f'{out_directory}/snps.txt', f'{out_directory}/snps_ld']) + subprocess.call(["bash", "/mnt/var/susie/plink_ld_snp_list.sh", f'{chrom}', + f'{out_directory}/snps.txt', f'{out_directory}/snps_ld']) # Call the Bash script of SuSiE with its arguments argument1_gwas = gwas_susie_file_name argument2_ld = f'{out_directory}/snps_ld.ld' argument3_out = out_directory - subprocess.call(['Rscript','/mnt/var/susie/SuSiE.r','--gwas',gwas_susie_file_name,'--ld',argument2_ld, '--out',argument3_out]) + subprocess.call(['Rscript', '/mnt/var/susie/SuSiE.r', '--gwas', gwas_susie_file_name, + '--ld', argument2_ld, '--out', argument3_out]) os.remove(argument1_gwas) os.remove(f'{out_directory}/snps.txt') @@ -85,7 +82,7 @@ def main(): os.remove(f'{out_directory}/snps_ld.log') os.remove(f'{out_directory}/snps_ld.nosex') subprocess.check_call(['touch', f'{out_directory}/_SUCCESS']) - subprocess.check_call(['aws','s3','cp',f'{out_directory}/',out_path,'--recursive']) + subprocess.check_call(['aws', 's3', 'cp', f'{out_directory}/', out_path, '--recursive']) os.remove('input.json') shutil.rmtree(out_directory) diff --git a/susie/src/main/scala/MakeSuSiE.scala b/susie/src/main/scala/MakeSuSiE.scala index 5a989057..cf55bf92 100644 --- a/susie/src/main/scala/MakeSuSiE.scala +++ b/susie/src/main/scala/MakeSuSiE.scala @@ -8,8 +8,7 @@ import org.broadinstitute.dig.aws.MemorySize class MakeSuSiE(implicit context: Context) extends Stage { import MemorySize.Implicits._ - val ancestrySpecific: Input.Source = Input.Source.Success("out/metaanalysis/bottom-line/ancestry-clumped/*/*/") - // val mixedDatasets: Input.Source = Input.Source.Success("variants/*/*/*/") + val ancestrySpecific: Input.Source = Input.Source.Success("out/metaanalysis/bottom-line/ancestry-clumped/*/ancestry=EU/") /** Source inputs. */ override val sources: Seq[Input.Source] = Seq(ancestrySpecific) From 88eb34deab6cc09f9e2a1c7ad337b5f25a9f9237 Mon Sep 17 00:00:00 2001 From: szandavi Date: Thu, 16 May 2024 13:20:49 -0400 Subject: [PATCH 04/13] clean up code --- susie/src/main/scala/Susie.scala | 27 --------------------------- 1 file changed, 27 deletions(-) delete mode 100644 susie/src/main/scala/Susie.scala diff --git a/susie/src/main/scala/Susie.scala b/susie/src/main/scala/Susie.scala deleted file mode 100644 index d051bd37..00000000 --- a/susie/src/main/scala/Susie.scala +++ /dev/null @@ -1,27 +0,0 @@ -package org.broadinstitute.dig.aggregator.methods.susie - -import org.broadinstitute.dig.aggregator.core._ -import org.broadinstitute.dig.aws._ -import org.broadinstitute.dig.aws.emr._ - -/** This is your aggregator method. - * - * All that needs to be done here is to implement the initStages function, - * which adds stages to the method in the order they should be executed. - * - * When you are ready to run it, use SBT from the CLI: - * - * sbt run [args] - * - * See the README of the dig-aggregator-core project for a complete list of - * CLI arguments available. - */ -object Susie extends Method { - - /** Add all stages used in this method here. Stages must be added in the - * order they should be serially executed. - */ - override def initStages(implicit context: Context) = { - addStage(new MakeSuSiE) - } -} From 13233a9053d862c875bfe6094bfebc8d23f5ba0a Mon Sep 17 00:00:00 2001 From: szandavi Date: Thu, 16 May 2024 15:00:46 -0400 Subject: [PATCH 05/13] back susie.scala --- susie/src/main/scala/Susie.scala | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 susie/src/main/scala/Susie.scala diff --git a/susie/src/main/scala/Susie.scala b/susie/src/main/scala/Susie.scala new file mode 100644 index 00000000..d051bd37 --- /dev/null +++ b/susie/src/main/scala/Susie.scala @@ -0,0 +1,27 @@ +package org.broadinstitute.dig.aggregator.methods.susie + +import org.broadinstitute.dig.aggregator.core._ +import org.broadinstitute.dig.aws._ +import org.broadinstitute.dig.aws.emr._ + +/** This is your aggregator method. + * + * All that needs to be done here is to implement the initStages function, + * which adds stages to the method in the order they should be executed. + * + * When you are ready to run it, use SBT from the CLI: + * + * sbt run [args] + * + * See the README of the dig-aggregator-core project for a complete list of + * CLI arguments available. + */ +object Susie extends Method { + + /** Add all stages used in this method here. Stages must be added in the + * order they should be serially executed. + */ + override def initStages(implicit context: Context) = { + addStage(new MakeSuSiE) + } +} From adf46c8e64700fa7b724f4d06fd728cdb0cf6800 Mon Sep 17 00:00:00 2001 From: szandavi Date: Thu, 16 May 2024 16:10:20 -0400 Subject: [PATCH 06/13] clean up code - ancestry specific --- susie/src/main/scala/MakeSuSiE.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/susie/src/main/scala/MakeSuSiE.scala b/susie/src/main/scala/MakeSuSiE.scala index cf55bf92..1dacf3e6 100644 --- a/susie/src/main/scala/MakeSuSiE.scala +++ b/susie/src/main/scala/MakeSuSiE.scala @@ -15,7 +15,8 @@ class MakeSuSiE(implicit context: Context) extends Stage { /** Map inputs to their outputs. */ override val rules: PartialFunction[Input, Outputs] = { - case ancestrySpecific(phenotype, ancestry) => Outputs.Named(s"$phenotype/${ancestry.split('=').last}") + case ancestrySpecific(phenotype) => Outputs.Named(s"$phenotype/EU") + // case ancestrySpecific(phenotype, ancestry) => Outputs.Named(s"$phenotype/${ancestry.split('=').last}") // case mixedDatasets(_, _, phenotype) => Outputs.Named(s"$phenotype/Mixed") } From eb34f385412b449d267e074f2029e857edc3f668 Mon Sep 17 00:00:00 2001 From: szandavi Date: Thu, 16 May 2024 16:52:05 -0400 Subject: [PATCH 07/13] clean up code - ancestry specific --- susie/src/main/resources/makeSuSiE.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/susie/src/main/resources/makeSuSiE.py b/susie/src/main/resources/makeSuSiE.py index b1229df6..dd1817f8 100644 --- a/susie/src/main/resources/makeSuSiE.py +++ b/susie/src/main/resources/makeSuSiE.py @@ -21,7 +21,7 @@ def main(): parser.add_option("", "--phenotype", default=None) parser.add_option("", "--ancestry", default=None) - args = parser.parse_args() + (args,_) = parser.parse_args() clump_path = f'{s3_in}/out/metaanalysis/bottom-line/ancestry-clumped/{args.phenotype}/ancestry={args.ancestry}' var2rs_path = '/mnt/var/susie/snps.csv' From b42dd19d2fbf74998eb3769c2b0ba474907e80e9 Mon Sep 17 00:00:00 2001 From: szandavi Date: Fri, 17 May 2024 10:08:11 -0400 Subject: [PATCH 08/13] add safe_remove function ' --- susie/src/main/resources/makeSuSiE.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/susie/src/main/resources/makeSuSiE.py b/susie/src/main/resources/makeSuSiE.py index dd1817f8..788f8d7f 100644 --- a/susie/src/main/resources/makeSuSiE.py +++ b/susie/src/main/resources/makeSuSiE.py @@ -15,6 +15,17 @@ def make_json_files(directory): subprocess.run('cat input/*.json > input.json', shell=True) shutil.rmtree('input') +def safe_remove(file_path): + try: + os.remove(file_path) + print(f"File {file_path} successfully removed.") + except FileNotFoundError: + print(f"File {file_path} does not exist.") + except PermissionError: + print(f"Permission denied: cannot remove {file_path}.") + except Exception as e: + print(f"An error occurred while trying to remove {file_path}: {e}") + def main(): usage = "usage: %prog [options]" parser = OptionParser(usage) @@ -75,15 +86,15 @@ def main(): argument3_out = out_directory subprocess.call(['Rscript', '/mnt/var/susie/SuSiE.r', '--gwas', gwas_susie_file_name, '--ld', argument2_ld, '--out', argument3_out]) - os.remove(argument1_gwas) + safe_remove(argument1_gwas) - os.remove(f'{out_directory}/snps.txt') - os.remove(f'{out_directory}/snps_ld.ld') - os.remove(f'{out_directory}/snps_ld.log') - os.remove(f'{out_directory}/snps_ld.nosex') + safe_remove(f'{out_directory}/snps.txt') + safe_remove(f'{out_directory}/snps_ld.ld') + safe_remove(f'{out_directory}/snps_ld.log') + safe_remove(f'{out_directory}/snps_ld.nosex') subprocess.check_call(['touch', f'{out_directory}/_SUCCESS']) subprocess.check_call(['aws', 's3', 'cp', f'{out_directory}/', out_path, '--recursive']) - os.remove('input.json') + safe_remove('input.json') shutil.rmtree(out_directory) if __name__ == '__main__': From 33aa908b5f456582f72b134a55287fcd05b2eedc Mon Sep 17 00:00:00 2001 From: szandavi Date: Wed, 18 Dec 2024 06:30:45 -0500 Subject: [PATCH 09/13] add Finemap into susie branch --- susie/.Rapp.history | 0 susie/src/main/resources/install-finemap.sh | 72 +++++++++++++++++++++ susie/src/main/resources/makeFinemap.py | 67 +++++++++++++++++++ susie/src/main/scala/Finemap.scala | 27 ++++++++ susie/src/main/scala/MakeFinemap.scala | 54 ++++++++++++++++ 5 files changed, 220 insertions(+) create mode 100644 susie/.Rapp.history create mode 100644 susie/src/main/resources/install-finemap.sh create mode 100644 susie/src/main/resources/makeFinemap.py create mode 100644 susie/src/main/scala/Finemap.scala create mode 100644 susie/src/main/scala/MakeFinemap.scala diff --git a/susie/.Rapp.history b/susie/.Rapp.history new file mode 100644 index 00000000..e69de29b diff --git a/susie/src/main/resources/install-finemap.sh b/susie/src/main/resources/install-finemap.sh new file mode 100644 index 00000000..ea20dbcb --- /dev/null +++ b/susie/src/main/resources/install-finemap.sh @@ -0,0 +1,72 @@ +#!/bin/bash -xe + +# susie method +## Developed with python 3 and R + +finemap_ROOT=/mnt/var/cojo + +# install to the root directory +sudo mkdir -p "$finemap_ROOT" +cd "$finemap_ROOT" + +# install yum dependencies +sudo yum install -y python3-devel + + +# Install conda +cd $finemap_ROOT +wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh +bash miniconda.sh -b -p $finemap_ROOT/miniconda +echo export PATH="$finemap_ROOT/miniconda/bin:\$PATH" >> ~/.profile +. ~/.profile + +# Install GCTA +cd $finemap_ROOT +mkdir -p ~/software/gcta +cd ~/software/gcta +# Note that this URL may change - old versions aren't accessible at the same URL +wget https://cnsgenomics.com/software/gcta/bin/gcta_1.93.2beta.zip +unzip gcta_1.93.2beta.zip +cd gcta_1.93.2beta +echo export PATH="$PWD:\$PATH" >> ~/.profile +. ~/.profile + +# Install plink +mkdir -p ~/software/plink +cd ~/software/plink +wget http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20201019.zip +unzip plink_linux_x86_64_20201019.zip +echo export PATH="$PWD:\$PATH" >> ~/.profile +. ~/.profile + +# Install FINEMAP +mkdir -p ~/software/finemap +cd ~/software/finemap +wget http://www.christianbenner.com/finemap_v1.4_x86_64.tgz +tar -zxf finemap_v1.4_x86_64.tgz +ln -s finemap_v1.4_x86_64/finemap_v1.4_x86_64 finemap +sudo apt-get install libgomp1 # Not present by default it seems +echo export PATH="$PWD:\$PATH" >> ~/.profile +. ~/.profile + +# Install JRE +sudo apt install -yf openjdk-8-jre-headless openjdk-8-jdk +# sudo update-java-alternatives --list +# sudo update-java-alternatives --set java-1.8.0-openjdk-amd64 + +# Install parallel +sudo apt install -yf parallel + +echo COMPLETE + + +# pull down LD bfiles +sudo mkdir -p ./bfiles +sudo aws s3 cp s3://dig-analysis-bin/cojo/bfiles/ ./bfiles/ --recursive + +# pull down finemap dir +sudo mkdir -p ./finemapping +sudo aws s3 cp s3://dig-analysis-bin/cojo/finemapping/ ./finemapping/ --recursive + +# fetch snps for mapping +sudo aws s3 cp "s3://dig-analysis-bin/snps/dbSNP_common_GRCh37.csv" ./snps.csv diff --git a/susie/src/main/resources/makeFinemap.py b/susie/src/main/resources/makeFinemap.py new file mode 100644 index 00000000..fa81d77b --- /dev/null +++ b/susie/src/main/resources/makeFinemap.py @@ -0,0 +1,67 @@ +#!/usr/bin/python3 +from optparse import OptionParser +import pandas as pd +import numpy as np +import shutil +import subprocess +import os + +s3_in=os.environ['INPUT_PATH'] +s3_out=os.environ['OUTPUT_PATH'] + +# def finds json files in the directory +def make_json_files(directory): + subprocess.check_call(['aws', 's3', 'cp', directory, 'input/', '--recursive']) + subprocess.run('zstdcat input/*.json.zst | jq -s '.' > input/input.json', shell=True) + +def safe_remove(file_path): + try: + os.remove(file_path) + print(f"File {file_path} successfully removed.") + except FileNotFoundError: + print(f"File {file_path} does not exist.") + except PermissionError: + print(f"Permission denied: cannot remove {file_path}.") + except Exception as e: + print(f"An error occurred while trying to remove {file_path}: {e}") + +def main(): + usage = "usage: %prog [options]" + parser = OptionParser(usage) + parser.add_option("", "--phenotype", default=None) + parser.add_option("", "--ancestry", default=None) + + (args,_) = parser.parse_args() + + pheno_path = f'{s3_in}/out/metaanalysis/bottom-line/ancestry-specific/{phenotype}/ancestry={ancestry}/' + var2rs_path = '/mnt/var/cojo/snps.csv' + bfiles = '/mnt/var/cojo/bfiles' + finemap_dir = '/mnt/var/cojo/finemapping' + config_file = '/mnt/var/cojo/finemapping/analysis.config.yaml' + out_path = f'{s3_out}/out/cojo/staging/{args.phenotype}/ancestry={args.ancestry}' + + # read all files in the clump path + make_json_files(pheno_path) + + # create the tmp out directory + out_directory = 'data' + if not os.path.exists(out_directory): + os.makedirs(out_directory, exist_ok=True) + + subprocess.call(['bash', '/mnt/var/cojo/finemapping/run_finemap_pipeline.sh', + '--input','input' + '--bfiles', bfiles, + '--config_file',config_file, + '--dbsnp_file',var2rs_path, + '--output', out_directory, + '--finemap_dir',finemap_dir + ]) + + subprocess.check_call(['touch', f'{out_directory}/_SUCCESS']) + subprocess.check_call(['aws', 's3', 'cp', f'{out_directory}/', out_path, '--recursive']) + safe_remove('input/input.json') + shutil.rmtree('input') + shutil.rmtree(out_directory) + +if __name__ == '__main__': + main() diff --git a/susie/src/main/scala/Finemap.scala b/susie/src/main/scala/Finemap.scala new file mode 100644 index 00000000..ee8f5dd1 --- /dev/null +++ b/susie/src/main/scala/Finemap.scala @@ -0,0 +1,27 @@ +package org.broadinstitute.dig.aggregator.methods.susie + +import org.broadinstitute.dig.aggregator.core._ +import org.broadinstitute.dig.aws._ +import org.broadinstitute.dig.aws.emr._ + +/** This is your aggregator method. + * + * All that needs to be done here is to implement the initStages function, + * which adds stages to the method in the order they should be executed. + * + * When you are ready to run it, use SBT from the CLI: + * + * sbt run [args] + * + * See the README of the dig-aggregator-core project for a complete list of + * CLI arguments available. + */ +object Susie extends Method { + + /** Add all stages used in this method here. Stages must be added in the + * order they should be serially executed. + */ + override def initStages(implicit context: Context) = { + addStage(new MakeFinemap) + } +} diff --git a/susie/src/main/scala/MakeFinemap.scala b/susie/src/main/scala/MakeFinemap.scala new file mode 100644 index 00000000..f2d68e6a --- /dev/null +++ b/susie/src/main/scala/MakeFinemap.scala @@ -0,0 +1,54 @@ +package org.broadinstitute.dig.aggregator.methods.susie + +import org.broadinstitute.dig.aggregator.core._ +import org.broadinstitute.dig.aws.emr._ +import org.broadinstitute.dig.aws.Ec2.Strategy +import org.broadinstitute.dig.aws.MemorySize + +class MakeSuSiE(implicit context: Context) extends Stage { + import MemorySize.Implicits._ + + val ancestrySpecific: Input.Source = Input.Source.Success("out/metaanalysis/bottom-line/ancestry-clumped/*/ancestry=EU/") + + /** Source inputs. */ + override val sources: Seq[Input.Source] = Seq(ancestrySpecific) + + /** Map inputs to their outputs. */ + override val rules: PartialFunction[Input, Outputs] = { + case ancestrySpecific(phenotype) => Outputs.Named(s"$phenotype/EU") + // case ancestrySpecific(phenotype, ancestry) => Outputs.Named(s"$phenotype/${ancestry.split('=').last}") + // case mixedDatasets(_, _, phenotype) => Outputs.Named(s"$phenotype/Mixed") + } + + /** Just need a single machine with no applications, but a good drive. */ + override def cluster: ClusterDef = super.cluster.copy( + instances = 1, + applications = Seq.empty, + masterVolumeSizeInGB = 100, + bootstrapScripts = Seq(new BootstrapScript(resourceUri("install-finemap.sh"))) + ) + + override def make(output: String): Job = { + val input = MakeSuSiEInput.fromString(output) + new Job(Job.Script(resourceUri("makeFinemap.py"), input.flags:_*)) + } + +} + + +case class MakeSuSiEInput( + phenotype: String, + ancestry: String +) { + + def flags: Seq[String] = Seq(s"--phenotype=$phenotype", s"--ancestry=$ancestry") +} + +object MakeSuSiEInput { + def fromString(output: String): MakeSuSiEInput = { + output.split("/").toSeq match { + case Seq(phenotype, ancestry) => MakeSuSiEInput(phenotype, ancestry) + } + } +} + From 47c8996aab753e5fd02391a7f82ac8282fdaece0 Mon Sep 17 00:00:00 2001 From: szandavi Date: Wed, 18 Dec 2024 07:06:28 -0500 Subject: [PATCH 10/13] change input in MakeFinemap.scala --- susie/src/main/scala/MakeFinemap.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/susie/src/main/scala/MakeFinemap.scala b/susie/src/main/scala/MakeFinemap.scala index f2d68e6a..35a3d2e5 100644 --- a/susie/src/main/scala/MakeFinemap.scala +++ b/susie/src/main/scala/MakeFinemap.scala @@ -8,7 +8,7 @@ import org.broadinstitute.dig.aws.MemorySize class MakeSuSiE(implicit context: Context) extends Stage { import MemorySize.Implicits._ - val ancestrySpecific: Input.Source = Input.Source.Success("out/metaanalysis/bottom-line/ancestry-clumped/*/ancestry=EU/") + val ancestrySpecific: Input.Source = Input.Source.Success("out/metaanalysis/bottom-line/ancestry-specific/*/ancestry=EU/") /** Source inputs. */ override val sources: Seq[Input.Source] = Seq(ancestrySpecific) From 313e6809f16b6b8d01c9373ca1e0d6fe259e522c Mon Sep 17 00:00:00 2001 From: szandavi Date: Wed, 18 Dec 2024 10:16:18 -0500 Subject: [PATCH 11/13] finemap --- finemap/.editorconfig | 14 ++++ finemap/.scalafmt.conf | 4 + finemap/LICENSE.txt | 29 +++++++ finemap/README.md | 14 ++++ finemap/built.sbt | 70 +++++++++++++++++ finemap/project/build.properties | 1 + finemap/project/plugins.sbt | 1 + finemap/src/main/resources/install-finemap.sh | 76 +++++++++++++++++++ finemap/src/main/resources/makeFinemap.py | 67 ++++++++++++++++ finemap/src/main/scala/Finemap.scala | 27 +++++++ finemap/src/main/scala/MakeFinemap.scala | 54 +++++++++++++ finemap/version.sbt | 1 + 12 files changed, 358 insertions(+) create mode 100644 finemap/.editorconfig create mode 100644 finemap/.scalafmt.conf create mode 100644 finemap/LICENSE.txt create mode 100644 finemap/README.md create mode 100644 finemap/built.sbt create mode 100644 finemap/project/build.properties create mode 100644 finemap/project/plugins.sbt create mode 100644 finemap/src/main/resources/install-finemap.sh create mode 100644 finemap/src/main/resources/makeFinemap.py create mode 100644 finemap/src/main/scala/Finemap.scala create mode 100644 finemap/src/main/scala/MakeFinemap.scala create mode 100644 finemap/version.sbt diff --git a/finemap/.editorconfig b/finemap/.editorconfig new file mode 100644 index 00000000..587935a6 --- /dev/null +++ b/finemap/.editorconfig @@ -0,0 +1,14 @@ +root = true + +[*] +insert_final_newline = true + +[*.java] +indent_style = space +indent_size = 4 +trim_trailing_whitespace = true + +[*.{scala,sbt}] +indent_style = space +indent_size = 2 +trim_trailing_whitespace = true diff --git a/finemap/.scalafmt.conf b/finemap/.scalafmt.conf new file mode 100644 index 00000000..ca683bf8 --- /dev/null +++ b/finemap/.scalafmt.conf @@ -0,0 +1,4 @@ +version = "2.4.2" +align=more +docstrings=ScalaDoc +maxColumn=120 diff --git a/finemap/LICENSE.txt b/finemap/LICENSE.txt new file mode 100644 index 00000000..0d0952ea --- /dev/null +++ b/finemap/LICENSE.txt @@ -0,0 +1,29 @@ +Copyright 2020 + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS +OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED +AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +DAMAGE. diff --git a/finemap/README.md b/finemap/README.md new file mode 100644 index 00000000..35b131c6 --- /dev/null +++ b/finemap/README.md @@ -0,0 +1,14 @@ +# finemap + +This is the documentation about the method. + +Please put some details here about the method, what its inputs are, what its +outputs are, where it reads from, and where it writes to. + +## Stages + +These are the stages of finemap. + +### FinemapStage + +A description of what this stage does. diff --git a/finemap/built.sbt b/finemap/built.sbt new file mode 100644 index 00000000..c018bd9e --- /dev/null +++ b/finemap/built.sbt @@ -0,0 +1,70 @@ +val Versions = new { + val Aggregator = "0.3.1-SNAPSHOT" + val Scala = "2.13.2" +} + +// set the version of scala to compile with +scalaVersion := Versions.Scala + +// add scala compile flags +scalacOptions ++= Seq( + "-feature", + "-deprecation", + "-unchecked", + "-Ywarn-value-discard" +) + +// add required libraries +libraryDependencies ++= Seq( + "org.broadinstitute.dig" %% "dig-aggregator-core" % Versions.Aggregator +) + +// set the oranization this method belongs to +organization := "org.broadinstitute.dig" + +// entry point when running this method +mainClass := Some("org.broadinstitute.dig.aggregator.methods.finemap.Finemap") + +// enables buildInfo, which bakes git version info into the jar +enablePlugins(GitVersioning) + +// get the buildInfo task +val buildInfoTask = taskKey[Seq[File]]("buildInfo") + +// define execution code for task +buildInfoTask := { + val file = (resourceManaged in Compile).value / "version.properties" + + // log where the properties will be written to + streams.value.log.info(s"Writing version info to $file...") + + // collect git versioning information + val branch = git.gitCurrentBranch.value + val lastCommit = git.gitHeadCommit.value + val describedVersion = git.gitDescribedVersion.value + val anyUncommittedChanges = git.gitUncommittedChanges.value + val remoteUrl = (scmInfo in ThisBuild).value.map(_.browseUrl.toString) + val buildDate = java.time.Instant.now + + // map properties + val properties = Map[String, String]( + "branch" -> branch, + "lastCommit" -> lastCommit.getOrElse(""), + "remoteUrl" -> remoteUrl.getOrElse(""), + "uncommittedChanges" -> anyUncommittedChanges.toString, + "buildDate" -> buildDate.toString + ) + + // build properties content + val contents = properties.toList.map { + case (key, value) if value.length > 0 => s"$key=$value" + case _ => "" + } + + // output the version information from git to versionInfo.properties + IO.write(file, contents.mkString("\n")) + Seq(file) +} + +// add the build info task output to resources +(resourceGenerators in Compile) += buildInfoTask.taskValue diff --git a/finemap/project/build.properties b/finemap/project/build.properties new file mode 100644 index 00000000..e67343ae --- /dev/null +++ b/finemap/project/build.properties @@ -0,0 +1 @@ +sbt.version=1.5.0 diff --git a/finemap/project/plugins.sbt b/finemap/project/plugins.sbt new file mode 100644 index 00000000..23d5057a --- /dev/null +++ b/finemap/project/plugins.sbt @@ -0,0 +1 @@ +addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "1.0.0") diff --git a/finemap/src/main/resources/install-finemap.sh b/finemap/src/main/resources/install-finemap.sh new file mode 100644 index 00000000..21efc858 --- /dev/null +++ b/finemap/src/main/resources/install-finemap.sh @@ -0,0 +1,76 @@ +#!/bin/bash -xe + +# susie method +## Developed with python 3 and R + +finemap_ROOT=/mnt/var/cojo + +# install to the root directory +sudo mkdir -p "$finemap_ROOT" +cd "$finemap_ROOT" + +# install yum dependencies +sudo yum install -y python3-devel + + +# Install conda +cd $finemap_ROOT +sudo wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh +sudo bash miniconda.sh -b -p $finemap_ROOT/miniconda +echo export PATH="$finemap_ROOT/miniconda/bin:\$PATH" >> ~/.profile +. ~/.profile + +# Install GCTA +cd $finemap_ROOT +sudo mkdir -p ~/software/gcta +cd ~/software/gcta +# Note that this URL may change - old versions aren't accessible at the same URL +sudo wget https://cnsgenomics.com/software/gcta/bin/gcta_1.93.2beta.zip +sudo unzip gcta_1.93.2beta.zip +cd gcta_1.93.2beta +echo export PATH="$PWD:\$PATH" >> ~/.profile +. ~/.profile + +# Install plink +sudo mkdir -p ~/software/plink +cd ~/software/plink +sudo wget http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20201019.zip +sudo unzip plink_linux_x86_64_20201019.zip +echo export PATH="$PWD:\$PATH" >> ~/.profile +. ~/.profile + +# Install FINEMAP +sudo mkdir -p ~/software/finemap +cd ~/software/finemap +sudo wget http://www.christianbenner.com/finemap_v1.4_x86_64.tgz +sudo tar -zxf finemap_v1.4_x86_64.tgz +sudo ln -s finemap_v1.4_x86_64/finemap_v1.4_x86_64 finemap +sudo apt-get install libgomp1 # Not present by default it seems +echo export PATH="$PWD:\$PATH" >> ~/.profile +. ~/.profile + +# Install JRE +sudo apt install -yf openjdk-8-jre-headless openjdk-8-jdk +# sudo update-java-alternatives --list +# sudo update-java-alternatives --set java-1.8.0-openjdk-amd64 + +# Install parallel +sudo apt install -yf parallel + +echo COMPLETE + + +# pull down LD bfiles +sudo mkdir -p ./bfiles +sudo aws s3 cp s3://dig-analysis-bin/cojo/bfiles/ ./bfiles/ --recursive + +# pull down finemap dir +sudo mkdir -p ./finemapping +sudo aws s3 cp s3://dig-analysis-bin/cojo/finemapping/ ./finemapping/ --recursive + +sudo chmod 777 ./finemapping/combine_results.sh +sudo chmod 777 ./finemapping/run_finemap_pipeline.sh + + +# fetch snps for mapping +sudo aws s3 cp "s3://dig-analysis-bin/snps/dbSNP_common_GRCh37.csv" ./snps.csv diff --git a/finemap/src/main/resources/makeFinemap.py b/finemap/src/main/resources/makeFinemap.py new file mode 100644 index 00000000..fa81d77b --- /dev/null +++ b/finemap/src/main/resources/makeFinemap.py @@ -0,0 +1,67 @@ +#!/usr/bin/python3 +from optparse import OptionParser +import pandas as pd +import numpy as np +import shutil +import subprocess +import os + +s3_in=os.environ['INPUT_PATH'] +s3_out=os.environ['OUTPUT_PATH'] + +# def finds json files in the directory +def make_json_files(directory): + subprocess.check_call(['aws', 's3', 'cp', directory, 'input/', '--recursive']) + subprocess.run('zstdcat input/*.json.zst | jq -s '.' > input/input.json', shell=True) + +def safe_remove(file_path): + try: + os.remove(file_path) + print(f"File {file_path} successfully removed.") + except FileNotFoundError: + print(f"File {file_path} does not exist.") + except PermissionError: + print(f"Permission denied: cannot remove {file_path}.") + except Exception as e: + print(f"An error occurred while trying to remove {file_path}: {e}") + +def main(): + usage = "usage: %prog [options]" + parser = OptionParser(usage) + parser.add_option("", "--phenotype", default=None) + parser.add_option("", "--ancestry", default=None) + + (args,_) = parser.parse_args() + + pheno_path = f'{s3_in}/out/metaanalysis/bottom-line/ancestry-specific/{phenotype}/ancestry={ancestry}/' + var2rs_path = '/mnt/var/cojo/snps.csv' + bfiles = '/mnt/var/cojo/bfiles' + finemap_dir = '/mnt/var/cojo/finemapping' + config_file = '/mnt/var/cojo/finemapping/analysis.config.yaml' + out_path = f'{s3_out}/out/cojo/staging/{args.phenotype}/ancestry={args.ancestry}' + + # read all files in the clump path + make_json_files(pheno_path) + + # create the tmp out directory + out_directory = 'data' + if not os.path.exists(out_directory): + os.makedirs(out_directory, exist_ok=True) + + subprocess.call(['bash', '/mnt/var/cojo/finemapping/run_finemap_pipeline.sh', + '--input','input' + '--bfiles', bfiles, + '--config_file',config_file, + '--dbsnp_file',var2rs_path, + '--output', out_directory, + '--finemap_dir',finemap_dir + ]) + + subprocess.check_call(['touch', f'{out_directory}/_SUCCESS']) + subprocess.check_call(['aws', 's3', 'cp', f'{out_directory}/', out_path, '--recursive']) + safe_remove('input/input.json') + shutil.rmtree('input') + shutil.rmtree(out_directory) + +if __name__ == '__main__': + main() diff --git a/finemap/src/main/scala/Finemap.scala b/finemap/src/main/scala/Finemap.scala new file mode 100644 index 00000000..ee8f5dd1 --- /dev/null +++ b/finemap/src/main/scala/Finemap.scala @@ -0,0 +1,27 @@ +package org.broadinstitute.dig.aggregator.methods.susie + +import org.broadinstitute.dig.aggregator.core._ +import org.broadinstitute.dig.aws._ +import org.broadinstitute.dig.aws.emr._ + +/** This is your aggregator method. + * + * All that needs to be done here is to implement the initStages function, + * which adds stages to the method in the order they should be executed. + * + * When you are ready to run it, use SBT from the CLI: + * + * sbt run [args] + * + * See the README of the dig-aggregator-core project for a complete list of + * CLI arguments available. + */ +object Susie extends Method { + + /** Add all stages used in this method here. Stages must be added in the + * order they should be serially executed. + */ + override def initStages(implicit context: Context) = { + addStage(new MakeFinemap) + } +} diff --git a/finemap/src/main/scala/MakeFinemap.scala b/finemap/src/main/scala/MakeFinemap.scala new file mode 100644 index 00000000..35a3d2e5 --- /dev/null +++ b/finemap/src/main/scala/MakeFinemap.scala @@ -0,0 +1,54 @@ +package org.broadinstitute.dig.aggregator.methods.susie + +import org.broadinstitute.dig.aggregator.core._ +import org.broadinstitute.dig.aws.emr._ +import org.broadinstitute.dig.aws.Ec2.Strategy +import org.broadinstitute.dig.aws.MemorySize + +class MakeSuSiE(implicit context: Context) extends Stage { + import MemorySize.Implicits._ + + val ancestrySpecific: Input.Source = Input.Source.Success("out/metaanalysis/bottom-line/ancestry-specific/*/ancestry=EU/") + + /** Source inputs. */ + override val sources: Seq[Input.Source] = Seq(ancestrySpecific) + + /** Map inputs to their outputs. */ + override val rules: PartialFunction[Input, Outputs] = { + case ancestrySpecific(phenotype) => Outputs.Named(s"$phenotype/EU") + // case ancestrySpecific(phenotype, ancestry) => Outputs.Named(s"$phenotype/${ancestry.split('=').last}") + // case mixedDatasets(_, _, phenotype) => Outputs.Named(s"$phenotype/Mixed") + } + + /** Just need a single machine with no applications, but a good drive. */ + override def cluster: ClusterDef = super.cluster.copy( + instances = 1, + applications = Seq.empty, + masterVolumeSizeInGB = 100, + bootstrapScripts = Seq(new BootstrapScript(resourceUri("install-finemap.sh"))) + ) + + override def make(output: String): Job = { + val input = MakeSuSiEInput.fromString(output) + new Job(Job.Script(resourceUri("makeFinemap.py"), input.flags:_*)) + } + +} + + +case class MakeSuSiEInput( + phenotype: String, + ancestry: String +) { + + def flags: Seq[String] = Seq(s"--phenotype=$phenotype", s"--ancestry=$ancestry") +} + +object MakeSuSiEInput { + def fromString(output: String): MakeSuSiEInput = { + output.split("/").toSeq match { + case Seq(phenotype, ancestry) => MakeSuSiEInput(phenotype, ancestry) + } + } +} + diff --git a/finemap/version.sbt b/finemap/version.sbt new file mode 100644 index 00000000..e7654440 --- /dev/null +++ b/finemap/version.sbt @@ -0,0 +1 @@ +version in ThisBuild := "0.1.0" From 8997070bfc635bc29eb6f8049540d46585920c1c Mon Sep 17 00:00:00 2001 From: szandavi Date: Wed, 18 Dec 2024 13:40:33 -0500 Subject: [PATCH 12/13] update susie for finemap --- susie/src/main/scala/Finemap.scala | 27 ------------- susie/src/main/scala/MakeFinemap.scala | 54 -------------------------- susie/src/main/scala/MakeSuSiE.scala | 9 +++-- 3 files changed, 6 insertions(+), 84 deletions(-) delete mode 100644 susie/src/main/scala/Finemap.scala delete mode 100644 susie/src/main/scala/MakeFinemap.scala diff --git a/susie/src/main/scala/Finemap.scala b/susie/src/main/scala/Finemap.scala deleted file mode 100644 index ee8f5dd1..00000000 --- a/susie/src/main/scala/Finemap.scala +++ /dev/null @@ -1,27 +0,0 @@ -package org.broadinstitute.dig.aggregator.methods.susie - -import org.broadinstitute.dig.aggregator.core._ -import org.broadinstitute.dig.aws._ -import org.broadinstitute.dig.aws.emr._ - -/** This is your aggregator method. - * - * All that needs to be done here is to implement the initStages function, - * which adds stages to the method in the order they should be executed. - * - * When you are ready to run it, use SBT from the CLI: - * - * sbt run [args] - * - * See the README of the dig-aggregator-core project for a complete list of - * CLI arguments available. - */ -object Susie extends Method { - - /** Add all stages used in this method here. Stages must be added in the - * order they should be serially executed. - */ - override def initStages(implicit context: Context) = { - addStage(new MakeFinemap) - } -} diff --git a/susie/src/main/scala/MakeFinemap.scala b/susie/src/main/scala/MakeFinemap.scala deleted file mode 100644 index 35a3d2e5..00000000 --- a/susie/src/main/scala/MakeFinemap.scala +++ /dev/null @@ -1,54 +0,0 @@ -package org.broadinstitute.dig.aggregator.methods.susie - -import org.broadinstitute.dig.aggregator.core._ -import org.broadinstitute.dig.aws.emr._ -import org.broadinstitute.dig.aws.Ec2.Strategy -import org.broadinstitute.dig.aws.MemorySize - -class MakeSuSiE(implicit context: Context) extends Stage { - import MemorySize.Implicits._ - - val ancestrySpecific: Input.Source = Input.Source.Success("out/metaanalysis/bottom-line/ancestry-specific/*/ancestry=EU/") - - /** Source inputs. */ - override val sources: Seq[Input.Source] = Seq(ancestrySpecific) - - /** Map inputs to their outputs. */ - override val rules: PartialFunction[Input, Outputs] = { - case ancestrySpecific(phenotype) => Outputs.Named(s"$phenotype/EU") - // case ancestrySpecific(phenotype, ancestry) => Outputs.Named(s"$phenotype/${ancestry.split('=').last}") - // case mixedDatasets(_, _, phenotype) => Outputs.Named(s"$phenotype/Mixed") - } - - /** Just need a single machine with no applications, but a good drive. */ - override def cluster: ClusterDef = super.cluster.copy( - instances = 1, - applications = Seq.empty, - masterVolumeSizeInGB = 100, - bootstrapScripts = Seq(new BootstrapScript(resourceUri("install-finemap.sh"))) - ) - - override def make(output: String): Job = { - val input = MakeSuSiEInput.fromString(output) - new Job(Job.Script(resourceUri("makeFinemap.py"), input.flags:_*)) - } - -} - - -case class MakeSuSiEInput( - phenotype: String, - ancestry: String -) { - - def flags: Seq[String] = Seq(s"--phenotype=$phenotype", s"--ancestry=$ancestry") -} - -object MakeSuSiEInput { - def fromString(output: String): MakeSuSiEInput = { - output.split("/").toSeq match { - case Seq(phenotype, ancestry) => MakeSuSiEInput(phenotype, ancestry) - } - } -} - diff --git a/susie/src/main/scala/MakeSuSiE.scala b/susie/src/main/scala/MakeSuSiE.scala index 1dacf3e6..27e7e236 100644 --- a/susie/src/main/scala/MakeSuSiE.scala +++ b/susie/src/main/scala/MakeSuSiE.scala @@ -8,7 +8,8 @@ import org.broadinstitute.dig.aws.MemorySize class MakeSuSiE(implicit context: Context) extends Stage { import MemorySize.Implicits._ - val ancestrySpecific: Input.Source = Input.Source.Success("out/metaanalysis/bottom-line/ancestry-clumped/*/ancestry=EU/") + // val ancestrySpecific: Input.Source = Input.Source.Success("out/metaanalysis/bottom-line/ancestry-clumped/*/ancestry=EU/") + val ancestrySpecific: Input.Source = Input.Source.Success("out/metaanalysis/bottom-line/ancestry-specific/*/ancestry=EU/") /** Source inputs. */ override val sources: Seq[Input.Source] = Seq(ancestrySpecific) @@ -25,12 +26,14 @@ class MakeSuSiE(implicit context: Context) extends Stage { instances = 1, applications = Seq.empty, masterVolumeSizeInGB = 100, - bootstrapScripts = Seq(new BootstrapScript(resourceUri("install-susie.sh"))) + // bootstrapScripts = Seq(new BootstrapScript(resourceUri("install-susie.sh"))) + bootstrapScripts = Seq(new BootstrapScript(resourceUri("install-finemap.sh"))) ) override def make(output: String): Job = { val input = MakeSuSiEInput.fromString(output) - new Job(Job.Script(resourceUri("makeSuSiE.py"), input.flags:_*)) + // new Job(Job.Script(resourceUri("makeSuSiE.py"), input.flags:_*)) + new Job(Job.Script(resourceUri("makeFinemap.py"), input.flags:_*)) } } From 7d5b241f93162ef5da3104ef94e7bf6a3addee20 Mon Sep 17 00:00:00 2001 From: szandavi Date: Wed, 18 Dec 2024 15:54:32 -0500 Subject: [PATCH 13/13] back susie to normal --- susie/src/main/resources/install-finemap.sh | 72 --------------------- susie/src/main/resources/makeFinemap.py | 67 ------------------- susie/src/main/scala/MakeSuSiE.scala | 9 +-- 3 files changed, 3 insertions(+), 145 deletions(-) delete mode 100644 susie/src/main/resources/install-finemap.sh delete mode 100644 susie/src/main/resources/makeFinemap.py diff --git a/susie/src/main/resources/install-finemap.sh b/susie/src/main/resources/install-finemap.sh deleted file mode 100644 index ea20dbcb..00000000 --- a/susie/src/main/resources/install-finemap.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash -xe - -# susie method -## Developed with python 3 and R - -finemap_ROOT=/mnt/var/cojo - -# install to the root directory -sudo mkdir -p "$finemap_ROOT" -cd "$finemap_ROOT" - -# install yum dependencies -sudo yum install -y python3-devel - - -# Install conda -cd $finemap_ROOT -wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh -bash miniconda.sh -b -p $finemap_ROOT/miniconda -echo export PATH="$finemap_ROOT/miniconda/bin:\$PATH" >> ~/.profile -. ~/.profile - -# Install GCTA -cd $finemap_ROOT -mkdir -p ~/software/gcta -cd ~/software/gcta -# Note that this URL may change - old versions aren't accessible at the same URL -wget https://cnsgenomics.com/software/gcta/bin/gcta_1.93.2beta.zip -unzip gcta_1.93.2beta.zip -cd gcta_1.93.2beta -echo export PATH="$PWD:\$PATH" >> ~/.profile -. ~/.profile - -# Install plink -mkdir -p ~/software/plink -cd ~/software/plink -wget http://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20201019.zip -unzip plink_linux_x86_64_20201019.zip -echo export PATH="$PWD:\$PATH" >> ~/.profile -. ~/.profile - -# Install FINEMAP -mkdir -p ~/software/finemap -cd ~/software/finemap -wget http://www.christianbenner.com/finemap_v1.4_x86_64.tgz -tar -zxf finemap_v1.4_x86_64.tgz -ln -s finemap_v1.4_x86_64/finemap_v1.4_x86_64 finemap -sudo apt-get install libgomp1 # Not present by default it seems -echo export PATH="$PWD:\$PATH" >> ~/.profile -. ~/.profile - -# Install JRE -sudo apt install -yf openjdk-8-jre-headless openjdk-8-jdk -# sudo update-java-alternatives --list -# sudo update-java-alternatives --set java-1.8.0-openjdk-amd64 - -# Install parallel -sudo apt install -yf parallel - -echo COMPLETE - - -# pull down LD bfiles -sudo mkdir -p ./bfiles -sudo aws s3 cp s3://dig-analysis-bin/cojo/bfiles/ ./bfiles/ --recursive - -# pull down finemap dir -sudo mkdir -p ./finemapping -sudo aws s3 cp s3://dig-analysis-bin/cojo/finemapping/ ./finemapping/ --recursive - -# fetch snps for mapping -sudo aws s3 cp "s3://dig-analysis-bin/snps/dbSNP_common_GRCh37.csv" ./snps.csv diff --git a/susie/src/main/resources/makeFinemap.py b/susie/src/main/resources/makeFinemap.py deleted file mode 100644 index fa81d77b..00000000 --- a/susie/src/main/resources/makeFinemap.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/python3 -from optparse import OptionParser -import pandas as pd -import numpy as np -import shutil -import subprocess -import os - -s3_in=os.environ['INPUT_PATH'] -s3_out=os.environ['OUTPUT_PATH'] - -# def finds json files in the directory -def make_json_files(directory): - subprocess.check_call(['aws', 's3', 'cp', directory, 'input/', '--recursive']) - subprocess.run('zstdcat input/*.json.zst | jq -s '.' > input/input.json', shell=True) - -def safe_remove(file_path): - try: - os.remove(file_path) - print(f"File {file_path} successfully removed.") - except FileNotFoundError: - print(f"File {file_path} does not exist.") - except PermissionError: - print(f"Permission denied: cannot remove {file_path}.") - except Exception as e: - print(f"An error occurred while trying to remove {file_path}: {e}") - -def main(): - usage = "usage: %prog [options]" - parser = OptionParser(usage) - parser.add_option("", "--phenotype", default=None) - parser.add_option("", "--ancestry", default=None) - - (args,_) = parser.parse_args() - - pheno_path = f'{s3_in}/out/metaanalysis/bottom-line/ancestry-specific/{phenotype}/ancestry={ancestry}/' - var2rs_path = '/mnt/var/cojo/snps.csv' - bfiles = '/mnt/var/cojo/bfiles' - finemap_dir = '/mnt/var/cojo/finemapping' - config_file = '/mnt/var/cojo/finemapping/analysis.config.yaml' - out_path = f'{s3_out}/out/cojo/staging/{args.phenotype}/ancestry={args.ancestry}' - - # read all files in the clump path - make_json_files(pheno_path) - - # create the tmp out directory - out_directory = 'data' - if not os.path.exists(out_directory): - os.makedirs(out_directory, exist_ok=True) - - subprocess.call(['bash', '/mnt/var/cojo/finemapping/run_finemap_pipeline.sh', - '--input','input' - '--bfiles', bfiles, - '--config_file',config_file, - '--dbsnp_file',var2rs_path, - '--output', out_directory, - '--finemap_dir',finemap_dir - ]) - - subprocess.check_call(['touch', f'{out_directory}/_SUCCESS']) - subprocess.check_call(['aws', 's3', 'cp', f'{out_directory}/', out_path, '--recursive']) - safe_remove('input/input.json') - shutil.rmtree('input') - shutil.rmtree(out_directory) - -if __name__ == '__main__': - main() diff --git a/susie/src/main/scala/MakeSuSiE.scala b/susie/src/main/scala/MakeSuSiE.scala index 27e7e236..1dacf3e6 100644 --- a/susie/src/main/scala/MakeSuSiE.scala +++ b/susie/src/main/scala/MakeSuSiE.scala @@ -8,8 +8,7 @@ import org.broadinstitute.dig.aws.MemorySize class MakeSuSiE(implicit context: Context) extends Stage { import MemorySize.Implicits._ - // val ancestrySpecific: Input.Source = Input.Source.Success("out/metaanalysis/bottom-line/ancestry-clumped/*/ancestry=EU/") - val ancestrySpecific: Input.Source = Input.Source.Success("out/metaanalysis/bottom-line/ancestry-specific/*/ancestry=EU/") + val ancestrySpecific: Input.Source = Input.Source.Success("out/metaanalysis/bottom-line/ancestry-clumped/*/ancestry=EU/") /** Source inputs. */ override val sources: Seq[Input.Source] = Seq(ancestrySpecific) @@ -26,14 +25,12 @@ class MakeSuSiE(implicit context: Context) extends Stage { instances = 1, applications = Seq.empty, masterVolumeSizeInGB = 100, - // bootstrapScripts = Seq(new BootstrapScript(resourceUri("install-susie.sh"))) - bootstrapScripts = Seq(new BootstrapScript(resourceUri("install-finemap.sh"))) + bootstrapScripts = Seq(new BootstrapScript(resourceUri("install-susie.sh"))) ) override def make(output: String): Job = { val input = MakeSuSiEInput.fromString(output) - // new Job(Job.Script(resourceUri("makeSuSiE.py"), input.flags:_*)) - new Job(Job.Script(resourceUri("makeFinemap.py"), input.flags:_*)) + new Job(Job.Script(resourceUri("makeSuSiE.py"), input.flags:_*)) } }