Skip to content

Commit 0e83994

Browse files
committed
Merge pull request #3 from dmlond/master
Actual
2 parents b247cb3 + f9206e3 commit 0e83994

14 files changed

+417
-7
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ packer_cache/
55
*sai*
66
*fasta*
77
*fastq*
8-
*sam*
8+
.*sam*
9+
*samtools*

bin/pipeline.docker.sh

+111
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
#!/bin/bash
2+
3+
#make sure this script runs from the ROOT of the project
4+
cd `dirname $0`/..
5+
6+
# this is a simple pipeline that maps FASTQ reads to a reference genome (in FASTA format).
7+
8+
# Here we define where our data reside. Perhaps we may need to modify this depending on
9+
# how we run the pipeline.
10+
DATA=data
11+
12+
# Here we define the number of cores we will use for the calculations. Perhaps we may need
13+
# to modify this depending on the configuration of our VM
14+
CORES=2
15+
16+
# The location of the reference genome in relation to the data folder
17+
REFERENCE=$DATA/Pf3D7_v2.1.5.fasta
18+
19+
# The location of the reads in relation to the data folder
20+
READS_1=$DATA/ERR022523_1.fastq.gz
21+
READS_2=$DATA/ERR022523_2.fastq.gz
22+
FASTQS="$READS_1 $READS_2"
23+
24+
# recreate BWA index if not exists
25+
if [ ! -e $REFERENCE.bwt ]; then
26+
echo "going to index $REFERENCE"
27+
28+
# Warning: "-a bwtsw" does not work for short genomes,
29+
# while "-a is" and "-a div" do not work not for long
30+
# genomes. Please choose "-a" according to the length
31+
# of the genome.
32+
docker-compose run bwa index -a bwtsw $REFERENCE
33+
else
34+
echo "$REFERENCE already indexed"
35+
fi
36+
37+
# lists of produced files. These will be assigned values as we run the pipeline
38+
SAIS=""
39+
SAM=""
40+
41+
# iterate over FASTQ files
42+
for FASTQ in $FASTQS; do
43+
44+
# create new names from the stem of the FASTA and FASTQ files
45+
LOCALFASTA=`echo $REFERENCE | sed -e 's/.*\///'`
46+
LOCALFASTQ=`echo $FASTQ | sed -e 's/.*\///'`
47+
OUTFILE=$DATA/$LOCALFASTQ-$LOCALFASTA.sai
48+
49+
# grow the list of *.sai files
50+
SAIS="$SAIS $OUTFILE"
51+
52+
# create a name for the SAM file
53+
SAM=`echo $OUTFILE | sed -e "s/_.*/-$LOCALFASTA.sam/"`
54+
55+
# note: we don't do basic QC here, because that might mean
56+
# that the mate pairs in the FASTQ files go out of order,
57+
# which will result in the bwa sampe step taking an inordinate
58+
# amount of time
59+
60+
# do bwa aln if needed
61+
if [ ! -e $OUTFILE ]; then
62+
echo "going to align $FASTQ against $REFERENCE"
63+
64+
# use $CORES threads
65+
docker-compose run bwa aln -t $CORES $REFERENCE $FASTQ -f $OUTFILE
66+
else
67+
echo "alignment $OUTFILE already created"
68+
fi
69+
done
70+
71+
# do bwa sampe if needed
72+
if [ ! -e $SAM ]; then
73+
74+
# create paired-end SAM file
75+
echo "going to run bwa sampe $FASTA $SAIS $FASTQS -f $SAM"
76+
docker-compose run bwa sampe $REFERENCE $SAIS $FASTQS -f $SAM
77+
else
78+
echo "sam file $SAM already created"
79+
fi
80+
81+
# do samtools filter if needed
82+
if [ ! -e $SAM.filtered ]; then
83+
# -bS = input is SAM, output is BAM
84+
# -F 4 = remove unmapped reads
85+
# -q 50 = remove reads with mapping qual < 50
86+
echo "going to run samtools view -bS -F 4 -q 50 -o $SAM > $SAM.filtered"
87+
docker-compose run samtools view -bS -F 4 -q 50 -o $SAM.filtered $SAM
88+
docker-compose run gzip -9 $SAM
89+
else
90+
echo "sam file $SAM.filtered already created"
91+
fi
92+
93+
# do samtools sorting if needed
94+
if [ ! -e $SAM.sorted.bam ]; then
95+
96+
# sorting is needed for indexing
97+
echo "going to run samtools sort $SAM.filtered $SAM.sorted"
98+
docker-compose run samtools sort $SAM.filtered $SAM.sorted
99+
else
100+
echo "sam file $SAM.sorted already created"
101+
fi
102+
103+
# created index for BAM file if needed
104+
if [ ! -e $SAM.sorted.bam.bai ]; then
105+
106+
# this should result in faster processing
107+
echo "going to run samtools index $SAM.sorted.bam"
108+
docker-compose run samtools index $SAM.sorted.bam
109+
else
110+
echo "BAM file index $SAM.sorted.bam.bai already created"
111+
fi

conf/docker/bwa/Dockerfile

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
FROM ubuntu:trusty
2+
MAINTAINER Darin London <[email protected]>
3+
4+
RUN apt-get update \
5+
&& apt-get install -y wget \
6+
&& apt-get install -y bzip2 \
7+
&& apt-get install -y tar \
8+
&& apt-get install -y build-essential \
9+
&& apt-get install -y zlib1g-dev
10+
ADD install_bwa.sh install_bwa.sh
11+
# this downloads the bwa source, makes it, moves it into place, then removes
12+
# the downloads in one transaction to make sure downloads do not remain
13+
# in the image
14+
RUN ./install_bwa.sh
15+
16+
# this creates a default command that gets
17+
# run when the container is run without arguments
18+
# it will print the usage + version of bwa and exit
19+
CMD bwa

conf/docker/bwa/install_bwa.sh

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
3+
# download and extract bwa source
4+
wget -O bwa-0.7.12.tar.bz2 http://sourceforge.net/projects/bio-bwa/files/bwa-0.7.12.tar.bz2/download
5+
tar jxf bwa-0.7.12.tar.bz2
6+
# build bwa and move it into /usr/local/bin
7+
cd bwa-0.7.12
8+
make
9+
mv bwa /usr/local/bin
10+
# clean up to minimize the size of the resulting image
11+
cd ..
12+
rm -rf bwa-0.7.12*

conf/docker/raw/Dockerfile

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
FROM centos:latest
2+
RUN ["/usr/sbin/useradd", "bwa_user"]
3+
RUN ["/usr/bin/yum", "install", "-y", "wget"]
4+
RUN ["mkdir", "-p", "/home/bwa_user/data"]
5+
RUN ["mkdir","-p","/home/bwa_user/data"]
6+
RUN ["chown","bwa_user","/home/bwa_user/data"]
7+
RUN ["chgrp","bwa_user","/home/bwa_user/data"]
8+
RUN ["chmod","777","/home/bwa_user/data"]
9+
ADD download_plasmodium_raw.sh /usr/local/bin/download_plasmodium_raw.sh
10+
VOLUME ["/home/bwa_user/data"]
11+
WORKDIR /home/bwa_user/data
12+
USER bwa_user
13+
CMD "/usr/local/bin/download_plasmodium_raw.sh"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#!/bin/bash
2+
3+
wget -O /home/bwa_user/data/ERR022523_1.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR022/ERR022523/ERR022523_1.fastq.gz
4+
wget -O /home/bwa_user/data/ERR022523_2.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR022/ERR022523/ERR022523_2.fastq.gz

conf/docker/samtools/Dockerfile

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
FROM ubuntu:trusty
2+
MAINTAINER Darin London <[email protected]>
3+
4+
RUN apt-get update \
5+
&& apt-get install -y wget \
6+
&& apt-get install -y bzip2 \
7+
&& apt-get install -y gzip \
8+
&& apt-get install -y tar \
9+
&& apt-get install -y build-essential \
10+
&& apt-get install -y zlib1g-dev \
11+
&& apt-get install -y ncurses-dev
12+
ADD install_samtools.sh install_samtools.sh
13+
# this downloads the bwa source, makes it, moves it into place, then removes
14+
# the downloads in one transaction to make sure downloads do not remain
15+
# in the image
16+
RUN ./install_samtools.sh
17+
18+
# this creates a default command that gets
19+
# run when the container is run without arguments
20+
# it will print the usage + version of samtools and exit
21+
CMD samtools
+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#!/bin/bash
2+
wget -O samtools-1.2.tar.bz2 http://sourceforge.net/projects/samtools/files/samtools/1.2/samtools-1.2.tar.bz2/download
3+
tar jxf samtools-1.2.tar.bz2
4+
# build samtools and move it into /usr/local/bin
5+
cd samtools-1.2
6+
make
7+
mv samtools /usr/local/bin
8+
# clean up to minimize the size of the resulting image
9+
cd ..
10+
rm -rf samtools-1.2*

docker-compose.yml

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
bwa:
2+
build: conf/docker/bwa
3+
volumes:
4+
- ./:/wdir
5+
working_dir: /wdir
6+
entrypoint: bwa
7+
command: ''
8+
samtools:
9+
build: conf/docker/samtools
10+
volumes:
11+
- ./:/wdir
12+
working_dir: /wdir
13+
entrypoint: samtools
14+
gzip:
15+
build: conf/docker/samtools
16+
volumes:
17+
- ./:/wdir
18+
working_dir: /wdir
19+
entrypoint: gzip
File renamed without changes.
109 KB
Binary file not shown.

docs/2015-05-13/README.md

+11-6
Original file line numberDiff line numberDiff line change
@@ -44,14 +44,19 @@ Schedule
4444
The outline for today is as follows:
4545

4646
- _Session 1_: Recap from yesterday: How Vagrant and Puppet can automate the creation and
47-
configuration of compute environments and how to run analyses inside a VM. Brief
47+
configuration of compute environments and how to run analyses inside a VM. To capture our
48+
understanding of yesterday's progress, we will each make a mindmap with XMind. Give it a
49+
name that includes your computer (t1, t2, etc.), add the file to your git repository and
50+
send us a pull request. This way we have all of them together. Then we will have a brief
4851
aside on how to organize data, e.g. as produced by different runs of a pipeline or
4952
different steps in a larger analysis.
50-
- _Session 2_: If all has gone well, we will be able to access the folder
51-
`arangs2015/data` on the host by navigating to `/vagrant_data/` on the VM. Verify that
52-
this is the case and that you can read from it (e.g. by accessing the README.md using
53-
`more`) as well as write to it (e.g. `touch foo` should create an empty file `foo`).
54-
Modify the pipeline shell script to point it to the right folder and run it.
53+
- _Session 2_: We are going to make our own vagrant box file to share with others. The end
54+
result will be something [like this](https://atlas.hashicorp.com/Naturalis/boxes/arangs2015),
55+
which you can install with `vagrant init Naturalis/arangs2015` (etc.). A box file
56+
is [a combination of the virtual hard drive of the VM and metadata](http://docs.vagrantup.com/v2/boxes/format.html).
57+
This bundling is made using the [packer program](https://packer.io/), which you should install.
58+
The bundler requires some extra scripts and config files, which we will adapt from
59+
[here](https://github.com/hashicorp/atlas-packer-vagrant-tutorial).
5560
- _Session 3_: Docker introduction. We will now begin to look at a newer technology that has emerged within the last few years, Docker. In this session,
5661
we will go over the basic concepts of the Docker system, and get to know its
5762
similarities and differences with Virtualization. We will then learn about the docker ecosystem on registry.docker.hub. We will then install the
+119
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
![GTPB](http://gtpb.igc.gulbenkian.pt/bicourses/images/GTPB2015logo.png "GTPB")
2+
3+
Introducing Docker
4+
==================
5+
6+
[Docker](www.docker.com) has some similarities with Virtualization Technologies:
7+
8+
- both involve the creation of reuseable images
9+
- both involve running one or more instances of an image on a Host machine
10+
- images can be transported from one Host to another and run successfully
11+
so long as the hosting software is installed
12+
13+
Docker images differ from Virtualization images in many important ways.
14+
15+
- They are 5-10 times smaller
16+
- They depend on and use much more of the host linux resources
17+
- They are less secure
18+
- Instances are called Containers
19+
- Containers can be instantiated and run within seconds
20+
- Containers can be plugged in to the Host tty, STDIN, STDOUT, and STDERR
21+
22+
The primary difference between a Docker image and a VM image is tied to
23+
a philosophical difference.
24+
25+
VM images are created to host an entire machine architecture which is run as if it were its own machine, completely oblivious to its host.
26+
27+
Docker images are designed to host a single application and its dependencies. They are designed to run on the host as if natively installed. To compose a pipeline, you use or create docker images for each application required, and run containers from the host more or less hooked in to the host, similar to the way you would run a natively installed application.
28+
29+
Docker Ecosystem
30+
----------------
31+
32+
**Docker Machine**
33+
34+
Host systems must install and run the Docker daemon. The daemon can only run on a modern (version created within the last 2 years) Linux Kernal. Almost all flavors of Linux (Fedora, Redhat, Ubuntu, Debian) use the Linux Kernal, and can host the daemon on them natively. Some flavors of \*Nix (Mac OSX in particular), do not use the Linux Kernal. They must run the docker daemon inside a VirtualMachine built on one of the Linux flavors with a modern kernal. This can introduce a bit more complexity, but it also introduces the powerful concept of using external docker hosts 'in the cloud'.
35+
36+
The docker daemon runs a web service in the background and listens to special ports for requests to manage docker images and containers. It provides a REST interface API that can be used by any client. Typically, it uses an encrypted connection called TLS, which is a standard system used by many network client-server communications. TLS requires that each client generate an encrypted certificate (not the same as used by GitHub) to be used when they communicate with the service. The primary client that uses the REST interface is the docker commandline interface.
37+
38+
The [docker-machine](https://docs.docker.com/machine) command automates the process of getting a docker host running on any computer with a supported Virtualization system (Virtualbox and VMware are supported). It makes it much easier to get Docker up and running if you do not have Systems Administration expertise. It does this by:
39+
- downloading a special VM image for a specified VM management system preconfigured to host and run the docker daemon
40+
- generating TLS certificates
41+
- starting and stopping the VM
42+
- Providing an easy way to configure the Environment needed by the Docker commandline interface (see below)
43+
The docker-machine command can also be used to create docker machines on many cloud [hosting systems](https://docs.docker.com/machine/#using-docker-machine-with-a-cloud-provider), which may be attractive to those wanting to purchase more powerful compute environments than are provided by their own machine, or institution.
44+
45+
**Docker**
46+
47+
The [docker commandline interface](https://docs.docker.com/reference/commandline/cli/) is written in the Go programming language. There are versions available for every known operating system (even Windows 10!). It is designed to interface with the Docker Machine daemon over the network using its REST interface. By compartmentalizing the docker interface from the docker machine, it is possible to use the same docker command to interface with a docker machine running anywhere on the network.
48+
49+
The client must run in the context of a special set of Environment variables:
50+
* DOCKER_TLS_VERIFY (1 if using TLS, default)
51+
* DOCKER_CERT_PATH (path to TLS certificate if using TLS)
52+
* DOCKER_HOST (url and port to the Docker Host daemon service)
53+
54+
The docker commandline interface provides the full set of tools needed to create and manage docker images and image container instances.
55+
56+
* pull images from a Docker Registry (it knows about the Official Docker Registry by default)
57+
* push images to a Docker Registry (requires login)
58+
* list images
59+
* build images from a build context (more about this tomorrow)
60+
* remove images
61+
* tag images (acts like an alias)
62+
* run container instances of images
63+
* list containers
64+
* start and stop existing container instances (background only)
65+
* pause/unpause existing containers (foreground and background)
66+
* kill a running container (stop is preferred but kill can be used to stop a runaway container process)
67+
* rm stopped/killed container instances
68+
* inspect container instances (running or stopped)
69+
* Dump the log (STDOUT) from a running container
70+
* save and load a tar file of an image (can be used instead of a registry to move docker images from one machine to another)
71+
* exec a command in a running container (allows you to interact with, and change the state of a running container)
72+
73+
There are many arguments that you can provide to the [Run](https://docs.docker.com/reference/run/) command:
74+
* container naming (docker provides default names to all containers, sometimes humorous), you can specifically name a container at run time
75+
* interactivity mode (interactive or daemon mode)
76+
* attach the host tty (we will demonstrate this) to an interactive container
77+
* mount local directories to the container file system
78+
* connect one container to another container to make a private network between them
79+
* mount volumes from other, special containers, called volume containers, to the container file system
80+
* set the user, group, working directory to be used inside the container
81+
* set environment variables
82+
* override the default entrypoint or command (more on this tomorrow)
83+
* connect host and container STDIN, STDOUT, and STDERR
84+
* expose container ports to the host
85+
86+
**Docker Registry**
87+
88+
Docker has hosted a worldwide [Registry](https://registry.hub.docker.com/) of Docker images. Anyone with docker can share their own images with the world. Images shared on the Docker Registry cannot be private. It is possible to [host your own registry](http://docs.docker.com/registry/deploying/).
89+
90+
The Docker commandline tool is preconfigured to know about and use the official
91+
Docker Registry.
92+
93+
- docker pull i will pull the image i down onto your host
94+
- docker run i will pull the image i down if it is not present, and then run a container of i
95+
96+
Lesson Plan
97+
-----------
98+
99+
- install docker-machine and docker
100+
- explore the Docker Registry
101+
- run some docker images
102+
- with and without docker pull
103+
- with and without local storage
104+
- with exposed ports
105+
- connected to other container systems/services
106+
- inspect information about containers
107+
- inspect the log from running containers
108+
- remove images
109+
- remove containers (with volumes)
110+
111+
Resources
112+
---------
113+
- https://www.docker.com/
114+
- https://docs.docker.com/machine/
115+
- https://docs.docker.com/compose/
116+
- https://docs.docker.com/userguide/
117+
- https://docs.docker.com/reference/commandline/cli/
118+
- https://registry.hub.docker.com
119+
- https://registry.hub.docker.com/u/tutum/hello-world/

0 commit comments

Comments
 (0)