Split input data, map to hg19, split by chromosome, and variant call with GATK

This is a template script written by Scott Hunicke-Smith to illustrate how to run exome analysis much faster on lonestar. It only requires two fastq files (paired files) and two parameters. It is NOT optimized, not highly robust, etc. It relies on many sub-scripts both within Scott's home directory and the BioITeam corral directories.

This bash script needs to be run on a head node somewhere where it won't be killed:

fastExon.sh

#!/bin/bash
# Copyright 2012 Scott Hunicke-Smith and the University of Texas at Austin

module load python
module load bwa
module load samtools
module load java64

r1file=$1
r2file=$2
splitSize=$3
batchSize=$4
queue="normal"

echo "Starting: `date`"
# 1. Split input fastq's as one job; store job #
echo "split -d -l $splitSize $r1file r1.
split -d -l $splitSize $r2file r2. " > split.script
/home1/01057/sphsmith/local/bin/launcher_creator.py -q $queue -j split.script -l split.sge -a DNAdenovo -n split -t 1:00:00
qsub split.sge &> split.sge.sublog
splitJID=`tail -1 split.sge.sublog | awk '{print $3}'`
echo "Submitted $splitJID to split input files at `date`"

echo "Waiting for split to finish"
while qstat | grep $splitJID ; do
 echo `date`
 sleep 30
done

# 2. Move a set of splits into their own directory
i=0
fileList=""
subdirList=""
for file in $( ls r1.* ) ; do
   fileExt="${file##*.}"
   if [ `expr $i % $batchSize` -eq `expr $batchSize - 1` ]
   then
      mkdir b.$i
      subdirList="$subdirList b.$i"
      fileList="$fileList $fileExt"
      for dataFiles in $fileList ; do
         mv r2.$dataFiles b.$i
         mv r1.$dataFiles b.$i
      done
      fileList=""
   else
      fileList="$fileList $fileExt"
   fi
   i=`expr $i + 1`
done
# And the residual set, if any:
for file in $( ls r1.* ) ; do
   mkdir b.$i
   subdirList="$subdirList b.$i"
   mv r1.* b.$i
   mv r2.* b.$i
done


# 3. Launch exome_step1.sh on each split within it's own directory; store job numbers; launch exome_step2.sh to combine chr files
mapJIDs=""
for subdir in $subdirList ; do
  cd $subdir
  echo "Creating launcher for all files in $subdir: `date`"
  rm -f map.sge
  for file in $( ls r1.* ) ; do
     fileExt="${file##*.}"
     echo "Run exome_step1.sh on r1.$fileExt and r2.$fileExt"
     echo "/home1/01057/sphsmith/local/bin/exome_step1.bash r1.$fileExt r2.$fileExt /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta mapped.$fileExt >& mapped.$fileExt.log" >> map.script
  done
  /home1/01057/sphsmith/local/bin/launcher_creator.py -q $queue -j map.script -l map.sge -a DNAdenovo -n map.$subdir -t 1:00:00 -w 2
  qsub map.sge &> map.sge.sublog
  mapJID=`tail -1 map.sge.sublog | awk '{print $3}'`
  mapJIDs="$mapJIDs,`tail -1 map.sge.sublog | awk '{print $3}'`"
  echo "Submitted $mapJID to split input files in $subdir at `date`"
  cd ..
done
echo "Waiting for mapping to finish"
while qstat | grep $mapJID ; do
  echo `date`
  sleep 30
done

echo "Finished: `date`"

# 4. Launch job to combine final chr files across all directories
echo "Creating launcher for merging by reference sequence: `date`"
subdir=`ls -d b.* | head -1`
subdirExt="${subdir##*.}"
evalcmd="ls b.$subdirExt/*.mapped.*.sorted.bam | awk -F [./] '{print \$3}' | sort | uniq"
refList=`eval $evalcmd`
# Randomize this list so large and small reference sequences are mixed up
refList=`echo $refList | awk 'BEGIN {srand() } {for (i=1;i<=NF;i++) {print rand() "\t" $i "\n"}}' | sort -n | cut -f 2`
echo $refList

rm -f merge.script
for refs in $refList ; do
   echo "Merging $refs "
   echo "samtools merge -f $refs.sorted.bam b.*/$refs.mapped.*.sorted.bam; samtools index $refs.sorted.bam" >> merge.script
done
/home1/01057/sphsmith/local/bin/launcher_creator.py -q $queue -j merge.script -l merge.sge -a DNAdenovo -n merge -t 1:00:00 -w 4
echo "Submitting job; queue start contingent on $mapJIDs completing first"
qsub merge.sge -hold_jid $mapJIDs &> merge.sge.sublog
mergeJID=`tail -1 merge.sge.sublog | awk '{print $3}'`
echo "Submitted $mergeJID to merge output files at `date`"
echo "Waiting for merging to finish"
while qstat | grep $mergeJID ; do
  echo `date`
  sleep 30
done

# 5. Launch GATK on each reference sequences' sorted bam file
echo "Creating launcher for merging by reference sequence: `date`"
subdir=`ls -d b.* | head -1`
subdirExt="${subdir##*.}"
evalcmd="ls b.$subdirExt/*.mapped.*.sorted.bam | awk -F [./] '{print \$3}' | sort | uniq"
refList=`eval $evalcmd`
# Randomize this list so large and small reference sequences are mixed up
refList=`echo $refList | awk 'BEGIN {srand() } {for (i=1;i<=NF;i++) {print rand() "\t" $i "\n"}}' | sort -n | cut -f 2`
echo $refList
rm -f variants.script
for refs in $refList ; do
   echo "GATK via exome_step2.bash on $refs.sorted.bam"
   echo "/home1/01057/sphsmith/local/bin/exome_step2.bash $refs.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf $refs >& variants.$refs.log" >> variants.script
done
# Note that the -w 2 option here defines how many GATK's run per node - might need optimization
/home1/01057/sphsmith/local/bin/launcher_creator.py -q $queue -j variants.script -l variants.sge -a DNAdenovo -n variants -t 4:00:00 -w 2
sed -i s/'module load launcher'/'module load launcher\nmodule load java64\nmodule load samtools'/ variants.sge
qsub variants.sge -hold_jid $mergeJID &> variants.sge.sublog
variantsJID=`tail -1 variants.sge.sublog | awk '{print $3}'`
echo "Submitted $variantsJID to call variants at `date`"

echo "Waiting for variant calling to finish"
while qstat | grep $variantsJID ; do
  echo `date`
  sleep 30
done

# 6. Merge all bam files & vcf files
echo "samtools merge -f $r1file.sorted.bam *.sorted.bam" > merge2.script
/home1/01057/sphsmith/local/bin/launcher_creator.py -q $queue -j merge2.script -l merge2.sge -a DNAdenovo -n merge2 -t 1:00:00 -w 4
qsub merge2.sge -hold_jid $variantsJID &> merge2.sge.sublog
merge2JID=`tail -1 merge2.sge.sublog | awk '{print $3}'`
echo "Submitted $merge2JID to merge output files at `date`"
echo "Waiting for merging to finish"
while qstat | grep $merge2JID ; do
  echo `date`
  sleep 30
done
grep '^#' chrX.sorted.bam.snps.vcf > $r1file.snps.vcf
grep -v '^#' chr*.sorted.bam.snps.vcf >> $r1file.snps.vcf

echo "Fast exon analysis is complete at: `date`"

It uses the TACC "launcher" functionality to do the following:

Create one job on one node which splits the two input fastq files into files with $splitSize lines per file each using split.sge and split.script. Wait for job to finish.
Create as many subdirectories as needed for the split output files to be mapped $batchSize per directory. Lonestar nodes have two sockets with 12 processors per socket, so a good choice here is to make $batchSize two so that the mapping step can use 6 threads.
Create $batchSize * originalFileSize % $splitSize lines in map.script and submit map.sge to do the mapping. Note that the embedded mapping script exome_step1.sh splits the mapping output into chromosome-specific files during the bwa sampe step. This mapping script is also where multi-threading for bwa is set. It should be parameterized of course.
Merge all the chromosome-specific files from these subdirectories back to the run directory using merge.sge and merge.script
Run GATK on sets of these chromosome-specific files, with 2 GATK's per node (hardcoded in script right now) using variants.sge and variants.script; since chromosomes are usually named based on their size (i.e. chr1 < chr2 < chr3, etc.), randomize the list so that we don't wind up with all the big chromosomes on one node.
Merge the final GATK chromosome-specific variant calls - both the BAM files and the VCF files - using merge2.sge and merge2.script.

Examples of the various .sge and .script files are shown below.

Benchmark analysis on ~40 million read pairs from a single human exome experiment show that this script takes about 2 hours vs. about 15 hours if all these same processes are run on only 1 node.

Expand here to see example split.sge and split.script

split.sge

#!/bin/csh
#
# Simple SGE script for submitting multiple serial
# jobs (e.g. parametric studies) using a script wrapper
# to launch the jobs.
#
# To use, build the launcher executable and your
# serial application(s) and place them in your WORKDIR
# directory.  Then, edit the CONTROL_FILE to specify
# each executable per process.
#-------------------------------------------------------
#-------------------------------------------------------
#
#         <------ Setup Parameters ------>
#
#$ -N split
#$ -pe 12way 12
#$ -q normal
#$ -o split.o$JOB_ID
#$ -l h_rt=1:00:00
#$ -V
#$ -cwd
#   <------ You MUST Specify a Project String ----->
#$ -A DNAdenovo
#------------------------------------------------------
#
# Usage:
#	#$ -pe <parallel environment> <number of slots>
#	#$ -l h_rt=hours:minutes:seconds to specify run time limit
# 	#$ -N <job name>
# 	#$ -q <queue name>
# 	#$ -o <job output file>
#	   NOTE: The env variable $JOB_ID contains the job id.
#
module load launcher
setenv EXECUTABLE     $TACC_LAUNCHER_DIR/init_launcher
setenv CONTROL_FILE   split.script
setenv WORKDIR        .
#
# Variable description:
#
#  EXECUTABLE     = full path to the job launcher executable
#  CONTROL_FILE   = text input file which specifies
#                   executable for each process
#                   (should be located in WORKDIR)
#  WORKDIR        = location of working directory
#
#      <------ End Setup Parameters ------>
#--------------------------------------------------------
#--------------------------------------------------------

#----------------
# Error Checking
#----------------

if ( ! -e $WORKDIR ) then
        echo " "
	echo "Error: unable to change to working directory."
	echo "       $WORKDIR"
	echo " "
	echo "Job not submitted."
	exit
endif

if ( ! -f $EXECUTABLE ) then
	echo " "
	echo "Error: unable to find launcher executable $EXECUTABLE."
	echo " "
	echo "Job not submitted."
	exit
endif

if ( ! -f $WORKDIR/$CONTROL_FILE ) then
	echo " "
	echo "Error: unable to find input control file $CONTROL_FILE."
	echo " "
	echo "Job not submitted."
	exit
endif


#----------------
# Job Submission
#----------------

cd $WORKDIR/
echo " WORKING DIR:   $WORKDIR/"

$TACC_LAUNCHER_DIR/paramrun $EXECUTABLE $CONTROL_FILE

echo " "
echo " Parameteric Job Complete"
echo " "

*********************

split.script

split -d -l 1500000 Sample_5_L003_R1.cat.fastq r1.
split -d -l 1500000 Sample_5_L003_R2.cat.fastq r2.

Expand here to see example merge.sge and merge.script

merge.sge

#!/bin/csh
#
# Simple SGE script for submitting multiple serial
# jobs (e.g. parametric studies) using a script wrapper
# to launch the jobs.
#
# To use, build the launcher executable and your
# serial application(s) and place them in your WORKDIR
# directory.  Then, edit the CONTROL_FILE to specify
# each executable per process.
#-------------------------------------------------------
#-------------------------------------------------------
#
#         <------ Setup Parameters ------>
#
#$ -N merge
#$ -pe 4way 72
#$ -q normal
#$ -o merge.o$JOB_ID
#$ -l h_rt=1:00:00
#$ -V
#$ -cwd
#   <------ You MUST Specify a Project String ----->
#$ -A DNAdenovo
#------------------------------------------------------
#
# Usage:
#	#$ -pe <parallel environment> <number of slots>
#	#$ -l h_rt=hours:minutes:seconds to specify run time limit
# 	#$ -N <job name>
# 	#$ -q <queue name>
# 	#$ -o <job output file>
#	   NOTE: The env variable $JOB_ID contains the job id.
#
module load launcher
setenv EXECUTABLE     $TACC_LAUNCHER_DIR/init_launcher
setenv CONTROL_FILE   merge.script
setenv WORKDIR        .
#
# Variable description:
#
#  EXECUTABLE     = full path to the job launcher executable
#  CONTROL_FILE   = text input file which specifies
#                   executable for each process
#                   (should be located in WORKDIR)
#  WORKDIR        = location of working directory
#
#      <------ End Setup Parameters ------>
#--------------------------------------------------------
#--------------------------------------------------------

#----------------
# Error Checking
#----------------

if ( ! -e $WORKDIR ) then
        echo " "
	echo "Error: unable to change to working directory."
	echo "       $WORKDIR"
	echo " "
	echo "Job not submitted."
	exit
endif

if ( ! -f $EXECUTABLE ) then
	echo " "
	echo "Error: unable to find launcher executable $EXECUTABLE."
	echo " "
	echo "Job not submitted."
	exit
endif

if ( ! -f $WORKDIR/$CONTROL_FILE ) then
	echo " "
	echo "Error: unable to find input control file $CONTROL_FILE."
	echo " "
	echo "Job not submitted."
	exit
endif


#----------------
# Job Submission
#----------------

cd $WORKDIR/
echo " WORKING DIR:   $WORKDIR/"

$TACC_LAUNCHER_DIR/paramrun $EXECUTABLE $CONTROL_FILE

echo " "
echo " Parameteric Job Complete"
echo " "

merge.script

samtools merge -f chr6.sorted.bam b.*/chr6.mapped.*.sorted.bam; samtools index chr6.sorted.bam
samtools merge -f chrX.sorted.bam b.*/chrX.mapped.*.sorted.bam; samtools index chrX.sorted.bam
samtools merge -f chr17.sorted.bam b.*/chr17.mapped.*.sorted.bam; samtools index chr17.sorted.bam
samtools merge -f chr21.sorted.bam b.*/chr21.mapped.*.sorted.bam; samtools index chr21.sorted.bam
samtools merge -f chr5.sorted.bam b.*/chr5.mapped.*.sorted.bam; samtools index chr5.sorted.bam
samtools merge -f chrY.sorted.bam b.*/chrY.mapped.*.sorted.bam; samtools index chrY.sorted.bam
samtools merge -f chr4.sorted.bam b.*/chr4.mapped.*.sorted.bam; samtools index chr4.sorted.bam
samtools merge -f chr19.sorted.bam b.*/chr19.mapped.*.sorted.bam; samtools index chr19.sorted.bam
samtools merge -f chr13.sorted.bam b.*/chr13.mapped.*.sorted.bam; samtools index chr13.sorted.bam
samtools merge -f chr16.sorted.bam b.*/chr16.mapped.*.sorted.bam; samtools index chr16.sorted.bam
samtools merge -f chr7.sorted.bam b.*/chr7.mapped.*.sorted.bam; samtools index chr7.sorted.bam
samtools merge -f chr9.sorted.bam b.*/chr9.mapped.*.sorted.bam; samtools index chr9.sorted.bam
samtools merge -f chr14.sorted.bam b.*/chr14.mapped.*.sorted.bam; samtools index chr14.sorted.bam
samtools merge -f chr11.sorted.bam b.*/chr11.mapped.*.sorted.bam; samtools index chr11.sorted.bam
samtools merge -f chr22.sorted.bam b.*/chr22.mapped.*.sorted.bam; samtools index chr22.sorted.bam
samtools merge -f chr1.sorted.bam b.*/chr1.mapped.*.sorted.bam; samtools index chr1.sorted.bam
samtools merge -f chr10.sorted.bam b.*/chr10.mapped.*.sorted.bam; samtools index chr10.sorted.bam
samtools merge -f chr15.sorted.bam b.*/chr15.mapped.*.sorted.bam; samtools index chr15.sorted.bam
samtools merge -f chr18.sorted.bam b.*/chr18.mapped.*.sorted.bam; samtools index chr18.sorted.bam
samtools merge -f chr3.sorted.bam b.*/chr3.mapped.*.sorted.bam; samtools index chr3.sorted.bam
samtools merge -f chr20.sorted.bam b.*/chr20.mapped.*.sorted.bam; samtools index chr20.sorted.bam
samtools merge -f chr8.sorted.bam b.*/chr8.mapped.*.sorted.bam; samtools index chr8.sorted.bam
samtools merge -f chr2.sorted.bam b.*/chr2.mapped.*.sorted.bam; samtools index chr2.sorted.bam
samtools merge -f chr12.sorted.bam b.*/chr12.mapped.*.sorted.bam; samtools index chr12.sorted.bam

Expand here to see example map.sge and map.script; note that the fastExon.sh script creates these within subdirectories.

map.sge

#!/bin/csh
#
# Simple SGE script for submitting multiple serial
# jobs (e.g. parametric studies) using a script wrapper
# to launch the jobs.
#
# To use, build the launcher executable and your
# serial application(s) and place them in your WORKDIR
# directory.  Then, edit the CONTROL_FILE to specify
# each executable per process.
#-------------------------------------------------------
#-------------------------------------------------------
#
#         <------ Setup Parameters ------>
#
#$ -N map.b.1
#$ -pe 2way 12
#$ -q normal
#$ -o map.b.1.o$JOB_ID
#$ -l h_rt=1:00:00
#$ -V
#$ -cwd
#   <------ You MUST Specify a Project String ----->
#$ -A DNAdenovo
#------------------------------------------------------
#
# Usage:
#	#$ -pe <parallel environment> <number of slots>
#	#$ -l h_rt=hours:minutes:seconds to specify run time limit
# 	#$ -N <job name>
# 	#$ -q <queue name>
# 	#$ -o <job output file>
#	   NOTE: The env variable $JOB_ID contains the job id.
#
module load launcher
setenv EXECUTABLE     $TACC_LAUNCHER_DIR/init_launcher
setenv CONTROL_FILE   map.script
setenv WORKDIR        .
#
# Variable description:
#
#  EXECUTABLE     = full path to the job launcher executable
#  CONTROL_FILE   = text input file which specifies
#                   executable for each process
#                   (should be located in WORKDIR)
#  WORKDIR        = location of working directory
#
#      <------ End Setup Parameters ------>
#--------------------------------------------------------
#--------------------------------------------------------

#----------------
# Error Checking
#----------------

if ( ! -e $WORKDIR ) then
        echo " "
	echo "Error: unable to change to working directory."
	echo "       $WORKDIR"
	echo " "
	echo "Job not submitted."
	exit
endif

if ( ! -f $EXECUTABLE ) then
	echo " "
	echo "Error: unable to find launcher executable $EXECUTABLE."
	echo " "
	echo "Job not submitted."
	exit
endif

if ( ! -f $WORKDIR/$CONTROL_FILE ) then
	echo " "
	echo "Error: unable to find input control file $CONTROL_FILE."
	echo " "
	echo "Job not submitted."
	exit
endif


#----------------
# Job Submission
#----------------

cd $WORKDIR/
echo " WORKING DIR:   $WORKDIR/"

$TACC_LAUNCHER_DIR/paramrun $EXECUTABLE $CONTROL_FILE

echo " "
echo " Parameteric Job Complete"
echo " "

map.script

/home1/01057/sphsmith/local/bin/exome_step1.bash r1.00 r2.00 /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta mapped.00 >& mapped.00.log
/home1/01057/sphsmith/local/bin/exome_step1.bash r1.01 r2.01 /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta mapped.01 >& mapped.01.log

Expand here to see example variants.sge and variants.script

variants.sge

#!/bin/csh
#
# Simple SGE script for submitting multiple serial
# jobs (e.g. parametric studies) using a script wrapper
# to launch the jobs.
#
# To use, build the launcher executable and your
# serial application(s) and place them in your WORKDIR
# directory.  Then, edit the CONTROL_FILE to specify
# each executable per process.
#-------------------------------------------------------
#-------------------------------------------------------
#
#         <------ Setup Parameters ------>
#
#$ -N variants
#$ -pe 2way 144
#$ -q normal
#$ -o variants.o$JOB_ID
#$ -l h_rt=4:00:00
#$ -V
#$ -cwd
#   <------ You MUST Specify a Project String ----->
#$ -A DNAdenovo
#------------------------------------------------------
#
# Usage:
#	#$ -pe <parallel environment> <number of slots>
#	#$ -l h_rt=hours:minutes:seconds to specify run time limit
# 	#$ -N <job name>
# 	#$ -q <queue name>
# 	#$ -o <job output file>
#	   NOTE: The env variable $JOB_ID contains the job id.
#
module load launcher
module load java64
module load samtools
setenv EXECUTABLE     $TACC_LAUNCHER_DIR/init_launcher
setenv CONTROL_FILE   variants.script
setenv WORKDIR        .
#
# Variable description:
#
#  EXECUTABLE     = full path to the job launcher executable
#  CONTROL_FILE   = text input file which specifies
#                   executable for each process
#                   (should be located in WORKDIR)
#  WORKDIR        = location of working directory
#
#      <------ End Setup Parameters ------>
#--------------------------------------------------------
#--------------------------------------------------------

#----------------
# Error Checking
#----------------

if ( ! -e $WORKDIR ) then
        echo " "
	echo "Error: unable to change to working directory."
	echo "       $WORKDIR"
	echo " "
	echo "Job not submitted."
	exit
endif

if ( ! -f $EXECUTABLE ) then
	echo " "
	echo "Error: unable to find launcher executable $EXECUTABLE."
	echo " "
	echo "Job not submitted."
	exit
endif

if ( ! -f $WORKDIR/$CONTROL_FILE ) then
	echo " "
	echo "Error: unable to find input control file $CONTROL_FILE."
	echo " "
	echo "Job not submitted."
	exit
endif


#----------------
# Job Submission
#----------------

cd $WORKDIR/
echo " WORKING DIR:   $WORKDIR/"

$TACC_LAUNCHER_DIR/paramrun $EXECUTABLE $CONTROL_FILE

echo " "
echo " Parameteric Job Complete"
echo " "

variants.script

/home1/01057/sphsmith/local/bin/exome_step2.bash chrY.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chrY >& variants.chrY.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr22.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr22 >& variants.chr22.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr9.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr9 >& variants.chr9.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr3.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr3 >& variants.chr3.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr21.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr21 >& variants.chr21.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr5.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr5 >& variants.chr5.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr16.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr16 >& variants.chr16.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr19.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr19 >& variants.chr19.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr18.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr18 >& variants.chr18.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr4.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr4 >& variants.chr4.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr12.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr12 >& variants.chr12.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr15.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr15 >& variants.chr15.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr14.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr14 >& variants.chr14.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chrX.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chrX >& variants.chrX.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr6.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr6 >& variants.chr6.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr13.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr13 >& variants.chr13.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr8.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr8 >& variants.chr8.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr7.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr7 >& variants.chr7.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr11.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr11 >& variants.chr11.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr20.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr20 >& variants.chr20.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr10.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr10 >& variants.chr10.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr1.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr1 >& variants.chr1.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr2.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr2 >& variants.chr2.log
/home1/01057/sphsmith/local/bin/exome_step2.bash chr17.sorted.bam /scratch/01057/sphsmith/hg19/Homo_sapiens.GRCh37.60.dna.fasta /work/01057/sphsmith/dbsnp/dbsnp_132.hg19.vcf chr17 >& variants.chr17.log

Expand here to see example merge2.sge and merge2.script

merge2.sge

#!/bin/csh
#
# Simple SGE script for submitting multiple serial
# jobs (e.g. parametric studies) using a script wrapper
# to launch the jobs.
#
# To use, build the launcher executable and your
# serial application(s) and place them in your WORKDIR
# directory.  Then, edit the CONTROL_FILE to specify
# each executable per process.
#-------------------------------------------------------
#-------------------------------------------------------
#
#         <------ Setup Parameters ------>
#
#$ -N merge2
#$ -pe 4way 12
#$ -q normal
#$ -o merge2.o$JOB_ID
#$ -l h_rt=1:00:00
#$ -V
#$ -cwd
#   <------ You MUST Specify a Project String ----->
#$ -A DNAdenovo
#------------------------------------------------------
#
# Usage:
#	#$ -pe <parallel environment> <number of slots>
#	#$ -l h_rt=hours:minutes:seconds to specify run time limit
# 	#$ -N <job name>
# 	#$ -q <queue name>
# 	#$ -o <job output file>
#	   NOTE: The env variable $JOB_ID contains the job id.
#
module load launcher
setenv EXECUTABLE     $TACC_LAUNCHER_DIR/init_launcher
setenv CONTROL_FILE   merge2.script
setenv WORKDIR        .
#
# Variable description:
#
#  EXECUTABLE     = full path to the job launcher executable
#  CONTROL_FILE   = text input file which specifies
#                   executable for each process
#                   (should be located in WORKDIR)
#  WORKDIR        = location of working directory
#
#      <------ End Setup Parameters ------>
#--------------------------------------------------------
#--------------------------------------------------------

#----------------
# Error Checking
#----------------

if ( ! -e $WORKDIR ) then
        echo " "
	echo "Error: unable to change to working directory."
	echo "       $WORKDIR"
	echo " "
	echo "Job not submitted."
	exit
endif

if ( ! -f $EXECUTABLE ) then
	echo " "
	echo "Error: unable to find launcher executable $EXECUTABLE."
	echo " "
	echo "Job not submitted."
	exit
endif

if ( ! -f $WORKDIR/$CONTROL_FILE ) then
	echo " "
	echo "Error: unable to find input control file $CONTROL_FILE."
	echo " "
	echo "Job not submitted."
	exit
endif


#----------------
# Job Submission
#----------------

cd $WORKDIR/
echo " WORKING DIR:   $WORKDIR/"

$TACC_LAUNCHER_DIR/paramrun $EXECUTABLE $CONTROL_FILE

echo " "
echo " Parameteric Job Complete"
echo " "

merge2.script

samtools merge -f Sample_5_L003_R1.cat.fastq.sorted.bam *.sorted.bam

Space shortcuts

Page tree