-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathExmAdHoc.2.MergeBams.sh
executable file
·133 lines (111 loc) · 5.81 KB
/
ExmAdHoc.2.MergeBams.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/bin/bash
#$ -cwd -l mem=12G,time=6:: -N MrgBam
# This script takes a a list of bam files and merges them into a single bam file. It is primarily for use in merging bams from a single sample that are from different read groups (e.g. have been sequenced on different lanes and aligned separately)
# The script can be pipelined directly to the GATK local indel realignment stage of the pipeline
# The script can also call the ExmAln.3a.BamMetrics script to get basics stats about the merged bam
# InpFil - (required) - A file with a list of paths to bams to be merged. List file name must end ".list". Note that the name of the input file will be used to generate the name of the output file
# RefFil - (required) - shell file containing variables with locations of reference files and resource directories; see list below for required variables for required variables
# LogFil - (optional) - File for logging progress
# TgtBed - (optional) - Exome capture kit targets bed file (must end .bed for GATK compatability) ; may be specified using a code corresponding to a variable in the RefFil giving the path to the target file- only required if calling pipeline
# PipeLine - P -(flag) - will start the GATK realign and recalibrate pipeline using the files generated by this script, will also call the ExmAln.3a.BamMetrics script
# Metrix - M -(flag) - will call the ExmAln.3a.BamMetrics script once the bams are merged; the -P flag will call both
# Help - H - (flag) - get usage information
#list of variables required in reference file:
# $EXOMPPLN - directory containing exome analysis pipeline scripts
#list of required tools:
# samtools <http://samtools.sourceforge.net/> <http://sourceforge.net/projects/samtools/files/>
# java <http://www.oracle.com/technetwork/java/javase/overview/index.html>
# picard <http://picard.sourceforge.net/> <http://sourceforge.net/projects/picard/files/>
## This file also requires exome.lib.sh - which contains various functions used throughout the Exome analysis scripts; this file should be in the same directory as this script
###############################################################
#set default arguments
usage="
ExmAdHoc.2.MergeBams.sh -i <InputFile> -r <reference_file> -t <target intervals file> -l <logfile> -PH
-i (required) - Path \".list\" file containing a multiple paths to bams. Name of input file used as name of output.
-r (required) - shell file containing variables with locations of reference files and resource directories
-l (optional) - Log file
-t (optional) - Exome capture kit targets or other genomic intervals bed file (must end .bed for GATK compatability); this file is required if calling the pipeline but otherwise can be omitted
-P (flag) - Initiate exome analysis pipeline after completion of script
-M (flag) - Call the Bam metrics script once the bams are merged but not the full exome analysis pipeline
-H (flag) - echo this message and exit
"
Metrix="false"
PipeLine="false"
#get arguments
while getopts i:r:l:t:PMH opt; do
case "$opt" in
i) InpFil="$OPTARG";;
r) RefFil="$OPTARG";;
l) LogFil="$OPTARG";;
t) TgtBed="$OPTARG";;
P) PipeLine="true";;
M) Metrix="true";;
H) echo "$usage"; exit;;
esac
done
#check all required paramaters present
if [[ ! -e "$InpFil" ]] || [[ ! -e "$RefFil" ]]; then echo "Missing/Incorrect required arguments"; echo "$usage"; exit; fi
#Call the RefFil to load variables
RefFil=`readlink -f $RefFil`
source $RefFil
#Load script library
source $EXOMPPLN/exome.lib.sh #library functions begin "func"
#set local variables
InpFil=`readlink -f $InpFil` #resolve absolute path to bam
echo $InpFil
BamNam=`basename $InpFil | sed s/.list//` # a name for the output files
echo $BamNam
if [[ -z "$LogFil" ]]; then LogFil=$BamNam.MrgBam.log; fi # a name for the log file
MrgDir=wd.$BamNam.merge # directory in which processing will be done
MrgFil=$BamNam.merged.bam #filename for bwa-mem aligned file
SrtFil=$BamNam.merged.sorted.bam #filename for bwa-mem aligned file
DdpFil=$BamNam.merged.mkdup.bam #filename for bwa-mem aligned file
FlgStat=$BamNam.merged.flagstat #output file for bam flag stats
IdxStat=$BamNam.merged.idxstats #output file for bam index stats
mkdir -p $MrgDir # create working directory
cd $MrgDir # move into working directory
TmpLog=$BamNam.MrgBam.temp.log #temporary log file
TmpDir=$BamNam.MrgBam.tempdir; mkdir -p $TmpDir #temporary directory
#start log
ProcessName="Merge with samtools"
funcWriteStartLog
#Merge with samtools
StepName="Merge with Samtools"
StepCmd="samtools merge -b $InpFil $MrgFil" #commandtoberun
funcRunStep
#Sort the bam file by coordinate
StepName="Sort Bam using PICARD"
StepCmd="java -Xmx4G -Djava.io.tmpdir=$TmpDir -jar $PICARD SortSam
INPUT=$MrgFil
OUTPUT=$SrtFil
SORT_ORDER=coordinate
CREATE_INDEX=TRUE"
funcRunStep
rm $MrgFil #removed the "Aligned bam"
#Mark the duplicates
StepName="Mark PCR Duplicates using PICARD"
StepCmd="java -Xmx4G -Djava.io.tmpdir=$TmpDir -jar $PICARD MarkDuplicates
INPUT=$SrtFil
OUTPUT=$DdpFil
METRICS_FILE=$DdpFil.dup.metrics.txt
CREATE_INDEX=TRUE"
funcRunStep
rm $SrtFil ${SrtFil/bam/bai} #removed the "Sorted bam"
#Get flagstat
StepName="Output flag stats using Samtools"
StepCmd="samtools flagstat $DdpFil > $FlgStat"
funcRunStep
#get index stats
StepName="Output idx stats using Samtools"
StepCmd="samtools idxstats $DdpFil > $IdxStat"
funcRunStep
#Call next steps of pipeline if requested
NextJob="Run Genotype VCF"
NextCmd="$EXOMPPLN/ExmAln.2.HaplotypeCaller_GVCFmode.sh -i $DdpFil -r $RefFil -t $TgtBed -l $LogFil -P -B"
funcPipeLine
if [[ "$Metrix" == "true" ]]; then PipeLine="true"; fi #if the bam metrics were requested this will activate the next job
NextJob="Get basic bam metrics"
NextCmd="$EXOMPPLN/ExmAln.3a.Bam_metrics.sh -i $DdpFil -r $RefFil -l $LogFil"
funcPipeLine
#End Log
funcWriteEndLog