-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathExmAln.1b.ReAlign_Bam_with_BWAmem.sh
executable file
·238 lines (214 loc) · 12.7 KB
/
ExmAln.1b.ReAlign_Bam_with_BWAmem.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#!/bin/bash
#$ -cwd -pe smp 6 -l mem=2G,time=12:: -N BamBWABam
# This script takes a bam file and reverts it to sam format and then realigns with BWA mem
# The process is:
# InputBam --bamshuf--> Randomly Shuffled Bam --bam2fq--> Fastq --bwa mem--> Sam --> Bam
# See http://gatkforums.broadinstitute.org/discussion/2908/howto-revert-a-bam-file-to-fastq-format and associated discussion for rationale, particularly with regard to shuffling the bam, not clear if it is essential, but it probably can't hurt (though it is a little time consuming)
# bam2fq generates and interleaved fastq, if the Bam contains singletons these disrupt the order of the the interleaved read pairs and bwa mem throws an error, therefore they are removed with -s flag and dumped into a separate fastq.gz file
# IMPORTANT READ GROUP CONSIDERATION - if the bam files contains multiple read groups these must be split up and processed separately if you wish to maintain this information as the bam --> bam2fq --> bwa mem --> sam is read group illiterate and will reassign all reads in the bam to the same read group. The script will by default do this for you by calling a new set of jobs, one for each readgroup. Alternatively you can force this with the -R flag.
# You can use "samtools view -h BAMFILE -r ReadGroupID | samtools -bS > BAMFILE_RGID" to split the file
# Note regarding Quality Scores:
# The bam2fq call will use the OQ field (old as in unrecalibrated quality scores) if it is present.
# It is a good idea to check the QS and see what coding they are in. If they are in pre-Illumina 1.8 coding then use the -F flag if calling the pipeline so that GATK adjusts them to the new coding scale.
# Usage notes:
# InpFil - (required) - Path to Bam file to be aligned. Alternatively a file with a list of bams can be provided and the script run in an array job. List file name must end ".list"
# RefFil - (required) - shell file containing variables with locations of reference files and resource directories; see list below for required variables
# LogFil - (optional) - File for logging progress
# TgtBed - (optional) - Exome capture kit targets bed file (must end .bed for GATK compatability) ; may be specified using a code corresponding to a variable in the RefFil giving the path to the target file- only required if calling pipeline
# RGNum - (optional) - Process only a single read group provide the read group ID. Practically there is no need to use this option. The script will automatically detect multiple read groups within the bam and will call a new process for each one then wait for them to finish before calling the merge script.
# PipeLine - P -(flag) - will start the GATK realign and recalibrate pipeline using the files generated by this script
# Flag - F - Fix mis-encoded base quality scores - Rewrite the qulaity scores from Illumina 1.5+ to Illumina 1.8+, will subtract 31 from all quality scores; used to fix encoding in some datasets (especially older Illumina ones) which starts at Q64 (see https://en.wikipedia.org/wiki/FASTQ_format#Encoding)
# Help - H - (flag) - get usage information
#list of variables required in reference file:
# $REF - reference genome in fasta format - must have been indexed using 'bwa index ref.fa'
# $EXOMPPLN - directory containing exome analysis pipeline scripts,
#list of required tools:
# samtools <http://samtools.sourceforge.net/> <http://sourceforge.net/projects/samtools/files/>
# bwa mem <http://bio-bwa.sourceforge.net/> <http://sourceforge.net/projects/bio-bwa/files/>
# java <http://www.oracle.com/technetwork/java/javase/overview/index.html>
# picard <http://picard.sourceforge.net/> <http://sourceforge.net/projects/picard/files/>
# seqtk <https://github.com/lh3/seqtk>
## This file also requires exome.lib.sh - which contains various functions used throughout the Exome analysis scripts; this file should be in the same directory as this script
###############################################################
#set default arguments
usage="
ExmAln.1b.ReAlign_Bam_with_BWAmem.sh -i <InputFile> -r <reference_file> -t <target intervals file> -a <readgroup_ID> -s <NewSampleName> -l <logfile> -PRFH
-i (required) - Path to Bam file
-r (required) - shell file containing variables with locations of reference files and resource directories
-l (optional) - Log file
-t (optional) - Exome capture kit targets or other genomic intervals bed file (must end .bed for GATK compatability); this file is required if calling the pipeline but otherwise can be omitted
-a (optional) - The RG ID to process, used when processing individual read groups separately
-s (optional) - A new sample name to place in the SM field of the RG header
-P (flag) - Initiate exome analysis pipeline after completion of script
-F (flag) - Fix mis-encoded base quality scores - Rewrite the qulaity scores from Illumina 1.5+ to Illumina 1.8+
-H (flag) - echo this message and exit
"
PipeLine="false"
FixMisencoded="false"
SplitReads="false"
#get arguments
while getopts i:r:l:t:a:s:PFH opt; do
case "$opt" in
i) InpFil="$OPTARG";;
r) RefFil="$OPTARG";;
l) LogFil="$OPTARG";;
t) TgtBed="$OPTARG";;
a) RGID="$OPTARG";;
s) NewSam="$OPTARG";;
P) PipeLine="true";;
F) FixMisencoded="true";;
H) echo "$usage"; exit;;
esac
done
#check all required paramaters present
if [[ ! -e "$InpFil" ]] || [[ ! -e "$RefFil" ]]; then echo "Missing/Incorrect required arguments"; echo "$usage"; exit; fi
#Call the RefFil to load variables
RefFil=`readlink -f $RefFil`
source $RefFil
#Load script library
source $EXOMPPLN/exome.lib.sh #library functions begin "func"
#set local variables
BamFil=`readlink -f $InpFil` #resolve absolute path to bam
BamNam=`basename $BamFil`
BamNam=`basename $BamFil | sed s/.bam$//` # a name for the output files
if [[ -z "$LogFil" ]]; then LogFil=$BamNam.BbB.log; fi # a name for the log file
AlnDir=wd.$BamNam.align # directory in which processing will be done
RgIdLst=$BamNam.byRGID.list # a file containing a list of RG ID specific bamfiles
AlnFil=$BamNam.bwamem.bam #filename for bwa-mem aligned file
SngFil=$BamNam.singletons.gz #output file for the singletons to be dumped to
SrtFil=$BamNam.bwamem.sorted.bam #output file for sorted bam
DdpFil=$BamNam.bwamem.mkdup.bam #output file with PCR duplicates marked
FlgStat=$BamNam.bwamem.flagstat #output file for bam flag stats
IdxStat=$BamNam.idxstats #output file for bam index stats
AlnDir=`readlink -f $AlnDir` #resolve absolute path to working directory
mkdir -p $AlnDir # create working directory
cd $AlnDir # move into working directory
TmpLog=$BamNam.BwB$RGID.temp.log #temporary log file
TmpDir=$BamNam.BwB$RGID.tempdir; mkdir -p $TmpDir #temporary directory
#start log
ProcessName="Align with BWA"
if [[ $RGID ]]; then ProcessName="Split by readgroups and Align with BWA"; fi
funcWriteStartLog
echo " Build of reference files: "$BUILD >> $TmpLog
echo "----------------------------------------------------------------" >> $TmpLog
#get ReadGroupHeader from input BAM and check readgroup header
RgHeader=$(samtools view -H $BamFil | grep -m 1 ^@RG | awk '{ gsub("\t","\\t") } { print }')
if [[ $RgHeader == "" ]]; then #check that we have a RG header and if not write a warning to the log file and exit
echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" >> $TmpLog
echo " Problem with ReadGroup header" >> $TmpLog
echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" >> $TmpLog
cat $TmpLog >> $LogFil
rm $TmpLog
exit 1
fi
#check for multiple readgroups and split if necessary
if [[ ! $RGID ]]; then
NumOfRG=$(samtools view -H $BamFil | grep ^@RG | wc -l)
if [[ $NumOfRG -gt 1 ]]; then
echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" >> $TmpLog
echo "There are "$NumOfRG" Readgroups in the bam:" >> $TmpLog
samtools view -H $BamFil | grep ^@RG >> $TmpLog
echo >> $TmpLog
echo "They will be processed separately and remerged....">> $TmpLog
cd ../ #move back out of working directory
if [[ "$PipeLine" == "true" ]]; then KeepPipe="true"; fi ## need to set the PipeLine status to true to call the seperate realignment jobs, but we need to reset the status in the final merge job
PipeLine="true"
LogFil=$AlnDir/$LogFil
TmpLog=$AlnDir/$TmpLog
TmpDir=$AlnDir/$TmpDir
for RGID in `samtools view -H $BamFil | grep ^@RG | sed s/.*ID://g | sed 's/\t.*//g'`; do
echo $RGID
NextCmd="$EXOMPPLN/ExmAln.1b.ReAlign_Bam_with_BWAmem.sh -i $BamFil -a $RGID -r $RefFil -l $LogFil"
if [[ $TgtBed ]]; then NextCmd=$NextCmd" -t $TgtBed"; fi
if [[ "$FixMisencoded" == "true" ]]; then NextCmd=$NextCmd" -F"; fi
funcPipeLine
pids[${RGi}]=$! #collect the pids of the child processes for the wait command
done
#End Log
echo "pids: "${pids[*]}
funcWriteEndLog
echo "Creating mergelist and sending merge command with hold." >> $TmpLog
samtools view -H $BamFil | grep ^@RG | sed "s/.*\<ID:/$BamNam.RGID_/" | sed -e 's/\s\+.*/.bwamem.mkdup.bam/' | awk -v PWD=$PWD '{ print PWD"/"$0 }' > $RgIdLst
NextCmd="$EXOMPPLN/ExmAdHoc.2.MergeBams.sh -i $RgIdLst -r $RefFil -t $TgtBed -l $LogFil"
if [[ "$KeepPipe" == "true" ]]; then
NextCmd=$NextCmd" -P"
else
NextCmd=$NextCmd" -M"
fi
for pid in ${pids[*]}; do wait $pid; done #wait until all of the realignment processes to run before finally calling the merge job
#funcPipeLine
exit 0
fi
else
echo "Splitting Bam by readgroups..." >> $TmpLog
echo "ReadGroup: " $RGID >> $TmpLog
RgHeader=$(samtools view -H $BamFil | grep ^@RG | head -n $RGID | tail -n 1 | awk '{ gsub("\t","\\t") } { print }')
AlnFil=$BamNam.RGID_$RGID.bwamem.bam
SngFil=$BamNam.RGID_$RGID.singletons.gz #output file for the singletons to be dumped to
SrtFil=$BamNam.RGID_$RGID.bwamem.sorted.bam #output file for sorted bam
DdpFil=$BamNam.RGID_$RGID.bwamem.mkdup.bam #output file with PCR duplicates marked
FlgStat=$BamNam.RGID_$RGID.bwamem.flagstat #output file for bam flag stats
IdxStat=$BamNam.RGID_$RGID.idxstats #output file for bam index stats
fi
#Replace Sample Name in RG header if necessary
if [[ $NewSam ]]; then
echo -e "Old ReadGroup header: $RgHeader" >> $TmpLog
RgHeader=$(echo $RgHeader | perl -pe 's|SM:.*?\\t|SM:'$NewSam'\\t|')
echo -e "New ReadGroup header: $RgHeader" >> $TmpLog
else
echo -e "ReadGroup header: $RgHeader" >> $TmpLog
fi
###Align using BWA mem algorithm
# use HTSlib to shuffle the bam | then tranform it to an interleaved fastq discarding an singletons to a separate file as they mess up the interleaving | transform sam back to bam
#StepCmd="samtools bamshuf -uOn 128 $BamFil tmp | samtools bam2fq -s $SngFil -O - | gzip | bwa mem -M -R \"$RgHeader\" -t 6 -p $REF - | samtools view -bS - > $AlnFil"
StepName="Align with BWA mem"
StepCmd="samtools bamshuf -uOn 128 $BamFil tmp |
samtools bam2fq -s $SngFil -O - |"
if [[ $FixMisencoded == "true" ]]; then
StepCmd=$StepCmd" seqtk seq -Q64 -V - |"
fi
StepCmd=$StepCmd" gzip | bwa mem -M -R \"$RgHeader\" -t 6 -p $REF - |
samtools view -bS - > $AlnFil"
funcRunStep
#Sort the bam file by coordinate
StepName="Sort Bam using PICARD"
StepCmd="java -Xmx4G -XX:ParallelGCThreads=1 -Djava.io.tmpdir=$TmpDir -jar $PICARD SortSam
INPUT=$AlnFil
OUTPUT=$SrtFil
SORT_ORDER=coordinate
CREATE_INDEX=TRUE"
funcRunStep
rm $AlnFil #removed the "Aligned bam"
#Mark the duplicates
StepName="Mark PCR Duplicates using PICARD"
StepCmd="java -Xmx4G -XX:ParallelGCThreads=1 -Djava.io.tmpdir=$TmpDir -jar $PICARD MarkDuplicates
INPUT=$SrtFil
OUTPUT=$DdpFil
METRICS_FILE=$DdpFil.dup.metrics.txt
CREATE_INDEX=TRUE"
funcRunStep
rm $SrtFil ${SrtFil/bam/bai} #removed the "Sorted bam"
if [[ ! $RGID ]]; then
#Call next steps of pipeline if requested
NextJob="Run Genotype VCF"
NextCmd='$EXOMPPLN/ExmAln.2.HaplotypeCaller_GVCFmode.sh -i $DdpFil -r $RefFil -t $TgtBed -l $LogFil -B > MakegVCF.$BamNam.$$.o 2>&1'
funcPipeLine
#Always call QC
PipeLine="true"
NextJob="Get basic bam metrics"
NextCmd="$EXOMPPLN/ExmAln.3a.Bam_metrics.sh -i $DdpFil -r $RefFil -l $LogFil > GetBamMets.$BamNam.$$.o 2>&1"
funcPipeLine
funcPipeLineNextJob="Run Depth of Coverage"
NextCmd='$EXOMPPLN/ExmAln.8a.DepthofCoverage.sh -i $DdpFil -r $RefFil -t $TgtBed -l $LogFil -B > DoC.$BamNam.$$.o 2>&1'
funcPipeLine
#Get flagstat
StepName="Output flag stats using Samtools"
StepCmd="samtools flagstat $DdpFil > $FlgStat"
funcRunStep
#get index stats
StepName="Output idx stats using Samtools"
StepCmd="samtools idxstats $DdpFil > $IdxStat"
funcRunStep
fi
#End Log
funcWriteEndLog