-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcondenseAlleles.pl
executable file
·129 lines (95 loc) · 3.57 KB
/
condenseAlleles.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
use File::Path;
use File::Basename;
our @input;
#Call subroutine to parse command-line arguments
parseArgs();
#iterate through input files
@input = glob "@input";
foreach my $file ( @input ){
my $name = "";
my %data;
#Open each file
open ( FAS, "$file" ) || die "Derp: Cannot open $file!";
while (<FAS>){
chomp $_;
if ($_ =~ /\>/ ){
$_ =~ />(\S+)/;
$name = "$1";
}else{
push @{$data{$name}}, $_;
}
}
consense( \%data, $file );
}
close FAS;
###########################################SUBROUTINES#########################################
sub parseArgs{
my $usage="\ncondenseAlleles.pl takes FASTA files with individual alleles encoded separately, and generates a consensus sequence for each individual
Usage: $0 --i /path/to/*.fasta
Mandatory Variables
-i, --input - path to input files in FASTA format\n\n";
my $result = GetOptions
(
'input|s=s{1,}' => \@input,
);
@input or die "\nDerp: Input not specified!\n\n$usage";
}
############################################################################################
#This subroutine takes a hash of arrays, where the array stores the alleles for each sample (key in the hash), and spits out a consensus sequence for any heterozygote.
#
#WARNING: This script may contain excessive warnings!
#
#WARNING: OVERWRITES THE ORIGINAL FILE!!
#
#ANOTHER WARNING: Not written to accomodate polyploids, or paralogous loci. I will be implementing this as a part of post-processing the output of Stacks, which should have removed any loci with individuals having more than 2 alleles (as these indicate presence of paralogs, which I don't want).
#
#YET ANOTHER ANOTHER WARNING: This script also assumes that alleles are of the same length, and pre-aligned.
sub consense{
my $datref = $_[0];
my $file = $_[1];
#Empty current file to start rewriting (all info stored in %info hash now)
open ( OUT, ">$file" ) || die "Derp: Oh noes! I'm in the subroutine and cannot open $file!";
foreach my $key ( sort {$a <=> $b } ( keys %$datref ) ) {
print OUT "\>$key\n"; #Print FASTA header
if ( exists $$datref{$key}->[1] ) { #Check if sequence has multiple alleles
#Load sequences into arrays for comparison
my @allele1 = split //, $$datref{$key}->[0];
my @allele2 = split //, $$datref{$key}->[1];
for ( my $i=0; $i <= length $$datref{$key}->[0]; $i++ ){
#This is messy and sucks. Fix later with Bio::AlignIO and consensus_iupac function
if ( uc ( $allele1[$i] ) eq uc ( $allele2[$i] ) ) {
uc ( $allele1[$i] ) eq "A" and print OUT "A";
uc ( $allele1[$i] ) eq "G" and print OUT "G";
uc ( $allele1[$i] ) eq "T" and print OUT "T";
uc ( $allele1[$i] ) eq "C" and print OUT "C";
}else{
if ( uc $allele1[$i] eq "A" ){
uc $allele2[$i] eq "G" and print OUT "R";
uc $allele2[$i] eq "C" and print OUT "M";
uc $allele2[$i] eq "T" and print OUT "W";
}elsif ( uc $allele1[$i] eq "G" ){
uc $allele2[$i] eq "A" and print OUT "R";
uc $allele2[$i] eq "C" and print OUT "S";
uc $allele2[$i] eq "T" and print OUT "K";
}elsif ( uc $allele1[$i] eq "T" ){
uc $allele2[$i] eq "A" and print OUT "W";
uc $allele2[$i] eq "C" and print OUT "Y";
uc $allele2[$i] eq "G" and print OUT "K";
}elsif ( uc $allele1[$i] eq "C" ){
uc $allele2[$i] eq "A" and print OUT "M";
uc $allele2[$i] eq "G" and print OUT "S";
uc $allele2[$i] eq "T" and print OUT "Y";
}
}
}
print OUT "\n";
}else{
print OUT $$datref{$key}->[0],"\n";
}
}
#close OUT;
}