-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathpfam2gff.py
executable file
·354 lines (327 loc) · 18.3 KB
/
pfam2gff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
#!/usr/bin/env python
#
# pfam2gff.py v1.0 created 2016-03-03
#
# Sequence Ontology terms from:
# https://github.com/The-Sequence-Ontology/SO-Ontologies/blob/master/subsets/SOFA.obo
'''
pfam2gff.py last modified 2022-11-14
EXAMPLE USAGE:
to convert to protein gff, where domains are protein coordinates
pfam2gff.py -i proteins.pfam.tab > proteins.pfam.gtf
to convert go genome gff, where domains are in genomic coordinates
must use -g with transcript coordinates, can be exon or CDS
pfam2gff.py -i proteins.pfam.tab -g genes.gff > genome.pfam.gtf
if using TransDecoder for peptide prediction, use mode -T
and use the TransDecoder genome GFF file for -g
this corrects for the CDS IDs as ID=cds.gene123 by removing the cds.
pfam2gff.py -i transdecoder.pfam.tab -g transdecoder_genome.gff3 -T > transdecoder.pfam.gff
if using AUGUSTUS, use -T, and -d __ for extract_features.py proteins
pfam2gff.py -i augustus.prots.pfam.tab -g augustus.exons.gff -T -d __ > augustus.pfam.gff
GENERATE AUGUSTUS EXONS BY:
reformatgff.py -a augustus.gff > augustus.exons.gff
GENERATE AUGUSTUS PROTS BY:
extract_features.py -p augustus.prots.fasta augustus.gff
GENERATE PFAM TABULAR BY:
hmmscan --cpu 4 --domtblout proteins.pfam.tab ~/PfamScan/data/Pfam-A.hmm transcripts.transdecoder.pep > proteins.pfam.log
'''
import os
import sys
import time
import argparse
import re
import gzip
from collections import defaultdict
from itertools import chain
def cds_to_intervals(gtffile, genesplit, keepexons, transdecoder, jgimode, nogenemode):
'''convert protein or gene intervals from gff to dictionary where mrna IDs are keys and lists of intervals are values'''
# this should probably be a class
geneintervals = defaultdict(list)
genestrand = {}
genescaffold = {}
commentlines = 0
linecounter = 0
transcounter = 0
exoncounter = 0
# allow gzipped files
if gtffile.rsplit('.',1)[-1]=="gz": # autodetect gzip format
opentype = gzip.open
sys.stderr.write("# Parsing gff from {} as gzipped {}\n".format(gtffile, time.asctime() ) )
else: # otherwise assume normal open for fasta format
opentype = open
sys.stderr.write("# Parsing gff from {} {}\n".format(gtffile, time.asctime() ) )
# extract gene or CDS information
for line in opentype(gtffile,'r'):
line = line.strip()
if line: # ignore empty lines
if line[0]=="#": # count comment lines, just in case
commentlines += 1
else:
linecounter += 1
lsplits = line.split("\t")
scaffold = lsplits[0]
feature = lsplits[2]
strand = lsplits[6]
attributes = lsplits[8]
if attributes.find("ID")==0: # indicates gff3 format
geneid = re.search('ID=([\w.|-]+)', attributes).group(1)
elif attributes.find("Parent")==0: # gff3 format but no ID
geneid = re.search('Parent=([\w.|-]+)', attributes).group(1)
elif attributes.find("gene_id")==0: # indicates gtf format
geneid = re.search('transcript_id "([\w.|-]+)";', attributes).group(1)
elif jgimode and attributes.find("name")==0: # for JGI like: name "fgeneshTA2_pg.C_scaffold_1000001";
geneid = re.search('name "([\w.|-]+)";', attributes).group(1)
if transdecoder: # meaning CDS IDs will start with cds.gene.123|m.1
geneid = geneid.replace("cds.","") # simply remove the cds.
geneid = geneid.replace(".cds","") # also works for AUGUSTUS
if genesplit: # if splitting, split all gene IDs
geneid = geneid.split(genesplit,1)[0]
if feature=="transcript" or feature=="mRNA":
transcounter += 1
genestrand[geneid] = strand
genescaffold[geneid] = scaffold
elif feature=="CDS" or (keepexons and feature=="exon"):
exoncounter += 1
boundaries = ( int(lsplits[3]), int(lsplits[4]) )
if nogenemode: # gtf contains only exon and CDS, so get gene info from each CDS
geneid = re.search('transcript_id "([\w.|-]+)";', attributes).group(1)
# this may reassign multiple times
genestrand[geneid] = strand
genescaffold[geneid] = scaffold
geneintervals[geneid].append(boundaries)
# sys.stderr.write("{} {} {}\n".format(geneid, scaffold, boundaries) )
sys.stderr.write("# Counted {} lines and {} comments {}\n".format(linecounter, commentlines, time.asctime() ) )
sys.stderr.write("# Counted {} exons for {} transcripts\n".format(exoncounter, transcounter) )
sys.stderr.write("# Gene IDs taken as {} from {}\n".format(geneid, attributes) )
return geneintervals, genestrand, genescaffold
def parse_pfam_domains(pfamtabular, evaluecutoff, lengthcutoff, programname, outputtype, donamechop, debugmode=False, jgimode=False, geneintervals=None, genestrand=None, genescaffold=None):
'''parse domains from hmm domtblout and write to stdout as protein gff or genome gff'''
domaincounter = 0
protnamedict = defaultdict(int) # presence of a name, or domain count
evalueRemovals = 0
shortRemovals = 0
intervalproblems = 0
intervalcounts = 0
writecount = 0
# for protein GFF, keep domains in dict for later sorting by position
protboundstoline = defaultdict(dict)
# allow hmmscan or interproscan format
is_pfam_format, is_interproscan = False, False
# allow gzipped files
if pfamtabular.rsplit('.',1)[-1]=="gz": # autodetect gzip format
opentype = gzip.open
sys.stderr.write("# Parsing hmmscan PFAM tabular {} as gzipped {}\n".format(pfamtabular, time.asctime() ) )
else: # otherwise assume normal open for fasta format
opentype = open
sys.stderr.write("# Parsing hmmscan PFAM tabular {} {}\n".format(pfamtabular, time.asctime() ) )
for line in opentype(pfamtabular, 'rt'):
line = line.strip()
if not line or line[0]=="#": # skip comment lines
continue # also catch for empty line, which would cause IndexError
domaincounter += 1
# try to autodetect format
if line.count("\t") == 12 or line.count("\t") == 14: # should be 15 column tab separated
is_interproscan = True
else: # default, should have no tabs
is_pfam_format = True
if is_pfam_format:
# --- full sequence --- -------------- this domain ------------- hmm coord ali coord env coord
# 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
# target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target
#------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- ---------------------
lsplits = line.split(None, 22)
targetname = lsplits[0]
pfamacc = lsplits[1].rsplit('.',1)[0] # accs as PF00530.13, so chop off .13
#queryid = lsplits[3].rsplit('|',1)[0] # chop transcript from transdecoder peptide
queryid = lsplits[3] # for transdecoder peptides, |m.123 is needed for interval identification
if donamechop:
queryid = queryid.rsplit(donamechop,1)[0]
protnamedict[queryid] = 1
evalue = float(lsplits[11]) # or [6] for full seq or [12]
domscore = lsplits[13]
domstart = int(lsplits[17])
domend = int(lsplits[18])
domnumber = lsplits[9]
# include target description in the Name, change disallowed characters to -
targetdescription = lsplits[22].replace("=","-").replace(",","-").replace(";","-")
targetdescription = targetdescription.replace(" ","_")
domainlength = domend - domstart + 1 # bases 1 to 6 should have length 6
fractioncov = domainlength/float(lsplits[2])
# filter matches by evalue, length, etc
if fractioncov < lengthcutoff: # skip domains that are too short
shortRemovals += 1
continue
if evalue >= evaluecutoff: # skip domains with bad evalue
evalueRemovals += 1
continue
elif is_interproscan:
# 0 1 2 3 4 5 6 7 8 9
# example 538ce3cc6086b9317efa6f6b9b76fbf0 1174 PANTHER PTHR32154 PYRUVATE-FLAVODOXIN OXIDOREDUCTASE-RELATED 1 1165 0.0 T 12-11-2022 - -
lsplits = line.split("\t")
targetname = lsplits[4]
pfamacc = lsplits[12]
queryid = lsplits[0] #
protnamedict[queryid] = 1
evalue = lsplits[8] # or [6] for full seq or [12]
domscore = "-"
domstart = int(lsplits[6])
domend = int(lsplits[7])
domnumber = protnamedict.get(queryid,0)
# include target description in the Name, change disallowed characters to -
targetdescription = lsplits[5].replace("=","-").replace(",","-").replace(";","-")
targetdescription = targetdescription.replace(" ","_")
domainlength = domend - domstart + 1 # bases 1 to 6 should have length 6
programname = "InterProScan"
else:
sys.stderr.write("# WARNING: cannot detect format of line{}:\n{}\n".format(domaincounter, line))
### FOR GENOME GFF ###
if geneintervals: # if gene intervals are given in genomic coordinates
genomeintervals = [] # to have empty iterable
# convert domain protein positions to transcript nucleotide
# protein position 1 becomes nucleotide position 1, position 2 becomes nucleotide 4, 3 to 7
domstart = (domstart - 1) * 3 + 1
domend = domend * 3 # end is necessarily the end of a codon
domainlength_nucl = domend - domstart + 1 # recalculate for nucleotide coordinates
scaffold = genescaffold.get(queryid, None)
strand = genestrand.get(queryid, None)
# convert transcript nucleotide to genomic nucleotide, and split at exon bounds
if strand=='+':
genomeintervals = get_intervals(geneintervals[queryid], domstart, domainlength_nucl, doreverse=False)
elif strand=='-': # implies '-'
genomeintervals = get_intervals(geneintervals[queryid], domstart, domainlength_nucl, doreverse=True)
elif strand=='.': # strand is specified as '.'
sys.stderr.write("WARNING: no strand given for {}, using forward\n".format(queryid) )
genomeintervals = get_intervals(geneintervals[queryid], domstart, domainlength_nucl, doreverse=False)
else: # strand is None, meaning queryid is not in genestrand dict
sys.stderr.write("WARNING: no match in GFF for query {}\n".format(queryid) )
continue
intervalcounts += len(genomeintervals)
if not len(genomeintervals):
sys.stderr.write("WARNING: no intervals for {} in {}\n".format(targetname, queryid) )
intervalproblems += 1
continue
writecount += 1
# make Parent feature
allpositions = list(chain(*genomeintervals))
parentstart = min(allpositions)
parentend = max(allpositions)
# ID consists of: query gene, "target name" of up to 21 characters, domain number
# ID=g1.t1.VWA.1
# Name consists of: PFAM accession, target name, target description
# Name=PF00092.VWA.von_Willebrand_factor_type_A_domain
outline = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t.\tID={10}.{8}.{9};Name={7}.{8}.{11}\n".format(scaffold, programname, outputtype, parentstart, parentend, domscore, strand, pfamacc, targetname, domnumber, queryid, targetdescription)
sys.stdout.write(outline+os.linesep)
# make child features for each interval
for interval in genomeintervals:
# thus ID appears as protein.targetname.number,
# so avic1234.G2F.1, and uses ID in most browsers
outline = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t.\tParent={10}.{8}.{9};Name={7}.{8}.{11}\n".format(scaffold, programname, outputtype, interval[0], interval[1], domscore, strand, pfamacc, targetname, domnumber, queryid, targetdescription)
sys.stdout.write(outline)
### FOR PROTEIN GFF ###
else: # for protein GFF, make outline for later sorting
writecount += 1
boundaries = (domstart,domend)
if debugmode:
bitlength = float(lsplits[13])/domainlength
outline = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{9:.3f}\t{10:.3f}\tID={6}.{7}.{8};Name={6}.{7}.{8}\n".format(queryid, programname, outputtype, domstart, domend, domscore, pfamacc, targetname, domnumber, bitlength, fractioncov)
else:
outline = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t.\t.\tID={6}.{7}.{8};Name={6}.{7}.{8}\n".format(queryid, programname, outputtype, domstart, domend, domscore, pfamacc, targetname, domnumber)
protboundstoline[queryid][boundaries] = outline
# report totals
sys.stderr.write("# Found {} domains for {} proteins, wrote {} {}\n".format(domaincounter, len(protnamedict), writecount, time.asctime() ) )
if geneintervals: # in genome GFF mode, check if any CDS intervals were actually collected
if intervalcounts:
sys.stderr.write("# Wrote {} domain intervals\n".format(intervalcounts) )
else:
sys.stderr.write("WARNING: NO DOMAINS WRITTEN, CHECK OPTIONS -d AND -D\n")
sys.stderr.write("# Removed {} domain hits by shortness\n".format(shortRemovals) )
sys.stderr.write("# Removed {} domain hits by evalue\n".format(evalueRemovals) )
if intervalproblems:
sys.stderr.write("# {} genes have domains extending beyond gene bounds\n".format(intervalproblems) )
if protboundstoline: # should be empty unless in protein GFF mode, meaning no genomic intervals
for protid, boundlines in protboundstoline.items(): # sort proteins by start position
for bounds in sorted(boundlines.keys()):
sys.stdout.write(boundlines[bounds])
# NO RETURN
def get_intervals(intervals, domstart, domlength, doreverse=True):
'''return a list of intervals with genomic positions for the domain'''
# example domain arrangement for forward strand
# intervals from 50,101 127,185 212,300
# protein domain 71,101 127,185 212,256
# in nucleotides 31 59 45
# in amino acids 10.3 19.6 15 = 45
# for domstart at 22 and domlength of 135
# basestostart is always from transcript N-terminus nucleotide
# so for forward transcripts, basestostart would be 22, so that 50+22-1=71
basestostart = int(domstart) # this value always should be 1 or greater
genomeintervals = [] # will contain a list of tuples
for interval in sorted(intervals, key=lambda x: x[0], reverse=doreverse):
intervallength = interval[1]-interval[0]+1 # corrected number of bases
if basestostart >= intervallength: # ignore intervals before the start of the domain
# print(interval, domstart, basestostart, domlength, intervallength+os.linesep)
basestostart -= intervallength
# in example, 101-50+1 = 52, 22 < 52, so else
else: # bases to start is fewer than length of the interval, meaning domain must start here
if doreverse: # reverse strand domains
# if domain continues past an interval, domstart should be equal to interval[1]
domstart = interval[1] - basestostart + 1 # correct for base numbering at end of interval
if domstart - interval[0] + 1 >= domlength: # if the remaining part of the domain ends before the start of the interval
# then define the last boundary and return the interval list
genomebounds = (domstart-domlength+1, domstart) # subtract remaining length
genomeintervals.append(genomebounds)
return genomeintervals
else:
genomebounds = (interval[0], domstart)
genomeintervals.append(genomebounds)
domlength -= (domstart - interval[0] + 1)
basestostart = 1 # start at the next interval
else: # for forward stranded domains
# first interval, domstart should be 50+22-1 = 71
# second interval, domstart should be 127+1-1 = 127
# third interval, domstart should be 212+1-1 = 212
domstart = interval[0] + basestostart - 1 # correct for base numbering
if interval[1] - domstart + 1 >= domlength: # if the remaining part of the domain ends before the end of the interval
# then define the last boundary and return the interval list
genomebounds = (domstart, domstart+domlength-1) # add remaining length for last interval
genomeintervals.append(genomebounds)
return genomeintervals
else:
# first interval, genomebounds should be 71,101
# domlength should be 135 - (101-71+1 = 31) = 104
# second interval, genomebounds should be interval[0], domstart, so 127
# domlength should be 104 - (185-127+1 = 59) = 45
genomebounds = (domstart, interval[1])
genomeintervals.append(genomebounds)
domlength -= (interval[1] - domstart + 1)
basestostart = 1 # next domstart should be interval[0] for next interval
if domlength < 1: # catch for if all domain length is accounted for
return genomeintervals
else:
sys.stderr.write("WARNING: cannot finish domain at {} for {} in {}\n".format(domstart, domlength, intervals) )
return genomeintervals
def main(argv, wayout):
if not len(argv):
argv.append("-h")
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__)
parser.add_argument('-i','--input', help="PFAM domain information as hmmscan tabular, or InterProScan .tsv file")
parser.add_argument('-d','--gene-delimiter', help="optional delimiter for gene names in GFF, cuts off end split")
parser.add_argument('-D','--prot-delimiter', help="optional delimiter for protein names in PFAM table, cuts off end split")
parser.add_argument('-e','--evalue', type=float, default=1e-1, help="evalue cutoff for domain filtering [1e-1]")
parser.add_argument('-g','--genes', help="genes or proteins in gff format")
parser.add_argument('-J','--JGI', action="store_true", help="use presets for JGI format files")
parser.add_argument('-l','--length-cutoff', type=float, default=0.3, help="minimum length coverage of domains [0.3]")
parser.add_argument('-n','--no-genes', action="store_true", help="genes are not defined, get gene ID for each exon")
parser.add_argument('-p','--program', help="program for 2nd column in output [hmmscan]", default="hmmscan")
parser.add_argument('-t','--type', help="gff type [PFAM]", default="PFAM")
# this could more properly be SO:0000349 protein_match
parser.add_argument('-T','--transdecoder', action="store_true", help="use presets for TransDecoder genome gff")
parser.add_argument('-x','--exons', action="store_true", help="exons define coding sequence")
parser.add_argument('--debug', action="store_true", help="debug some output options")
args = parser.parse_args(argv)
if args.genes:
geneintervals, genestrand, genescaffold = cds_to_intervals(args.genes, args.gene_delimiter, args.exons, args.transdecoder, args.JGI, args.no_genes)
parse_pfam_domains(args.input, args.evalue, args.length_cutoff, args.program, args.type, args.prot_delimiter, args.debug, args.JGI, geneintervals, genestrand, genescaffold)
else: # assume protein gff
parse_pfam_domains(args.input, args.evalue, args.length_cutoff, args.program, args.type, args.prot_delimiter, args.debug)
if __name__ == "__main__":
main(sys.argv[1:],sys.stdout)