-
Notifications
You must be signed in to change notification settings - Fork 27
/
removeredundantgff.py
executable file
·92 lines (82 loc) · 3.61 KB
/
removeredundantgff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python
# removeredundantgff.py
# v1.0 2015-07-23
'''
removeredundantgff.py last modified 2015-07-24
remove identical gene predictions from gff3 file
EXAMPLE USAGE:
removeredundantgff.py -g genewise.gff3 > genewise.nonredundant.gff3
IF EXONS WERE PREVIOUSLY EXCLUDED, GET RANGE INFORMATION FROM CDS:
removeredundantgff.py -g genewise.gff3 -C > genewise.nonredundant.gff3
'''
import sys
import argparse
import time
import re
from collections import defaultdict
def main(argv, wayout):
if not len(argv):
argv.append("-h")
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__)
parser.add_argument('-g','--gff', help="gff3 format file")
parser.add_argument('-d','--delimiter', default=".", help="delimiter for separating names and IDs [.]")
parser.add_argument('-C','--cds', action="store_true", help="ranges are by CDS rather than exons")
parser.add_argument('-E','--exons', action="store_false", help="exclude exons when writing output")
parser.add_argument('-v','--verbose', help="verbose output", action="store_true")
args = parser.parse_args(argv)
exonScaffold = defaultdict(lambda: defaultdict(list)) # dict of dicts of lists, as scaffold[gene][exons]
geneorder = [] # keep order of genes read in the file, to make output deterministic
scaffoldbygene = defaultdict(list) # all scaffold hits by each gene, might only be one each
linesbyid = defaultdict(list) # this stores all GFF output lines by the gene ID
genecount, mrnacount, exoncount, cdscount = 0,0,0,0
print >> sys.stderr, "Starting exon parsing on {}".format(args.gff), time.asctime()
for line in open(args.gff,'r').readlines():
line = line.rstrip()
if not line or line[0]=="#":
continue
lsplits = line.split("\t")
if lsplits[2]=="gene":
genecount += 1
transid = re.search("ID=([\w.]+);", lsplits[8]).group(1)
geneorder.append(transid)
scaffoldbygene[transid].append( lsplits[0] )
linesbyid[transid].append(line)
elif lsplits[2]=="mRNA":
mrnacount += 1
linesbyid[transid].append(line)
elif lsplits[2]=="exon":
exoncount += 1
if not args.cds:
contig, startpos, endpos, strand = lsplits[0], int(lsplits[3]), int(lsplits[4]), lsplits[6]
boundaries = (startpos,endpos)
#if strand=="+":
exonScaffold[contig][transid].append(boundaries)
#else: # strand=="-":
if args.exons:
linesbyid[transid].append(line)
elif lsplits[2]=="CDS":
cdscount += 1
if args.cds:
contig, startpos, endpos, strand = lsplits[0], int(lsplits[3]), int(lsplits[4]), lsplits[6]
boundaries = (startpos,endpos)
exonScaffold[contig][transid].append(boundaries)
linesbyid[transid].append(line)
print >> sys.stderr, "Counted {} gene and {} mRNA predictions".format(genecount, mrnacount), time.asctime()
print >> sys.stderr, "Counted {} exons and {} CDS".format(exoncount, cdscount), time.asctime()
printcount = 0
exonsorted = defaultdict(dict) # dict of scaffold and then exon boundaries as list of tuples
print >> sys.stderr, "Starting duplicate gene sorting", time.asctime()
for gene in geneorder:
for sc in scaffoldbygene[gene]:
# this generates a string of the sorted list of exon boundaries
exonorder = str(sorted(exonScaffold[sc][gene]))
# such as "[(19899, 20013), (20080, 20137), (20218, 20335)]"
### TODO allow removal of substring matches
if not exonorder in exonsorted[sc]:
printcount += 1
exonsorted[sc][exonorder] = True
for line in linesbyid[gene]:
print >> wayout, line
print >> sys.stderr, "Printed {} non redundant predictions".format(printcount), time.asctime()
if __name__ == "__main__":
main(sys.argv[1:],sys.stdout)