-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathblastnExtract.py
More file actions
executable file
·218 lines (181 loc) · 9.01 KB
/
blastnExtract.py
File metadata and controls
executable file
·218 lines (181 loc) · 9.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/home/mplace/anaconda3/bin/python
"""
@Program: blastnExtract.py
@Purpose: Extract a gene plus & minus user defined window from a sequence
assembly. Program assumes you are running in the blast file directory.
@Input: 1) User list of blast tab files, assumes header for each read looks
something like :
qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
Assembly file are assumed to be in a different dir
and named the same way,
2) Directory where sequence assemblies are located. Sequence Assemblies
are assumed to have been indexed with samtools.
example: samtools faidx sample.fa
3) Window around blast coordinates to extract from assembly, 1000 = 1000 base pairs
up & down stream of cds start and stop position.
@Dependencies:
Python 3,
BioPython,
samtools (for indexing assembly and extracting region)
@Output: ORF specific fasta files, one for each orf with Sequence extracted
in fasta format by samtools
@author: Mike Place
@Date: 8/05/2015
"""
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from collections import OrderedDict
import argparse
import os
import re
import subprocess
import sys
def extract( refFile, strainName, assemblyDir, window ): #, assemblyDir ):
"""
Use samtools faidx reference.fasta chr:start-end to extract cds region
from reference. Then call writeFasta().
example:
samtools faidx yjm555.fa Chr07:1060052-1060879
BioPython Seq and SeqIO are used to write output.
Each ORF file created has all the sequences across all strains
for that ORF.
"""
reference = assemblyDir + strainName + ".fa"
print(strainName) # hold final sequences for printing
for k,v in refFile.items():
orfResults = k + "-results.fa"
out = open(orfResults, 'a')
rvse = False
# check for start position > end position (reverse strand)
if v[2] > v[3]:
tmp = v[3]
v[3] = v[2]
v[2] = tmp
rvse =True
# check for start position less than zero
if v[2] - window < 0:
v[2] = 0
else:
v[2] = v[2] - window
v[3] += window # Set right side of window
loc = str(v[2]) + "-" + str(v[3]) # create start-end for samtools extraction
# Now run samtools to extract sequence from reference assembly
cmd = [ 'samtools', 'faidx', reference, v[0] + ":" + loc ]
output = subprocess.Popen(cmd, stdout=subprocess.PIPE,stderr=subprocess.PIPE).communicate()
result = output[0].decode('utf-8')
# format result for writing to file
tmpSeq = result.split()
seqName = re.sub(r">", "", tmpSeq.pop(0) )
orfSeq = ''.join(tmpSeq)
orfSeq = Seq(orfSeq)
if rvse:
orfSeq = orfSeq.reverse_complement()
rvse = False
record = SeqRecord(orfSeq, id = seqName, description = strainName)
SeqIO.write(record, out, "fasta")
def processFiles( blastFiles, assemblyDir, window ):
"""
Loop through the blast tab files, get list of regions to extract for each file
then call extract(). The region to extract is defined as the "winner", i.e.
the best hit for a given ORF based on percent ID and bitscore
blastFiles = list of blast tab files to process one at a time
assemblyDir = directory (different from cds file dir) contains sequence assemblies
window = window to extract on both sides of orf
"""
for f in blastFiles:
with open(f) as blastF:
# orderedDict, key = orf (ORF1) value = list, [0] = chrom [1] = %id [2] = start , [3] = end [4] = bitscore
orf = OrderedDict()
strainName = f.split('.')[0] # get strain name
for line in blastF:
if line.startswith('qseqid'): # skip header
continue
else:
line = line.rstrip()
tabs = line.split()
names = tabs[0].split('.') # split strain.ORF#
if names[1] in orf:
if float(tabs[11]) > orf[names[1]][4] : # is current bitscore > old bitscore
orf[names[1]][0] = tabs[1]
orf[names[1]][1] = float(tabs[2])
orf[names[1]][2] = int(tabs[8])
orf[names[1]][3] = int(tabs[9])
orf[names[1]][4] = float(tabs[11])
else: # if orf not a key, add it
bestHit = []
bestHit.append(tabs[1]) # chrom
bestHit.append(float(tabs[2])) # percent id
bestHit.append(int(tabs[8])) # start position
bestHit.append(int(tabs[9])) # end position
bestHit.append(float(tabs[11])) # bitsscore
orf[names[1]] = bestHit
extract( orf, strainName, assemblyDir, window )
#for k,v in orf.items():
# print(k,v)
def main():
"""
Main
"""
# Variables
assemblies = [] # list of assembly files to extract sequence from
blastFiles = [] # cds files to process, in curr working dir
cmdparser = argparse.ArgumentParser(description="Extract gene/cds sequence from Yeast Genome assemblies",
usage='%(prog)s -f <list_file> -d <directory> [-w size_bp]' ,prog='extractGene.py' )
cmdparser.add_argument('-f', '--file', action='store', dest='FILE',
help='List of gene/cds fasta files to process.' , metavar='')
cmdparser.add_argument('-d', '--dir' , action='store', dest='DIR' , help='Directory path containing assembly fasta files.', metavar='')
cmdparser.add_argument('-w', '--win' , action='store', dest='WIN' , help='Window size, optional default= 1000bp', metavar='')
cmdparser.add_argument('-i', '--info', action='store_true', dest='INFO', help='Detailed description of program.')
cmdResults = vars(cmdparser.parse_args())
# if no args print help
if len(sys.argv) == 1:
print("")
cmdparser.print_help()
sys.exit(1)
if cmdResults['INFO']:
print("\n extractBlastn.py -f list.txt -w window [ -i ]")
print("\n Purpose: Extract a gene plus & minus user defined window from a sequence assembly.")
print("\n Input : file w/ list of blastn table files, 1 per strain, path to assemblies, window size (default 1000 bp)")
print("\n Output : fasta files for each orf.")
print("\n Usage : extractGene.py -f list.txt -d /home/GLBRCORG/user/assemblyDir -w 200")
print(" ")
print("\tTo see Python Docs for this program:")
print("\n\tOpen python console and enter")
print("\timport sys")
print("\tsys.path.append('/full/path/to/script')")
print("\timport extractGene")
print("\thelp(extractGene)")
print("\n\tSee Mike Place for any problems or suggestions.")
sys.exit(1)
if cmdResults['FILE']:
inFile = cmdResults['FILE']
if not os.path.exists(inFile):
print("\n\t-f input file does not exist.\n")
cmdparser.print_help()
sys.exit(1)
else:
# open file containing a list of cds files to process.
with open( inFile, 'r') as filelist:
for blast in filelist:
blast = blast.rstrip()
blastFiles.append(blast)
if cmdResults['DIR']:
assemblyDir = cmdResults['DIR']
if not os.path.exists(assemblyDir):
print("\n\t-d assembly directory does not exist.\n")
cmdparser.print_help()
sys.exit(1)
else:
# Here we are assuming the .fai index file for reference exists
for f in os.listdir(assemblyDir):
if re.search(r'\.fa$',f):
assemblies.append(f)
if cmdResults['WIN']:
window = int(cmdResults['WIN'])
else:
window = 1000
# start the real work
processFiles(blastFiles, assemblyDir, window)
if __name__ == "__main__":
main()