-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathblast.py
More file actions
156 lines (106 loc) · 4.65 KB
/
Copy pathblast.py
File metadata and controls
156 lines (106 loc) · 4.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/python3
#
# Description:
# - Wrapper script for running BLAST through Python
# - Automatically detects which BLAST program to use (BLASTN, BLASTP, TBLASTN, BLASTX)
# - Specify query, db, and output directory with other BLAST options supported
# - Outputs params file and TSV of BLAST hits
#
# Requirements
# - The BLAST suite must be in your $PATH variable (in this script, BLAST is loaded as a module)
#
import os
import sys
import argparse
from Bio.Blast.Applications import *
from Bio import SeqIO
from datetime import datetime
def main(arguments):
parser = argparse.ArgumentParser(description = __doc__, formatter_class = argparse.RawDescriptionHelpFormatter)
# Required parameters
parser.add_argument("-q", "--query", help="Query file", required=True)
parser.add_argument("-d", "--db", help="Query file", required=True)
parser.add_argument("-o", "--out", help="Output directory", required=True)
# Optional parameters
parser.add_argument("-e", "--evalue", help="E-value threshold (default: 10)", default="10.0")
parser.add_argument("-t", "--threads", help="Threads (default: 1)", type=int, default=1)
parser.add_argument("--max_hsps", help="Max HSPs (default: 1)", type=int, default=1)
parser.add_argument("--max_targets", help="Max targets (default: 15)", type=int, default=15)
parser.add_argument("--min_hsp_cov", help="Min HSP coverage [0-100] (default: 0.0)", type=float, default=0.0)
parser.add_argument("--query_is_prot", help="Query is protein", action="store_true")
parser.add_argument("--query_is_nucl", help="Query is nucleotide", action="store_true")
args = parser.parse_args(arguments) # Get args as args.name
### Generate output file names and sets file paths as absolute ###
query_basename = os.path.basename(args.query)
db_basename = os.path.basename(args.db)
out_prefix = "%s__%s__%s" % (query_basename, db_basename, args.evalue)
out_tsv = args.out + "/" + out_prefix + ".tsv"
out_params = args.out + "/" + out_prefix + ".params"
if not os.path.isabs(args.query):
args.query = os.path.abspath(os.getcwd()) + "/" + args.query
if not os.path.isabs(args.db):
args.db = os.path.abspath(os.getcwd()) + "/" + args.db
### Determine the query type ###
query_type = ""
if args.query_is_nucl:
query_type = "N"
elif args.query_is_prot:
query_type = "P"
else: # We infer from the first 5000 characters of the sequence
query_seqs = ""
for record in SeqIO.parse(args.query, "fasta"):
query_seqs += str(record.seq).upper()
if len(query_seqs) > 5000:
break
# Check if any of the following amino acids are present in the query sequences
# It's possible that you could have a protein query without these amino acids and the check would fail
# If that happens, run with --query_is_prot
amino_acids = ["R", "D", "Q", "E", "H", "I", "L", "K", "M", "F", "P", "S", "W", "Y", "V"]
if any([(aa in query_seqs) for aa in amino_acids]):
query_type = "P"
else:
query_type = "N"
### Determine database type ###
db_type = ""
if os.path.isfile(args.db + ".nin"):
db_type = "N"
elif os.path.isfile(args.db + ".pin"):
db_type = "P"
else:
print("Error: Cannot find BLAST db for %s" % args.db)
exit()
### Determine which BLASTP program to use ###
os.system("module load blast")
blast = None
if query_type == "N" and db_type == "N":
blast = NcbiblastnCommandline
elif query_type == "P" and db_type == "P":
blast = NcbiblastpCommandline
elif query_type == "N" and db_type == "P":
blast = NcbiblastxCommandline
elif query_type == "P" and db_type == "N":
blast = NcbitblastnCommandline
### Run BLAST ###
outfmt = ["qseqid", "sacc", "stitle", "qstart", "qend", "sstart", "send", "qlen", "slen", "length", "mismatch", "pident", "qcovs", "qcovhsp", "score", "bitscore", "evalue"]
if blast is not None:
if not os.path.isdir(args.out):
os.mkdir(args.out)
### Create params log ###
vars_args = vars(args)
log = ["Date\t%s" % datetime.now()]
log += [ "%s\t%s" % (name, vars_args[name]) for name in vars_args]
log += ["\nColumn information"]
log += ["%s\t%s" % (i, outfmt[i]) for i in range(len(outfmt))]
print("\n****************\n")
print("Running BLAST\n")
print("\n".join(log) + "\n")
print("\n****************\n")
with open(out_params, "w") as f:
f.writelines("\n".join(log) + "\n")
### Run BLAST ###
cmd = blast(query = args.query, db = args.db, evalue = args.evalue, outfmt = "6 " + " ".join(outfmt), out = out_tsv, max_target_seqs = args.max_targets, max_hsps = args.max_hsps, qcov_hsp_perc = args.min_hsp_cov, num_threads=args.threads)
stdout,stderr = cmd()
else:
print("Unable to determine correct BLAST version")
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))