-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfindDiffFasta.py
More file actions
executable file
·101 lines (80 loc) · 3.03 KB
/
findDiffFasta.py
File metadata and controls
executable file
·101 lines (80 loc) · 3.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/home/mplace/anaconda3/bin/python
"""
Name: findDiffFasta.py
Purpose: compare the sequences of 2 fasta files, output sequences in the first
file but not in the second
Input: 2 fasta files
Output: single fasta file, sequences in first but not the second
Author: Mike Place
Date: 1/29/2015
"""
from Bio import SeqIO
import argparse
import textwrap
def main():
"""
Start Here
"""
# handle command line arguments
cmdparser = argparse.ArgumentParser(description="Compare 2 fasta files & output seq in first not in second file", prog='compareFasta.py' )
cmdparser.add_argument('-f1','--file1', action='store', required='true',
dest='FILE1', help='REQUIRED, fasta file (.fasta, .fa , .fsa)')
cmdparser.add_argument('-f2','--file2', action='store', required='true',
dest='FILE2', help='REQUIRED, fasta file (.fasta, .fa , .fsa)')
cmdparser.add_argument('-o', '--output', action='store', required='false', dest='OUT', help='Diff.fasta')
cmdResults = vars(cmdparser.parse_args())
firSeq = {}
firName = {}
secSeq = {}
secName = {}
result = {}
if cmdResults['FILE1'] is not None:
f1 = cmdResults['FILE1']
f2 = cmdResults['FILE2']
fname = f1.split('.')
sname = f2.split('.')
if cmdResults['OUT'] is not None:
outFile = cmdResults['OUT']
else:
outFile = "Unique.fasta"
# parse the first fasta file, get unique set
for seq1 in SeqIO.parse(f1,"fasta"):
tseq = str(seq1.seq)
name = str(seq1.id)
if name in firName:
firName[name] = firName[name] + 1
else:
firName[name] = 1
if tseq in firSeq: # if sequence is found, concatenate gene name
firSeq[tseq] = firSeq[tseq] + "|" + seq1.id + "_" + fname[0]
else:
firSeq[tseq] = seq1.id + "_" + fname[0]
# parse the second fasta file, get unique set
for seq2 in SeqIO.parse(f2,"fasta"):
nseq = str(seq2.seq)
name = str(seq2.id)
if name in secName:
secName[name] = secName[name] + 1
else:
secName[name] = 1
if nseq in secSeq:
secSeq[nseq] = secSeq[nseq] + "|" + seq2.id + "_" + sname[0]
else:
secSeq[nseq] = seq2.id + "_" + sname[0]
#Search secSeq dict with firSeq keys
for k,v in firSeq.items():
if k in secSeq:
continue #result[k] = v + "|" + secSeq[k]
else:
result[k] = v
# print output in fasta format
fileHandle = open(outFile, 'w')
for k,v in result.items():
fileHandle.write(">%s\n" %(v))
#print (">%s" %(v))
data = (textwrap.wrap(k,width=80) )
#[ print(i) for i in data ]
for i in data:
fileHandle.write("%s\n" %(i))
if __name__ == "__main__":
main()