forked from brettbarbaro/csvcomplete
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcsvcomplete.py
More file actions
140 lines (111 loc) · 4.32 KB
/
csvcomplete.py
File metadata and controls
140 lines (111 loc) · 4.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 2 10:29:18 2016
@author: Brett Barbaro
"""
# -*- coding: utf-8 -*-
import csv # for dealing with .csv files
import urllib # for getting stuff off the web
import os
from HTMLParser import HTMLParser
from Bio import Entrez
Entrez.email = "X@Y.com"
# GOAL: input a csv file containing accession numbers
# output: csv file with accession #s and sequences
print "hello"
# import random # only for color assignments
# import math
# import numpy
# import json
# import data from csv file - Brett
csvpath = "/Users/mac/Documents/OLSON/Programs/csvcomplete/syn1_0_cut.csv"
head, tail = os.path.split(csvpath)
model_dir = head + '/'
csvname, ext = tail.split('.')
pdbpath = model_dir + 'PDB' + os.sep
all_data = []
class MyHTMLParser(HTMLParser):
#
# def __init__(self, html, lookforTag="a", lookforData="[Show]", lookforAttr=None, gatherData=False):
# self.lookforTag = lookforTag
# self.lookforData = lookforData
# self.lookforAttr = lookforAttr
# self.gatherData = gatherData
# self.tag_stack = False
# self.tag_attr = []
# self.stored = []
# self.stored_attr = []
# self.feed(html)
def print_p_contents(self, html, lookforTag="a", lookforData="[Show]", lookforAttr=None, gatherData=False):
self.lookforTag = lookforTag
self.lookforData = lookforData
self.lookforAttr = lookforAttr
self.gatherData = gatherData
self.tag_stack = False
self.tag_attr = []
self.stored = []
self.stored_attr = []
self.feed(html)
def handle_starttag(self, tag, attrs):
if tag == self.lookforTag:
self.tag_stack = True
self.tag_attr = []
# print "Encountered the beginning of a %s tag" % tag
# print attrs
if len(attrs):
if self.lookforAttr is not None:
for atr in self.lookforAttr:
for attr in attrs:
if attr[0] == atr:
self.tag_attr.append(attr[1])
else:
self.tag_attr = attrs[0][1]
def handle_endtag(self, tag):
self.tag_stack = False
# self.tag_attr = None
pass
# print "Encountered the end of a %s tag" % tag
def handle_data(self, data):
if self.tag_stack:
if self.lookforData is None:
if self.gatherData:
self.stored.append(data)
else:
if len(self.tag_attr):
self.stored.append(self.tag_attr)
else:
if data == self.lookforData:
# print "Encountered data %s" % data
# print self.tag_attr
self.stored.append(self.tag_attr)
with open(csvpath, 'rU') as csvfile: # need to open the file in Universal mode so it can read Mac Excel output .csv
spamreader = csv.reader(csvfile)
for row in spamreader:
all_data.append(row)
headers = {'test': 'headers test works'}
for num in range(len(all_data[0])):
headers[all_data[0][num]] = num # This establishes a dictionary with the header names in it.
# After this, columns can be indicated with e.g. "name = all_data[x][headers['NAME']]".
# The headers must be correctly labeled.
# acc2seq will take an accession number as input and return the sequence of that molecule
def acc2seq(accession):
handle = Entrez.efetch(db="protein", id=accession, rettype="fasta")
response = handle.read()
header, sequence = response.split(']')
sequence = sequence.replace('\n', '')
return header, sequence
# Can also be used instead of Entrez package:
# urllibrary = 'http://www.ncbi.nlm.nih.gov/protein/ADH21625.1?report=fasta&log$=seqview&format=Excel#'
# for row in range(1):
# response = urllib.urlopen(urllibrary)
# print response.read()
#
for x in range(1,len(all_data)):
accession = all_data[x][headers['ACCESSION']]
header, sequence = acc2seq(accession)
all_data[x][headers['SEQUENCE']] = sequence
with open(str(model_dir + csvname + '_complete.csv'), 'wb') as csvfile: # writes output.csv file with all_data
spamwriter = csv.writer(csvfile)
for row in range(len(all_data)):
spamwriter.writerow(all_data[row])
print"done"