-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathparse_binary_get_enrich_table_binonly.py
More file actions
117 lines (108 loc) · 3.85 KB
/
parse_binary_get_enrich_table_binonly.py
File metadata and controls
117 lines (108 loc) · 3.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import sys, os
matrix_file = open(sys.argv[1], 'r') # binary matrix file with all features and genetype comparisons: binary_matrix-domain_matrix.txt_comparisons.txt
output = open(sys.argv[1] + '_enrichment_table.txt', 'w')
gene_type = str(sys.argv[2]) #pos class
genetype_dict ={}
genecond_dict ={}
feature_list = []
line = matrix_file.readline()
x = line.strip().split('\t')
feature_list = x[2:]
print("getting genes")
def get_genes(inp, D, D2, feature_list):
header= inp.readline()
for line in inp:
L2 = line.strip().split('\t')
gene = L2[0]
gen_type = L2[1]
cond = L2[2:]
if gen_type not in genetype_dict:
genetype_dict[gen_type] = [gene]
else:
genetype_dict[gen_type].append(gene)
if gene not in genecond_dict:
genecond_dict[gene] = cond
else:
pass
get_genes(matrix_file, genetype_dict, genecond_dict, feature_list)
gene_num = len(genecond_dict.keys())
#281print (gene_num)
#print genetype_dict
#print genecond_dict
#print feature_list
feature_dict_pos= {}
feature_dict_neg= {}
n = len(feature_list)
print("getting pos and neg features", len(feature_list))
for i in range(0,n):
name = str(feature_list[i])
#neg_name = str(feature_list[i]) + '_neg'
#print (name)
for gene in genecond_dict:
#print gene
feature_cond = genecond_dict[gene]
if len(feature_cond) == 0:
if name not in feature_dict_neg:
feature_dict_neg[name] = [gene]
else:
feature_dict_neg[name].append(gene)
else:
feature_cond2 = feature_cond[i]
#print (feature_cond2)
if float(feature_cond2) == float(1):
#print feature_cond2
if name not in feature_dict_pos:
feature_dict_pos[name] = [gene]
else:
feature_dict_pos[name].append(gene)
elif float(feature_cond2) == float(0):
if name not in feature_dict_neg:
feature_dict_neg[name] = [gene]
else:
feature_dict_neg[name].append(gene)
else:
pass
#print (feature_dict_neg)
#print (feature_dict_pos)
## make table for enrichment
print ("getting enrichment and writing")
for feature in feature_dict_pos:
gene_list_pos = feature_dict_pos[feature]
if feature in feature_dict_neg:
gene_list_neg = feature_dict_neg[feature]
else:
gene_list_neg= []
gene_num = len(gene_list_pos)+len(gene_list_neg)
#print (len(gene_list_pos))
gene_list_pos_len = len(gene_list_pos)
#print gene_list_for_go
for genetype in genetype_dict:
if genetype == gene_type:
gene_list2nd = genetype_dict[genetype]
else:
gene_listother = genetype_dict[genetype]
count1 = 0
count2 = 0
count3 = 0
count4 = 0
for gene in gene_list_pos:
if gene in gene_list2nd:
count1 = count1 + 1 #SM/pos gene type that is pos
else:
count3 = count3 + 1 #other genetype that is pos
#print (count1)
neg_list=[]
for gene in gene_list_neg:
if gene in gene_list2nd:
neg_list.append(gene) #SM/pos gene type that is neg
#print (len(neg_list))
#count3 = len(gene_list2nd) - count1 #SM/pos genetype- pos = SM not in feature
count2= len(neg_list)
#print (len(gene_list2nd))
#count3 = len(gene_list2nd) - count1 #SM/pos genetype- pos = SM not in feature
#print (count3)
#print (gene_num)
count4 = gene_num - (count1 + count2 + count3) #other genetype that is neg -number is based on the # of genes from cluster file
output.write('%s\t%i\t%i\t%i\t%i\n' % (feature, count1, count2, count3, count4))
output.close()
matrix_file.close()