-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathPathway_gene_coexpression.py
More file actions
144 lines (130 loc) · 4.78 KB
/
Pathway_gene_coexpression.py
File metadata and controls
144 lines (130 loc) · 4.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#Pathway_coexpression.py
# calculate all pairwise PCCs for all pathways
# module load SciPy
import random
import sys, os
import os.path
from scipy.stats import pearsonr
# module load scipy ! before running this code
# for 3 pathways it took 1min to run this script
from scipy.stats import scoreatpercentile
from numpy import median
from itertools import combinations
print('''
inp1 = Expression dataset, rows are genes and columns are samples, with header (File)
inp2 = Percentile to use for random background, Measure:PCC (Integer:95 for 95th percentile)
inp3 = Pathway annotation first column pathway, second column gene with header(File)
inp4 = Directory to output files (Path)
inp5 = Number of gene pairs to pick randomly for background distribution (Integer: 500000)
''')
expression = sys.argv[1] #"/mnt/home/uygunsah/projects/1_Expression_Database/expression_matrix/5_Stress_FC"
perc = int(sys.argv[2]) #95
aracyc = sys.argv[3] #aracyc, go, or manual annotation
direc = sys.argv[4] #"/mnt/home/uygunsah/projects/4_Plastid/_June2016_resubmission/PCC_distr/"
pairs = int(sys.argv[5])
#all pairwise-gene pccs
def calculate_pairwise_PCC(list_of_genes):
pcc_path_list = []
for combo in combinations(list_of_genes, 2):
gen1=list(combo)[0]
gen2=list(combo)[1]
gen1exp = dict_e[gen1]
gen2exp = dict_e[gen2]
pcc_path = pearsonr(gen1exp, gen2exp)[0]
pcc_path_list.append(pcc_path)
return pcc_path_list
#ec function
def calculate_EC(pcc_list, threshold, ngenes):
count = 0
for p2 in pcc_list:
if p2 > threshold:
count = count + 1
ec2 = float(count)/float((ngenes*(ngenes-1)/2))
return ec2
# expression dictionary
expression_open = open(expression, "r")
line1 = expression_open.readline()
line1 = expression_open.readline()
dict_e = {}
gen_list = []
while line1:
info = line1.strip().split()
gen_list.append(info[0])
dict_e[info[0]] = [float(i) for i in info[1:]]
line1 = expression_open.readline()
# get random pcc distribution and print to file, also print pcc95
oup_random = open("random_pairs_PCC_%s" %pairs, "w")
pcc_list =[]
for i in range(0, pairs):
a = random.choice(gen_list)
b = random.choice(gen_list)
x = dict_e[a]
y = dict_e[b]
pcc = pearsonr(x, y)[0]
oup_random.write("%f\n" %pcc)
pcc_list.append(pcc)
pcc95 = scoreatpercentile(pcc_list, perc)
#pcc95 = 0.404774
threshold2 = pcc95
# pathway dictionary
aracyc_op = open(aracyc, "r")
line = aracyc_op.readline()
line = aracyc_op.readline()
dict = {}
while line:
info = line.strip().split("\t")
pathway = info[0]
gene = info[1]
if pathway not in dict:
dict[pathway] = [gene]
else:
if gene not in dict[pathway]:
dict[pathway].append(gene)
line = aracyc_op.readline()
# pathway PCCs and EC calculations
output = open("EC_pathways", "w")
all_pathway_genes = []
for bio in dict:
pcc_path_list = []
new_list = []
genez = dict[bio] # genes in pathway
for el in genez:
if el in dict_e: # check if genes in pathway is present in expression dataset
new_list.append(el)
if el not in all_pathway_genes:
all_pathway_genes.append(el)
leng_genez = float(len(new_list))
if leng_genez > 2: # only consider pathways with more than 2 genes
if os.path.isfile(direc+"PCC_distr_%s" % bio): #if the file already present, pass
print bio
pass
else:
pcc_path_list = calculate_pairwise_PCC(new_list)
#print bio, "writing"
out = open("PCC_distr_%s" % bio, "w")
#out.write("%s\n%s\n" % (bio, str(med))) # name of pathway and median PCC as header
for num in pcc_path_list:
out.write("%s\n" % str(num))
out.close()
ec = calculate_EC(pcc_path_list, threshold2,len(new_list))
output.write("%s\t%f\n" % (bio, ec))
output.close()
# random ECs, randomly form pathways 50 times for pathway size of 3-100 genes
output = open("EC_random", "w")
ec2 = []
for a in range(1, 50):
for sayi in range(3,100):
rand_gen_list=[]
pcc_random = []
count2_random = 0
for fcv in range(1, sayi+1):
rand1 = random.choice(gen_list)
if rand1 in rand_gen_list:
rand1 = random.choice(gen_list)
rand_gen_list.append(rand1)
else:
rand_gen_list.append(rand1)
pcc_random = calculate_pairwise_PCC(rand_gen_list)
ec_random = calculate_EC(pcc_random, threshold2,len(rand_gen_list))
output.write("%i\t%f\n" % (sayi, ec_random))
output.close()