-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathNucleotideDivForPop.py
More file actions
84 lines (77 loc) · 2.68 KB
/
NucleotideDivForPop.py
File metadata and controls
84 lines (77 loc) · 2.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python
"""
Created on Wed Apr 17 09:10:45 2013
@author: nathanieldavidchu
Calculates nucleotide diversity for all populations.
Usage:
./NucleotideDivForPop.py -i <inputhaplotypefile> -h <inputhaplodepthsfile> -p <poplist> -o <output>
"""
def NucleotideDiv(radtagsfilename, haplotagsfilename, pop1):
""" (file, file, str) -> (float)
Return nucleotide diversity for a given population, using only RAD tags that have fewer than 4 SNPs
"""
import csv
import itertools
count = 0
markersused = 0
with open(radtagsfilename, "U") as f:
radtags = csv.reader(f, delimiter="\t")
radheader = radtags.next()
radindex1 = radheader.index(pop1)
snpindex = radheader.index("Num SNPs")
radpopA = []
snps = []
for i in radtags:
if int(i[snpindex]) < 4 and i[radindex1] != "":
radpopA.append(i[radindex1].split("/"))
snps.append(int(i[snpindex]))
with open(haplotagsfilename, "r+") as g:
haplotags = csv.reader(g, delimiter = "\t")
haploheader = haplotags.next()
haploindex1 = haploheader.index(pop1)
snpindex = radheader.index("Num SNPs")
happopA = []
for j in haplotags:
if int(j[snpindex]) < 4 and j[haploindex1] != "":
happopA.append(map(int, j[haploindex1].split("/")))
for k in range(len(radpopA)):
if len(radpopA[k]) == 1:
markersused += 1
else:
combA = sorted(zip(radpopA[k], happopA[k]))
radpopAset = set(radpopA[k])
if len(radpopAset) == len(radpopA[k]):
pass
else:
haplodepths = []
for l in radpopAset:
indicies = [m for m, x in enumerate(radpopA[k]) if x == l]
readcount = 0
for n in indicies:
readcount += happopA[k][n]
haplodepths.append(readcount)
combA = zip(radpopAset, haplodepths)
sumreadsA = sum([x[1] for x in combA])
Nucdiv = 0
for o in itertools.combinations(combA, 2):
pi = sum(1 for x, y in zip(o[0][0], o[1][0]) if x != y) / 42
Nucdiv += pi * (o[0][1]/sumreadsA) * (o[1][1]/sumreadsA)
count += Nucdiv
markersused += 1
return count/markersused
print "radtags used = ", markersused
print "Nucleodiversity =", count/markersused
def NucleotideDivAllPop(radtagsfilename, haplotagsfilename, poplist, output):
with open(output, "w+") as h:
for i in poplist:
h.write(i + " " + str(NucleotideDiv(radtagsfilename, haplotagsfilename, i)))
h.write("\n")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Calculate nucleotide diversity for all populations")
parser.add_argument('-i', '--inputhaplo')
parser.add_argument('-h', '--inputdepths')
parser.add_argument('-p', '--poplist')
parser.add_argument('-o', '--output')
args = parser.parse_args()
NucleotideDivAllPop(args.inputhaplo, args.inputdepths, args.poplist, args.output)