From 7b9d7d991b394c0188efe978445447efc3900b8f Mon Sep 17 00:00:00 2001 From: Soren Holm Date: Mon, 23 Oct 2017 15:17:00 -0400 Subject: [PATCH 1/3] Replaces in header --- challenge1.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 challenge1.py diff --git a/challenge1.py b/challenge1.py new file mode 100644 index 0000000..e403698 --- /dev/null +++ b/challenge1.py @@ -0,0 +1,23 @@ +import re + +inputFile = open("Cflorida.vcf","r") +outputFile = open("CfloridaCounts.txt","w") + +regexTX = re.compile(r"(CF|cf)\w*.[Aa]\w*.") +regexFL = re.compile(r"(CF|cf).[Gg]\w*.") + +for ln in inputFile: + ln.strip() + if ln.startswith("##"): + outputFile.write(ln + "\n") + continue + if ln.startswith("#"): + ln = re.sub(regexFL,"Cf.Gai.",ln) + ln = re.sub(regexTX,"Cf.Sfa.",ln) + outputFile.write(ln + "\n") + else: + outputFile.write(ln + "\n") + +inputFile.close() +outputFile.close() + \ No newline at end of file From 6a014f3b6caac09247ee6b95aacf637979914cb0 Mon Sep 17 00:00:00 2001 From: zoeloh <31715186+zoeloh@users.noreply.github.com> Date: Tue, 24 Oct 2017 21:01:46 -0400 Subject: [PATCH 2/3] Create challenge2.py Finished challenge 2 --- challenge2.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 challenge2.py diff --git a/challenge2.py b/challenge2.py new file mode 100644 index 0000000..782dd19 --- /dev/null +++ b/challenge2.py @@ -0,0 +1,47 @@ +import re + +IDFile = open("indivIDs.txt","r") +IDs = {} + +for ln in IDFile: + ln = ln.strip() + fields = ln.split() + if fields[0] in IDs: + print("Duplicate: " + fields[0]) + continue + else: + IDs[fields[0]] = fields[1] + +IDFile.close() + +sequenceFile = open("seqFastq.fq","r") +outputFile = open("IDseq.fasta","w") + +cutSites = [] + +regex = re.compile(r"([ATCG]{8})AATTC") #gets the group of 8 bases just before the restriction site in a group +sequenceReg = re.compile(r"[A-Z]{10}") #matches uppercase strings. I saw some had an N to begin with. I chose 10 because it is unlikely that the quality line has 10 uppercase in a row + +for ln in sequenceFile: + if re.match(sequenceReg, ln): + match = re.search(regex, ln) + if not match: + #print("No match on the restriction site in sequence: " + ln) + continue + if match.group(1) in IDs: + cutSites.append(match.start(1) + 8) + ln = ln[:match.start(1)] + ln[match.end(1):] #cuts out the part matched by regex from string + + outputFile.write('>' + IDs[match.group(1)] + "\n") + outputFile.write(ln) + #else: + #print("no match found on : " + match.group(1)) + #outputFile.write("> No match on: " + match.group(1) + '\n') + #outputFile.write(ln) + +import pandas +from plotnine import * +data=pandas.DataFrame({"Cut Site": cutSites}) + +cutHG=ggplot(data,aes(x="Cut Site")) +cutHG+geom_histogram()+theme_classic() From 4c2b1f9c01bebe569753636fca01b37b46b005e1 Mon Sep 17 00:00:00 2001 From: Soren Holm Date: Tue, 24 Oct 2017 21:04:17 -0400 Subject: [PATCH 3/3] Update challenge1.py Challenge 1 complete --- challenge1.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/challenge1.py b/challenge1.py index e403698..6a1ab17 100644 --- a/challenge1.py +++ b/challenge1.py @@ -5,6 +5,9 @@ regexTX = re.compile(r"(CF|cf)\w*.[Aa]\w*.") regexFL = re.compile(r"(CF|cf).[Gg]\w*.") +delete1 = re.compile("0/0:") +delete2 = re.compile(":\d[\d:,]*") +delete3 = re.compile("./.:.:.:.:.") for ln in inputFile: ln.strip() @@ -16,8 +19,11 @@ ln = re.sub(regexTX,"Cf.Sfa.",ln) outputFile.write(ln + "\n") else: + ln = re.sub(delete1, "", ln) + ln = re.sub(delete2, "", ln) + ln = re.sub(delete3, "NA", ln) outputFile.write(ln + "\n") inputFile.close() outputFile.close() - \ No newline at end of file +