lyy005 · zoeloh · Oct 23, 2017 · Oct 25, 2017 · Oct 25, 2017 · lyy005
diff --git a/challenge1.py b/challenge1.py
@@ -0,0 +1,29 @@
+import re
+
+inputFile = open("Cflorida.vcf","r")
+outputFile = open("CfloridaCounts.txt","w")
+
+regexTX = re.compile(r"(CF|cf)\w*.[Aa]\w*.")
+regexFL = re.compile(r"(CF|cf).[Gg]\w*.")
+delete1 = re.compile("0/0:")
+delete2 = re.compile(":\d[\d:,]*")
+delete3 = re.compile("./.:.:.:.:.")
+
+for ln in inputFile:
+    ln.strip()
+    if ln.startswith("##"):
+        outputFile.write(ln + "\n")
+        continue
+    if ln.startswith("#"):
+        ln = re.sub(regexFL,"Cf.Gai.",ln)
+        ln = re.sub(regexTX,"Cf.Sfa.",ln)
+        outputFile.write(ln + "\n")
+    else:
+        ln = re.sub(delete1, "", ln)
+        ln = re.sub(delete2, "", ln)
+        ln = re.sub(delete3, "NA", ln)
+        outputFile.write(ln + "\n")
+
+inputFile.close()
+outputFile.close()
+
diff --git a/challenge2.py b/challenge2.py
@@ -0,0 +1,47 @@
+import re
+
+IDFile = open("indivIDs.txt","r")
+IDs = {}
+
+for ln in IDFile:
+    ln = ln.strip()
+    fields = ln.split()
+    if fields[0] in IDs:
+        print("Duplicate: " + fields[0])
+        continue
+    else:
+        IDs[fields[0]] = fields[1]
+
+IDFile.close()
+
+sequenceFile = open("seqFastq.fq","r")
+outputFile = open("IDseq.fasta","w")
+
+cutSites = []
+
+regex = re.compile(r"([ATCG]{8})AATTC") #gets the group of 8 bases just before the restriction site in a  group
+sequenceReg = re.compile(r"[A-Z]{10}")  #matches uppercase strings. I saw some had an N to begin with. I chose 10 because it is unlikely that the quality line has 10 uppercase in a row
+
+for ln in sequenceFile:
+    if re.match(sequenceReg, ln):
+        match = re.search(regex, ln)
+        if not match:
+            #print("No match on the restriction site in sequence: " + ln)
+            continue
+        if match.group(1) in IDs:
+            cutSites.append(match.start(1) + 8)
+            ln = ln[:match.start(1)] + ln[match.end(1):] #cuts out the part matched by regex from string
+
+            outputFile.write('>' + IDs[match.group(1)] + "\n")
+            outputFile.write(ln)
+        #else:
+            #print("no match found on : " + match.group(1))
+            #outputFile.write("> No match on: " + match.group(1) + '\n')
+            #outputFile.write(ln)
+
+import pandas
+from plotnine import *
+data=pandas.DataFrame({"Cut Site": cutSites})
+
+cutHG=ggplot(data,aes(x="Cut Site"))
+cutHG+geom_histogram()+theme_classic()