From 7b9d7d991b394c0188efe978445447efc3900b8f Mon Sep 17 00:00:00 2001
From: Soren Holm <sholm@prudence.campus.nd.edu>
Date: Mon, 23 Oct 2017 15:17:00 -0400
Subject: [PATCH 1/3] Replaces in header

---
 challenge1.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 challenge1.py

diff --git a/challenge1.py b/challenge1.py
new file mode 100644
index 0000000..e403698
--- /dev/null
+++ b/challenge1.py
@@ -0,0 +1,23 @@
+import re
+
+inputFile = open("Cflorida.vcf","r")
+outputFile = open("CfloridaCounts.txt","w")
+
+regexTX = re.compile(r"(CF|cf)\w*.[Aa]\w*.")
+regexFL = re.compile(r"(CF|cf).[Gg]\w*.")
+
+for ln in inputFile:
+    ln.strip()
+    if ln.startswith("##"):
+        outputFile.write(ln + "\n")
+        continue
+    if ln.startswith("#"):
+        ln = re.sub(regexFL,"Cf.Gai.",ln)
+        ln = re.sub(regexTX,"Cf.Sfa.",ln)
+        outputFile.write(ln + "\n")
+    else:
+        outputFile.write(ln + "\n")
+        
+inputFile.close()
+outputFile.close()
+        
\ No newline at end of file

From 6a014f3b6caac09247ee6b95aacf637979914cb0 Mon Sep 17 00:00:00 2001
From: zoeloh <31715186+zoeloh@users.noreply.github.com>
Date: Tue, 24 Oct 2017 21:01:46 -0400
Subject: [PATCH 2/3] Create challenge2.py

Finished challenge 2
---
 challenge2.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 challenge2.py

diff --git a/challenge2.py b/challenge2.py
new file mode 100644
index 0000000..782dd19
--- /dev/null
+++ b/challenge2.py
@@ -0,0 +1,47 @@
+import re
+
+IDFile = open("indivIDs.txt","r")
+IDs = {}
+
+for ln in IDFile:
+    ln = ln.strip()
+    fields = ln.split()
+    if fields[0] in IDs:
+        print("Duplicate: " + fields[0])
+        continue
+    else:
+        IDs[fields[0]] = fields[1]
+        
+IDFile.close()
+
+sequenceFile = open("seqFastq.fq","r")
+outputFile = open("IDseq.fasta","w")
+
+cutSites = []
+
+regex = re.compile(r"([ATCG]{8})AATTC") #gets the group of 8 bases just before the restriction site in a  group
+sequenceReg = re.compile(r"[A-Z]{10}")  #matches uppercase strings. I saw some had an N to begin with. I chose 10 because it is unlikely that the quality line has 10 uppercase in a row
+
+for ln in sequenceFile:
+    if re.match(sequenceReg, ln):
+        match = re.search(regex, ln)
+        if not match:
+            #print("No match on the restriction site in sequence: " + ln)
+            continue
+        if match.group(1) in IDs:
+            cutSites.append(match.start(1) + 8)
+            ln = ln[:match.start(1)] + ln[match.end(1):] #cuts out the part matched by regex from string
+            
+            outputFile.write('>' + IDs[match.group(1)] + "\n")
+            outputFile.write(ln)
+        #else:
+            #print("no match found on : " + match.group(1))
+            #outputFile.write("> No match on: " + match.group(1) + '\n')
+            #outputFile.write(ln)
+            
+import pandas
+from plotnine import *
+data=pandas.DataFrame({"Cut Site": cutSites})
+
+cutHG=ggplot(data,aes(x="Cut Site"))
+cutHG+geom_histogram()+theme_classic()

From 4c2b1f9c01bebe569753636fca01b37b46b005e1 Mon Sep 17 00:00:00 2001
From: Soren Holm <jhssoren@hotmail.com>
Date: Tue, 24 Oct 2017 21:04:17 -0400
Subject: [PATCH 3/3] Update challenge1.py

Challenge 1 complete
---
 challenge1.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/challenge1.py b/challenge1.py
index e403698..6a1ab17 100644
--- a/challenge1.py
+++ b/challenge1.py
@@ -5,6 +5,9 @@
 
 regexTX = re.compile(r"(CF|cf)\w*.[Aa]\w*.")
 regexFL = re.compile(r"(CF|cf).[Gg]\w*.")
+delete1 = re.compile("0/0:")
+delete2 = re.compile(":\d[\d:,]*")
+delete3 = re.compile("./.:.:.:.:.")
 
 for ln in inputFile:
     ln.strip()
@@ -16,8 +19,11 @@
         ln = re.sub(regexTX,"Cf.Sfa.",ln)
         outputFile.write(ln + "\n")
     else:
+        ln = re.sub(delete1, "", ln)
+        ln = re.sub(delete2, "", ln)
+        ln = re.sub(delete3, "NA", ln)
         outputFile.write(ln + "\n")
         
 inputFile.close()
 outputFile.close()
-        
\ No newline at end of file
+