lyy005 · mcorley1 · Oct 25, 2017 · Oct 26, 2017 · Oct 26, 2017 · Oct 26, 2017
diff --git a/CfloridaCounts.txt b/CfloridaCounts.txt
diff --git a/CfloridaCounts_2ndhalf.txt b/CfloridaCounts_2ndhalf.txt
diff --git a/Challenge2_Ex8_inprogress b/Challenge2_Ex8_inprogress
@@ -0,0 +1,66 @@
+# Import packages:
+
+import numpy
+import os
+import pandas
+import re
+from plotnine import *
+
+# Navigation to directory:
+
+os.listdir('.')
+os.chdir('/Users/winghomitchell/Intro_Biocom_ND_319_Tutorial8')
+
+# Open files to read and write:
+
+IDtxt = open("indivIDs.txt","r")
+seqFastq = open("seqFastq.fq","r")
+
+IDseqFasta = open("seqFastq.fq","w")
+
+# Initialize empty dictionary and lists:
+
+dictionary = {}
+
+for line in seqFastq:
+    line = line.strip()
+    cols = line.split()
+
+    if cols[0] in dictionary:
+        print("Duplicate: " + cols[1])
+        break
+    else:
+        dictionary[cols[0]] = cols[1]
+
+# Make dictionary of IDs:
+
+# Close ID file:
+IDtxt.close()
+
+# Assign RegEx to variable name, or compile to variable name:
+
+# RegEx = (AATTC)
+
+# While loop to process fastq lines:
+
+while line != "":    
+    if: line = 
+        if:
+            if:
+
+            else:
+    else:
+        print("Error")
+        break
+
+# Graph histograms of good and bad start positions:
+
+
+# Close other file: 
+seqFastq.close()
+
+
+
+
+
+
diff --git a/EX_8_Script_2ndhalf_part1 b/EX_8_Script_2ndhalf_part1
@@ -0,0 +1,38 @@
+import numpy
+import os
+import re
+from plotnine import *
+os.listdir('.')
+os.chdir('/Users/sampathkumarbalaji/EX_8/Intro_Biocom_ND_319_Tutorial8')
+import pandas
+
+pattern = '[0-9],[0-9]'
+regex = re.compile(pattern)
+#file1 = open("Cflorida.vcf", 'r')
+dictionary = {}
+#for lines in file1:
+#data_txt = pandas.read_csv("Cflorida.vcf")
+with open("Cflorida.vcf", "r") as infile, open("CfloridaCounts_2ndhalf.txt", "w") as outfile:
+    for line in infile:
+#        outfile.write(line)
+        line = line.strip()
+        cols = line.split()
+        for index in range(4,len(cols)):
+            #match = re.search(pattern, cols[index]).group()
+            match = re.findall(pattern, cols[index])
+            #match = re.search(pattern, cols[index],flags=0)
+          #  print (match.group(0))
+            if (match):
+                 cols[index] = match[0]
+            else:
+                 cols[index] = 'NA'
+        line_new = '\t'.join(cols) 
+        outfile.write(line_new + '\n')
+        print (match)
+outfile.close()
+infile.close()
+
+
+
+
+
diff --git a/EX_8_Script_2ndhalf_part1_rev2 b/EX_8_Script_2ndhalf_part1_rev2
@@ -0,0 +1,36 @@
+import numpy
+import os
+import re
+from plotnine import *
+os.listdir('.')
+os.chdir('/Users/sampathkumarbalaji/EX_8/Intro_Biocom_ND_319_Tutorial8')
+import pandas
+
+pattern = '[0-9],[0-9]'
+# setting up the regex format and compiling it
+regex = re.compile(pattern)
+# parsing input file Cflorida.vcf and writing onto the outfile CfloridaCounts_2ndhalf using for loop
+with open("Cflorida.vcf", "r") as infile, open("CfloridaCounts_2ndhalf.txt", "w") as outfile:
+    for line in infile:
+# splitting up lines and columns to process further
+        line = line.strip()
+        cols = line.split()
+        for index in range(4,len(cols)):
+# find all the matching pattern based off pattern str variable
+            match = re.findall(pattern, cols[index])
+# if match is found then replace with the 1st match      
+            if (match):
+                 cols[index] = match[0]
+# else '.' gets replaced with NA                 
+            else:
+                 cols[index] = 'NA'
+        line_new = '\t'.join(cols) 
+# new lines are written onto the output file
+        outfile.write(line_new + '\n')
+outfile.close()
+infile.close()
+
+
+
+
+
diff --git a/part1script.py b/part1script.py
@@ -0,0 +1,33 @@
+#Exercise 8, Question 1
+#10/13/17
+import re
+
+#Open files to read and write
+vcffile = open("Cflorida.vcf","r")
+outfile = open("CfloridaCounts.txt","w")
+
+#regex variables
+TXsamples = "([Cc][Ff](\d{2})?\.[Aa]\d?)"
+FLsamples = "([Cc][Ff]\.[Gg](2|AI|ai))"
+allelect = r"\d/\d:(\d,\d):\d:\d+:\d+,\d+,\d+"
+missing = "\./\.:\.:\.:\.:\."
+
+for Line in vcffile:
+    Line=Line.strip()
+    if '##' in Line:
+        #print('found first line')
+        outfile.write(Line + "\n") #write unchanged header line to file
+    elif '#' in Line: #how can you tell if this is the line with the column headings?
+        #print('found second line')
+        TXreplaced = re.sub(TXsamples,"Cf.Sfa",Line)
+        FLreplaced = re.sub(FLsamples,"Cf.Gai",TXreplaced) #standardize (replace) sample names with TX and FL regexes
+        outfile.write(FLreplaced + "\n") #write new version of line to file
+    else: #now you're in the data
+        selAllele = re.sub(allelect, r"\1", Line) #replace full SNP info with allele counts only
+        #print(selAllele)
+        missingData = re.sub(missing, "N/A", selAllele) #replace missing data with NA
+        #print(missingData)
+        outfile.write(missingData +"\n") #write new version of line to new file
+
+vcffile.close()
+outfile.close()