Skip to content
1,001 changes: 1,001 additions & 0 deletions CfloridaCounts.txt

Large diffs are not rendered by default.

1,001 changes: 1,001 additions & 0 deletions CfloridaCounts_2ndhalf.txt

Large diffs are not rendered by default.

66 changes: 66 additions & 0 deletions Challenge2_Ex8_inprogress
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Import packages:

import numpy
import os
import pandas
import re
from plotnine import *

# Navigation to directory:

os.listdir('.')
os.chdir('/Users/winghomitchell/Intro_Biocom_ND_319_Tutorial8')

# Open files to read and write:

IDtxt = open("indivIDs.txt","r")
seqFastq = open("seqFastq.fq","r")

IDseqFasta = open("seqFastq.fq","w")

# Initialize empty dictionary and lists:

dictionary = {}

for line in seqFastq:
line = line.strip()
cols = line.split()

if cols[0] in dictionary:
print("Duplicate: " + cols[1])
break
else:
dictionary[cols[0]] = cols[1]

# Make dictionary of IDs:

# Close ID file:
IDtxt.close()

# Assign RegEx to variable name, or compile to variable name:

# RegEx = (AATTC)

# While loop to process fastq lines:

while line != "":
if: line =
if:
if:

else:
else:
print("Error")
break

# Graph histograms of good and bad start positions:


# Close other file:
seqFastq.close()






38 changes: 38 additions & 0 deletions EX_8_Script_2ndhalf_part1
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import numpy
import os
import re
from plotnine import *
os.listdir('.')
os.chdir('/Users/sampathkumarbalaji/EX_8/Intro_Biocom_ND_319_Tutorial8')
import pandas

pattern = '[0-9],[0-9]'
regex = re.compile(pattern)
#file1 = open("Cflorida.vcf", 'r')
dictionary = {}
#for lines in file1:
#data_txt = pandas.read_csv("Cflorida.vcf")
with open("Cflorida.vcf", "r") as infile, open("CfloridaCounts_2ndhalf.txt", "w") as outfile:
for line in infile:
# outfile.write(line)
line = line.strip()
cols = line.split()
for index in range(4,len(cols)):
#match = re.search(pattern, cols[index]).group()
match = re.findall(pattern, cols[index])
#match = re.search(pattern, cols[index],flags=0)
# print (match.group(0))
if (match):
cols[index] = match[0]
else:
cols[index] = 'NA'
line_new = '\t'.join(cols)
outfile.write(line_new + '\n')
print (match)
outfile.close()
infile.close()





36 changes: 36 additions & 0 deletions EX_8_Script_2ndhalf_part1_rev2
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import numpy
import os
import re
from plotnine import *
os.listdir('.')
os.chdir('/Users/sampathkumarbalaji/EX_8/Intro_Biocom_ND_319_Tutorial8')
import pandas

pattern = '[0-9],[0-9]'
# setting up the regex format and compiling it
regex = re.compile(pattern)
# parsing input file Cflorida.vcf and writing onto the outfile CfloridaCounts_2ndhalf using for loop
with open("Cflorida.vcf", "r") as infile, open("CfloridaCounts_2ndhalf.txt", "w") as outfile:
for line in infile:
# splitting up lines and columns to process further
line = line.strip()
cols = line.split()
for index in range(4,len(cols)):
# find all the matching pattern based off pattern str variable
match = re.findall(pattern, cols[index])
# if match is found then replace with the 1st match
if (match):
cols[index] = match[0]
# else '.' gets replaced with NA
else:
cols[index] = 'NA'
line_new = '\t'.join(cols)
# new lines are written onto the output file
outfile.write(line_new + '\n')
outfile.close()
infile.close()





33 changes: 33 additions & 0 deletions part1script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#Exercise 8, Question 1
#10/13/17
import re

#Open files to read and write
vcffile = open("Cflorida.vcf","r")
outfile = open("CfloridaCounts.txt","w")

#regex variables
TXsamples = "([Cc][Ff](\d{2})?\.[Aa]\d?)"
FLsamples = "([Cc][Ff]\.[Gg](2|AI|ai))"
allelect = r"\d/\d:(\d,\d):\d:\d+:\d+,\d+,\d+"
missing = "\./\.:\.:\.:\.:\."

for Line in vcffile:
Line=Line.strip()
if '##' in Line:
#print('found first line')
outfile.write(Line + "\n") #write unchanged header line to file
elif '#' in Line: #how can you tell if this is the line with the column headings?
#print('found second line')
TXreplaced = re.sub(TXsamples,"Cf.Sfa",Line)
FLreplaced = re.sub(FLsamples,"Cf.Gai",TXreplaced) #standardize (replace) sample names with TX and FL regexes
outfile.write(FLreplaced + "\n") #write new version of line to file
else: #now you're in the data
selAllele = re.sub(allelect, r"\1", Line) #replace full SNP info with allele counts only
#print(selAllele)
missingData = re.sub(missing, "N/A", selAllele) #replace missing data with NA
#print(missingData)
outfile.write(missingData +"\n") #write new version of line to new file

vcffile.close()
outfile.close()

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good job