diff --git a/challenge1.py b/challenge1.py new file mode 100644 index 0000000..6a1ab17 --- /dev/null +++ b/challenge1.py @@ -0,0 +1,29 @@ +import re + +inputFile = open("Cflorida.vcf","r") +outputFile = open("CfloridaCounts.txt","w") + +regexTX = re.compile(r"(CF|cf)\w*.[Aa]\w*.") +regexFL = re.compile(r"(CF|cf).[Gg]\w*.") +delete1 = re.compile("0/0:") +delete2 = re.compile(":\d[\d:,]*") +delete3 = re.compile("./.:.:.:.:.") + +for ln in inputFile: + ln.strip() + if ln.startswith("##"): + outputFile.write(ln + "\n") + continue + if ln.startswith("#"): + ln = re.sub(regexFL,"Cf.Gai.",ln) + ln = re.sub(regexTX,"Cf.Sfa.",ln) + outputFile.write(ln + "\n") + else: + ln = re.sub(delete1, "", ln) + ln = re.sub(delete2, "", ln) + ln = re.sub(delete3, "NA", ln) + outputFile.write(ln + "\n") + +inputFile.close() +outputFile.close() + diff --git a/challenge2.py b/challenge2.py new file mode 100644 index 0000000..782dd19 --- /dev/null +++ b/challenge2.py @@ -0,0 +1,47 @@ +import re + +IDFile = open("indivIDs.txt","r") +IDs = {} + +for ln in IDFile: + ln = ln.strip() + fields = ln.split() + if fields[0] in IDs: + print("Duplicate: " + fields[0]) + continue + else: + IDs[fields[0]] = fields[1] + +IDFile.close() + +sequenceFile = open("seqFastq.fq","r") +outputFile = open("IDseq.fasta","w") + +cutSites = [] + +regex = re.compile(r"([ATCG]{8})AATTC") #gets the group of 8 bases just before the restriction site in a group +sequenceReg = re.compile(r"[A-Z]{10}") #matches uppercase strings. I saw some had an N to begin with. I chose 10 because it is unlikely that the quality line has 10 uppercase in a row + +for ln in sequenceFile: + if re.match(sequenceReg, ln): + match = re.search(regex, ln) + if not match: + #print("No match on the restriction site in sequence: " + ln) + continue + if match.group(1) in IDs: + cutSites.append(match.start(1) + 8) + ln = ln[:match.start(1)] + ln[match.end(1):] #cuts out the part matched by regex from string + + outputFile.write('>' + IDs[match.group(1)] + "\n") + outputFile.write(ln) + #else: + #print("no match found on : " + match.group(1)) + #outputFile.write("> No match on: " + match.group(1) + '\n') + #outputFile.write(ln) + +import pandas +from plotnine import * +data=pandas.DataFrame({"Cut Site": cutSites}) + +cutHG=ggplot(data,aes(x="Cut Site")) +cutHG+geom_histogram()+theme_classic()