lyy005 · kinskeep · Oct 6, 2017 · Oct 6, 2017 · Oct 6, 2017 · Oct 9, 2017
diff --git a/exercise7 b/exercise7
@@ -0,0 +1,82 @@
+####Exercise7####
+#Question 1
+#load dataset
+import pandas
+InFile=open("Lecture11.fasta","r")
+#InFile=close()
+#create lists for storing information about sequences
+sequenceID=[]
+sequenceLength=[]
+percentGC=[]
+meltingTemp=[]
+#for loop to sort out sequence lines and append their lengths
+for Line in InFile:
+# remove newline character from file line
+    Line=Line.strip()
+    print (Line)
+    # carrot lines separated from sequence lines   
+    if '>' in Line:
+        sequenceID.append(Line[1:])
+    else:
+        # Create new seqlength dataframe and append lengths
+        Seqlength = float(len(Line))
+        print (Seqlength)
+        sequenceLength.append(Seqlength)
+        # count the number of G's and C's
+        nG=Line.count("G")
+        print (nG)
+        nC=Line.count("C")
+        print (nC)
+        # append values to list
+        gcTotal = (nG+nC)/Seqlength*100
+        percentGC.append(gcTotal)
+
+#dataframe of resulting info
+seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC'])
+#to make infile management easier
+#InFile=open("Lecture11.fasta","r")
+InFile.close()
+
+#Histogram of sequence lengths
+import plotnine
+from plotnine import *
+p=(ggplot(data=seqDF) +
+    aes(x="sequenceLength") +
+    geom_histogram(binwidth=4))
+p
+#Histogram of Percent GC
+g=(ggplot(data=seqDF) +
+    aes(x="percentGC") +
+    geom_histogram(binwidth=5))
+g
+
+#Question 2
+import numpy
+import pandas
+import plotnine
+from plotnine import *
+
+#read in file
+Part2=pandas.read_csv("part2datacopy.txt", sep=",")
+#print(Part2)
+
+#plotting data in scatterplot with trendline
+a=ggplot(Part2,aes(x="oil changes per year",y="cost of repairs($)"))+theme_classic()+geom_point()
+a+xlab("oil changes per year")+ylab("cost of repairs($)")+stat_smooth(method="lm")
+
+#Question 3
+#load the dataset
+import pandas
+import numpy
+Data = pandas.read_csv("data.txt", sep=',')
+print (Data)
+
+#making bar graph with region as x and ave as y
+import plotnine
+from plotnine import *
+d=ggplot(Data)+theme_classic()+xlab("region")+ylab("Average")
+d+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean)
+
+#scatter plot of everything observed
+a=ggplot(Data,aes(x="region",y="observations"))
+a+geom_jitter()+coord_cartesian()
diff --git a/part1script.py b/part1script.py
@@ -0,0 +1,49 @@
+import numpy
+import pandas
+from plotnine import *
+
+
+#Question 1
+InFile=open("Lecture11.fasta","r")
+
+#create lists for storing information about sequences
+sequenceID=[]
+sequenceLength=[]
+percentGC=[]
+meltingTemp=[]
+
+#loop through each line in fasta file to process sequences
+for Line in InFile:
+    Line=Line.strip() #removes white space, tab, space, newline characters
+    if '>' in Line:
+        sequenceID.append(Line[1:])
+        #print(Line[1:])
+    else:
+        seqLen=float(len(Line))
+        nG=Line.count("G")
+        nC=Line.count("C")
+
+    #append values to lists
+        sequenceLength.append(seqLen)
+        percentGC.append((nG+nC)/seqLen*100)
+
+#combine lists into dataframe 
+seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC'])
+#min(seqDF.sequenceLength)
+
+#close file
+InFile.close()
+
+#plots histogram of sequence length
+b=ggplot(seqDF,aes(x="sequenceLength"))
+b+geom_histogram(binwidth=5)+theme_classic()
+
+#plots histogram of percent GC
+b=ggplot(seqDF,aes(x="percentGC"))
+b+geom_histogram(binwidth=5)+theme_classic()
+
+
+
+
+
+
diff --git a/part2datacopy.txt b/part2datacopy.txt
@@ -0,0 +1 @@
+oil changes per year,cost of repairs($)3,3005,3002,5003,4001,7004,4006,1004,2503,4502,6500,60010,07,150

diff --git a/part2script.py b/part2script.py
@@ -0,0 +1,11 @@
+import numpy
+import pandas
+import plotnine
+from plotnine import *
+
+Part2=pandas.read_csv("part2datacopy.txt", sep=",")
+#print(Part2)
+
+#plotting data in scatterplot with trendline
+a=ggplot(Part2,aes(x="oil changes per year",y="cost of repairs($)"))+theme_classic()+geom_point()
+a+xlab("oil changes per year")+ylab("cost of repairs($)")+stat_smooth(method="lm")
diff --git a/part3script.py b/part3script.py
@@ -0,0 +1,16 @@
+#Question 3
+#load the dataset
+import pandas
+import numpy
+Data = pandas.read_csv("data.txt", sep=',')
+#print (Data)
+
+#making bar graph with region as x and ave as y
+import plotnine
+from plotnine import *
+d=ggplot(Data)+theme_classic()+xlab("region")+ylab("Average")
+d+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean)
+
+#scatter plot of all observations
+a=ggplot(Data,aes(x="region",y="observations"))
+a+geom_jitter()+coord_cartesian()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		oil changes per year,cost of repairs($)3,3005,3002,5003,4001,7004,4006,1004,2503,4502,6500,60010,07,150
Expand Down