lyy005 · twhmitchell · Oct 6, 2017 · Oct 12, 2017 · Oct 12, 2017 · Oct 12, 2017
diff --git a/EX_7_Script_Final b/EX_7_Script_Final
@@ -0,0 +1,75 @@
+#Part 1 - Amanda
+import pandas
+from plotnine import *
+
+InFile=open("Lecture11.fasta","r") #Open fasta file as read-only
+
+sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated
+percentGC=[]
+
+for line in InFile: #Loop through each line in fasta file
+    if '>' in line: #Check line for >, if present, skip to next line
+        continue
+    else:
+        seqLen=float(len(line)) #Calculate length of sequence
+        nG=line.count("G") #Count individual G and C contents
+        nC=line.count("C")
+        percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC
+
+        sequenceLength.append(seqLen) #Append length of individual sequences to list
+        percentGC.append(percGC) #Append %GC of individual sequences to list
+
+seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting
+a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths
+a+geom_histogram()+theme_classic() #Plot as histogram
+
+b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC
+b+geom_histogram()+theme_classic() #Plot as histogram
+
+InFile.close() #Close file
+#############################################################################
+#Part 2 - Thomas
+import numpy
+import pandas
+from plotnine import *
+
+icecream=pandas.read_csv("icream_sales.txt",sep="\t",header=0)
+icecream.shape
+icecream.head(20)
+
+a=ggplot(icecream,aes(x="temp",y="sales"))+theme_classic()+geom_point()
+a+xlab("Temperature (C)")+ylab("Sales In Dollars")+stat_smooth(method="lm")
+#############################################################################
+#Part 3 - Balaji
+import numpy
+import os
+import matplotlib.pyplot as plt
+from plotnine import *
+os.listdir('.')
+os.chdir('/Users/sampathkumarbalaji/EX_7/Intro_Biocom_ND_319_Tutorial7')
+import pandas
+
+#to parse and read
+data_txt = pandas.read_csv("data.txt")
+directions = ['north','south','east','west']
+
+#create dataframe
+A=numpy.zeros((4,2))
+mean_DF=pandas.DataFrame(A,columns=['region', 'mean_dir'])
+
+#assigning values to data frame elements with mean of 4 regions
+for i in range(0,4) :
+    mean_DF.mean_dir[i] = numpy.mean(data_txt[data_txt.region==directions[i]].observations)
+    mean_DF.region[i] = directions[i]
+
+a=ggplot(mean_DF)+theme_classic()+xlab("region")+ylab("mean_dir")
+a+geom_bar(aes(x="region",y="mean_dir"),stat="summary")
+
+b=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot
+b+geom_jitter()+theme_classic()
+
+#Bar Graph had mean plotted across the regions which almost had same values (~15). They almost looked the same.
+#But on the scatter plot we north has points centered around 15 while east and west are equally spread and south
+#has a bi-modal distribution.
+
+
diff --git a/EX_7_Script_Q3 b/EX_7_Script_Q3
@@ -0,0 +1,30 @@
+import numpy
+import os
+import matplotlib.pyplot as plt
+from plotnine import *
+os.listdir('.')
+os.chdir('/Users/sampathkumarbalaji/EX_7/Intro_Biocom_ND_319_Tutorial7')
+import pandas
+
+#to parse and read
+data_txt = pandas.read_csv("data.txt")
+directions = ['north','south','east','west']
+
+#create dataframe
+A=numpy.zeros((4,2))
+mean_DF=pandas.DataFrame(A,columns=['region', 'mean_dir'])
+
+#assigning values to data frame elements with mean of 4 regions
+for i in range(0,4) :
+    mean_DF.mean_dir[i] = numpy.mean(data_txt[data_txt.region==directions[i]].observations)
+    mean_DF.region[i] = directions[i]
+
+a=ggplot(mean_DF)+theme_classic()+xlab("region")+ylab("mean_dir")
+a+geom_bar(aes(x="region",y="mean_dir"),stat="summary")
+
+b=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot
+b+geom_jitter()+theme_classic()
+
+#Bar Graph had mean plotted across the regions which almost had same values (~15). They almost looked the same.
+#But on the scatter plot we north has points centered around 15 while east and west are equally spread and south
+#has a bi-modal distribution.
diff --git a/Exercise7.py b/Exercise7.py
@@ -0,0 +1,65 @@
+#Part 1
+import pandas
+from plotnine import *
+
+InFile=open("Lecture11.fasta","r") #Open fasta file as read-only
+
+sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated
+percentGC=[]
+
+for line in InFile: #Loop through each line in fasta file
+    if '>' in line: #Check line for >, if present, skip to next line
+        continue
+    else:
+        seqLen=float(len(line)) #Calculate length of sequence
+        nG=line.count("G") #Count individual G and C contents
+        nC=line.count("C")
+        percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC
+
+        sequenceLength.append(seqLen) #Append length of individual sequences to list
+        percentGC.append(percGC) #Append %GC of individual sequences to list
+
+seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting
+a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths
+a+geom_histogram()+theme_classic() #Plot as histogram
+
+b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC
+b+geom_histogram()+theme_classic() #Plot as histogram
+
+InFile.close() #Close file
+
+#Part 3
+import numpy
+
+data=pandas.read_csv("data.txt", header=0, sep=",") #Open file as data frame
+
+dataN=data[data.region=="north"] #Subset data frame & find mean for all populations
+nMean=numpy.mean(dataN.observations)
+
+dataE=data[data.region=="east"]
+eMean=numpy.mean(dataE.observations)
+
+dataW=data[data.region=="west"]
+wMean=numpy.mean(dataW.observations)
+
+dataS=data[data.region=="south"]
+sMean=numpy.mean(dataS.observations)
+
+means=pandas.DataFrame(columns=('region', 'mean')) #Combine means into new data frame
+means.region='north','south','east','west'
+means.iloc[0,1]=nMean
+means.iloc[1,1]=sMean
+means.iloc[2,1]=eMean
+means.iloc[3,1]=wMean
+
+c=ggplot(means, aes(x="region",y="mean")) #Plot means on bar graph
+c+geom_col()+theme_classic()
+
+d=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot
+d+geom_jitter()+theme_classic()
+
+#Graphs tell different stories - only on the scatter plot does it become apparent that the observations
+#in the south region are two discrete populations, rather than a continuous spread like the others.
+#Additionally, the mean for the West region makes it look as though it has the smallest values, whereas
+#the scatterplot shows that it has both the lowest and the highest values, over a very large spread.
+#The mean barplot is really only an accurate respresentation for the North region.
diff --git a/Exercise7Part1.py b/Exercise7Part1.py
@@ -0,0 +1,29 @@
+#Part 1
+import pandas
+from plotnine import *
+
+InFile=open("Lecture11.fasta","r") #Open fasta file as read-only
+
+sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated
+percentGC=[]
+
+for line in InFile: #Loop through each line in fasta file
+    if '>' in line: #Check line for >, if present, skip to next line
+        continue
+    else:
+        seqLen=float(len(line)) #Calculate length of sequence
+        nG=line.count("G") #Count individual G and C contents
+        nC=line.count("C")
+        percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC
+
+        sequenceLength.append(seqLen) #Append length of individual sequences to list
+        percentGC.append(percGC) #Append %GC of individual sequences to list
+
+seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting
+a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths
+a+geom_histogram()+theme_classic() #Plot as histogram
+
+b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC
+b+geom_histogram()+theme_classic() #Plot as histogram
+
+InFile.close() #Close file
diff --git a/code b/code
@@ -0,0 +1,10 @@
+import numpy
+import pandas
+from plotnine import *
+
+icecream=pandas.read_csv("icream_sales.txt",sep="\t",header=0)
+icecream.shape
+icecream.head(20)
+
+a=ggplot(icecream,aes(x="temp",y="sales"))+theme_classic()+geom_point()
+a+xlab("Temperature (C)")+ylab("Sales In Dollars")+stat_smooth(method="lm")
diff --git a/icream_sales.txt b/icream_sales.txt
@@ -0,0 +1,2 @@
+“temp” “sales”
+14.2  21516.4  32511.9  18515.2  33218.5  40622.1  52219.4  41225.1  61423.4  54418.1  42122.6  44517.2  408
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		“temp” “sales”
		14.2 21516.4 32511.9 18515.2 33218.5 40622.1 52219.4 41225.1 61423.4 54418.1 42122.6 44517.2 408
Expand Down