From 0327e7943f33a9370de28e06345e1b5bf53318a2 Mon Sep 17 00:00:00 2001 From: omegadan01 Date: Fri, 6 Oct 2017 11:00:46 -0400 Subject: [PATCH 1/9] made file --- .idea/vcs.xml | 6 ++++++ exercise 7.py | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 .idea/vcs.xml create mode 100644 exercise 7.py diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/exercise 7.py b/exercise 7.py new file mode 100644 index 0000000..295cf84 --- /dev/null +++ b/exercise 7.py @@ -0,0 +1,18 @@ +#exercise 7# +#Dan Bruzzese and Zoe Loh + + + + + +# question 1 + + +#question2 + + + + + + +#question 3 \ No newline at end of file From a77b8108893293d15f562f87fb4aff5d5bbd42e1 Mon Sep 17 00:00:00 2001 From: omegadan01 Date: Fri, 6 Oct 2017 11:15:47 -0400 Subject: [PATCH 2/9] started q3 --- exercise 7.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/exercise 7.py b/exercise 7.py index 295cf84..7d22240 100644 --- a/exercise 7.py +++ b/exercise 7.py @@ -8,6 +8,8 @@ # question 1 + + #question2 @@ -15,4 +17,22 @@ -#question 3 \ No newline at end of file +#question 3 +#making the plot +from plotnine import * + +import pandas +dat = pandas.read_csv("data.txt") + +print dat.head(n=5) + +#need graph for mean + +p=(ggplot(data=dat) + + aes( "region", "observations") + + geom_bar(stat = "identity") + + theme_classic() +) + +print p + From 80473e5766f2613f9103979a2384dc2d0d74a263 Mon Sep 17 00:00:00 2001 From: omegadan01 Date: Tue, 10 Oct 2017 02:25:22 -0400 Subject: [PATCH 3/9] work on q3 --- exercise 7.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/exercise 7.py b/exercise 7.py index 7d22240..2af229f 100644 --- a/exercise 7.py +++ b/exercise 7.py @@ -24,15 +24,14 @@ import pandas dat = pandas.read_csv("data.txt") -print dat.head(n=5) +#barplot for mean observations in a region +dat_grp= dat['observations'].groupby(dat['region']) #group observations by region +dat_mean= dat_grp.mean() # mean of the dat grp into a list -#need graph for mean +df = pandas.DataFrame({'col':dat_mean}) #turn list into a dataframe +print (df) +print df[0:4] +#only has one row.... -p=(ggplot(data=dat) - + aes( "region", "observations") - + geom_bar(stat = "identity") - + theme_classic() -) -print p From d7ee4b5e44bf6e94b176c88662858e5003f9570c Mon Sep 17 00:00:00 2001 From: omegadan01 Date: Tue, 10 Oct 2017 12:17:41 -0400 Subject: [PATCH 4/9] q3 barplot --- exercise 7.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/exercise 7.py b/exercise 7.py index 2af229f..27ce2b5 100644 --- a/exercise 7.py +++ b/exercise 7.py @@ -17,21 +17,22 @@ -#question 3 -#making the plot +#########question 3################ from plotnine import * - -import pandas -dat = pandas.read_csv("data.txt") +import pandas as pd +dat = pd.read_csv("data.txt") #barplot for mean observations in a region -dat_grp= dat['observations'].groupby(dat['region']) #group observations by region -dat_mean= dat_grp.mean() # mean of the dat grp into a list +grouped= dat.groupby(["region"]).mean().reset_index() #mean observations by region +print grouped +grouped.columns = ['region', 'mean_observations'] +p= (ggplot(data=grouped) + + aes(x='region', y= 'mean_observations',fill= 'region') + + geom_bar(stat = "identity") + + theme_classic() + ) +print p -df = pandas.DataFrame({'col':dat_mean}) #turn list into a dataframe -print (df) -print df[0:4] -#only has one row.... From 399fc3496492433a787ce38573154c801ed867d1 Mon Sep 17 00:00:00 2001 From: omegadan01 Date: Tue, 10 Oct 2017 12:43:11 -0400 Subject: [PATCH 5/9] q3 scatterplot --- exercise 7.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/exercise 7.py b/exercise 7.py index 27ce2b5..7b9d24b 100644 --- a/exercise 7.py +++ b/exercise 7.py @@ -33,6 +33,11 @@ ) print p - - +#scatterplot +d= (ggplot(data=dat) + + aes(y='observations', x='region', fill= 'region') + + geom_point(alpha= .01) + + theme_classic() + ) +print d From d69f5f3b871b4a0b6c8a946a7f25c976cc9628bc Mon Sep 17 00:00:00 2001 From: omegadan01 Date: Tue, 10 Oct 2017 12:47:33 -0400 Subject: [PATCH 6/9] q3 finished! and works! let me know if you want me to help with q1 --- exercise 7.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/exercise 7.py b/exercise 7.py index 7b9d24b..60fadd5 100644 --- a/exercise 7.py +++ b/exercise 7.py @@ -40,4 +40,5 @@ + theme_classic() ) print d - + # why= the bar chart shows us the mean of observations from each region +#while the scatter plot shows us the value of all observations from each region From afe0ca396955e0999f6ef08edba8df82a43e322c Mon Sep 17 00:00:00 2001 From: Zoe Loh Date: Wed, 11 Oct 2017 21:49:00 -0400 Subject: [PATCH 7/9] problem 1 and 2 --- exercise 7.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/exercise 7.py b/exercise 7.py index 7d22240..7940b3f 100644 --- a/exercise 7.py +++ b/exercise 7.py @@ -6,15 +6,39 @@ # question 1 - - - +import pandas +File=open("Lecture11.fasta","r") +plotData = pandas.DataFrame(columns = ["Sequence Length" , "GC content"]) + +for line in File: + line = line.strip() + if ">" in line: + continue + else: + #First the length of the sequence and the percent gc count is calculated + Length = (len(line)-1) + #Because it is integer division we must force python do divide as if it was real numbers by using float() + GCcount = (float((line.count("G"))+line.count("C"))/len(line)) + #The values are inserted into a dataframe for plotting + row = pandas.DataFrame({"Sequence Length": Length, "GC content": GCcount}, index=[0]) + plotData = plotData.append(row) +#GC histogram plot +a=ggplot(plotData,aes(x="GC content")) +a+geom_histogram()+theme_classic() + +#sequence length histogram plot +b=ggplot(plotData,aes(x="Sequence Length")) +b+geom_histogram()+theme_classic() #question2 +import pandas +from plotnine import * +data=pandas.read_csv("heartrate.txt",sep=",",header=0) - - +#Here I make the scatter plot showing how running speed and heart rate are related +plot=ggplot(data,aes(x="Heart rate",y="Running speed")) +plot+geom_point()+coord_cartesian()+stat_smooth(method="lm") #question 3 From 248cb606f2460ab951331db1f0e7e6f8d94bcf04 Mon Sep 17 00:00:00 2001 From: Zoe Loh Date: Wed, 11 Oct 2017 22:41:23 -0400 Subject: [PATCH 8/9] file for graph --- heartrate.txt | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 heartrate.txt diff --git a/heartrate.txt b/heartrate.txt new file mode 100644 index 0000000..63dcc94 --- /dev/null +++ b/heartrate.txt @@ -0,0 +1,22 @@ +"Heart rate","Running speed" +80,0 +85,2 +87,3 +90,3 +94,3 +97,4 +102,4 +60,-5 +110,5 +117,6 +120,6 +124,7 +130,7 +138,8 +143,8 +150,9 +157,10 +160,11 +165,12 +170,12.5 +185,14 From 6fcd03057b90d8b233c5ba42df31855af4276851 Mon Sep 17 00:00:00 2001 From: omegadan01 Date: Thu, 12 Oct 2017 00:16:57 -0400 Subject: [PATCH 9/9] added print command to ggplots (didnt work for me otherwise) tweaked the alpha in my code Code looks great and runs!! --- exercise 7.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/exercise 7.py b/exercise 7.py index 088d49d..37d91f2 100644 --- a/exercise 7.py +++ b/exercise 7.py @@ -7,6 +7,7 @@ # question 1 import pandas +from plotnine import * File=open("Lecture11.fasta","r") plotData = pandas.DataFrame(columns = ["Sequence Length" , "GC content"]) @@ -24,11 +25,13 @@ plotData = plotData.append(row) #GC histogram plot a=ggplot(plotData,aes(x="GC content")) -a+geom_histogram()+theme_classic() +aa= a+geom_histogram()+theme_classic() +print aa #sequence length histogram plot b=ggplot(plotData,aes(x="Sequence Length")) -b+geom_histogram()+theme_classic() +bb=b+geom_histogram()+theme_classic() +print bb #question2 @@ -38,8 +41,8 @@ #Here I make the scatter plot showing how running speed and heart rate are related plot=ggplot(data,aes(x="Heart rate",y="Running speed")) -plot+geom_point()+coord_cartesian()+stat_smooth(method="lm") - +p=plot+geom_point()+coord_cartesian()+stat_smooth(method="lm") +print p #########question 3################ from plotnine import * @@ -60,7 +63,7 @@ #scatterplot d= (ggplot(data=dat) + aes(y='observations', x='region', fill= 'region') - + geom_point(alpha= .01) + + geom_point(alpha= .1) + theme_classic() ) print d