-
Notifications
You must be signed in to change notification settings - Fork 10
Exercise Submission #7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
84d6133
534e9ce
90ecfc3
fbaf8a2
e507b14
68e6535
5c2294b
fbb7d48
829b99f
b6a8115
e6cd363
05b8aca
51f6869
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,75 @@ | ||
| #Part 1 - Amanda | ||
| import pandas | ||
| from plotnine import * | ||
|
|
||
| InFile=open("Lecture11.fasta","r") #Open fasta file as read-only | ||
|
|
||
| sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated | ||
| percentGC=[] | ||
|
|
||
| for line in InFile: #Loop through each line in fasta file | ||
| if '>' in line: #Check line for >, if present, skip to next line | ||
| continue | ||
| else: | ||
| seqLen=float(len(line)) #Calculate length of sequence | ||
| nG=line.count("G") #Count individual G and C contents | ||
| nC=line.count("C") | ||
| percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC | ||
|
|
||
| sequenceLength.append(seqLen) #Append length of individual sequences to list | ||
| percentGC.append(percGC) #Append %GC of individual sequences to list | ||
|
|
||
| seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting | ||
| a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths | ||
| a+geom_histogram()+theme_classic() #Plot as histogram | ||
|
|
||
| b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC | ||
| b+geom_histogram()+theme_classic() #Plot as histogram | ||
|
|
||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good job |
||
| InFile.close() #Close file | ||
| ############################################################################# | ||
| #Part 2 - Thomas | ||
| import numpy | ||
| import pandas | ||
| from plotnine import * | ||
|
|
||
| icecream=pandas.read_csv("icream_sales.txt",sep="\t",header=0) | ||
| icecream.shape | ||
| icecream.head(20) | ||
|
|
||
| a=ggplot(icecream,aes(x="temp",y="sales"))+theme_classic()+geom_point() | ||
| a+xlab("Temperature (C)")+ylab("Sales In Dollars")+stat_smooth(method="lm") | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good job |
||
| ############################################################################# | ||
| #Part 3 - Balaji | ||
| import numpy | ||
| import os | ||
| import matplotlib.pyplot as plt | ||
| from plotnine import * | ||
| os.listdir('.') | ||
| os.chdir('/Users/sampathkumarbalaji/EX_7/Intro_Biocom_ND_319_Tutorial7') | ||
| import pandas | ||
|
|
||
| #to parse and read | ||
| data_txt = pandas.read_csv("data.txt") | ||
| directions = ['north','south','east','west'] | ||
|
|
||
| #create dataframe | ||
| A=numpy.zeros((4,2)) | ||
| mean_DF=pandas.DataFrame(A,columns=['region', 'mean_dir']) | ||
|
|
||
| #assigning values to data frame elements with mean of 4 regions | ||
| for i in range(0,4) : | ||
| mean_DF.mean_dir[i] = numpy.mean(data_txt[data_txt.region==directions[i]].observations) | ||
| mean_DF.region[i] = directions[i] | ||
|
|
||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or you can use: |
||
| a=ggplot(mean_DF)+theme_classic()+xlab("region")+ylab("mean_dir") | ||
| a+geom_bar(aes(x="region",y="mean_dir"),stat="summary") | ||
|
|
||
| b=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot | ||
| b+geom_jitter()+theme_classic() | ||
|
|
||
| #Bar Graph had mean plotted across the regions which almost had same values (~15). They almost looked the same. | ||
| #But on the scatter plot we north has points centered around 15 while east and west are equally spread and south | ||
| #has a bi-modal distribution. | ||
|
|
||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good job |
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,30 @@ | ||
| import numpy | ||
| import os | ||
| import matplotlib.pyplot as plt | ||
| from plotnine import * | ||
| os.listdir('.') | ||
| os.chdir('/Users/sampathkumarbalaji/EX_7/Intro_Biocom_ND_319_Tutorial7') | ||
| import pandas | ||
|
|
||
| #to parse and read | ||
| data_txt = pandas.read_csv("data.txt") | ||
| directions = ['north','south','east','west'] | ||
|
|
||
| #create dataframe | ||
| A=numpy.zeros((4,2)) | ||
| mean_DF=pandas.DataFrame(A,columns=['region', 'mean_dir']) | ||
|
|
||
| #assigning values to data frame elements with mean of 4 regions | ||
| for i in range(0,4) : | ||
| mean_DF.mean_dir[i] = numpy.mean(data_txt[data_txt.region==directions[i]].observations) | ||
| mean_DF.region[i] = directions[i] | ||
|
|
||
| a=ggplot(mean_DF)+theme_classic()+xlab("region")+ylab("mean_dir") | ||
| a+geom_bar(aes(x="region",y="mean_dir"),stat="summary") | ||
|
|
||
| b=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot | ||
| b+geom_jitter()+theme_classic() | ||
|
|
||
| #Bar Graph had mean plotted across the regions which almost had same values (~15). They almost looked the same. | ||
| #But on the scatter plot we north has points centered around 15 while east and west are equally spread and south | ||
| #has a bi-modal distribution. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,65 @@ | ||
| #Part 1 | ||
| import pandas | ||
| from plotnine import * | ||
|
|
||
| InFile=open("Lecture11.fasta","r") #Open fasta file as read-only | ||
|
|
||
| sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated | ||
| percentGC=[] | ||
|
|
||
| for line in InFile: #Loop through each line in fasta file | ||
| if '>' in line: #Check line for >, if present, skip to next line | ||
| continue | ||
| else: | ||
| seqLen=float(len(line)) #Calculate length of sequence | ||
| nG=line.count("G") #Count individual G and C contents | ||
| nC=line.count("C") | ||
| percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC | ||
|
|
||
| sequenceLength.append(seqLen) #Append length of individual sequences to list | ||
| percentGC.append(percGC) #Append %GC of individual sequences to list | ||
|
|
||
| seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting | ||
| a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths | ||
| a+geom_histogram()+theme_classic() #Plot as histogram | ||
|
|
||
| b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC | ||
| b+geom_histogram()+theme_classic() #Plot as histogram | ||
|
|
||
| InFile.close() #Close file | ||
|
|
||
| #Part 3 | ||
| import numpy | ||
|
|
||
| data=pandas.read_csv("data.txt", header=0, sep=",") #Open file as data frame | ||
|
|
||
| dataN=data[data.region=="north"] #Subset data frame & find mean for all populations | ||
| nMean=numpy.mean(dataN.observations) | ||
|
|
||
| dataE=data[data.region=="east"] | ||
| eMean=numpy.mean(dataE.observations) | ||
|
|
||
| dataW=data[data.region=="west"] | ||
| wMean=numpy.mean(dataW.observations) | ||
|
|
||
| dataS=data[data.region=="south"] | ||
| sMean=numpy.mean(dataS.observations) | ||
|
|
||
| means=pandas.DataFrame(columns=('region', 'mean')) #Combine means into new data frame | ||
| means.region='north','south','east','west' | ||
| means.iloc[0,1]=nMean | ||
| means.iloc[1,1]=sMean | ||
| means.iloc[2,1]=eMean | ||
| means.iloc[3,1]=wMean | ||
|
|
||
| c=ggplot(means, aes(x="region",y="mean")) #Plot means on bar graph | ||
| c+geom_col()+theme_classic() | ||
|
|
||
| d=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot | ||
| d+geom_jitter()+theme_classic() | ||
|
|
||
| #Graphs tell different stories - only on the scatter plot does it become apparent that the observations | ||
| #in the south region are two discrete populations, rather than a continuous spread like the others. | ||
| #Additionally, the mean for the West region makes it look as though it has the smallest values, whereas | ||
| #the scatterplot shows that it has both the lowest and the highest values, over a very large spread. | ||
| #The mean barplot is really only an accurate respresentation for the North region. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| #Part 1 | ||
| import pandas | ||
| from plotnine import * | ||
|
|
||
| InFile=open("Lecture11.fasta","r") #Open fasta file as read-only | ||
|
|
||
| sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated | ||
| percentGC=[] | ||
|
|
||
| for line in InFile: #Loop through each line in fasta file | ||
| if '>' in line: #Check line for >, if present, skip to next line | ||
| continue | ||
| else: | ||
| seqLen=float(len(line)) #Calculate length of sequence | ||
| nG=line.count("G") #Count individual G and C contents | ||
| nC=line.count("C") | ||
| percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC | ||
|
|
||
| sequenceLength.append(seqLen) #Append length of individual sequences to list | ||
| percentGC.append(percGC) #Append %GC of individual sequences to list | ||
|
|
||
| seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting | ||
| a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths | ||
| a+geom_histogram()+theme_classic() #Plot as histogram | ||
|
|
||
| b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC | ||
| b+geom_histogram()+theme_classic() #Plot as histogram | ||
|
|
||
| InFile.close() #Close file |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| import numpy | ||
| import pandas | ||
| from plotnine import * | ||
|
|
||
| icecream=pandas.read_csv("icream_sales.txt",sep="\t",header=0) | ||
| icecream.shape | ||
| icecream.head(20) | ||
|
|
||
| a=ggplot(icecream,aes(x="temp",y="sales"))+theme_classic()+geom_point() | ||
| a+xlab("Temperature (C)")+ylab("Sales In Dollars")+stat_smooth(method="lm") |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| “temp” “sales” | ||
| 14.2 21516.4 32511.9 18515.2 33218.5 40622.1 52219.4 41225.1 61423.4 54418.1 42122.6 44517.2 408 | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
line=line.strip() # remove the new line character