diff --git a/AnalysisTutorial7.py b/AnalysisTutorial7.py new file mode 100644 index 0000000..d42b8ae --- /dev/null +++ b/AnalysisTutorial7.py @@ -0,0 +1,106 @@ +#### Analysis for Tutorial 7 + + +# Set working directory +#os.chdir('/Users/brittnibertolet/Desktop/bcTutorials/Intro_Biocomp_ND_318_Tutorial7/') + +# Load packages +import numpy +import pandas +# from plotnine import * (not sure we need to do it this way) + + +############################################ +################ Question 1 ################ +############################################ + +# Read in fasta +seq=open("Lecture11.fasta",'r') + +# Plan for storing info +sequenceID=[] +sequenceLength=[] +percentGC=[] +meltingTemp=[] + +# Create for loop to do these things +for Line in seq: + # Remove newline character from file line + Line=Line.strip() + # Use if/else statement to figure out what line you're on + if '>' in Line: + # Append sequenceID to stored list + sequenceID.append(Line[1:]) + else: + # Count sequence length + seqLen=float(len(Line)) + # Count the number of G's and C's + nG=Line.count("G") + nC=Line.count("C") + + # Use if/else to figure out if the sequence is 14 or fewer bases + if seqLen<=14: + # Calculate melting temperature + Tm=2*(nG+nC)+2*seqLen + else: + # Return "-9999" + Tm=-9999 + + # Append values to the stored lists + sequenceLength.append(seqLen) + percentGC.append((nG+nC)/seqLen*100) + meltingTemp.append(Tm) + +# Combine lists into a dataframe +seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC,meltingTemp)),columns=['sequenceID','sequenceLength','percentGC','meltingTemp']) + +# Close file +seq.close() + +# Create histogram of sequence length +from plotnine import * +plot1=ggplot(seqDF,aes(x="sequenceLength")) +plot1+geom_histogram()+theme_classic() + +# Create histogram of GC content +plot2=ggplot(seqDF,aes(x="percentGC")) +plot2+geom_histogram()+theme_classic() + +############################################ +################ Question 2 ################ +############################################ + +# I put data in the GitHub titled "Q2_lakeData.txt". +# Read in Q2_lakeData.txt + +lakeData=pandas.read_csv("Q2_lakeData.txt", sep="\t") + +# Plot scatter plot of Prod vs chlA with trendline + +a=ggplot(lakeData,aes(x="chlA",y="Prod"))+theme_classic()+geom_point() +a+xlab("Concentration of Chlorophyll A")+ylab("Methane Production")+stat_smooth(method="lm") + + + +############################################ +################ Question 3 ################ +############################################ + +#read in data.txt +data3=pandas.read_csv("data.txt",sep=",",header=0) + +#bar plot of north, east, south, and west populations +plot4=ggplot(data3)+xlab("Region")+ylab("Mean Observation") +plot4+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean)+theme_classic() + +# Plot a scatter plot of of all the observations +plot5=ggplot(data3, aes(x="region", y="observations"))+xlab("Region")+ylab("Mean Observation") +plot5+geom_point()+geom_jitter()+theme_classic() + +### The two plots tell very different stories. While each region does have a very similar +### mean, the data are distributed differently around the mean. Both East and West have very +### large standard deviations, while the North region does not. The South region is more like a +### bimodal distribution with both maximam on opposite sides of the mean. + + +#scatter plot of observations diff --git a/Q2_lakeData.txt b/Q2_lakeData.txt new file mode 100644 index 0000000..9b8fab6 --- /dev/null +++ b/Q2_lakeData.txt @@ -0,0 +1 @@ +lakeID chlA pH DOC TP TN Prod BA 4.4 7 6.5 22.8 840 4493 BE 12.8 5.2 9.4 21.5 531 5329 BO 20.7 5.4 19.5 48.7 1389 9261 BR 40.2 8.3 6.6 86.9 1187 12649 CB 16.7 4.1 18.2 33 990 8137 CR 4 5.9 4.5 11.1 1111 2842 FO 6.3 5.5 11.1 52.6 696 3052 HB 13.8 5.3 23 30.7 1678 11418 MO 7.2 7.8 22.6 36.2 1450 8397 NG 32.3 4.7 23.4 17.6 2239 18701 PA 4.3 7.2 5.1 40.9 1024 9279 PE 3.4 7.7 6.4 12 1252 12256 TU 7 6.7 13.4 15.3 1122 8085 WL 8.1 6 7.4 22.1 803 13075 \ No newline at end of file