diff --git a/Ex7.py b/Ex7.py new file mode 100644 index 0000000..bae1bce --- /dev/null +++ b/Ex7.py @@ -0,0 +1,76 @@ +cd Desktop/Intro_Biocom_ND_319_Tutorial7/ + + +import pandas as pd + +InFile=open("Lecture11.fasta","r") + +#create lists for storing information about sequences +sequenceID=[] +sequenceLength=[] +percentGC=[] +meltingTemp=[] + +#loop through each line of fasta file to process sequences +for Line in InFile: + # remove newline character from file line + Line=Line.strip() + # if a sequence record + if '>' in Line: + # add the sequence ID (except the ">" character) to the sequenceID list + sequenceID.append(Line[1:]) + # if a sequence line + else: + # get the number of characters in the sequence and convert to a float to avoid integer division + seqLen=float(len(Line)) + # count the number of G's and C's + nG=Line.count("G") + nC=Line.count("C") + + # if the sequence is 14 or fewer bases calculate melting temperature + if seqLen<=14: + Tm=2*(nG+nC)+2*seqLen + else: + Tm=-9999 + + # append values to the lists + sequenceLength.append(seqLen) + percentGC.append((nG+nC)/seqLen*100) + meltingTemp.append(Tm) + +# combine lists into dataframe +seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC,meltingTemp)),columns=['sequenceID','sequenceLength','percentGC','meltingTemp']) + +# close file +InFile.close() + + +import plotnine + +#ex 1.1, histogram of sequence length +plot1=plotnine.ggplot(seqDF, plotnine.aes('sequenceLength')) +plot1 + plotnine.geom_histogram(binwidth=5) + +#ex 1.2, histogram of GC content +plot2=plotnine.ggplot(seqDF, plotnine.aes('percentGC')) +plot2 + plotnine.geom_histogram(binwidth=5) + +#ex 2, data of speed and stopping distance of cars.csv +cars=pd.read_csv("cars.csv") + +plot3=plotnine.ggplot(cars, plotnine.aes('speed','dist')) +plot3 + plotnine.geom_point() + plotnine.geom_smooth() + + +#ex 3 +data=pd.read_table("data.txt", delimiter=",") + +#ex3 plot 1, barplot of means +plot4=plotnine.ggplot(data, plotnine.aes('region', 'observations')) +plot4 + plotnine.geom_bar(stat='summary') + +#ex3 plot 2, scatter plot +plot5=plotnine.ggplot(data, plotnine.aes('region','observations')) +plot5 + plotnine.geom_jitter() + +#the bar plot showing the means leads me to believe the data are all very similar, since the means are all very close. However the scatterplot shows that the distribution of the data in each group is very different. diff --git a/cars.csv b/cars.csv new file mode 100644 index 0000000..34b3d46 --- /dev/null +++ b/cars.csv @@ -0,0 +1,51 @@ +"","speed","dist" +"1",4,2 +"2",4,10 +"3",7,4 +"4",7,22 +"5",8,16 +"6",9,10 +"7",10,18 +"8",10,26 +"9",10,34 +"10",11,17 +"11",11,28 +"12",12,14 +"13",12,20 +"14",12,24 +"15",12,28 +"16",13,26 +"17",13,34 +"18",13,34 +"19",13,46 +"20",14,26 +"21",14,36 +"22",14,60 +"23",14,80 +"24",15,20 +"25",15,26 +"26",15,54 +"27",16,32 +"28",16,40 +"29",17,32 +"30",17,40 +"31",17,50 +"32",18,42 +"33",18,56 +"34",18,76 +"35",18,84 +"36",19,36 +"37",19,46 +"38",19,68 +"39",20,32 +"40",20,48 +"41",20,52 +"42",20,56 +"43",20,64 +"44",22,66 +"45",23,54 +"46",24,70 +"47",24,92 +"48",24,93 +"49",24,120 +"50",25,85