diff --git a/exercise7 b/exercise7 new file mode 100755 index 0000000..f471461 --- /dev/null +++ b/exercise7 @@ -0,0 +1,82 @@ +####Exercise7#### +#Question 1 +#load dataset +import pandas +InFile=open("Lecture11.fasta","r") +#InFile=close() +#create lists for storing information about sequences +sequenceID=[] +sequenceLength=[] +percentGC=[] +meltingTemp=[] +#for loop to sort out sequence lines and append their lengths +for Line in InFile: +# remove newline character from file line + Line=Line.strip() + print (Line) + # carrot lines separated from sequence lines + if '>' in Line: + sequenceID.append(Line[1:]) + else: + # Create new seqlength dataframe and append lengths + Seqlength = float(len(Line)) + print (Seqlength) + sequenceLength.append(Seqlength) + # count the number of G's and C's + nG=Line.count("G") + print (nG) + nC=Line.count("C") + print (nC) + # append values to list + gcTotal = (nG+nC)/Seqlength*100 + percentGC.append(gcTotal) + +#dataframe of resulting info +seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC']) +#to make infile management easier +#InFile=open("Lecture11.fasta","r") +InFile.close() + +#Histogram of sequence lengths +import plotnine +from plotnine import * +p=(ggplot(data=seqDF) + + aes(x="sequenceLength") + + geom_histogram(binwidth=4)) +p +#Histogram of Percent GC +g=(ggplot(data=seqDF) + + aes(x="percentGC") + + geom_histogram(binwidth=5)) +g + +#Question 2 +import numpy +import pandas +import plotnine +from plotnine import * + +#read in file +Part2=pandas.read_csv("part2datacopy.txt", sep=",") +#print(Part2) + +#plotting data in scatterplot with trendline +a=ggplot(Part2,aes(x="oil changes per year",y="cost of repairs($)"))+theme_classic()+geom_point() +a+xlab("oil changes per year")+ylab("cost of repairs($)")+stat_smooth(method="lm") + +#Question 3 +#load the dataset +import pandas +import numpy +Data = pandas.read_csv("data.txt", sep=',') +print (Data) + +#making bar graph with region as x and ave as y +import plotnine +from plotnine import * +d=ggplot(Data)+theme_classic()+xlab("region")+ylab("Average") +d+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean) + +#scatter plot of everything observed +a=ggplot(Data,aes(x="region",y="observations")) +a+geom_jitter()+coord_cartesian() diff --git a/part1script.py b/part1script.py new file mode 100644 index 0000000..eceada6 --- /dev/null +++ b/part1script.py @@ -0,0 +1,49 @@ +import numpy +import pandas +from plotnine import * + + +#Question 1 +InFile=open("Lecture11.fasta","r") + +#create lists for storing information about sequences +sequenceID=[] +sequenceLength=[] +percentGC=[] +meltingTemp=[] + +#loop through each line in fasta file to process sequences +for Line in InFile: + Line=Line.strip() #removes white space, tab, space, newline characters + if '>' in Line: + sequenceID.append(Line[1:]) + #print(Line[1:]) + else: + seqLen=float(len(Line)) + nG=Line.count("G") + nC=Line.count("C") + + #append values to lists + sequenceLength.append(seqLen) + percentGC.append((nG+nC)/seqLen*100) + +#combine lists into dataframe +seqDF = pandas.DataFrame(list(zip(sequenceID,sequenceLength,percentGC)),columns=['sequenceID','sequenceLength','percentGC']) +#min(seqDF.sequenceLength) + +#close file +InFile.close() + +#plots histogram of sequence length +b=ggplot(seqDF,aes(x="sequenceLength")) +b+geom_histogram(binwidth=5)+theme_classic() + +#plots histogram of percent GC +b=ggplot(seqDF,aes(x="percentGC")) +b+geom_histogram(binwidth=5)+theme_classic() + + + + + + diff --git a/part2datacopy.txt b/part2datacopy.txt new file mode 100644 index 0000000..82b3374 --- /dev/null +++ b/part2datacopy.txt @@ -0,0 +1 @@ +oil changes per year,cost of repairs($) 3,300 5,300 2,500 3,400 1,700 4,400 6,100 4,250 3,450 2,650 0,600 10,0 7,150 \ No newline at end of file diff --git a/part2script.py b/part2script.py new file mode 100644 index 0000000..6f3d765 --- /dev/null +++ b/part2script.py @@ -0,0 +1,11 @@ +import numpy +import pandas +import plotnine +from plotnine import * + +Part2=pandas.read_csv("part2datacopy.txt", sep=",") +#print(Part2) + +#plotting data in scatterplot with trendline +a=ggplot(Part2,aes(x="oil changes per year",y="cost of repairs($)"))+theme_classic()+geom_point() +a+xlab("oil changes per year")+ylab("cost of repairs($)")+stat_smooth(method="lm") diff --git a/part3script.py b/part3script.py new file mode 100644 index 0000000..595e38c --- /dev/null +++ b/part3script.py @@ -0,0 +1,16 @@ +#Question 3 +#load the dataset +import pandas +import numpy +Data = pandas.read_csv("data.txt", sep=',') +#print (Data) + +#making bar graph with region as x and ave as y +import plotnine +from plotnine import * +d=ggplot(Data)+theme_classic()+xlab("region")+ylab("Average") +d+geom_bar(aes(x="factor(region)",y="observations"),stat="summary",fun_y=numpy.mean) + +#scatter plot of all observations +a=ggplot(Data,aes(x="region",y="observations")) +a+geom_jitter()+coord_cartesian()