diff --git a/EX_7_Script_Final b/EX_7_Script_Final new file mode 100644 index 0000000..2ccf6cf --- /dev/null +++ b/EX_7_Script_Final @@ -0,0 +1,75 @@ +#Part 1 - Amanda +import pandas +from plotnine import * + +InFile=open("Lecture11.fasta","r") #Open fasta file as read-only + +sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated +percentGC=[] + +for line in InFile: #Loop through each line in fasta file + if '>' in line: #Check line for >, if present, skip to next line + continue + else: + seqLen=float(len(line)) #Calculate length of sequence + nG=line.count("G") #Count individual G and C contents + nC=line.count("C") + percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC + + sequenceLength.append(seqLen) #Append length of individual sequences to list + percentGC.append(percGC) #Append %GC of individual sequences to list + +seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting +a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths +a+geom_histogram()+theme_classic() #Plot as histogram + +b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC +b+geom_histogram()+theme_classic() #Plot as histogram + +InFile.close() #Close file +############################################################################# +#Part 2 - Thomas +import numpy +import pandas +from plotnine import * + +icecream=pandas.read_csv("icream_sales.txt",sep="\t",header=0) +icecream.shape +icecream.head(20) + +a=ggplot(icecream,aes(x="temp",y="sales"))+theme_classic()+geom_point() +a+xlab("Temperature (C)")+ylab("Sales In Dollars")+stat_smooth(method="lm") +############################################################################# +#Part 3 - Balaji +import numpy +import os +import matplotlib.pyplot as plt +from plotnine import * +os.listdir('.') +os.chdir('/Users/sampathkumarbalaji/EX_7/Intro_Biocom_ND_319_Tutorial7') +import pandas + +#to parse and read +data_txt = pandas.read_csv("data.txt") +directions = ['north','south','east','west'] + +#create dataframe +A=numpy.zeros((4,2)) +mean_DF=pandas.DataFrame(A,columns=['region', 'mean_dir']) + +#assigning values to data frame elements with mean of 4 regions +for i in range(0,4) : + mean_DF.mean_dir[i] = numpy.mean(data_txt[data_txt.region==directions[i]].observations) + mean_DF.region[i] = directions[i] + +a=ggplot(mean_DF)+theme_classic()+xlab("region")+ylab("mean_dir") +a+geom_bar(aes(x="region",y="mean_dir"),stat="summary") + +b=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot +b+geom_jitter()+theme_classic() + +#Bar Graph had mean plotted across the regions which almost had same values (~15). They almost looked the same. +#But on the scatter plot we north has points centered around 15 while east and west are equally spread and south +#has a bi-modal distribution. + + diff --git a/EX_7_Script_Q3 b/EX_7_Script_Q3 new file mode 100644 index 0000000..ccb5e69 --- /dev/null +++ b/EX_7_Script_Q3 @@ -0,0 +1,30 @@ +import numpy +import os +import matplotlib.pyplot as plt +from plotnine import * +os.listdir('.') +os.chdir('/Users/sampathkumarbalaji/EX_7/Intro_Biocom_ND_319_Tutorial7') +import pandas + +#to parse and read +data_txt = pandas.read_csv("data.txt") +directions = ['north','south','east','west'] + +#create dataframe +A=numpy.zeros((4,2)) +mean_DF=pandas.DataFrame(A,columns=['region', 'mean_dir']) + +#assigning values to data frame elements with mean of 4 regions +for i in range(0,4) : + mean_DF.mean_dir[i] = numpy.mean(data_txt[data_txt.region==directions[i]].observations) + mean_DF.region[i] = directions[i] + +a=ggplot(mean_DF)+theme_classic()+xlab("region")+ylab("mean_dir") +a+geom_bar(aes(x="region",y="mean_dir"),stat="summary") + +b=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot +b+geom_jitter()+theme_classic() + +#Bar Graph had mean plotted across the regions which almost had same values (~15). They almost looked the same. +#But on the scatter plot we north has points centered around 15 while east and west are equally spread and south +#has a bi-modal distribution. diff --git a/Exercise7.py b/Exercise7.py new file mode 100755 index 0000000..a8c4a28 --- /dev/null +++ b/Exercise7.py @@ -0,0 +1,65 @@ +#Part 1 +import pandas +from plotnine import * + +InFile=open("Lecture11.fasta","r") #Open fasta file as read-only + +sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated +percentGC=[] + +for line in InFile: #Loop through each line in fasta file + if '>' in line: #Check line for >, if present, skip to next line + continue + else: + seqLen=float(len(line)) #Calculate length of sequence + nG=line.count("G") #Count individual G and C contents + nC=line.count("C") + percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC + + sequenceLength.append(seqLen) #Append length of individual sequences to list + percentGC.append(percGC) #Append %GC of individual sequences to list + +seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting +a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths +a+geom_histogram()+theme_classic() #Plot as histogram + +b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC +b+geom_histogram()+theme_classic() #Plot as histogram + +InFile.close() #Close file + +#Part 3 +import numpy + +data=pandas.read_csv("data.txt", header=0, sep=",") #Open file as data frame + +dataN=data[data.region=="north"] #Subset data frame & find mean for all populations +nMean=numpy.mean(dataN.observations) + +dataE=data[data.region=="east"] +eMean=numpy.mean(dataE.observations) + +dataW=data[data.region=="west"] +wMean=numpy.mean(dataW.observations) + +dataS=data[data.region=="south"] +sMean=numpy.mean(dataS.observations) + +means=pandas.DataFrame(columns=('region', 'mean')) #Combine means into new data frame +means.region='north','south','east','west' +means.iloc[0,1]=nMean +means.iloc[1,1]=sMean +means.iloc[2,1]=eMean +means.iloc[3,1]=wMean + +c=ggplot(means, aes(x="region",y="mean")) #Plot means on bar graph +c+geom_col()+theme_classic() + +d=ggplot(data, aes(x="region", y="observations")) #Plot all observations on scatter plot +d+geom_jitter()+theme_classic() + +#Graphs tell different stories - only on the scatter plot does it become apparent that the observations +#in the south region are two discrete populations, rather than a continuous spread like the others. +#Additionally, the mean for the West region makes it look as though it has the smallest values, whereas +#the scatterplot shows that it has both the lowest and the highest values, over a very large spread. +#The mean barplot is really only an accurate respresentation for the North region. \ No newline at end of file diff --git a/Exercise7Part1.py b/Exercise7Part1.py new file mode 100755 index 0000000..52b7988 --- /dev/null +++ b/Exercise7Part1.py @@ -0,0 +1,29 @@ +#Part 1 +import pandas +from plotnine import * + +InFile=open("Lecture11.fasta","r") #Open fasta file as read-only + +sequenceLength=[] #Set up variables to accept/store sequence data as it is calculated +percentGC=[] + +for line in InFile: #Loop through each line in fasta file + if '>' in line: #Check line for >, if present, skip to next line + continue + else: + seqLen=float(len(line)) #Calculate length of sequence + nG=line.count("G") #Count individual G and C contents + nC=line.count("C") + percGC=float(((nG+nC)/seqLen)*100) #Calculate % GC + + sequenceLength.append(seqLen) #Append length of individual sequences to list + percentGC.append(percGC) #Append %GC of individual sequences to list + +seqDF=pandas.DataFrame(list(zip(sequenceLength,percentGC)),columns=['sequenceLength','percentGC']) #combine lists into dataframe for easier plotting +a=ggplot(seqDF, aes(x="sequenceLength")) #Create plot of sequence lengths +a+geom_histogram()+theme_classic() #Plot as histogram + +b=ggplot(seqDF, aes(x="percentGC")) #Create plot of %GC +b+geom_histogram()+theme_classic() #Plot as histogram + +InFile.close() #Close file \ No newline at end of file diff --git a/code b/code new file mode 100644 index 0000000..a54bf3a --- /dev/null +++ b/code @@ -0,0 +1,10 @@ +import numpy +import pandas +from plotnine import * + +icecream=pandas.read_csv("icream_sales.txt",sep="\t",header=0) +icecream.shape +icecream.head(20) + +a=ggplot(icecream,aes(x="temp",y="sales"))+theme_classic()+geom_point() +a+xlab("Temperature (C)")+ylab("Sales In Dollars")+stat_smooth(method="lm") \ No newline at end of file diff --git a/icream_sales.txt b/icream_sales.txt new file mode 100644 index 0000000..7faa37b --- /dev/null +++ b/icream_sales.txt @@ -0,0 +1,2 @@ +“temp” “sales” +14.2 215 16.4 325 11.9 185 15.2 332 18.5 406 22.1 522 19.4 412 25.1 614 23.4 544 18.1 421 22.6 445 17.2 408 \ No newline at end of file