diff --git a/Ex12Q1.py b/Ex12Q1.py new file mode 100755 index 0000000..d677ae2 --- /dev/null +++ b/Ex12Q1.py @@ -0,0 +1,23 @@ +import numpy +import pandas +import plotnine +from plotnine import * +from scipy.optimize import minimize +from scipy.stats import norm + +chicken=pandas.read_csv("chickwts.txt", sep=",") +chicken.shape +ggplot(chicken,aes(x="feed",y="weight"))+geom_dotplot(binaxis="y",stackdir="center", stackratio=0.5, dotsize=0.2)+theme_classic() + +#subset data into different data frames +sub1=chicken.loc[chicken.feed.isin(['soybean','sunflower']),:] + +#Make new data frame with 'group' column (your x=0 or x=1) +#var2=pandas.DataFrame({'y':var1.col2name, 'x':}) +sub1frame=pandas.DataFrame({'y':sub1.feed,'x':0}) + +#Designate 'treatment' group as x=1 +#var2.loc[var1.col1name=='name of treatment group', 'x']=1 +sub1frame.loc[sub1.weight=='sunflower','x']=1 + + diff --git a/Ex12Q2.py b/Ex12Q2.py new file mode 100755 index 0000000..c31e30f --- /dev/null +++ b/Ex12Q2.py @@ -0,0 +1,4 @@ +import numpy +import pandas +import re + diff --git a/Exercise12-Q1.html b/Exercise12-Q1.html new file mode 100644 index 0000000..ed21850 --- /dev/null +++ b/Exercise12-Q1.html @@ -0,0 +1,12715 @@ + + +
+This dataset describes the attributes of chicks. The first column contains the weights of the chicks in a numeric value and the second column contains a character string which describes the type of feed they received.
+#Import packages
+import numpy
+import pandas
+import scipy
+import scipy.integrate as spint
+from scipy.stats import norm
+from scipy.optimize import minimize
+from scipy.stats import chi2
+import plotnine
+from plotnine import *
+
+#Load Data
+chicken=pandas.read_csv("chickwts.txt", sep=",")
+#Generate plot that shows the weights of chicks vs. feed type(scatter)
+plot1= ggplot(chicken,aes(x="feed",y="weight"))+geom_dotplot(binaxis="y",stackdir="center", stackratio=0.5, dotsize=0.2)+theme_classic()
+
+print plot1
+
+#Generate plot that shows average weight vs. feed type (bar)
+plot2= ggplot(chicken, aes(y="weight",x="feed"))+geom_bar(stat="summary",fun_y=numpy.mean)
+print plot2
+#Subset the Data to Only Have the Types of Feed we are Interested in
+chicksub=chicken.loc[chicken.feed.isin(['soybean', 'sunflower']),:]
+
+#Make Dataframe for Modeling
+chickFrame= pandas.DataFrame({'y':chicksub.weight,'x':0})
+chickFrame.loc[chicksub.feed=='sunflower','x']=1
+#Define Null
+def nllikeNull(pNull,obsNull):
+ B0Null=pNull[0]
+ sigmaNull=pNull[1]
+ expectedNull=B0Null
+ nllNull=-1*norm(expectedNull,sigmaNull).logpdf(obsNull.y).sum()
+ return nllNull
+
+#Define Alternative
+def nlllikeAlt(pAlt,obsAlt):
+ B0Alt=pAlt[0]
+ B1Alt=pAlt[1]
+ sigmaAlt=pAlt[2]
+ expectedAlt=B0Alt+B1Alt*obsAlt.x
+ nllAlt=-1*norm(expectedAlt,sigmaAlt).logpdf(obsAlt.y).sum()
+ return nllAlt
+#Null Model
+initialGuessNull=numpy.array([1,1])
+fitNullChickFrame=minimize(nllikeNull,initialGuessNull, method="Nelder-Mead",options={'disp':True},args=chickFrame)
+print("Estimated Parameters: Null Model")
+print(fitNullChickFrame.x)
+print("NLL: Null Model")
+nllNullChickFrame=fitNullChickFrame.fun
+#Print NLL value for Null Model
+print(nllNullChickFrame)
+#Alternative Model
+initialGuessAlt=numpy.array([1,1,1])
+fitAltChickFrame=minimize(nlllikeAlt,initialGuessAlt,method="Nelder-Mead",options={'disp':True},args=chickFrame)
+print("Estimated Parameters: Alternative Model")
+print(fitAltChickFrame.x)
+print("NLL: Alternative Model")
+nllAltChickFrame=fitAltChickFrame.fun
+#Print NLL value for Alternative Model
+print(nllAltChickFrame)
+As our p-value was less than 0.05, we can report that our alternative hypothesis was correct. This means that there is a significant difference between the chicks who were fed soybean feed vs. those who were fed sunflower feed.
+ +#Calculate D value
+DchickFrame=2*(nllNullChickFrame-nllAltChickFrame)
+print("D Value")
+print(DchickFrame)
+
+#Calculate p value
+pChickFrame=1-scipy.stats.chi2.cdf(x=DchickFrame,df=1)
+print("p-value")
+print(pChickFrame)
+Matching patterns using regular expressions is useful when you want to look for strings in a file with a lot of data. In this exercise we will look at 3 scenarios and use regular expressions to match patterns.
+In order to use regular expressions in Python you first need to add the regular expression package(re). Then, for each scenario, you will need to simulate some data, build the necessary regular expression(s)
+ +This pattern will match times after noon, but before midnight when reported in 24-hour or "military" format(e.g. 15:30). The following regular expression(s) work because it makes it so the only digits you capture are those that start with a 1 (i.e. 12:xx) or 2 (i.e. 20:xx). The print then combines the 2 filters so that you can see all values after noon/before midnight.
+ +#Import packages needed
+import re
+#Simulate Data
+times=['00:30','o1:30','02:30','03:30','04:30','05:30','06:30','07:30','08:30','09:30','10:30','11:30','12:30','13:30','14:30','15:30','16:30','17:30','18:30','19:30','20:30','21:30','22:30','23:30']
+#Build expressions
+regex1=re.compile('[1][2-9]:\d{2}')
+regex2=re.compile('[2][0-9]:\d{2}')
+#Filter
+print(filter(regex1.match,times)+filter(regex2.match,times))
+This pattern will match genus species names that are expressed in the format G. species(e.g. H. sapiens). This code will look for a capital letter(A-Z), followed by a period, then a space and finally 2-25 lowercase letters. As we use the .match function, it will look for instances where the string starts with the specified parameters.
+ +#Simulate Data
+names=['M. avium','Bubbles','T. cruzi','J. F. Kennedy','B. megaterium','Kei-ichi Uchiya','mumbo.jumbo','T. rex', 'S. pyogenes','h. sapiens']
+#Build Expression
+regex=re.compile('[A-Z]\.\s[a-z]{2,25}')
+#Filter
+print(filter(regex.match,names))
+As you can see, this expression works because it ignored items like 'Bubbles', 'J. F. Kennedy, etc. This is because they did not fit the required format. In the instance of 'Bubbles' it was ignored because it didn't have a period or space following the first letter.In the 'J. F. Kennedy' example it was ignored because it failed the second part of the expression which required a lowercase letter/word following the first period and space.
+ +This pattern will match social security numbers in the proper format(e.g. 389-05-4771). This code will look for 3 digits followed by a dash, then 2 digits followed by a dash and finally 4 digits.
+ +#Simulate Data
+data=['389-05-4771','123-45-6789','McDougal Littell','876-54-3210','111-22-3333','Goofy','888-77-6666','22-333-44']
+#Build Expression
+regex=re.compile('\d{3}\-\d{2}\-\d{4}')
+#Filter
+print(filter(regex.match,data))
+This regular expression worked because it ignored data like 'Goofy' and '22-333-44'. Naturally, Goofy was gnored because it didn't start with a digit and 22-333-44 was ignored because it didn't start with 3 digits.
+ +